Skip to content
Snippets Groups Projects
Commit 0910a960 authored by ale's avatar ale
Browse files

streamline metadata processing using interfaces

parent 30145e43
No related branches found
No related tags found
No related merge requests found
...@@ -14,6 +14,12 @@ import ( ...@@ -14,6 +14,12 @@ import (
"time" "time"
) )
type googleBooksRefiner struct{}
func (r *googleBooksRefiner) Name() string {
return "gbooks"
}
type atomResultEntry struct { type atomResultEntry struct {
GoogleId string `xml:"id"` GoogleId string `xml:"id"`
Title string `xml:"http://purl.org/dc/terms title"` Title string `xml:"http://purl.org/dc/terms title"`
...@@ -142,7 +148,7 @@ func googleBooksGet(uri string) (*http.Response, error) { ...@@ -142,7 +148,7 @@ func googleBooksGet(uri string) (*http.Response, error) {
return nil, errors.New("deadline exceeded") return nil, errors.New("deadline exceeded")
} }
func LookupGoogleBooks(m *Metadata) ([]*Metadata, error) { func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) {
qstr := googleBooksQuery(m) qstr := googleBooksQuery(m)
if qstr == "" { if qstr == "" {
return nil, errors.New("insufficient metadata for query") return nil, errors.New("insufficient metadata for query")
...@@ -173,7 +179,7 @@ var imageUnavailableMD5 = [16]byte{ ...@@ -173,7 +179,7 @@ var imageUnavailableMD5 = [16]byte{
0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57, 0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57,
} }
func GetGoogleBooksCover(m *Metadata) ([]byte, error) { func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) {
gbid := getGoogleBooksId(m) gbid := getGoogleBooksId(m)
if gbid == "" { if gbid == "" {
return nil, errors.New("no ID") return nil, errors.New("no ID")
......
...@@ -11,6 +11,22 @@ import ( ...@@ -11,6 +11,22 @@ import (
"github.com/meskio/epubgo" "github.com/meskio/epubgo"
) )
// A metadata provider generates metadata from the local filesystem.
type MetadataProvider interface {
Name() string
Lookup(*FileStorage, string, string) (*Metadata, error)
GetBookCover(*FileStorage, string) (string, error)
}
// A metadata refiner improves on existing metadata and may provide
// more than one result to choose from. It usually involves talking to
// a remote service.
type MetadataRefiner interface {
Name() string
Lookup(*Metadata) ([]*Metadata, error)
GetBookCover(*Metadata) ([]byte, error)
}
type MetadataSource struct { type MetadataSource struct {
Name string Name string
ID string ID string
...@@ -30,10 +46,18 @@ type Metadata struct { ...@@ -30,10 +46,18 @@ type Metadata struct {
} }
// Sufficient returns true if the object contains enough information. // Sufficient returns true if the object contains enough information.
// If this check does not pass, the book won't be added to the database.
func (m *Metadata) Sufficient() bool { func (m *Metadata) Sufficient() bool {
return m.Title != "" return m.Title != ""
} }
// Complete returns true if we're satisfied with the quality of the
// information about this book. If this returns true, remote checks
// will be skipped.
func (m *Metadata) Complete() bool {
return (m.Title != "" && len(m.Creator) > 0 && len(m.ISBN) > 0)
}
// Uniques returns the list of possible unique tokens for this book. // Uniques returns the list of possible unique tokens for this book.
func (m *Metadata) Uniques() []string { func (m *Metadata) Uniques() []string {
var out []string var out []string
...@@ -220,23 +244,45 @@ func parseAnything(filename string) (*Metadata, error) { ...@@ -220,23 +244,45 @@ func parseAnything(filename string) (*Metadata, error) {
}, nil }, nil
} }
func Parse(filename string) (*Book, string, error) { type fileProvider struct{}
func (p *fileProvider) Lookup(storage *FileStorage, path, filetype string) (*Metadata, error) {
path = storage.Abs(path)
var m *Metadata var m *Metadata
var err error var err error
ext := strings.ToLower(filepath.Ext(filename)) switch filetype {
switch ext {
case ".epub": case ".epub":
m, err = parseEpub(filename) m, err = parseEpub(path)
case ".mobi": case ".mobi":
m, err = parseMobi(filename) m, err = parseMobi(path)
case ".pdf": case ".pdf":
m, err = parseAnything(filename) m, err = parseAnything(path)
default: default:
return nil, "", errors.New("unsupported file format") return nil, errors.New("unsupported file format")
} }
if err != nil { if err != nil {
return nil, "", err return nil, err
} }
return &Book{Metadata: m}, ext, nil return m, nil
}
func (p *fileProvider) GetBookCover(storage *FileStorage, path string) (string, error) {
coverPath := path + ".cover.png"
if storage.Exists(coverPath) {
return coverPath, nil
}
return "", nil
}
func (p *fileProvider) Name() string {
return "file"
}
func GetFileType(path string) (string, error) {
filetype := strings.ToLower(filepath.Ext(path))
if filetype != ".epub" && filetype != ".mobi" && filetype != ".pdf" {
return "", errors.New("unsupported file format")
}
return filetype, nil
} }
...@@ -82,3 +82,28 @@ func opfMetadataPath(epubPath string) string { ...@@ -82,3 +82,28 @@ func opfMetadataPath(epubPath string) string {
func opfCoverPath(epubPath string) string { func opfCoverPath(epubPath string) string {
return filepath.Join(filepath.Dir(epubPath), "cover.jpg") return filepath.Join(filepath.Dir(epubPath), "cover.jpg")
} }
type opfProvider struct{}
func (p *opfProvider) Lookup(storage *FileStorage, path, filetype string) (*Metadata, error) {
if !storage.Exists(opfMetadataPath(path)) {
return nil, nil
}
m, err := opfOpen(opfMetadataPath(storage.Abs(path)))
if err != nil {
return nil, err
}
return m, err
}
func (p *opfProvider) GetBookCover(storage *FileStorage, path string) (string, error) {
coverPath := opfCoverPath(path)
if storage.Exists(coverPath) {
return coverPath, nil
}
return "", nil
}
func (p *opfProvider) Name() string {
return "opf"
}
...@@ -42,9 +42,11 @@ type fileAndBook struct { ...@@ -42,9 +42,11 @@ type fileAndBook struct {
} }
type updateContext struct { type updateContext struct {
db *Database db *Database
storage *FileStorage storage *FileStorage
chooser MetadataChooserFunc chooser MetadataChooserFunc
providers []MetadataProvider
refiners []MetadataRefiner
} }
func (uc *updateContext) dbFileScanner(fileCh chan fileData) { func (uc *updateContext) dbFileScanner(fileCh chan fileData) {
...@@ -151,52 +153,94 @@ func (uc *updateContext) extractor(fileCh chan fileData, outCh chan fileAndBook) ...@@ -151,52 +153,94 @@ func (uc *updateContext) extractor(fileCh chan fileData, outCh chan fileAndBook)
} }
func (uc *updateContext) parseMeta(f fileData) (*Book, string, error) { func (uc *updateContext) parseMeta(f fileData) (*Book, string, error) {
// Attempt direct metadata extraction. filetype, err := GetFileType(f.path)
book, filetype, err := Parse(uc.storage.Abs(f.path))
if err != nil { if err != nil {
return nil, "", err return nil, "", err
} }
// Check if a Calibre OPF file exists. // Attempt metadata extraction from the providers. The first
if opfmeta, err := opfOpen(opfMetadataPath(uc.storage.Abs(f.path))); err == nil { // match returned stops the iteration. At the same time, look
book.Metadata.Merge(opfmeta) // for a cover image until one is found.
} else { var meta *Metadata
// No local metadata, use Google Books to retrieve var coverPath string
// more information on the book. Ask the user to
// choose in case there are multiple results. for _, provider := range uc.providers {
candidates, err := LookupGoogleBooks(book.Metadata) if meta == nil {
if err == nil && len(candidates) > 0 { meta, err = provider.Lookup(uc.storage, f.path, filetype)
if len(candidates) == 1 { if err != nil {
log.Printf("found Google Books match: %s", candidates[0].String()) log.Printf("%s: %s: could not parse: %v", f.path, provider.Name(), err)
book.Metadata.Merge(candidates[0]) } else if meta != nil {
} else if uc.chooser != nil { log.Printf("%s: identified by: %s", f.path, provider.Name())
if userchoice := uc.chooser(f.path, candidates); userchoice != nil { }
book.Metadata.Merge(userchoice) }
if coverPath == "" {
coverPath, err = provider.GetBookCover(uc.storage, f.path)
if err != nil {
log.Printf("%s: %s: could not fetch cover image at %s", f.path, provider.Name(), err)
} else if coverPath != "" {
log.Printf("%s: cover image found by: %s", f.path, provider.Name())
}
}
}
if meta == nil {
return nil, "", errors.New("no metadata could be identified")
}
// If the book cover couldn't be found locally, prepare to
// download it. It's possible that we've already done this, so
// check in the storage first (TODO: this check isn't useful,
// if the cover exists it should have been emitted by the
// fileProvider above).
localCoverPath := f.path + ".cover.png"
if coverPath == "" && uc.storage.Exists(localCoverPath) {
coverPath = localCoverPath
}
// Only run remote checks if the metadata isn't complete.
if !meta.Complete() {
// Integrate metadata using the refiners. We check them all,
// and merge their results into the metadata object. The user
// is prompted if a choice is necessary. Search for a book
// cover only until one is found.
for _, refiner := range uc.refiners {
candidates, err := refiner.Lookup(meta)
if err == nil && len(candidates) > 0 {
if len(candidates) == 1 {
log.Printf("found match from %s: %s", refiner.Name(), candidates[0].String())
meta.Merge(candidates[0])
} else if uc.chooser != nil {
if userchoice := uc.chooser(f.path, candidates); userchoice != nil {
meta.Merge(userchoice)
}
}
}
if coverPath == "" {
if coverData, err := refiner.GetBookCover(meta); err == nil {
if imgf, err := os.Create(uc.storage.Abs(localCoverPath)); err != nil {
log.Printf("Error saving cover image: %v", err)
} else {
imgf.Write(coverData)
imgf.Close()
coverPath = localCoverPath
}
} }
} }
} }
} }
// Check if the book metadata looks ok. If not, don't even // Check if the book metadata looks ok. If not, don't even
// bother looking for a cover image. // bother looking for a cover image.
if !book.Metadata.Sufficient() { if !meta.Sufficient() {
return nil, "", errors.New("insufficient metadata") return nil, "", errors.New("insufficient metadata")
} }
// Try to find a cover image. Look on the local filesystem // Create a Book with no ID (yet).
// first, otherwise check Google Books. book := &Book{
localCoverPath := opfCoverPath(f.path) Metadata: meta,
if uc.storage.Exists(localCoverPath) { CoverPath: coverPath,
book.CoverPath = localCoverPath
} else if imageData, err := GetGoogleBooksCover(book.Metadata); err == nil {
imageFileName := f.path + ".cover.png"
if imgf, err := os.Create(uc.storage.Abs(imageFileName)); err != nil {
log.Printf("Could not save cover image for %d: %v", book.Id, err)
} else {
imgf.Write(imageData)
imgf.Close()
book.CoverPath = imageFileName
}
} }
return book, filetype, nil return book, filetype, nil
...@@ -251,6 +295,19 @@ func (db *Database) Update(dir string, chooser MetadataChooserFunc) { ...@@ -251,6 +295,19 @@ func (db *Database) Update(dir string, chooser MetadataChooserFunc) {
db: db, db: db,
chooser: chooser, chooser: chooser,
storage: NewFileStorage(dir), storage: NewFileStorage(dir),
// Calibre/OPF must be first, so we don't attempt to
// parse the file itself.
providers: []MetadataProvider{
&opfProvider{},
&fileProvider{},
},
// Check Google Books when the metadata is not
// sufficient to fully describe the book.
refiners: []MetadataRefiner{
&googleBooksRefiner{},
},
} }
var wg sync.WaitGroup var wg sync.WaitGroup
......
...@@ -20,6 +20,30 @@ func createTestFs(fs map[string]string) string { ...@@ -20,6 +20,30 @@ func createTestFs(fs map[string]string) string {
return base return base
} }
func checkDbPathIntegrity(t *testing.T, db *Database) {
// Files should have relative paths.
for i := db.Scan(FileBucket); i.Valid(); i.Next() {
var f File
if err := i.Value(&f); err != nil {
t.Fatal(err)
}
if strings.HasPrefix(f.Path, "/") {
t.Errorf("file has absolute path: %v", f.Path)
}
}
// Book cover images should have relative paths.
for i := db.Scan(BookBucket); i.Valid(); i.Next() {
var b Book
if err := i.Value(&b); err != nil {
t.Fatal(err)
}
if b.CoverPath != "" && strings.HasPrefix(b.CoverPath, "/") {
t.Errorf("file has absolute path: %v", b.CoverPath)
}
}
}
func TestDatabase_Update(t *testing.T) { func TestDatabase_Update(t *testing.T) {
util.WalkerDefaultMinSize = 0 util.WalkerDefaultMinSize = 0
...@@ -59,30 +83,26 @@ func TestDatabase_Update(t *testing.T) { ...@@ -59,30 +83,26 @@ func TestDatabase_Update(t *testing.T) {
db.Update(tmpdir, chooser) db.Update(tmpdir, chooser)
testDb("second update") testDb("second update")
if chooserCalled {
t.Errorf("chooser function was called")
}
// Check that the test file is there. // Check that the test file is there.
if _, err := db.GetFile("book/Test Ebook.pdf"); err != nil { if _, err := db.GetFile("book/Test Ebook.pdf"); err != nil {
t.Errorf("test file is not in the database") t.Errorf("test file is not in the database")
} }
// Files should have relative paths. checkDbPathIntegrity(t, db)
for i := db.Scan(FileBucket); i.Valid(); i.Next() { }
var f File
if err := i.Value(&f); err != nil {
t.Fatal(err)
}
if strings.HasPrefix(f.Path, "/") {
t.Errorf("file has absolute path: %v", f.Path)
}
}
// Book cover images should have relative paths. func TestDatabase_UpdateEpub(t *testing.T) {
for i := db.Scan(BookBucket); i.Valid(); i.Next() { util.WalkerDefaultMinSize = 0
var b Book
if err := i.Value(&b); err != nil { td, db := newTestDatabase(t)
t.Fatal(err) defer td.Close()
}
if b.CoverPath != "" && strings.HasPrefix(b.CoverPath, "/") { // Read the test epub from testdata/.
t.Errorf("file has absolute path: %v", b.CoverPath) db.Update("testdata", nil)
}
} checkDbPathIntegrity(t, db)
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment