Commit 0910a960 authored by ale's avatar ale

streamline metadata processing using interfaces

parent 30145e43
......@@ -14,6 +14,12 @@ import (
"time"
)
type googleBooksRefiner struct{}
func (r *googleBooksRefiner) Name() string {
return "gbooks"
}
type atomResultEntry struct {
GoogleId string `xml:"id"`
Title string `xml:"http://purl.org/dc/terms title"`
......@@ -142,7 +148,7 @@ func googleBooksGet(uri string) (*http.Response, error) {
return nil, errors.New("deadline exceeded")
}
func LookupGoogleBooks(m *Metadata) ([]*Metadata, error) {
func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) {
qstr := googleBooksQuery(m)
if qstr == "" {
return nil, errors.New("insufficient metadata for query")
......@@ -173,7 +179,7 @@ var imageUnavailableMD5 = [16]byte{
0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57,
}
func GetGoogleBooksCover(m *Metadata) ([]byte, error) {
func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) {
gbid := getGoogleBooksId(m)
if gbid == "" {
return nil, errors.New("no ID")
......
......@@ -11,6 +11,22 @@ import (
"github.com/meskio/epubgo"
)
// A metadata provider generates metadata from the local filesystem.
type MetadataProvider interface {
Name() string
Lookup(*FileStorage, string, string) (*Metadata, error)
GetBookCover(*FileStorage, string) (string, error)
}
// A metadata refiner improves on existing metadata and may provide
// more than one result to choose from. It usually involves talking to
// a remote service.
type MetadataRefiner interface {
Name() string
Lookup(*Metadata) ([]*Metadata, error)
GetBookCover(*Metadata) ([]byte, error)
}
type MetadataSource struct {
Name string
ID string
......@@ -30,10 +46,18 @@ type Metadata struct {
}
// Sufficient returns true if the object contains enough information.
// If this check does not pass, the book won't be added to the database.
func (m *Metadata) Sufficient() bool {
return m.Title != ""
}
// Complete returns true if we're satisfied with the quality of the
// information about this book. If this returns true, remote checks
// will be skipped.
func (m *Metadata) Complete() bool {
return (m.Title != "" && len(m.Creator) > 0 && len(m.ISBN) > 0)
}
// Uniques returns the list of possible unique tokens for this book.
func (m *Metadata) Uniques() []string {
var out []string
......@@ -220,23 +244,45 @@ func parseAnything(filename string) (*Metadata, error) {
}, nil
}
func Parse(filename string) (*Book, string, error) {
type fileProvider struct{}
func (p *fileProvider) Lookup(storage *FileStorage, path, filetype string) (*Metadata, error) {
path = storage.Abs(path)
var m *Metadata
var err error
ext := strings.ToLower(filepath.Ext(filename))
switch ext {
switch filetype {
case ".epub":
m, err = parseEpub(filename)
m, err = parseEpub(path)
case ".mobi":
m, err = parseMobi(filename)
m, err = parseMobi(path)
case ".pdf":
m, err = parseAnything(filename)
m, err = parseAnything(path)
default:
return nil, "", errors.New("unsupported file format")
return nil, errors.New("unsupported file format")
}
if err != nil {
return nil, "", err
return nil, err
}
return &Book{Metadata: m}, ext, nil
return m, nil
}
func (p *fileProvider) GetBookCover(storage *FileStorage, path string) (string, error) {
coverPath := path + ".cover.png"
if storage.Exists(coverPath) {
return coverPath, nil
}
return "", nil
}
func (p *fileProvider) Name() string {
return "file"
}
func GetFileType(path string) (string, error) {
filetype := strings.ToLower(filepath.Ext(path))
if filetype != ".epub" && filetype != ".mobi" && filetype != ".pdf" {
return "", errors.New("unsupported file format")
}
return filetype, nil
}
......@@ -82,3 +82,28 @@ func opfMetadataPath(epubPath string) string {
func opfCoverPath(epubPath string) string {
return filepath.Join(filepath.Dir(epubPath), "cover.jpg")
}
type opfProvider struct{}
func (p *opfProvider) Lookup(storage *FileStorage, path, filetype string) (*Metadata, error) {
if !storage.Exists(opfMetadataPath(path)) {
return nil, nil
}
m, err := opfOpen(opfMetadataPath(storage.Abs(path)))
if err != nil {
return nil, err
}
return m, err
}
func (p *opfProvider) GetBookCover(storage *FileStorage, path string) (string, error) {
coverPath := opfCoverPath(path)
if storage.Exists(coverPath) {
return coverPath, nil
}
return "", nil
}
func (p *opfProvider) Name() string {
return "opf"
}
......@@ -42,9 +42,11 @@ type fileAndBook struct {
}
type updateContext struct {
db *Database
storage *FileStorage
chooser MetadataChooserFunc
db *Database
storage *FileStorage
chooser MetadataChooserFunc
providers []MetadataProvider
refiners []MetadataRefiner
}
func (uc *updateContext) dbFileScanner(fileCh chan fileData) {
......@@ -151,52 +153,94 @@ func (uc *updateContext) extractor(fileCh chan fileData, outCh chan fileAndBook)
}
func (uc *updateContext) parseMeta(f fileData) (*Book, string, error) {
// Attempt direct metadata extraction.
book, filetype, err := Parse(uc.storage.Abs(f.path))
filetype, err := GetFileType(f.path)
if err != nil {
return nil, "", err
}
// Check if a Calibre OPF file exists.
if opfmeta, err := opfOpen(opfMetadataPath(uc.storage.Abs(f.path))); err == nil {
book.Metadata.Merge(opfmeta)
} else {
// No local metadata, use Google Books to retrieve
// more information on the book. Ask the user to
// choose in case there are multiple results.
candidates, err := LookupGoogleBooks(book.Metadata)
if err == nil && len(candidates) > 0 {
if len(candidates) == 1 {
log.Printf("found Google Books match: %s", candidates[0].String())
book.Metadata.Merge(candidates[0])
} else if uc.chooser != nil {
if userchoice := uc.chooser(f.path, candidates); userchoice != nil {
book.Metadata.Merge(userchoice)
// Attempt metadata extraction from the providers. The first
// match returned stops the iteration. At the same time, look
// for a cover image until one is found.
var meta *Metadata
var coverPath string
for _, provider := range uc.providers {
if meta == nil {
meta, err = provider.Lookup(uc.storage, f.path, filetype)
if err != nil {
log.Printf("%s: %s: could not parse: %v", f.path, provider.Name(), err)
} else if meta != nil {
log.Printf("%s: identified by: %s", f.path, provider.Name())
}
}
if coverPath == "" {
coverPath, err = provider.GetBookCover(uc.storage, f.path)
if err != nil {
log.Printf("%s: %s: could not fetch cover image at %s", f.path, provider.Name(), err)
} else if coverPath != "" {
log.Printf("%s: cover image found by: %s", f.path, provider.Name())
}
}
}
if meta == nil {
return nil, "", errors.New("no metadata could be identified")
}
// If the book cover couldn't be found locally, prepare to
// download it. It's possible that we've already done this, so
// check in the storage first (TODO: this check isn't useful,
// if the cover exists it should have been emitted by the
// fileProvider above).
localCoverPath := f.path + ".cover.png"
if coverPath == "" && uc.storage.Exists(localCoverPath) {
coverPath = localCoverPath
}
// Only run remote checks if the metadata isn't complete.
if !meta.Complete() {
// Integrate metadata using the refiners. We check them all,
// and merge their results into the metadata object. The user
// is prompted if a choice is necessary. Search for a book
// cover only until one is found.
for _, refiner := range uc.refiners {
candidates, err := refiner.Lookup(meta)
if err == nil && len(candidates) > 0 {
if len(candidates) == 1 {
log.Printf("found match from %s: %s", refiner.Name(), candidates[0].String())
meta.Merge(candidates[0])
} else if uc.chooser != nil {
if userchoice := uc.chooser(f.path, candidates); userchoice != nil {
meta.Merge(userchoice)
}
}
}
if coverPath == "" {
if coverData, err := refiner.GetBookCover(meta); err == nil {
if imgf, err := os.Create(uc.storage.Abs(localCoverPath)); err != nil {
log.Printf("Error saving cover image: %v", err)
} else {
imgf.Write(coverData)
imgf.Close()
coverPath = localCoverPath
}
}
}
}
}
// Check if the book metadata looks ok. If not, don't even
// bother looking for a cover image.
if !book.Metadata.Sufficient() {
if !meta.Sufficient() {
return nil, "", errors.New("insufficient metadata")
}
// Try to find a cover image. Look on the local filesystem
// first, otherwise check Google Books.
localCoverPath := opfCoverPath(f.path)
if uc.storage.Exists(localCoverPath) {
book.CoverPath = localCoverPath
} else if imageData, err := GetGoogleBooksCover(book.Metadata); err == nil {
imageFileName := f.path + ".cover.png"
if imgf, err := os.Create(uc.storage.Abs(imageFileName)); err != nil {
log.Printf("Could not save cover image for %d: %v", book.Id, err)
} else {
imgf.Write(imageData)
imgf.Close()
book.CoverPath = imageFileName
}
// Create a Book with no ID (yet).
book := &Book{
Metadata: meta,
CoverPath: coverPath,
}
return book, filetype, nil
......@@ -251,6 +295,19 @@ func (db *Database) Update(dir string, chooser MetadataChooserFunc) {
db: db,
chooser: chooser,
storage: NewFileStorage(dir),
// Calibre/OPF must be first, so we don't attempt to
// parse the file itself.
providers: []MetadataProvider{
&opfProvider{},
&fileProvider{},
},
// Check Google Books when the metadata is not
// sufficient to fully describe the book.
refiners: []MetadataRefiner{
&googleBooksRefiner{},
},
}
var wg sync.WaitGroup
......
......@@ -20,6 +20,30 @@ func createTestFs(fs map[string]string) string {
return base
}
func checkDbPathIntegrity(t *testing.T, db *Database) {
// Files should have relative paths.
for i := db.Scan(FileBucket); i.Valid(); i.Next() {
var f File
if err := i.Value(&f); err != nil {
t.Fatal(err)
}
if strings.HasPrefix(f.Path, "/") {
t.Errorf("file has absolute path: %v", f.Path)
}
}
// Book cover images should have relative paths.
for i := db.Scan(BookBucket); i.Valid(); i.Next() {
var b Book
if err := i.Value(&b); err != nil {
t.Fatal(err)
}
if b.CoverPath != "" && strings.HasPrefix(b.CoverPath, "/") {
t.Errorf("file has absolute path: %v", b.CoverPath)
}
}
}
func TestDatabase_Update(t *testing.T) {
util.WalkerDefaultMinSize = 0
......@@ -59,30 +83,26 @@ func TestDatabase_Update(t *testing.T) {
db.Update(tmpdir, chooser)
testDb("second update")
if chooserCalled {
t.Errorf("chooser function was called")
}
// Check that the test file is there.
if _, err := db.GetFile("book/Test Ebook.pdf"); err != nil {
t.Errorf("test file is not in the database")
}
// Files should have relative paths.
for i := db.Scan(FileBucket); i.Valid(); i.Next() {
var f File
if err := i.Value(&f); err != nil {
t.Fatal(err)
}
if strings.HasPrefix(f.Path, "/") {
t.Errorf("file has absolute path: %v", f.Path)
}
}
checkDbPathIntegrity(t, db)
}
// Book cover images should have relative paths.
for i := db.Scan(BookBucket); i.Valid(); i.Next() {
var b Book
if err := i.Value(&b); err != nil {
t.Fatal(err)
}
if b.CoverPath != "" && strings.HasPrefix(b.CoverPath, "/") {
t.Errorf("file has absolute path: %v", b.CoverPath)
}
}
func TestDatabase_UpdateEpub(t *testing.T) {
util.WalkerDefaultMinSize = 0
td, db := newTestDatabase(t)
defer td.Close()
// Read the test epub from testdata/.
db.Update("testdata", nil)
checkDbPathIntegrity(t, db)
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment