diff --git a/database.go b/database.go index 1c92cac505749129aec4ac3d1b41c5ea860f8b28..a6c4a77a90043771fe74ef3f053f9c32cd0b4ef4 100644 --- a/database.go +++ b/database.go @@ -36,6 +36,25 @@ func (id BookId) Key() []byte { return buf.Bytes() } +type Book struct { + Id BookId + CoverPath string + Metadata *Metadata +} + +type File struct { + Path string + FileType string + Mtime time.Time + Size int64 + Error bool + Id BookId +} + +func (f *File) HasChanged(info os.FileInfo) bool { + return !info.ModTime().Equal(f.Mtime) || info.Size() != f.Size +} + func init() { // Seed the RNG to a random value. var seed int64 @@ -52,27 +71,52 @@ func ParseID(s string) BookId { return BookId(id) } +// The structure that gets actually indexed. +type flatBook struct { + Title string `json:"title"` + Author []string `json:"author"` + Description string `json:"description"` + ISBN []string `json:"isbn"` + Unique []string `json:"_unique"` +} + +func (f *flatBook) Type() string { + return "ebook" +} + +func flatten(book *Book) *flatBook { + return &flatBook{ + Title: book.Metadata.Title, + Author: book.Metadata.Creator, + Description: book.Metadata.Description, + ISBN: book.Metadata.ISBN, + Unique: book.Metadata.Uniques(), + } +} + +var defaultTextAnalyzer = "standard" + func metadataDocumentMapping() *bleve.DocumentMapping { md := bleve.NewDocumentMapping() - titleFieldMapping := bleve.NewTextFieldMapping() - titleFieldMapping.Analyzer = "en" - titleFieldMapping.Store = false - md.AddFieldMappingsAt("Title", titleFieldMapping) + textFieldMapping := bleve.NewTextFieldMapping() + textFieldMapping.Store = false + textFieldMapping.Analyzer = defaultTextAnalyzer authorFieldMapping := bleve.NewTextFieldMapping() authorFieldMapping.Store = false - md.AddFieldMappingsAt("Creator", authorFieldMapping) + authorFieldMapping.Analyzer = "simple" - nostoreFieldMapping := bleve.NewTextFieldMapping() - nostoreFieldMapping.Store = false - nostoreFieldMapping.IncludeInAll = false - md.AddFieldMappingsAt("Description", nostoreFieldMapping) - md.AddFieldMappingsAt("ISBN", nostoreFieldMapping) + keywordFieldMapping := bleve.NewTextFieldMapping() + keywordFieldMapping.Store = false + keywordFieldMapping.Analyzer = "keyword" + keywordFieldMapping.IncludeInAll = false - for _, ignore := range []string{"Sources", "Date", "Publisher", "Format", "Keywords", "Language"} { - md.AddSubDocumentMapping(ignore, bleve.NewDocumentDisabledMapping()) - } + md.AddFieldMappingsAt("title", textFieldMapping) + md.AddFieldMappingsAt("author", authorFieldMapping) + md.AddFieldMappingsAt("description", textFieldMapping) + md.AddFieldMappingsAt("isbn", keywordFieldMapping) + md.AddFieldMappingsAt("_unique", keywordFieldMapping) return md } @@ -80,34 +124,11 @@ func metadataDocumentMapping() *bleve.DocumentMapping { func defaultIndexMapping() *bleve.IndexMapping { i := bleve.NewIndexMapping() i.AddDocumentMapping("ebook", metadataDocumentMapping()) - i.DefaultAnalyzer = "en" + i.DefaultAnalyzer = defaultTextAnalyzer + i.DefaultType = "ebook" return i } -type Book struct { - Id BookId - // Path string - CoverPath string - Metadata *Metadata -} - -func (b *Book) Type() string { - return "ebook" -} - -type File struct { - Path string - FileType string - Mtime time.Time - Size int64 - Error bool - Id BookId -} - -func (f *File) HasChanged(info os.FileInfo) bool { - return !info.ModTime().Equal(f.Mtime) || info.Size() != f.Size -} - type Database struct { leveldb *levigo.DB leveldbCache *levigo.Cache @@ -219,7 +240,7 @@ func (db *Database) PutBook(b *Book) error { if err := db.Put(BookBucket, b.Id.Key(), b); err != nil { return err } - return db.index.Index(b.Id.String(), b.Metadata) + return db.index.Index(b.Id.String(), flatten(b)) } func fileBookKey(path string, bookid BookId) []byte { @@ -356,35 +377,17 @@ func (db *Database) Autocomplete(term string) (*SearchResult, error) { } // Find a book matching the given metadata, if possible. -func (db *Database) Find(m *Metadata) (*Book, error) { +func (db *Database) Find(uniqueIds []string) (*Book, error) { + var queries []bleve.Query var query bleve.Query - if len(m.ISBN) > 0 { - var queries []bleve.Query - for _, isbn := range m.ISBN { - q := bleve.NewTermQuery(isbn) - q.SetField("ISBN") - queries = append(queries, q) - } + for _, u := range uniqueIds { + queries = append(queries, bleve.NewTermQuery(u).SetField("_unique")) + } + if len(queries) > 0 { query = bleve.NewDisjunctionQuery(queries) } else { - var queries []bleve.Query - if m.Title != "" { - q := bleve.NewMatchQuery(m.Title) - q.SetField("Title") - queries = append(queries, q) - } - if len(m.Creator) > 0 { - for _, a := range m.Creator { - q := bleve.NewMatchQuery(a) - q.SetField("Creator") - queries = append(queries, q) - } - } - if len(queries) == 0 { - return nil, errors.New("insufficient metadata for query") - } - query = bleve.NewConjunctionQuery(queries) + query = queries[0] } search := bleve.NewSearchRequest(query) @@ -392,16 +395,11 @@ func (db *Database) Find(m *Metadata) (*Book, error) { if err != nil { return nil, err } - for _, r := range result.Hits { - book, err := db.GetBook(ParseID(r.ID)) - if err != nil { - continue - } - if book.Metadata.Equals(m) { - return book, nil - } + if len(result.Hits) == 0 { + return nil, errors.New("no matches found") } - return nil, errors.New("no matches found") + + return db.GetBook(ParseID(result.Hits[0].ID)) } func bktToKey(bucket, key []byte) []byte { diff --git a/database_test.go b/database_test.go index 7e5b17742b11feb64e3ccbb05691f420f212e2f0..4d207364c0b480814ea0da6bf5fbf5966a6b5830 100644 --- a/database_test.go +++ b/database_test.go @@ -5,6 +5,8 @@ import ( "io/ioutil" "os" "testing" + + "github.com/blevesearch/bleve" ) type testDatabase struct { @@ -26,16 +28,25 @@ func newTestDatabase(t *testing.T) (*testDatabase, *Database) { } book := testEbook() - if err = db.PutBook(book); err != nil { + if err := db.PutBook(book); err != nil { t.Fatalf("PutBook(): %v", err) } - if err = db.PutFile(testEpubFile(path, book.Id)); err != nil { + if err := db.PutFile(testEpubFile(path, book.Id)); err != nil { t.Fatalf("PutFile(): %v", err) } return &testDatabase{db: db, path: path, refbookid: book.Id}, db } +func newTestDatabase2(t *testing.T) (*testDatabase, *Database) { + td, db := newTestDatabase(t) + book := testEbook2() + if err := db.PutBook(book); err != nil { + t.Fatalf("PutBook(): %v", err) + } + return td, db +} + func testEpubFile(dir string, bookid BookId) *File { f, _ := ioutil.TempFile(dir, "ebook-") io.WriteString(f, "epub\n") @@ -52,14 +63,26 @@ func testEbook() *Book { return &Book{ Id: NewID(), Metadata: &Metadata{ - Title: "20,000 Leagues under the sea", + Title: "Twenty Thousand Leagues Under The Sea", Creator: []string{"Jules Verne"}, - ISBN: []string{"1234"}, + ISBN: []string{"1234", "2345"}, Description: "A pretty cool book.", }, } } +func testEbook2() *Book { + return &Book{ + Id: NewID(), + Metadata: &Metadata{ + Title: "Around The World In Eighty Days", + Creator: []string{"Jules Verne"}, + ISBN: []string{"5678"}, + Description: "It's about balloons.", + }, + } +} + func TestDatabase_Get(t *testing.T) { td, db := newTestDatabase(t) defer td.Close() @@ -109,50 +132,100 @@ func TestDatabase_BookFileRelation(t *testing.T) { } } -func TestDatabase_Search(t *testing.T) { - td, db := newTestDatabase(t) +func TestDatabase_BleveLowLevelSearch(t *testing.T) { + td, db := newTestDatabase2(t) defer td.Close() - r, err := db.Search("jules verne", 0, 100) - if err != nil { - t.Fatal(err) - } - if r.NumResults != 1 { - t.Fatalf("NumResults != 1 - %#v", r) - } - if len(r.Results) != 1 { - t.Fatalf("len(Results) != 1 - %#v", r) + doSearch := func(tag string, query bleve.Query, numExpectedResults int) { + req := bleve.NewSearchRequestOptions(query, 100, 0, false) + result, err := db.index.Search(req) + if err != nil { + t.Fatalf("%s: %v", tag, err) + } + if int(result.Total) != numExpectedResults { + t.Errorf("%s: got %d results, expected %d", tag, result.Total, numExpectedResults) + } } - r, err = db.Search("italo calvino", 0, 100) - if err != nil { - t.Fatal(err) - } - if r.NumResults > 0 { - t.Fatalf("unexpected results: %v", r) + doSearch("match_query", + bleve.NewMatchQuery("Leagues Under The Sea"), + 1) + doSearch("match_query_with_field", + bleve.NewMatchQuery("Leagues Under The Sea").SetField("title"), + 1) + doSearch("match_query_with_field_2", + bleve.NewMatchQuery("Jules Verne").SetField("author"), + 2) + //doSearch("match_query_with_field_2_AND", + // bleve.NewMatchQuery("Hugo Verne").SetField("author"), + // 0) + doSearch("match_query_with_wrong_field", + bleve.NewMatchQuery("Leagues Under The Sea").SetField("author"), + 0) + doSearch("query_string_precise", + bleve.NewQueryStringQuery("+title:Leagues +author:verne"), + 1) + doSearch("isbn_term_query", + bleve.NewTermQuery("1234").SetField("isbn"), + 1) + doSearch("isbn_term_query_2", + bleve.NewTermQuery("2345").SetField("isbn"), + 1) +} + +func TestDatabase_Search(t *testing.T) { + td, db := newTestDatabase2(t) + defer td.Close() + + doSearch := func(query string, numExpectedResults int) { + r, err := db.Search(query, 0, 100) + if err != nil { + t.Fatalf("Search(%s) failed: %v", query, err) + } + if r.NumResults != numExpectedResults { + t.Errorf("Search(%s): got %d results, expecting %d\n%#v", query, r.NumResults, numExpectedResults, r.Results) + } } + + doSearch("jules verne", 2) + doSearch("italo calvino", 0) + + doSearch("Twenty Thousand Leagues", 1) + doSearch("\"Twenty Thousand Leagues\"", 1) + doSearch("title:\"Twenty Thousand Leagues\"", 1) + doSearch("author:\"Jules Verne\"", 2) + + doSearch("author:verne", 2) + doSearch("+title:Leagues +author:Verne", 1) + doSearch("title:verne", 0) + doSearch("author:vernes", 0) } func TestDatabase_Find(t *testing.T) { - td, db := newTestDatabase(t) + td, db := newTestDatabase2(t) defer td.Close() + book, _ := db.GetBook(td.refbookid) + m := book.Metadata // Find using ISBN. - book := testEbook() - if _, err := db.Find(book.Metadata); err != nil { - t.Fatal("With ISBN: ", err) + if result, err := db.Find(m.Uniques()); err != nil { + t.Errorf("With ISBN: %v", err) + } else if result.Id != book.Id { + t.Errorf("Bad match with ISBN: got=%d, expected=%d", result.Id, book.Id) } // Find using title/author. - book.Metadata.ISBN = nil - if _, err := db.Find(book.Metadata); err != nil { - t.Fatal("With title/author: ", err) + m.ISBN = nil + if result, err := db.Find(m.Uniques()); err != nil { + t.Errorf("With title/author: %v", err) + } else if result.Id != book.Id { + t.Errorf("Bad match with title/author: got=%d, expected=%d", result.Id, book.Id) } // Find only using title (should fail). - book.Metadata.Creator = nil - if result, err := db.Find(book.Metadata); err == nil { - t.Fatalf("Title only: no error, result = %v", result) + m.Creator = nil + if result, err := db.Find(m.Uniques()); err == nil { + t.Errorf("Title only: no error, result = %v", result) } } @@ -160,7 +233,7 @@ func TestDatabase_Autocomplete(t *testing.T) { td, db := newTestDatabase(t) defer td.Close() - r, err := db.Autocomplete("jul") + r, err := db.Autocomplete("jules") if err != nil { t.Fatal(err) } diff --git a/htdocs/templates/book.html b/htdocs/templates/book.html index 00d50c8917802c5e8bbf10562cff1212d6b2041d..aa5d2fa550139334e868e21547712ae3f1812077 100644 --- a/htdocs/templates/book.html +++ b/htdocs/templates/book.html @@ -14,7 +14,7 @@ </h1> <p class="book-authors"> - {{range $i, $a := .Book.Metadata.Creator}}{{if gt $i 0}}, {{end}}<a href="/search?q=Author%3A{{$a}}"><b>{{$a}}</b></a>{{end}} + {{range $i, $a := .Book.Metadata.Creator}}{{if gt $i 0}}, {{end}}<a href="/search?q=author%3A%22{{$a}}%22"><b>{{$a}}</b></a>{{end}} </p> <p> diff --git a/metadata.go b/metadata.go index 4746bd3d0d370dc42585711ce59b9d9a471e0e0d..b90d6aa9d564d68a6d810546abefa51a0af0648c 100644 --- a/metadata.go +++ b/metadata.go @@ -46,30 +46,6 @@ func (m *Metadata) Uniques() []string { return out } -func parseUniqueId(u string) (*Metadata, error) { - var m Metadata - p := strings.SplitN(u, ":", 2) - if len(p) < 2 { - return nil, errors.New("unparseable unique id") - } - switch p[0] { - case "isbn": - m.ISBN = []string{p[1]} - case "sig": - sp := strings.Split(p[1], "|") - if len(sp) < 2 { - return nil, errors.New("unparseable 'sig' unique id") - } - m.Title = sp[0] - if sp[1] != "" { - m.Creator = sp[1:len(sp)] - } - default: - return nil, errors.New("unknown unique id type") - } - return &m, nil -} - func listsOverlap(a, b []string) bool { for _, aa := range a { for _, bb := range b { diff --git a/sync.go b/sync.go index 7b4ce8464ebae72628a69194f0691ddfabf84b39..1e27496fd0dd170b87261811709ca0146eceb506 100644 --- a/sync.go +++ b/sync.go @@ -235,21 +235,9 @@ func (l *syncServer) handleDiffRequest(w http.ResponseWriter, req *http.Request) var resp diffResponse for _, c := range diffreq.Candidates { - // For every unique ID, decode it into a template - // Metadata object and see if we can find a match in - // the database. - found := false - for _, unique := range c.Unique { - meta, err := parseUniqueId(unique) - if err != nil { - continue - } - if _, err := l.db.Find(meta); err == nil { - found = true - break - } - } - if !found { + // Search for matches in the database by looking for + // unique ids. + if _, err := l.db.Find(c.Unique); err != nil { resp.Missing = append(resp.Missing, c.Id) } } @@ -274,7 +262,7 @@ func (l *syncServer) handleSyncUpload(w http.ResponseWriter, req *http.Request) } // Check again that we don't have this book. - if _, err := l.db.Find(&md); err == nil { + if _, err := l.db.Find(md.Uniques()); err == nil { log.Printf("attempt to upload duplicate: %#v", &md) http.Error(w, "Duplicate", http.StatusConflict) return diff --git a/sync_test.go b/sync_test.go index 82e66ba02638b8a034530d5f6c39c76f927ced98..aeb3cc7018b34b5d8ce0f36c80b51422896d5aaa 100644 --- a/sync_test.go +++ b/sync_test.go @@ -59,7 +59,7 @@ func TestSync_Sync(t *testing.T) { metatmpl := &Metadata{ ISBN: []string{strconv.Itoa(i + 1)}, } - if _, err := db2.Find(metatmpl); err != nil { + if _, err := db2.Find(metatmpl.Uniques()); err != nil { t.Errorf("Book %d missing from db2: %v", i+1, err) } } diff --git a/update.go b/update.go index 80688094acc3e44f5d50b49cd3f62207ed331a7e..46035cb294d231f8b16daf1cfdbff9193ca16171 100644 --- a/update.go +++ b/update.go @@ -203,7 +203,7 @@ func dbwriter(db *Database, ch chan fileAndBook) { // existing book. if pair.f.id == 0 { log.Printf("potential new book: %#v", pair.b.Metadata) - if match, err := db.Find(pair.b.Metadata); err == nil { + if match, err := db.Find(pair.b.Metadata.Uniques()); err == nil { log.Printf("%s matches existing book %d", pair.f.path, match.Id) // Ignore new metadata. pair.b = match diff --git a/update_test.go b/update_test.go new file mode 100644 index 0000000000000000000000000000000000000000..59b6c0dc97a72552d18d13aa2f89d377556dabc7 --- /dev/null +++ b/update_test.go @@ -0,0 +1,58 @@ +package liber + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "git.autistici.org/ale/liber/util" +) + +func createTestFs(fs map[string]string) string { + base, _ := ioutil.TempDir("", "test-fs-") + for path, contents := range fs { + path = filepath.Join(base, path) + os.MkdirAll(filepath.Dir(path), 0700) + ioutil.WriteFile(path, []byte(contents), 0700) + } + return base +} + +func TestDatabase_Update(t *testing.T) { + util.WalkerDefaultMinSize = 0 + + td, db := newTestDatabase(t) + defer td.Close() + + // Make the test book a pdf so we don't attempt to parse it. + tmpdir := createTestFs(map[string]string{ + "book/Test Ebook.pdf": "foo", + "book/metadata.opf": testOpf, + "book/cover.jpg": "jpeg", + }) + defer os.RemoveAll(tmpdir) + + chooserCalled := false + chooser := func(path string, choices []*Metadata) *Metadata { + chooserCalled = true + return nil + } + + testDb := func(tag string) { + // The test ebook added in newTestDatabase should not be there + // any more. + if _, err := db.GetBook(td.refbookid); err == nil { + t.Errorf("%s: test book still in database", tag) + } + // Test OPF ebook should have been found by Update. + if result, err := db.Search("isbn:9781939293015", 0, 1); err != nil || result.NumResults != 1 { + t.Errorf("%s: new book not found in database", tag) + } + } + + db.Update(tmpdir, chooser) + testDb("first update") + db.Update(tmpdir, chooser) + testDb("second update") +} diff --git a/util/file_walk.go b/util/file_walk.go index 9190a0ef58e43700f87da03689a83bed675c60e2..b9e5f3e3cdc6791fbfbf4fbeb52f2e0d68f9da66 100644 --- a/util/file_walk.go +++ b/util/file_walk.go @@ -5,6 +5,8 @@ import ( "path/filepath" ) +var WalkerDefaultMinSize int64 = 4096 + type Walker struct { Exclude, Include []string MinSize int64 @@ -67,6 +69,6 @@ func (w *Walker) Walk(root string, walkFn filepath.WalkFunc) error { func NewDefaultWalker() *Walker { return &Walker{ Include: []string{"*.epub", "*.mobi", "*.pdf"}, - MinSize: 65535, + MinSize: WalkerDefaultMinSize, } } diff --git a/web_test.go b/web_test.go index 1562f98f16de3dd5c5467b3e8987824d7db21822..aafefe04fe3bf93f9a3491aa7ca84204ecadcb24 100644 --- a/web_test.go +++ b/web_test.go @@ -70,7 +70,7 @@ func TestWeb_Search(t *testing.T) { t.Fatalf("Bad HTTP response: %s\n%s", resp.Status, readTestResponseData(resp, t)) } data := readTestResponseData(resp, t) - if !strings.Contains(data, "20,000 Leagues") { + if !strings.Contains(data, "Twenty Thousand Leagues") { t.Fatalf("Response does not contain book title:\n%s", data) } } @@ -88,7 +88,7 @@ func TestWeb_ShowBook(t *testing.T) { t.Fatalf("Bad HTTP response: %s\n%s", resp.Status, readTestResponseData(resp, t)) } data := readTestResponseData(resp, t) - if !strings.Contains(data, "20,000 Leagues") { + if !strings.Contains(data, "Twenty Thousand Leagues") { t.Fatalf("Response does not contain book title:\n%s", data) } }