Commit a86cd6dd authored by ale's avatar ale

various indexing fixes

Use a separate, simpler struct for indexing. Store unique IDs in the
index directly instead of parsing them back into a Metadata object.
Add more thorough tests.
parent bc535848
......@@ -36,6 +36,25 @@ func (id BookId) Key() []byte {
return buf.Bytes()
}
type Book struct {
Id BookId
CoverPath string
Metadata *Metadata
}
type File struct {
Path string
FileType string
Mtime time.Time
Size int64
Error bool
Id BookId
}
func (f *File) HasChanged(info os.FileInfo) bool {
return !info.ModTime().Equal(f.Mtime) || info.Size() != f.Size
}
func init() {
// Seed the RNG to a random value.
var seed int64
......@@ -52,27 +71,52 @@ func ParseID(s string) BookId {
return BookId(id)
}
// The structure that gets actually indexed.
type flatBook struct {
Title string `json:"title"`
Author []string `json:"author"`
Description string `json:"description"`
ISBN []string `json:"isbn"`
Unique []string `json:"_unique"`
}
func (f *flatBook) Type() string {
return "ebook"
}
func flatten(book *Book) *flatBook {
return &flatBook{
Title: book.Metadata.Title,
Author: book.Metadata.Creator,
Description: book.Metadata.Description,
ISBN: book.Metadata.ISBN,
Unique: book.Metadata.Uniques(),
}
}
var defaultTextAnalyzer = "standard"
func metadataDocumentMapping() *bleve.DocumentMapping {
md := bleve.NewDocumentMapping()
titleFieldMapping := bleve.NewTextFieldMapping()
titleFieldMapping.Analyzer = "en"
titleFieldMapping.Store = false
md.AddFieldMappingsAt("Title", titleFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.Store = false
textFieldMapping.Analyzer = defaultTextAnalyzer
authorFieldMapping := bleve.NewTextFieldMapping()
authorFieldMapping.Store = false
md.AddFieldMappingsAt("Creator", authorFieldMapping)
authorFieldMapping.Analyzer = "simple"
nostoreFieldMapping := bleve.NewTextFieldMapping()
nostoreFieldMapping.Store = false
nostoreFieldMapping.IncludeInAll = false
md.AddFieldMappingsAt("Description", nostoreFieldMapping)
md.AddFieldMappingsAt("ISBN", nostoreFieldMapping)
keywordFieldMapping := bleve.NewTextFieldMapping()
keywordFieldMapping.Store = false
keywordFieldMapping.Analyzer = "keyword"
keywordFieldMapping.IncludeInAll = false
for _, ignore := range []string{"Sources", "Date", "Publisher", "Format", "Keywords", "Language"} {
md.AddSubDocumentMapping(ignore, bleve.NewDocumentDisabledMapping())
}
md.AddFieldMappingsAt("title", textFieldMapping)
md.AddFieldMappingsAt("author", authorFieldMapping)
md.AddFieldMappingsAt("description", textFieldMapping)
md.AddFieldMappingsAt("isbn", keywordFieldMapping)
md.AddFieldMappingsAt("_unique", keywordFieldMapping)
return md
}
......@@ -80,34 +124,11 @@ func metadataDocumentMapping() *bleve.DocumentMapping {
func defaultIndexMapping() *bleve.IndexMapping {
i := bleve.NewIndexMapping()
i.AddDocumentMapping("ebook", metadataDocumentMapping())
i.DefaultAnalyzer = "en"
i.DefaultAnalyzer = defaultTextAnalyzer
i.DefaultType = "ebook"
return i
}
type Book struct {
Id BookId
// Path string
CoverPath string
Metadata *Metadata
}
func (b *Book) Type() string {
return "ebook"
}
type File struct {
Path string
FileType string
Mtime time.Time
Size int64
Error bool
Id BookId
}
func (f *File) HasChanged(info os.FileInfo) bool {
return !info.ModTime().Equal(f.Mtime) || info.Size() != f.Size
}
type Database struct {
leveldb *levigo.DB
leveldbCache *levigo.Cache
......@@ -219,7 +240,7 @@ func (db *Database) PutBook(b *Book) error {
if err := db.Put(BookBucket, b.Id.Key(), b); err != nil {
return err
}
return db.index.Index(b.Id.String(), b.Metadata)
return db.index.Index(b.Id.String(), flatten(b))
}
func fileBookKey(path string, bookid BookId) []byte {
......@@ -356,35 +377,17 @@ func (db *Database) Autocomplete(term string) (*SearchResult, error) {
}
// Find a book matching the given metadata, if possible.
func (db *Database) Find(m *Metadata) (*Book, error) {
func (db *Database) Find(uniqueIds []string) (*Book, error) {
var queries []bleve.Query
var query bleve.Query
if len(m.ISBN) > 0 {
var queries []bleve.Query
for _, isbn := range m.ISBN {
q := bleve.NewTermQuery(isbn)
q.SetField("ISBN")
queries = append(queries, q)
}
for _, u := range uniqueIds {
queries = append(queries, bleve.NewTermQuery(u).SetField("_unique"))
}
if len(queries) > 0 {
query = bleve.NewDisjunctionQuery(queries)
} else {
var queries []bleve.Query
if m.Title != "" {
q := bleve.NewMatchQuery(m.Title)
q.SetField("Title")
queries = append(queries, q)
}
if len(m.Creator) > 0 {
for _, a := range m.Creator {
q := bleve.NewMatchQuery(a)
q.SetField("Creator")
queries = append(queries, q)
}
}
if len(queries) == 0 {
return nil, errors.New("insufficient metadata for query")
}
query = bleve.NewConjunctionQuery(queries)
query = queries[0]
}
search := bleve.NewSearchRequest(query)
......@@ -392,16 +395,11 @@ func (db *Database) Find(m *Metadata) (*Book, error) {
if err != nil {
return nil, err
}
for _, r := range result.Hits {
book, err := db.GetBook(ParseID(r.ID))
if err != nil {
continue
}
if book.Metadata.Equals(m) {
return book, nil
}
if len(result.Hits) == 0 {
return nil, errors.New("no matches found")
}
return nil, errors.New("no matches found")
return db.GetBook(ParseID(result.Hits[0].ID))
}
func bktToKey(bucket, key []byte) []byte {
......
......@@ -5,6 +5,8 @@ import (
"io/ioutil"
"os"
"testing"
"github.com/blevesearch/bleve"
)
type testDatabase struct {
......@@ -26,16 +28,25 @@ func newTestDatabase(t *testing.T) (*testDatabase, *Database) {
}
book := testEbook()
if err = db.PutBook(book); err != nil {
if err := db.PutBook(book); err != nil {
t.Fatalf("PutBook(): %v", err)
}
if err = db.PutFile(testEpubFile(path, book.Id)); err != nil {
if err := db.PutFile(testEpubFile(path, book.Id)); err != nil {
t.Fatalf("PutFile(): %v", err)
}
return &testDatabase{db: db, path: path, refbookid: book.Id}, db
}
func newTestDatabase2(t *testing.T) (*testDatabase, *Database) {
td, db := newTestDatabase(t)
book := testEbook2()
if err := db.PutBook(book); err != nil {
t.Fatalf("PutBook(): %v", err)
}
return td, db
}
func testEpubFile(dir string, bookid BookId) *File {
f, _ := ioutil.TempFile(dir, "ebook-")
io.WriteString(f, "epub\n")
......@@ -52,14 +63,26 @@ func testEbook() *Book {
return &Book{
Id: NewID(),
Metadata: &Metadata{
Title: "20,000 Leagues under the sea",
Title: "Twenty Thousand Leagues Under The Sea",
Creator: []string{"Jules Verne"},
ISBN: []string{"1234"},
ISBN: []string{"1234", "2345"},
Description: "A pretty cool book.",
},
}
}
func testEbook2() *Book {
return &Book{
Id: NewID(),
Metadata: &Metadata{
Title: "Around The World In Eighty Days",
Creator: []string{"Jules Verne"},
ISBN: []string{"5678"},
Description: "It's about balloons.",
},
}
}
func TestDatabase_Get(t *testing.T) {
td, db := newTestDatabase(t)
defer td.Close()
......@@ -109,50 +132,100 @@ func TestDatabase_BookFileRelation(t *testing.T) {
}
}
func TestDatabase_Search(t *testing.T) {
td, db := newTestDatabase(t)
func TestDatabase_BleveLowLevelSearch(t *testing.T) {
td, db := newTestDatabase2(t)
defer td.Close()
r, err := db.Search("jules verne", 0, 100)
if err != nil {
t.Fatal(err)
}
if r.NumResults != 1 {
t.Fatalf("NumResults != 1 - %#v", r)
}
if len(r.Results) != 1 {
t.Fatalf("len(Results) != 1 - %#v", r)
doSearch := func(tag string, query bleve.Query, numExpectedResults int) {
req := bleve.NewSearchRequestOptions(query, 100, 0, false)
result, err := db.index.Search(req)
if err != nil {
t.Fatalf("%s: %v", tag, err)
}
if int(result.Total) != numExpectedResults {
t.Errorf("%s: got %d results, expected %d", tag, result.Total, numExpectedResults)
}
}
r, err = db.Search("italo calvino", 0, 100)
if err != nil {
t.Fatal(err)
}
if r.NumResults > 0 {
t.Fatalf("unexpected results: %v", r)
doSearch("match_query",
bleve.NewMatchQuery("Leagues Under The Sea"),
1)
doSearch("match_query_with_field",
bleve.NewMatchQuery("Leagues Under The Sea").SetField("title"),
1)
doSearch("match_query_with_field_2",
bleve.NewMatchQuery("Jules Verne").SetField("author"),
2)
//doSearch("match_query_with_field_2_AND",
// bleve.NewMatchQuery("Hugo Verne").SetField("author"),
// 0)
doSearch("match_query_with_wrong_field",
bleve.NewMatchQuery("Leagues Under The Sea").SetField("author"),
0)
doSearch("query_string_precise",
bleve.NewQueryStringQuery("+title:Leagues +author:verne"),
1)
doSearch("isbn_term_query",
bleve.NewTermQuery("1234").SetField("isbn"),
1)
doSearch("isbn_term_query_2",
bleve.NewTermQuery("2345").SetField("isbn"),
1)
}
func TestDatabase_Search(t *testing.T) {
td, db := newTestDatabase2(t)
defer td.Close()
doSearch := func(query string, numExpectedResults int) {
r, err := db.Search(query, 0, 100)
if err != nil {
t.Fatalf("Search(%s) failed: %v", query, err)
}
if r.NumResults != numExpectedResults {
t.Errorf("Search(%s): got %d results, expecting %d\n%#v", query, r.NumResults, numExpectedResults, r.Results)
}
}
doSearch("jules verne", 2)
doSearch("italo calvino", 0)
doSearch("Twenty Thousand Leagues", 1)
doSearch("\"Twenty Thousand Leagues\"", 1)
doSearch("title:\"Twenty Thousand Leagues\"", 1)
doSearch("author:\"Jules Verne\"", 2)
doSearch("author:verne", 2)
doSearch("+title:Leagues +author:Verne", 1)
doSearch("title:verne", 0)
doSearch("author:vernes", 0)
}
func TestDatabase_Find(t *testing.T) {
td, db := newTestDatabase(t)
td, db := newTestDatabase2(t)
defer td.Close()
book, _ := db.GetBook(td.refbookid)
m := book.Metadata
// Find using ISBN.
book := testEbook()
if _, err := db.Find(book.Metadata); err != nil {
t.Fatal("With ISBN: ", err)
if result, err := db.Find(m.Uniques()); err != nil {
t.Errorf("With ISBN: %v", err)
} else if result.Id != book.Id {
t.Errorf("Bad match with ISBN: got=%d, expected=%d", result.Id, book.Id)
}
// Find using title/author.
book.Metadata.ISBN = nil
if _, err := db.Find(book.Metadata); err != nil {
t.Fatal("With title/author: ", err)
m.ISBN = nil
if result, err := db.Find(m.Uniques()); err != nil {
t.Errorf("With title/author: %v", err)
} else if result.Id != book.Id {
t.Errorf("Bad match with title/author: got=%d, expected=%d", result.Id, book.Id)
}
// Find only using title (should fail).
book.Metadata.Creator = nil
if result, err := db.Find(book.Metadata); err == nil {
t.Fatalf("Title only: no error, result = %v", result)
m.Creator = nil
if result, err := db.Find(m.Uniques()); err == nil {
t.Errorf("Title only: no error, result = %v", result)
}
}
......@@ -160,7 +233,7 @@ func TestDatabase_Autocomplete(t *testing.T) {
td, db := newTestDatabase(t)
defer td.Close()
r, err := db.Autocomplete("jul")
r, err := db.Autocomplete("jules")
if err != nil {
t.Fatal(err)
}
......
......@@ -14,7 +14,7 @@
</h1>
<p class="book-authors">
{{range $i, $a := .Book.Metadata.Creator}}{{if gt $i 0}}, {{end}}<a href="/search?q=Author%3A{{$a}}"><b>{{$a}}</b></a>{{end}}
{{range $i, $a := .Book.Metadata.Creator}}{{if gt $i 0}}, {{end}}<a href="/search?q=author%3A%22{{$a}}%22"><b>{{$a}}</b></a>{{end}}
</p>
<p>
......
......@@ -46,30 +46,6 @@ func (m *Metadata) Uniques() []string {
return out
}
func parseUniqueId(u string) (*Metadata, error) {
var m Metadata
p := strings.SplitN(u, ":", 2)
if len(p) < 2 {
return nil, errors.New("unparseable unique id")
}
switch p[0] {
case "isbn":
m.ISBN = []string{p[1]}
case "sig":
sp := strings.Split(p[1], "|")
if len(sp) < 2 {
return nil, errors.New("unparseable 'sig' unique id")
}
m.Title = sp[0]
if sp[1] != "" {
m.Creator = sp[1:len(sp)]
}
default:
return nil, errors.New("unknown unique id type")
}
return &m, nil
}
func listsOverlap(a, b []string) bool {
for _, aa := range a {
for _, bb := range b {
......
......@@ -235,21 +235,9 @@ func (l *syncServer) handleDiffRequest(w http.ResponseWriter, req *http.Request)
var resp diffResponse
for _, c := range diffreq.Candidates {
// For every unique ID, decode it into a template
// Metadata object and see if we can find a match in
// the database.
found := false
for _, unique := range c.Unique {
meta, err := parseUniqueId(unique)
if err != nil {
continue
}
if _, err := l.db.Find(meta); err == nil {
found = true
break
}
}
if !found {
// Search for matches in the database by looking for
// unique ids.
if _, err := l.db.Find(c.Unique); err != nil {
resp.Missing = append(resp.Missing, c.Id)
}
}
......@@ -274,7 +262,7 @@ func (l *syncServer) handleSyncUpload(w http.ResponseWriter, req *http.Request)
}
// Check again that we don't have this book.
if _, err := l.db.Find(&md); err == nil {
if _, err := l.db.Find(md.Uniques()); err == nil {
log.Printf("attempt to upload duplicate: %#v", &md)
http.Error(w, "Duplicate", http.StatusConflict)
return
......
......@@ -59,7 +59,7 @@ func TestSync_Sync(t *testing.T) {
metatmpl := &Metadata{
ISBN: []string{strconv.Itoa(i + 1)},
}
if _, err := db2.Find(metatmpl); err != nil {
if _, err := db2.Find(metatmpl.Uniques()); err != nil {
t.Errorf("Book %d missing from db2: %v", i+1, err)
}
}
......
......@@ -203,7 +203,7 @@ func dbwriter(db *Database, ch chan fileAndBook) {
// existing book.
if pair.f.id == 0 {
log.Printf("potential new book: %#v", pair.b.Metadata)
if match, err := db.Find(pair.b.Metadata); err == nil {
if match, err := db.Find(pair.b.Metadata.Uniques()); err == nil {
log.Printf("%s matches existing book %d", pair.f.path, match.Id)
// Ignore new metadata.
pair.b = match
......
package liber
import (
"io/ioutil"
"os"
"path/filepath"
"testing"
"git.autistici.org/ale/liber/util"
)
func createTestFs(fs map[string]string) string {
base, _ := ioutil.TempDir("", "test-fs-")
for path, contents := range fs {
path = filepath.Join(base, path)
os.MkdirAll(filepath.Dir(path), 0700)
ioutil.WriteFile(path, []byte(contents), 0700)
}
return base
}
func TestDatabase_Update(t *testing.T) {
util.WalkerDefaultMinSize = 0
td, db := newTestDatabase(t)
defer td.Close()
// Make the test book a pdf so we don't attempt to parse it.
tmpdir := createTestFs(map[string]string{
"book/Test Ebook.pdf": "foo",
"book/metadata.opf": testOpf,
"book/cover.jpg": "jpeg",
})
defer os.RemoveAll(tmpdir)
chooserCalled := false
chooser := func(path string, choices []*Metadata) *Metadata {
chooserCalled = true
return nil
}
testDb := func(tag string) {
// The test ebook added in newTestDatabase should not be there
// any more.
if _, err := db.GetBook(td.refbookid); err == nil {
t.Errorf("%s: test book still in database", tag)
}
// Test OPF ebook should have been found by Update.
if result, err := db.Search("isbn:9781939293015", 0, 1); err != nil || result.NumResults != 1 {
t.Errorf("%s: new book not found in database", tag)
}
}
db.Update(tmpdir, chooser)
testDb("first update")
db.Update(tmpdir, chooser)
testDb("second update")
}
......@@ -5,6 +5,8 @@ import (
"path/filepath"
)
var WalkerDefaultMinSize int64 = 4096
type Walker struct {
Exclude, Include []string
MinSize int64
......@@ -67,6 +69,6 @@ func (w *Walker) Walk(root string, walkFn filepath.WalkFunc) error {
func NewDefaultWalker() *Walker {
return &Walker{
Include: []string{"*.epub", "*.mobi", "*.pdf"},
MinSize: 65535,
MinSize: WalkerDefaultMinSize,
}
}
......@@ -70,7 +70,7 @@ func TestWeb_Search(t *testing.T) {
t.Fatalf("Bad HTTP response: %s\n%s", resp.Status, readTestResponseData(resp, t))
}
data := readTestResponseData(resp, t)
if !strings.Contains(data, "20,000 Leagues") {
if !strings.Contains(data, "Twenty Thousand Leagues") {
t.Fatalf("Response does not contain book title:\n%s", data)
}
}
......@@ -88,7 +88,7 @@ func TestWeb_ShowBook(t *testing.T) {
t.Fatalf("Bad HTTP response: %s\n%s", resp.Status, readTestResponseData(resp, t))
}
data := readTestResponseData(resp, t)
if !strings.Contains(data, "20,000 Leagues") {
if !strings.Contains(data, "Twenty Thousand Leagues") {
t.Fatalf("Response does not contain book title:\n%s", data)
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment