Commit a5e06fb5 authored by ale's avatar ale

Autodetect ISBN numbers in the text if not found in metadata

parent 825467fc
package liber
import (
"io"
"io/ioutil"
"regexp"
"strconv"
"strings"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/meskio/epubgo"
)
var isbnRx = regexp.MustCompile(`(?:ISBN(?:-10|-13)?\s*:?\s*)?((?:(?:[\d]-?){9}|(?:[\d]-?){12})[\dxX])(?:[^-\d]|$)`)
func findISBNInEpub(epub *epubgo.Epub) []string {
var isbn []string
spine, err := epub.Spine()
if err != nil {
return nil
}
for spine.Next() == nil {
r, err := spine.Open()
if err != nil {
continue
}
if found := findISBNInPage(r); len(found) > 0 {
isbn = append(isbn, found...)
}
r.Close()
}
return isbn
}
func findISBNInPage(r io.Reader) []string {
data, err := ioutil.ReadAll(r)
if err != nil {
return nil
}
var result []string
for _, m := range isbnRx.FindAllSubmatch(data, -1) {
if len(m) > 1 {
isbn := string(m[1])
if validateIsbn(isbn) {
result = append(result, isbn)
}
}
}
return result
}
func validateIsbn10(isbn string) bool {
var sum int
var multiply int = 10
for i, v := range isbn {
if v == '-' {
continue
}
digitString := string(v)
if i == 9 && digitString == "X" {
digitString = "10"
}
digit, err := strconv.Atoi(digitString)
if err != nil {
return false
}
sum = sum + (multiply * digit)
multiply--
}
return sum%11 == 0
}
func validateIsbn13(isbn string) bool {
var sum int
for i, v := range isbn {
var multiply int
if i%2 == 0 {
multiply = 1
} else {
multiply = 3
}
digit, err := strconv.Atoi(string(v))
if err != nil {
return false
}
sum = sum + (multiply * digit)
}
return sum%10 == 0
}
func validateIsbn(isbn string) bool {
isbn = strings.Replace(isbn, "-", "", -1)
switch len(isbn) {
case 10:
return validateIsbn10(isbn)
case 13:
return validateIsbn13(isbn)
default:
return false
}
}
package liber
import (
"reflect"
"strings"
"testing"
)
var isbnTestData = []struct {
input string
expected []string
}{
{"not-an-isbn", nil},
{"123456780-330-28498-312345678", nil},
// Valid ISBN numbers.
{"ISBN: 0-330-28498-3", []string{"0-330-28498-3"}},
{"ISBN : 1-58182-008-9", []string{"1-58182-008-9"}},
{"ISBN-10: 2-226-05257-7", []string{"2-226-05257-7"}},
{"ISBN 3-7965-1900-8", []string{"3-7965-1900-8"}},
{"ISBN 4-19-830127-1", []string{"4-19-830127-1"}},
{"ISBN 5-85270-001-0", []string{"5-85270-001-0"}},
{"ISBN 978-600-119-125-1", []string{"978-600-119-125-1"}},
{"978-601-7151-13-3", []string{"978-601-7151-13-3"}},
{"ISBN-13: 978-602-8328-22-7", []string{"978-602-8328-22-7"}},
{"ISBN 978-603-500-045-1", []string{"978-603-500-045-1"}},
{"ISBN 605-384-057-2", []string{"605-384-057-2"}},
{"ISBN 978-606-8126-35-7", []string{"978-606-8126-35-7"}},
{"ISBN 978-607-455-035-1", []string{"978-607-455-035-1"}},
{"ISBN: 978-608-203-023-4", []string{"978-608-203-023-4"}},
{"ISBN 978-612-45165-9-7", []string{"978-612-45165-9-7"}},
{"ISBN 978-614-404-018-8", []string{"978-614-404-018-8"}},
{"ISBN 978-615-5014-99-4", []string{"978-615-5014-99-4"}},
{"ISBN 7-301-10299-2", []string{"7-301-10299-2"}},
{"ISBN 80-85983-44-3", []string{"80-85983-44-3"}},
{"81-7215-399-6", []string{"81-7215-399-6"}},
{"82-530-0983-6", []string{"82-530-0983-6"}},
{"eISBN : 83-08-01587-5", []string{"83-08-01587-5"}},
{"eISBN : 84-86546-08-7", []string{"84-86546-08-7"}},
{"ISBN 85-7531-015-1", []string{"85-7531-015-1"}},
{"ISBN 86-341-0846-5", []string{"86-341-0846-5"}},
{"ISBN 87-595-2277-1", []string{"87-595-2277-1"}},
{"88-04-47328-2", []string{"88-04-47328-2"}},
{"ISBN 90-5691-187-2", []string{"90-5691-187-2"}},
{"ISBN 91-1-811692-2", []string{"91-1-811692-2"}},
{"ISBN 92-67-10370-9", []string{"92-67-10370-9"}},
{"ISBN 93-5025-214-7", []string{"93-5025-214-7"}},
{"ISBN 950-04-0442-7", []string{"950-04-0442-7"}},
{"ISBN 951-0-11369-7", []string{"951-0-11369-7"}},
{"ISBN 952-471-294-6", []string{"952-471-294-6"}},
{"ISBN 953-157-105-8", []string{"953-157-105-8"}},
{"ISBN 954-430-603-X", []string{"954-430-603-X"}},
{"ISBN 955-20-3051-X", []string{"955-20-3051-X"}},
{"ISBN 956-7291-48-9", []string{"956-7291-48-9"}},
{"ISBN 957-01-7429-3", []string{"957-01-7429-3"}},
{"ISBN 958-04-6278-X", []string{"958-04-6278-X"}},
{"ISBN 959-10-0363-3", []string{"959-10-0363-3"}},
{"ISBN 961-6403-23-0", []string{"961-6403-23-0"}},
{"ISBN 962-04-0195-6", []string{"962-04-0195-6"}},
{"ISBN 978-988-00-3827-3", []string{"978-988-00-3827-3"}},
{"ISBN: 978-9928400529", []string{"978-9928400529"}},
{"ISBN: 978-9929801646", []string{"978-9929801646"}},
{"ISBN: 978-9930943106", []string{"978-9930943106"}},
// Multiple results.
{"ISBN: 0-330-28498-3 and ISBN : 1-58182-008-9",
[]string{"0-330-28498-3", "1-58182-008-9"}},
// Invalid ISBN numbers (wrong check digit).
{"ISBN 961-6403-23-1", nil},
{"ISBN 962-04-0195-1", nil},
{"ISBN: 978-9928400521", nil},
{"ISBN: 978-9929801641", nil},
{"ISBN: 978-993094310X", nil},
}
func TestISBN_Detect(t *testing.T) {
for _, td := range isbnTestData {
isbn := findISBNInPage(strings.NewReader(td.input))
if !reflect.DeepEqual(isbn, td.expected) {
t.Errorf("input: %s, got: %v, expected: %v", td.input, isbn, td.expected)
}
}
}
......@@ -3,6 +3,7 @@ package liber
import (
"errors"
"fmt"
"log"
"path/filepath"
"regexp"
"strings"
......@@ -192,6 +193,14 @@ func parseEpub(filename string) (*Metadata, error) {
}
}
// If we haven't found an ISBN, look for it in the book text.
if len(m.ISBN) == 0 {
if isbn := findISBNInEpub(e); len(isbn) > 0 {
log.Printf("found ISBN in book text: %s", strings.Join(isbn, ", "))
m.ISBN = isbn
}
}
m.Sources = []MetadataSource{{
Name: "epub",
ID: filename,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment