Newer
Older
package liber
import (
"crypto/md5"
"encoding/xml"
"errors"
"net/http"
"net/url"
"path/filepath"
"strings"
"time"
)
var googleAPIKey = flag.String("google-api-key", "", "Google API key")
type googleBooksRefiner struct{}
func (r *googleBooksRefiner) Name() string {
return "gbooks"
}
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
type atomResultEntry struct {
GoogleId string `xml:"id"`
Title string `xml:"http://purl.org/dc/terms title"`
Date string `xml:"http://purl.org/dc/terms date"`
Creator []string `xml:"http://purl.org/dc/terms creator"`
Description []string `xml:"http://purl.org/dc/terms description"`
Language []string `xml:"http://purl.org/dc/terms language"`
Identifier []string `xml:"http://purl.org/dc/terms identifier"`
Publisher []string `xml:"http://purl.org/dc/terms publisher"`
Format []string `xml:"http://purl.org/dc/terms format"`
}
func (e *atomResultEntry) ToMetadata() *Metadata {
m := &Metadata{
Title: e.Title,
Date: toYear(e.Date),
Creator: e.Creator,
Language: e.Language,
Publisher: e.Publisher,
Format: e.Format,
Sources: []MetadataSource{{
Name: "gbooks",
ID: e.GoogleId,
}},
}
if len(e.Description) > 0 {
m.Description = e.Description[0]
}
for _, id := range e.Identifier {
if strings.HasPrefix(id, "ISBN:") {
m.ISBN = append(m.ISBN, strings.TrimPrefix(id, "ISBN:"))
}
}
return m
}
type atomResult struct {
XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
Entries []*atomResultEntry `xml:"entry"`
}
// Build an appropriate query for Google Books, trying to be as
// specific as possible based on the provided metadata.
func googleBooksQuery(m *Metadata) string {
var query []string
// If the ISBN is specified, query it directly.
if len(m.ISBN) > 0 {
for _, isbn := range m.ISBN {
query = append(query, fmt.Sprintf("isbn:%s", isbn))
}
return strings.Join(query, " ")
}
// If we have explicitly tagged metadata, use it.
if len(m.Title) > 0 {
query = append(query, fmt.Sprintf("intitle:\"%s\"", m.Title))
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
}
if len(m.Creator) > 0 {
for _, author := range m.Creator {
for _, s := range strings.Fields(author) {
query = append(query, fmt.Sprintf("inauthor:%s", s))
}
}
}
if query != nil {
return strings.Join(query, "+")
}
// Try to make a generic query using keywords.
if len(m.Keywords) > 0 {
return strings.Join(m.Keywords, " ")
}
return ""
}
// Parse the ATOM response and decode metadata.
func googleBooksParse(in io.Reader) ([]*Metadata, error) {
var result atomResult
if err := xml.NewDecoder(in).Decode(&result); err != nil {
return nil, err
}
var out []*Metadata
for _, e := range result.Entries {
out = append(out, e.ToMetadata())
}
return out, nil
}
// Limit the number of concurrent queries to Google.
var gbSemaphore = make(chan bool, 3)
// Make a HTTP GET query to Google Books, with exponential backoff in
// case of rate limiting.
func googleBooksGet(uri string) (*http.Response, error) {
gbSemaphore <- true
defer func() {
<-gbSemaphore
}()
// Set a deadline to 10 minutes.
timeout := 900 * time.Second
maxBackoff := 60 * time.Second
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
resp, err := http.Get(uri)
if err != nil {
return nil, err
}
switch resp.StatusCode {
case 403:
// We've been ratelimited. Back off.
resp.Body.Close()
time.Sleep(backoff)
backoff += backoff / 2
if backoff > maxBackoff {
backoff = maxBackoff
}
continue
case 200:
return resp, nil
default:
resp.Body.Close()
return nil, fmt.Errorf("HTTP status %s", resp.Status)
}
}
return nil, errors.New("deadline exceeded")
}
func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) {
qstr := googleBooksQuery(m)
if qstr == "" {
return nil, errors.New("insufficient metadata for query")
}
log.Printf("googlebooks query: %s", qstr)
if *googleAPIKey != "" {
values.Set("key", *googleAPIKey)
}
uri := "http://books.google.com/books/feeds/volumes?" + values.Encode()
resp, err := googleBooksGet(uri)
if err != nil {
log.Printf("googlebooks error: %v", err)
return nil, err
}
defer resp.Body.Close()
return googleBooksParse(resp.Body)
}
func getGoogleBooksId(m *Metadata) string {
for _, s := range m.Sources {
if s.Name == "gbooks" {
return filepath.Base(s.ID)
}
}
return ""
}
// MD5 of the "image not available" placeholder image served by Google
// Books. We'd like to avoid it using space on the filesystem.
var imageUnavailableMD5 = [16]byte{
0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57,
}
func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) {
gbid := getGoogleBooksId(m)
if gbid == "" {
return nil, errors.New("no ID")
}
uri := fmt.Sprintf("http://books.google.com/books?id=%s&printsec=frontcover&img=1", gbid)
resp, err := googleBooksGet(uri)
if err != nil {
return nil, err
}
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
if md5.Sum(data) == imageUnavailableMD5 {
return nil, errors.New("image unavailable")
}
return data, nil
}