package liber import ( "crypto/md5" "encoding/xml" "errors" "flag" "fmt" "io" "io/ioutil" "net/http" "net/url" "path/filepath" "strings" "time" ) var googleAPIKey = flag.String("google-api-key", "", "Google API key") type googleBooksRefiner struct{} func (r *googleBooksRefiner) Name() string { return "gbooks" } type atomResultEntry struct { GoogleId string `xml:"id"` Title string `xml:"http://purl.org/dc/terms title"` Date string `xml:"http://purl.org/dc/terms date"` Creator []string `xml:"http://purl.org/dc/terms creator"` Description []string `xml:"http://purl.org/dc/terms description"` Language []string `xml:"http://purl.org/dc/terms language"` Identifier []string `xml:"http://purl.org/dc/terms identifier"` Publisher []string `xml:"http://purl.org/dc/terms publisher"` Format []string `xml:"http://purl.org/dc/terms format"` } func (e *atomResultEntry) ToMetadata() *Metadata { m := &Metadata{ Title: e.Title, Date: toYear(e.Date), Creator: e.Creator, Language: e.Language, Publisher: e.Publisher, Format: e.Format, Sources: []MetadataSource{{ Name: "gbooks", ID: e.GoogleId, }}, } if len(e.Description) > 0 { m.Description = e.Description[0] } for _, id := range e.Identifier { if strings.HasPrefix(id, "ISBN:") { m.ISBN = append(m.ISBN, strings.TrimPrefix(id, "ISBN:")) } } return m } type atomResult struct { XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"` Entries []*atomResultEntry `xml:"entry"` } // Build an appropriate query for Google Books, trying to be as // specific as possible based on the provided metadata. func googleBooksQuery(m *Metadata) string { var query []string // If the ISBN is specified, query it directly. if len(m.ISBN) > 0 { for _, isbn := range m.ISBN { query = append(query, fmt.Sprintf("isbn:%s", isbn)) } return strings.Join(query, " ") } // If we have explicitly tagged metadata, use it. if len(m.Title) > 0 { for _, s := range strings.Fields(m.Title) { query = append(query, fmt.Sprintf("intitle:%s", s)) } } if len(m.Creator) > 0 { for _, author := range m.Creator { for _, s := range strings.Fields(author) { query = append(query, fmt.Sprintf("inauthor:%s", s)) } } } if query != nil { return strings.Join(query, "+") } // Try to make a generic query using keywords. if len(m.Keywords) > 0 { return strings.Join(m.Keywords, " ") } return "" } // Parse the ATOM response and decode metadata. func googleBooksParse(in io.Reader) ([]*Metadata, error) { var result atomResult if err := xml.NewDecoder(in).Decode(&result); err != nil { return nil, err } var out []*Metadata for _, e := range result.Entries { out = append(out, e.ToMetadata()) } return out, nil } // Limit the number of concurrent queries to Google. var gbSemaphore = make(chan bool, 3) // Make a HTTP GET query to Google Books, with exponential backoff in // case of rate limiting. func googleBooksGet(uri string) (*http.Response, error) { gbSemaphore <- true defer func() { <-gbSemaphore }() // Set a deadline to 10 minutes. timeout := 900 * time.Second backoff := 500 * time.Millisecond maxBackoff := 60 * time.Second deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { resp, err := http.Get(uri) if err != nil { return nil, err } switch resp.StatusCode { case 403: // We've been ratelimited. Back off. resp.Body.Close() time.Sleep(backoff) backoff += backoff / 2 if backoff > maxBackoff { backoff = maxBackoff } continue case 200: return resp, nil default: resp.Body.Close() return nil, fmt.Errorf("HTTP status %s", resp.Status) } } return nil, errors.New("deadline exceeded") } func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) { qstr := googleBooksQuery(m) if qstr == "" { return nil, errors.New("insufficient metadata for query") } values := make(url.Values) values.Set("q", qstr) if *googleAPIKey != "" { values.Set("key", *googleAPIKey) } uri := "http://books.google.com/books/feeds/volumes?" + values.Encode() resp, err := googleBooksGet(uri) if err != nil { return nil, err } defer resp.Body.Close() return googleBooksParse(resp.Body) } func getGoogleBooksId(m *Metadata) string { for _, s := range m.Sources { if s.Name == "gbooks" { return filepath.Base(s.ID) } } return "" } // MD5 of the "image not available" placeholder image served by Google // Books. We'd like to avoid it using space on the filesystem. var imageUnavailableMD5 = [16]byte{ 0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57, } func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) { gbid := getGoogleBooksId(m) if gbid == "" { return nil, errors.New("no ID") } uri := fmt.Sprintf("http://books.google.com/books?id=%s&printsec=frontcover&img=1", gbid) resp, err := googleBooksGet(uri) if err != nil { return nil, err } defer resp.Body.Close() data, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, err } if md5.Sum(data) == imageUnavailableMD5 { return nil, errors.New("image unavailable") } return data, nil }