Skip to content
Snippets Groups Projects
googlebooks.go 5.13 KiB
Newer Older
  • Learn to ignore specific revisions
  • ale's avatar
    ale committed
    package liber
    
    import (
    	"crypto/md5"
    	"encoding/xml"
    	"errors"
    
    ale's avatar
    ale committed
    	"fmt"
    	"io"
    	"io/ioutil"
    	"net/http"
    	"net/url"
    	"path/filepath"
    	"strings"
    	"time"
    )
    
    
    var googleAPIKey = flag.String("google-api-key", "", "Google API key")
    
    
    type googleBooksRefiner struct{}
    
    func (r *googleBooksRefiner) Name() string {
    	return "gbooks"
    }
    
    
    ale's avatar
    ale committed
    type atomResultEntry struct {
    	GoogleId    string   `xml:"id"`
    	Title       string   `xml:"http://purl.org/dc/terms title"`
    	Date        string   `xml:"http://purl.org/dc/terms date"`
    	Creator     []string `xml:"http://purl.org/dc/terms creator"`
    	Description []string `xml:"http://purl.org/dc/terms description"`
    	Language    []string `xml:"http://purl.org/dc/terms language"`
    	Identifier  []string `xml:"http://purl.org/dc/terms identifier"`
    	Publisher   []string `xml:"http://purl.org/dc/terms publisher"`
    	Format      []string `xml:"http://purl.org/dc/terms format"`
    }
    
    func (e *atomResultEntry) ToMetadata() *Metadata {
    	m := &Metadata{
    		Title:     e.Title,
    		Date:      toYear(e.Date),
    		Creator:   e.Creator,
    		Language:  e.Language,
    		Publisher: e.Publisher,
    		Format:    e.Format,
    		Sources: []MetadataSource{{
    			Name: "gbooks",
    			ID:   e.GoogleId,
    		}},
    	}
    	if len(e.Description) > 0 {
    		m.Description = e.Description[0]
    	}
    	for _, id := range e.Identifier {
    		if strings.HasPrefix(id, "ISBN:") {
    			m.ISBN = append(m.ISBN, strings.TrimPrefix(id, "ISBN:"))
    		}
    	}
    	return m
    }
    
    type atomResult struct {
    	XMLName xml.Name           `xml:"http://www.w3.org/2005/Atom feed"`
    	Entries []*atomResultEntry `xml:"entry"`
    }
    
    // Build an appropriate query for Google Books, trying to be as
    // specific as possible based on the provided metadata.
    func googleBooksQuery(m *Metadata) string {
    	var query []string
    
    	// If the ISBN is specified, query it directly.
    	if len(m.ISBN) > 0 {
    		for _, isbn := range m.ISBN {
    			query = append(query, fmt.Sprintf("isbn:%s", isbn))
    		}
    		return strings.Join(query, " ")
    	}
    
    	// If we have explicitly tagged metadata, use it.
    	if len(m.Title) > 0 {
    		for _, s := range strings.Fields(m.Title) {
    			query = append(query, fmt.Sprintf("intitle:%s", s))
    		}
    	}
    	if len(m.Creator) > 0 {
    		for _, author := range m.Creator {
    			for _, s := range strings.Fields(author) {
    				query = append(query, fmt.Sprintf("inauthor:%s", s))
    			}
    		}
    	}
    	if query != nil {
    		return strings.Join(query, "+")
    	}
    
    	// Try to make a generic query using keywords.
    	if len(m.Keywords) > 0 {
    		return strings.Join(m.Keywords, " ")
    	}
    
    	return ""
    }
    
    // Parse the ATOM response and decode metadata.
    func googleBooksParse(in io.Reader) ([]*Metadata, error) {
    	var result atomResult
    	if err := xml.NewDecoder(in).Decode(&result); err != nil {
    		return nil, err
    	}
    	var out []*Metadata
    	for _, e := range result.Entries {
    		out = append(out, e.ToMetadata())
    	}
    	return out, nil
    }
    
    // Limit the number of concurrent queries to Google.
    var gbSemaphore = make(chan bool, 3)
    
    // Make a HTTP GET query to Google Books, with exponential backoff in
    // case of rate limiting.
    func googleBooksGet(uri string) (*http.Response, error) {
    	gbSemaphore <- true
    	defer func() {
    		<-gbSemaphore
    	}()
    
    
    	// Set a deadline to 10 minutes.
    	timeout := 900 * time.Second
    
    ale's avatar
    ale committed
    	backoff := 500 * time.Millisecond
    
    	maxBackoff := 60 * time.Second
    	deadline := time.Now().Add(timeout)
    
    ale's avatar
    ale committed
    	for time.Now().Before(deadline) {
    		resp, err := http.Get(uri)
    		if err != nil {
    			return nil, err
    		}
    		switch resp.StatusCode {
    		case 403:
    			// We've been ratelimited. Back off.
    			resp.Body.Close()
    			time.Sleep(backoff)
    			backoff += backoff / 2
    
    			if backoff > maxBackoff {
    				backoff = maxBackoff
    			}
    
    ale's avatar
    ale committed
    			continue
    		case 200:
    			return resp, nil
    		default:
    			resp.Body.Close()
    			return nil, fmt.Errorf("HTTP status %s", resp.Status)
    		}
    	}
    	return nil, errors.New("deadline exceeded")
    }
    
    
    func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) {
    
    ale's avatar
    ale committed
    	qstr := googleBooksQuery(m)
    	if qstr == "" {
    		return nil, errors.New("insufficient metadata for query")
    	}
    	values := make(url.Values)
    	values.Set("q", qstr)
    
    	if *googleAPIKey != "" {
    		values.Set("key", *googleAPIKey)
    	}
    
    ale's avatar
    ale committed
    	uri := "http://books.google.com/books/feeds/volumes?" + values.Encode()
    	resp, err := googleBooksGet(uri)
    	if err != nil {
    		return nil, err
    	}
    	defer resp.Body.Close()
    	return googleBooksParse(resp.Body)
    }
    
    func getGoogleBooksId(m *Metadata) string {
    	for _, s := range m.Sources {
    		if s.Name == "gbooks" {
    			return filepath.Base(s.ID)
    		}
    	}
    	return ""
    }
    
    // MD5 of the "image not available" placeholder image served by Google
    // Books. We'd like to avoid it using space on the filesystem.
    var imageUnavailableMD5 = [16]byte{
    	0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57,
    }
    
    
    func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) {
    
    ale's avatar
    ale committed
    	gbid := getGoogleBooksId(m)
    	if gbid == "" {
    		return nil, errors.New("no ID")
    	}
    
    	uri := fmt.Sprintf("http://books.google.com/books?id=%s&printsec=frontcover&img=1", gbid)
    	resp, err := googleBooksGet(uri)
    	if err != nil {
    		return nil, err
    	}
    	defer resp.Body.Close()
    
    	data, err := ioutil.ReadAll(resp.Body)
    	if err != nil {
    		return nil, err
    	}
    
    	if md5.Sum(data) == imageUnavailableMD5 {
    		return nil, errors.New("image unavailable")
    	}
    
    	return data, nil
    }