googlebooks.go

package liber

import (
	"crypto/md5"
	"encoding/xml"
	"errors"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"path/filepath"
	"strings"
	"time"
)

var googleAPIKey = flag.String("google-api-key", "", "Google API key")

type googleBooksRefiner struct{}

func (r *googleBooksRefiner) Name() string {
	return "gbooks"
}

type atomResultEntry struct {
	GoogleId    string   `xml:"id"`
	Title       string   `xml:"http://purl.org/dc/terms title"`
	Date        string   `xml:"http://purl.org/dc/terms date"`
	Creator     []string `xml:"http://purl.org/dc/terms creator"`
	Description []string `xml:"http://purl.org/dc/terms description"`
	Language    []string `xml:"http://purl.org/dc/terms language"`
	Identifier  []string `xml:"http://purl.org/dc/terms identifier"`
	Publisher   []string `xml:"http://purl.org/dc/terms publisher"`
	Format      []string `xml:"http://purl.org/dc/terms format"`
}

func (e *atomResultEntry) ToMetadata() *Metadata {
	m := &Metadata{
		Title:     e.Title,
		Date:      toYear(e.Date),
		Creator:   e.Creator,
		Language:  e.Language,
		Publisher: e.Publisher,
		Format:    e.Format,
		Sources: []MetadataSource{{
			Name: "gbooks",
			ID:   e.GoogleId,
		}},
	}
	if len(e.Description) > 0 {
		m.Description = e.Description[0]
	}
	for _, id := range e.Identifier {
		if strings.HasPrefix(id, "ISBN:") {
			m.ISBN = append(m.ISBN, strings.TrimPrefix(id, "ISBN:"))
		}
	}
	return m
}

type atomResult struct {
	XMLName xml.Name           `xml:"http://www.w3.org/2005/Atom feed"`
	Entries []*atomResultEntry `xml:"entry"`
}

// Build an appropriate query for Google Books, trying to be as
// specific as possible based on the provided metadata.
func googleBooksQuery(m *Metadata) string {
	var query []string

	// If the ISBN is specified, query it directly.
	if len(m.ISBN) > 0 {
		for _, isbn := range m.ISBN {
			query = append(query, fmt.Sprintf("isbn:%s", isbn))
		}
		return strings.Join(query, " ")
	}

	// If we have explicitly tagged metadata, use it.
	if len(m.Title) > 0 {
		query = append(query, fmt.Sprintf("intitle:\"%s\"", m.Title))
	}
	if len(m.Creator) > 0 {
		for _, author := range m.Creator {
			for _, s := range strings.Fields(author) {
				query = append(query, fmt.Sprintf("inauthor:%s", s))
			}
		}
	}
	if query != nil {
		return strings.Join(query, "+")
	}

	// Try to make a generic query using keywords.
	if len(m.Keywords) > 0 {
		return strings.Join(m.Keywords, " ")
	}

	return ""
}

// Parse the ATOM response and decode metadata.
func googleBooksParse(in io.Reader) ([]*Metadata, error) {
	var result atomResult
	if err := xml.NewDecoder(in).Decode(&result); err != nil {
		return nil, err
	}
	var out []*Metadata
	for _, e := range result.Entries {
		out = append(out, e.ToMetadata())
	}
	return out, nil
}

// Limit the number of concurrent queries to Google.
var gbSemaphore = make(chan bool, 3)

// Make a HTTP GET query to Google Books, with exponential backoff in
// case of rate limiting.
func googleBooksGet(uri string) (*http.Response, error) {
	gbSemaphore <- true
	defer func() {
		<-gbSemaphore
	}()

	// Set a deadline to 10 minutes.
	timeout := 900 * time.Second
	backoff := 500 * time.Millisecond
	maxBackoff := 60 * time.Second
	deadline := time.Now().Add(timeout)
	for time.Now().Before(deadline) {
		resp, err := http.Get(uri)
		if err != nil {
			return nil, err
		}
		switch resp.StatusCode {
		case 403:
			// We've been ratelimited. Back off.
			resp.Body.Close()
			time.Sleep(backoff)
			backoff += backoff / 2
			if backoff > maxBackoff {
				backoff = maxBackoff
			}
			continue
		case 200:
			return resp, nil
		default:
			resp.Body.Close()
			return nil, fmt.Errorf("HTTP status %s", resp.Status)
		}
	}
	return nil, errors.New("deadline exceeded")
}

func (r *googleBooksRefiner) Lookup(m *Metadata) ([]*Metadata, error) {
	qstr := googleBooksQuery(m)
	if qstr == "" {
		return nil, errors.New("insufficient metadata for query")
	}
	log.Printf("googlebooks query: %s", qstr)
	values := make(url.Values)
	values.Set("q", qstr)
	if *googleAPIKey != "" {
		values.Set("key", *googleAPIKey)
	}
	uri := "http://books.google.com/books/feeds/volumes?" + values.Encode()
	resp, err := googleBooksGet(uri)
	if err != nil {
		log.Printf("googlebooks error: %v", err)
		return nil, err
	}
	defer resp.Body.Close()
	return googleBooksParse(resp.Body)
}

func getGoogleBooksId(m *Metadata) string {
	for _, s := range m.Sources {
		if s.Name == "gbooks" {
			return filepath.Base(s.ID)
		}
	}
	return ""
}

// MD5 of the "image not available" placeholder image served by Google
// Books. We'd like to avoid it using space on the filesystem.
var imageUnavailableMD5 = [16]byte{
	0x0d, 0xe4, 0x38, 0x3e, 0xba, 0xd0, 0xad, 0xad, 0x5e, 0xeb, 0x89, 0x75, 0xcd, 0x79, 0x66, 0x57,
}

func (r *googleBooksRefiner) GetBookCover(m *Metadata) ([]byte, error) {
	gbid := getGoogleBooksId(m)
	if gbid == "" {
		return nil, errors.New("no ID")
	}

	uri := fmt.Sprintf("http://books.google.com/books?id=%s&printsec=frontcover&img=1", gbid)
	resp, err := googleBooksGet(uri)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	data, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}

	if md5.Sum(data) == imageUnavailableMD5 {
		return nil, errors.New("image unavailable")
	}

	return data, nil
}