external.py

import hashlib
import os
import re
import subprocess
import urllib
import urllib2
from lxml import etree
from djrandom import utils


class AlbumImageDiskCache(object):
    """Cache album art on disk.

    Files are saved and converted to JPEG using Imagemagick.  Negative
    matches are saved as empty files.

    If you want to periodically retry 'missed' entries (to recover from
    temporary errors, for example), you can simply run:

        find $DIR -type f -size 0 -mtime +$DAYS -exec rm -f \{\} +

    """

    def __init__(self, root):
        self.root = root

    def _path(self, artist, album):
        album_hash = hashlib.sha1('%s|%s' % (artist, album)).hexdigest()
        return utils.generate_path(self.root, album_hash)

    def has(self, artist, album):
        return os.path.exists(self._path(artist, album))

    def get(self, artist, album):
        path = self._path(artist, album)
        if os.path.getsize(path) == 0:
            return None
        return path

    def set_negative_match(self, artist, album):
        open(self._path(artist, album), 'w').close()

    def download(self, artist, album, url):
        """Download url and convert to JPEG."""
        path = self._path(artist, album)
        tmpf = path + '.tmp'
        with open(tmpf, 'w') as fd:
            fd.write(urllib2.urlopen(url).read())
        subprocess.call(
            ['/usr/bin/convert', tmpf, '-quality', '75', 'jpeg:%s' % path])
        os.unlink(tmpf)


class AlbumImageRetriever(object):

    def __init__(self, api_key, cache_root):
        self.api_key = api_key
        self.cache = AlbumImageDiskCache(cache_root)

    def _get_album_info(self, artist, album):
        args = {'method': 'album.getInfo',
                'artist': artist,
                'album': album,
                'api_key': self.api_key}
        request = urllib2.urlopen(
            'http://ws.audioscrobbler.com/2.0/?%s' % urllib.urlencode(args))
        return etree.fromstring(request.read())

    def get_album_image(self, artist, album):
        if not self.cache.has(artist, album):
            queries = [(artist, album)]
            # Fix a minor annoyance that is popular in ID3 tags: if the
            # album name ends in a number, it might be part of a series;
            # in that case, try again without the number.
            m = re.search(r'^(.+) \d+$', album)
            if m:
                queries.append((artist, m.group(1)))
            for query_artist, query_album in queries:
                try:
                    xml = self._get_album_info(query_artist, query_album)
                    xp = etree.XPath('album/image[@size="extralarge"]')
                    img = xp(xml)
                    if img:
                        self.cache.download(artist, album, img[0].text)
                    else:
                        self.cache.set_negative_match(artist, album)
                except:
                    continue
                break
        return self.cache.get(artist, album)


if __name__ == '__main__':
    import sys
    api_key, artist, album = sys.argv[1:]
    air = AlbumImageRetriever(api_key, '/var/tmp/album-image-cache')
    print air.get_album_image(artist, album)