Select Git revision
crawler_test.go
-
ale authored
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
ale authoredThis change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
external.py 2.46 KiB
import hashlib
import os
import subprocess
import urllib
import urllib2
from lxml import etree
from djrandom import utils
class AlbumImageDiskCache(object):
"""Cache album art on disk.
Files are saved and converted to JPEG using Imagemagick. Negative
matches are saved as empty files.
"""
def __init__(self, root):
self.root = root
def _path(self, artist, album):
album_hash = hashlib.sha1('%s|%s' % (artist, album)).hexdigest()
return utils.generate_path(self.root, album_hash)
def has(self, artist, album):
return os.path.exists(self._path(artist, album))
def get(self, artist, album):
path = self._path(artist, album)
if os.path.getsize(path) == 0:
return None
return path
def set_negative_match(self, artist, album):
open(self._path(artist, album), 'w').close()
def download(self, artist, album, url):
"""Download url and convert to JPEG."""
path = self._path(artist, album)
tmpf = path + '.tmp'
with open(tmpf, 'w') as fd:
fd.write(urllib2.urlopen(url).read())
subprocess.call(
['/usr/bin/convert', tmpf, '-quality', '75', 'jpeg:%s' % path])
os.unlink(tmpf)
class AlbumImageRetriever(object):
def __init__(self, api_key, cache_root):
self.api_key = api_key
self.cache = AlbumImageDiskCache(cache_root)
def _get_album_info(self, artist, album):
args = {'method': 'album.getInfo',
'artist': artist,
'album': album,
'api_key': self.api_key}
request = urllib2.urlopen(
'http://ws.audioscrobbler.com/2.0/?%s' % urllib.urlencode(args))
return etree.fromstring(request.read())
def get_album_image(self, artist, album):
if not self.cache.has(artist, album):
try:
xml = self._get_album_info(artist, album)
xp = etree.XPath('album/image[@size="extralarge"]')
img = xp(xml)
if img:
self.cache.download(artist, album, img[0].text)
else:
self.cache.set_negative_match(artist, album)
except:
return None
return self.cache.get(artist, album)
if __name__ == '__main__':
import sys
api_key, artist, album = sys.argv[1:]
air = AlbumImageRetriever(api_key, '/var/tmp/album-image-cache')
print air.get_album_image(artist, album)