From 467095484ef1a98bd63f41d6622a5241e22b38fe Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sat, 8 Oct 2011 16:47:55 +0200 Subject: [PATCH] support more than one deduplication strategy --- server/djrandom/database.py | 9 +- server/djrandom/fingerprint/cheap_dedup.py | 120 ++++++++++++++++++ server/djrandom/fingerprint/compare_songs.py | 56 ++++++++ server/djrandom/fingerprint/dedup.py | 77 +---------- .../fingerprint/resolve_duplicates.py | 45 +++++++ 5 files changed, 232 insertions(+), 75 deletions(-) create mode 100644 server/djrandom/fingerprint/cheap_dedup.py create mode 100644 server/djrandom/fingerprint/compare_songs.py create mode 100644 server/djrandom/fingerprint/resolve_duplicates.py diff --git a/server/djrandom/database.py b/server/djrandom/database.py index dc5f4cb..4db52f4 100644 --- a/server/djrandom/database.py +++ b/server/djrandom/database.py @@ -20,7 +20,12 @@ class SetTextFactory(PoolListener): def init_db(uri): # Import all ORM modules here, so that 'create_all' can find them. from djrandom.model import mp3, playlist - engine = create_engine(uri, listeners=[SetTextFactory()], - pool_recycle=1800) + if uri.startswith('mysql://'): + engine = create_engine(uri, listeners=[SetTextFactory()], + pool_recycle=1800) + else: + engine = create_engine(uri, pool_recycle=1800) Session.configure(bind=engine) Base.metadata.create_all(engine) + return engine + diff --git a/server/djrandom/fingerprint/cheap_dedup.py b/server/djrandom/fingerprint/cheap_dedup.py new file mode 100644 index 0000000..603c6e7 --- /dev/null +++ b/server/djrandom/fingerprint/cheap_dedup.py @@ -0,0 +1,120 @@ +import base64 +import os +import optparse +import logging +import json +import sys +import time +import traceback +from djrandom import daemonize +from djrandom import utils +from djrandom.database import init_db, Session +from djrandom.model.mp3 import MP3, Fingerprint +from djrandom.fingerprint.resolve_duplicates import Resolver +from sqlalchemy import * + +log = logging.getLogger(__name__) + + +class CheapDeDuper(object): + """Will find _identical_ duplicates (same bitrate).""" + + def __init__(self): + self._resolver = Resolver() + + def dedupe_fp(self, engine): + count, errs = 0, 0 + codes = {} + print 'loading all fingerprints' + + # Skip the ORM and directly query the SQL layer. + q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], + (MP3.sha1 == Fingerprint.sha1) + & (MP3.state == MP3.READY) + & (MP3.has_fingerprint == True)) + for row in engine.execute(q): + try: + enc_code = str(json.loads(row.echoprint_fp)['code']) + code = base64.urlsafe_b64decode(enc_code) + if code: + codes.setdefault(code, []).append(row.sha1) + except KeyError: + continue + except Exception, e: + traceback.print_exc() + errs += 1 + continue + count += 1 + if count % 100 == 0: + sys.stdout.write('%d \r' % count) + sys.stdout.flush() + + print '\n%d fingerprints, %d errors' % (count, errs) + Session.remove() + + print 'done, scanning for duplicates' + for code, hashes in codes.iteritems(): + if len(hashes) > 1: + self._dedup_songs([MP3.query.get(x) for x in hashes]) + + def dedupe(self, dry_run): + dupes = [] + last_mp3 = None + mp3s = MP3.query.filter( + (MP3.state == MP3.READY) + & (MP3.artist != None) & (MP3.artist != '') + & (MP3.title != None) & (MP3.title != '') + & (MP3.has_fingerprint == True) + ).order_by(asc(MP3.artist), asc(MP3.title)) + for mp3 in mp3s: + if last_mp3: + if (mp3.artist == last_mp3.artist and + mp3.title == last_mp3.title): + dupes.append(mp3) + else: + if len(dupes) > 1: + self._dedup_songs(dupes) + dupes = [] + last_mp3 = mp3 + + def _dedup_songs(self, songs): + def _toutf8(x): + try: + return x.encode('utf-8') + except: + return '???' + print 'dedup group:' + for s in songs: + fp = json.loads(s.get_fingerprint()).get('code') + print ' - %s / %s / %s' % (_toutf8(s.artist), _toutf8(s.title), _toutf8(s.album)) + print ' [%s]' % str(fp)[:128] + best = self._resolver.resolve_dupes([s.sha1 for s in songs]) + print '\n * best: %s\n' % (best, ) + + +def run_cheap_deduper(db_url, dry_run): + engine = init_db(db_url) + dup = CheapDeDuper() + #dup.dedupe(dry_run) + dup.dedupe_fp(engine) + + +def main(): + parser = optparse.OptionParser() + parser.add_option('--db_url') + parser.add_option('--apply', action='store_true') + daemonize.add_standard_options(parser) + utils.read_config_defaults( + parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf')) + opts, args = parser.parse_args() + if not opts.db_url: + parser.error('Must provide --db_url') + if args: + parser.error('Too many arguments') + + daemonize.daemonize(opts, run_cheap_deduper, + (opts.db_url, not opts.apply)) + + +if __name__ == '__main__': + main() diff --git a/server/djrandom/fingerprint/compare_songs.py b/server/djrandom/fingerprint/compare_songs.py new file mode 100644 index 0000000..d6f7e1b --- /dev/null +++ b/server/djrandom/fingerprint/compare_songs.py @@ -0,0 +1,56 @@ +import eyeD3 +from djrandom.model.mp3 import MP3 + + +# Monkey-patch eyeD3 so that it does not look at file extensions to +# figure out if something is an MP3 or not. +eyeD3.tag.isMp3File = lambda x: True + + +def _compare_score(a, b): + a_bitrate, a_duration, a_nmeta = a[0] + b_bitrate, b_duration, b_nmeta = b[0] + res = cmp(a_bitrate, b_bitrate) + if res == 0: + res = cmp(a_duration, b_duration) + if res == 0: + res = cmp(a_nmeta, b_nmeta) + return res + + +def get_song_score(mp3): + try: + af = eyeD3.Mp3AudioFile(mp3.path) + except: + return (0, 0, 0) + + # Get encoding parameters. + bitrate = af.getBitRate()[1] + duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs + + # Count metadata tags. + try: + tag = af.getTag() + has_album = not (not tag.getAlbum()) + has_artist = not (not tag.getArtist()) + has_title = not (not tag.getTitle()) + has_genre = not (not tag.getGenre()) + has_year = not (not tag.getYear()) + has_tracknum = (tag.getTrackNum()[0] is not None) + has_images = not (not tag.getImages()) + num_meta = (4 * int(has_images) + + 2 * sum(map(int, (has_album, has_artist, has_title))) + + sum(map(int, (has_genre, has_year, has_tracknum)))) + except: + num_meta = 0 + + return (bitrate, duration, num_meta) + + +def sort_songs(hashes=None, mp3s=None): + assert hashes or mp3s + if mp3s is None: + mp3s = MP3.query.filter(MP3.sha1.in_(hashes)) + return sorted(((get_song_score(x), x.sha1) for x in mp3s), + cmp=_compare_score, reverse=True) + diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index 652369f..d3dc02d 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -9,19 +9,15 @@ from djrandom import daemonize from djrandom import utils from djrandom.model.mp3 import MP3 from djrandom.database import Session, init_db +from djrandom.fingerprint.resolve_duplicates import Resolver log = logging.getLogger(__name__) -# Monkey-patch eyeD3 so that it does not look at file extensions to -# figure out if something is an MP3 or not. -eyeD3.tag.isMp3File = lambda x: True - class DeDuper(object): def __init__(self): - self.songs_to_remove = {} - self.dupes_cache = set() + self._resolver = Resolver() def _generate_code_json(self, jsondata, track_id): """Parse the JSON string output of echoprint-codegen, and return @@ -149,76 +145,11 @@ class DeDuper(object): track_sha1, score, original_score, mp3.artist, mp3.title)) # Actually de-duplicate the songs we've found. - self._resolve_dupes([x[0] for x in dupes]) + self._resolver.resolve_dupes([x[0] for x in dupes]) return True - def _get_song_score(self, mp3): - af = eyeD3.Mp3AudioFile(mp3.path) - - # Get encoding parameters. - bitrate = af.getBitRate()[1] - duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs - - # Count metadata tags. - tag = af.getTag() - has_album = not (not tag.getAlbum()) - has_artist = not (not tag.getArtist()) - has_title = not (not tag.getTitle()) - has_genre = not (not tag.getGenre()) - has_year = not (not tag.getYear()) - has_tracknum = (tag.getTrackNum()[0] is not None) - has_images = not (not tag.getImages()) - num_meta = (4 * int(has_images) - + 2 * sum(map(int, (has_album, has_artist, has_title))) - + sum(map(int, (has_genre, has_year, has_tracknum)))) - - return (bitrate, duration, num_meta) - - def _resolve_dupes(self, hashes): - """Perform best duplicate selection and remove dupes from db.""" - hashes_key = ','.join(sorted(hashes)) - log.debug('remove_dupes(%s)' % hashes_key) - if hashes_key in self.dupes_cache: - return self.dupes_cache[hashes_key] - - def _compare_score(a, b): - a_bitrate, a_duration, a_nmeta = a[0] - b_bitrate, b_duration, b_nmeta = b[0] - res = cmp(a_bitrate, b_bitrate) - if res == 0: - res = cmp(a_duration, b_duration) - if res == 0: - res = cmp(a_nmeta, b_nmeta) - return res - - # Compute 'score' for each song and sort them. - scores = [] - mp3s = MP3.query.filter(MP3.sha1.in_(hashes)) - for mp3 in mp3s: - scores.append((self._get_song_score(mp3), mp3.sha1)) - scores.sort(cmp=_compare_score, reverse=True) - best_song = scores[0][1] - log.debug('remove_dupes: best song is %s' % best_song) - log.debug('remove_dupes: score dump:') - for score, sha1 in scores: - bitrate, duration, nmeta = score - log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % ( - bitrate, duration, nmeta, sha1)) - - # Remove all the other songs. - songs_to_remove = dict((x, best_song) for x in hashes if x != best_song) - log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove)) - self.songs_to_remove.update(songs_to_remove) - self.dupes_cache[hashes_key] = best_song - return best_song - def _cleanup(self): - for sha1, duplicate_of in self.songs_to_remove.iteritems(): - # Mark the MP3 as duplicate, remove the associated file. - mp3 = MP3.query.get(sha1) - mp3.mark_as_duplicate(duplicate_of) - Session.add(mp3) - Session.commit() + self._resolver.commit() def run_deduper(db_url, dry_run): diff --git a/server/djrandom/fingerprint/resolve_duplicates.py b/server/djrandom/fingerprint/resolve_duplicates.py new file mode 100644 index 0000000..f635ceb --- /dev/null +++ b/server/djrandom/fingerprint/resolve_duplicates.py @@ -0,0 +1,45 @@ +import logging +from djrandom.database import Session +from djrandom.model.mp3 import MP3 +from djrandom.fingerprint.compare_songs import sort_songs + +log = logging.getLogger(__name__) + + +class Resolver(object): + + def __init__(self): + self._to_remove = {} + self._cache = {} + + def resolve_dupes(self, hashes): + """Perform best duplicate selection and remove dupes from db.""" + hashes_key = ','.join(sorted(hashes)) + log.debug('remove_dupes(%s)' % hashes_key) + if hashes_key in self._cache: + return self._cache[hashes_key] + + # Compute 'score' for each song and sort them. + by_score = sort_songs(hashes=hashes) + best_song = by_score[0][1] + log.debug('remove_dupes: best song is %s' % best_song) + log.debug('remove_dupes: score dump:') + for score, sha1 in by_score: + bitrate, duration, nmeta = score + log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % ( + bitrate, duration, nmeta, sha1)) + + # Remove all the other songs. + songs_to_remove = dict((x, best_song) for x in hashes if x != best_song) + log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove)) + self._to_remove.update(songs_to_remove) + self._cache[hashes_key] = best_song + return best_song + + def commit(self): + for sha1, duplicate_of in self._to_remove.iteritems(): + mp3 = MP3.query.get(sha1) + mp3.mark_as_duplicate(duplicate_of) + Session.add(mp3) + Session.commit() + -- GitLab