From 467095484ef1a98bd63f41d6622a5241e22b38fe Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 8 Oct 2011 16:47:55 +0200
Subject: [PATCH] support more than one deduplication strategy

---
 server/djrandom/database.py                   |   9 +-
 server/djrandom/fingerprint/cheap_dedup.py    | 120 ++++++++++++++++++
 server/djrandom/fingerprint/compare_songs.py  |  56 ++++++++
 server/djrandom/fingerprint/dedup.py          |  77 +----------
 .../fingerprint/resolve_duplicates.py         |  45 +++++++
 5 files changed, 232 insertions(+), 75 deletions(-)
 create mode 100644 server/djrandom/fingerprint/cheap_dedup.py
 create mode 100644 server/djrandom/fingerprint/compare_songs.py
 create mode 100644 server/djrandom/fingerprint/resolve_duplicates.py

diff --git a/server/djrandom/database.py b/server/djrandom/database.py
index dc5f4cb..4db52f4 100644
--- a/server/djrandom/database.py
+++ b/server/djrandom/database.py
@@ -20,7 +20,12 @@ class SetTextFactory(PoolListener):
 def init_db(uri):
     # Import all ORM modules here, so that 'create_all' can find them.
     from djrandom.model import mp3, playlist
-    engine = create_engine(uri, listeners=[SetTextFactory()],
-                           pool_recycle=1800)
+    if uri.startswith('mysql://'):
+        engine = create_engine(uri, listeners=[SetTextFactory()],
+                               pool_recycle=1800)
+    else:
+        engine = create_engine(uri, pool_recycle=1800)
     Session.configure(bind=engine)
     Base.metadata.create_all(engine)
+    return engine
+
diff --git a/server/djrandom/fingerprint/cheap_dedup.py b/server/djrandom/fingerprint/cheap_dedup.py
new file mode 100644
index 0000000..603c6e7
--- /dev/null
+++ b/server/djrandom/fingerprint/cheap_dedup.py
@@ -0,0 +1,120 @@
+import base64
+import os
+import optparse
+import logging
+import json
+import sys
+import time
+import traceback
+from djrandom import daemonize
+from djrandom import utils
+from djrandom.database import init_db, Session
+from djrandom.model.mp3 import MP3, Fingerprint
+from djrandom.fingerprint.resolve_duplicates import Resolver
+from sqlalchemy import *
+
+log = logging.getLogger(__name__)
+
+
+class CheapDeDuper(object):
+    """Will find _identical_ duplicates (same bitrate)."""
+
+    def __init__(self):
+        self._resolver = Resolver()
+
+    def dedupe_fp(self, engine):
+        count, errs = 0, 0
+        codes = {}
+        print 'loading all fingerprints'
+
+        # Skip the ORM and directly query the SQL layer.
+        q = select([Fingerprint.sha1, Fingerprint.echoprint_fp],
+                    (MP3.sha1 == Fingerprint.sha1)
+                    & (MP3.state == MP3.READY)
+                    & (MP3.has_fingerprint == True))
+        for row in engine.execute(q):
+            try:
+                enc_code = str(json.loads(row.echoprint_fp)['code'])
+                code = base64.urlsafe_b64decode(enc_code)
+                if code:
+                    codes.setdefault(code, []).append(row.sha1)
+            except KeyError:
+                continue
+            except Exception, e:
+                traceback.print_exc()
+                errs += 1
+                continue
+            count += 1
+            if count % 100 == 0:
+                sys.stdout.write('%d  \r' % count)
+                sys.stdout.flush()
+
+        print '\n%d fingerprints, %d errors' % (count, errs)
+        Session.remove()
+
+        print 'done, scanning for duplicates'
+        for code, hashes in codes.iteritems():
+            if len(hashes) > 1:
+                self._dedup_songs([MP3.query.get(x) for x in hashes])
+
+    def dedupe(self, dry_run):
+        dupes = []
+        last_mp3 = None
+        mp3s = MP3.query.filter(
+            (MP3.state == MP3.READY)
+            & (MP3.artist != None) & (MP3.artist != '')
+            & (MP3.title != None) & (MP3.title != '')
+            & (MP3.has_fingerprint == True)
+            ).order_by(asc(MP3.artist), asc(MP3.title))
+        for mp3 in mp3s:
+            if last_mp3:
+                if (mp3.artist == last_mp3.artist and
+                    mp3.title == last_mp3.title):
+                    dupes.append(mp3)
+                else:
+                    if len(dupes) > 1:
+                        self._dedup_songs(dupes)
+                    dupes = []
+            last_mp3 = mp3
+
+    def _dedup_songs(self, songs):
+        def _toutf8(x):
+            try:
+                return x.encode('utf-8')
+            except:
+                return '???'
+        print 'dedup group:'
+        for s in songs:
+            fp = json.loads(s.get_fingerprint()).get('code')
+            print '   - %s / %s / %s' % (_toutf8(s.artist), _toutf8(s.title), _toutf8(s.album))
+            print '     [%s]' % str(fp)[:128]
+        best = self._resolver.resolve_dupes([s.sha1 for s in songs])
+        print '\n  * best: %s\n' % (best, )
+
+
+def run_cheap_deduper(db_url, dry_run):
+    engine = init_db(db_url)
+    dup = CheapDeDuper()
+    #dup.dedupe(dry_run)
+    dup.dedupe_fp(engine)
+
+
+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('--db_url')
+    parser.add_option('--apply', action='store_true')
+    daemonize.add_standard_options(parser)
+    utils.read_config_defaults(
+        parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
+    opts, args = parser.parse_args()
+    if not opts.db_url:
+        parser.error('Must provide --db_url')
+    if args:
+        parser.error('Too many arguments')
+
+    daemonize.daemonize(opts, run_cheap_deduper,
+                        (opts.db_url, not opts.apply))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/server/djrandom/fingerprint/compare_songs.py b/server/djrandom/fingerprint/compare_songs.py
new file mode 100644
index 0000000..d6f7e1b
--- /dev/null
+++ b/server/djrandom/fingerprint/compare_songs.py
@@ -0,0 +1,56 @@
+import eyeD3
+from djrandom.model.mp3 import MP3
+
+
+# Monkey-patch eyeD3 so that it does not look at file extensions to
+# figure out if something is an MP3 or not.
+eyeD3.tag.isMp3File = lambda x: True
+
+
+def _compare_score(a, b):
+    a_bitrate, a_duration, a_nmeta = a[0]
+    b_bitrate, b_duration, b_nmeta = b[0]
+    res = cmp(a_bitrate, b_bitrate)
+    if res == 0:
+        res = cmp(a_duration, b_duration)
+        if res == 0:
+            res = cmp(a_nmeta, b_nmeta)
+    return res
+
+
+def get_song_score(mp3):
+    try:
+        af = eyeD3.Mp3AudioFile(mp3.path)
+    except:
+        return (0, 0, 0)
+
+    # Get encoding parameters.
+    bitrate = af.getBitRate()[1]
+    duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs
+
+    # Count metadata tags.
+    try:
+        tag = af.getTag()
+        has_album = not (not tag.getAlbum())
+        has_artist = not (not tag.getArtist())
+        has_title = not (not tag.getTitle())
+        has_genre = not (not tag.getGenre())
+        has_year = not (not tag.getYear())
+        has_tracknum = (tag.getTrackNum()[0] is not None)
+        has_images = not (not tag.getImages())
+        num_meta = (4 * int(has_images) 
+                    + 2 * sum(map(int, (has_album, has_artist, has_title)))
+                    + sum(map(int, (has_genre, has_year, has_tracknum))))
+    except:
+        num_meta = 0
+
+    return (bitrate, duration, num_meta)
+
+
+def sort_songs(hashes=None, mp3s=None):
+    assert hashes or mp3s
+    if mp3s is None:
+        mp3s = MP3.query.filter(MP3.sha1.in_(hashes))
+    return sorted(((get_song_score(x), x.sha1) for x in mp3s),
+                  cmp=_compare_score, reverse=True)
+
diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py
index 652369f..d3dc02d 100644
--- a/server/djrandom/fingerprint/dedup.py
+++ b/server/djrandom/fingerprint/dedup.py
@@ -9,19 +9,15 @@ from djrandom import daemonize
 from djrandom import utils
 from djrandom.model.mp3 import MP3
 from djrandom.database import Session, init_db
+from djrandom.fingerprint.resolve_duplicates import Resolver
 
 log = logging.getLogger(__name__)
 
-# Monkey-patch eyeD3 so that it does not look at file extensions to
-# figure out if something is an MP3 or not.
-eyeD3.tag.isMp3File = lambda x: True
-
 
 class DeDuper(object):
 
     def __init__(self):
-        self.songs_to_remove = {}
-        self.dupes_cache = set()
+        self._resolver = Resolver()
 
     def _generate_code_json(self, jsondata, track_id):
         """Parse the JSON string output of echoprint-codegen, and return
@@ -149,76 +145,11 @@ class DeDuper(object):
                     track_sha1, score, original_score, mp3.artist, mp3.title))
 
         # Actually de-duplicate the songs we've found.
-        self._resolve_dupes([x[0] for x in dupes])
+        self._resolver.resolve_dupes([x[0] for x in dupes])
         return True
 
-    def _get_song_score(self, mp3):
-        af = eyeD3.Mp3AudioFile(mp3.path)
-
-        # Get encoding parameters.
-        bitrate = af.getBitRate()[1]
-        duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs
-
-        # Count metadata tags.
-        tag = af.getTag()
-        has_album = not (not tag.getAlbum())
-        has_artist = not (not tag.getArtist())
-        has_title = not (not tag.getTitle())
-        has_genre = not (not tag.getGenre())
-        has_year = not (not tag.getYear())
-        has_tracknum = (tag.getTrackNum()[0] is not None)
-        has_images = not (not tag.getImages())
-        num_meta = (4 * int(has_images) 
-                    + 2 * sum(map(int, (has_album, has_artist, has_title)))
-                    + sum(map(int, (has_genre, has_year, has_tracknum))))
-
-        return (bitrate, duration, num_meta)
-
-    def _resolve_dupes(self, hashes):
-        """Perform best duplicate selection and remove dupes from db."""
-        hashes_key = ','.join(sorted(hashes))
-        log.debug('remove_dupes(%s)' % hashes_key)
-        if hashes_key in self.dupes_cache:
-            return self.dupes_cache[hashes_key]
-
-        def _compare_score(a, b):
-            a_bitrate, a_duration, a_nmeta = a[0]
-            b_bitrate, b_duration, b_nmeta = b[0]
-            res = cmp(a_bitrate, b_bitrate)
-            if res == 0:
-                res = cmp(a_duration, b_duration)
-                if res == 0:
-                    res = cmp(a_nmeta, b_nmeta)
-            return res
-
-        # Compute 'score' for each song and sort them.
-        scores = []
-        mp3s = MP3.query.filter(MP3.sha1.in_(hashes))
-        for mp3 in mp3s:
-            scores.append((self._get_song_score(mp3), mp3.sha1))
-        scores.sort(cmp=_compare_score, reverse=True)
-        best_song = scores[0][1]
-        log.debug('remove_dupes: best song is %s' % best_song)
-        log.debug('remove_dupes: score dump:')
-        for score, sha1 in scores:
-            bitrate, duration, nmeta = score
-            log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % (
-                    bitrate, duration, nmeta, sha1))
-
-        # Remove all the other songs.
-        songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
-        log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
-        self.songs_to_remove.update(songs_to_remove)
-        self.dupes_cache[hashes_key] = best_song
-        return best_song
-
     def _cleanup(self):
-        for sha1, duplicate_of in self.songs_to_remove.iteritems():
-            # Mark the MP3 as duplicate, remove the associated file.
-            mp3 = MP3.query.get(sha1)
-            mp3.mark_as_duplicate(duplicate_of)
-            Session.add(mp3)
-        Session.commit()
+        self._resolver.commit()
 
 
 def run_deduper(db_url, dry_run):
diff --git a/server/djrandom/fingerprint/resolve_duplicates.py b/server/djrandom/fingerprint/resolve_duplicates.py
new file mode 100644
index 0000000..f635ceb
--- /dev/null
+++ b/server/djrandom/fingerprint/resolve_duplicates.py
@@ -0,0 +1,45 @@
+import logging
+from djrandom.database import Session
+from djrandom.model.mp3 import MP3
+from djrandom.fingerprint.compare_songs import sort_songs
+
+log = logging.getLogger(__name__)
+
+
+class Resolver(object):
+
+    def __init__(self):
+        self._to_remove = {}
+        self._cache = {}
+
+    def resolve_dupes(self, hashes):
+        """Perform best duplicate selection and remove dupes from db."""
+        hashes_key = ','.join(sorted(hashes))
+        log.debug('remove_dupes(%s)' % hashes_key)
+        if hashes_key in self._cache:
+            return self._cache[hashes_key]
+
+        # Compute 'score' for each song and sort them.
+        by_score = sort_songs(hashes=hashes)
+        best_song = by_score[0][1]
+        log.debug('remove_dupes: best song is %s' % best_song)
+        log.debug('remove_dupes: score dump:')
+        for score, sha1 in by_score:
+            bitrate, duration, nmeta = score
+            log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % (
+                    bitrate, duration, nmeta, sha1))
+
+        # Remove all the other songs.
+        songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
+        log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
+        self._to_remove.update(songs_to_remove)
+        self._cache[hashes_key] = best_song
+        return best_song
+
+    def commit(self):
+        for sha1, duplicate_of in self._to_remove.iteritems():
+            mp3 = MP3.query.get(sha1)
+            mp3.mark_as_duplicate(duplicate_of)
+            Session.add(mp3)
+        Session.commit()
+
-- 
GitLab