From 6d0d09b6abb92bcdfb6226404e460ac87178bd57 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 25 Sep 2011 19:59:09 +0100
Subject: [PATCH] do not perform the same deduplication more than once

---
 server/djrandom/fingerprint/dedup.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py
index 8836662..0283051 100644
--- a/server/djrandom/fingerprint/dedup.py
+++ b/server/djrandom/fingerprint/dedup.py
@@ -21,6 +21,7 @@ class DeDuper(object):
 
     def __init__(self):
         self.songs_to_remove = {}
+        self.dupes_cache = set()
 
     def _generate_code_json(self, jsondata, track_id):
         """Parse the JSON string output of echoprint-codegen, and return
@@ -81,13 +82,17 @@ class DeDuper(object):
         # Now dedupe by going through all our codes over again.
         log.debug('de-duping fingerprint database...')
         start = time.time()
+        dup_count = 0
         for sha1, code in self.codes.iteritems():
             results = fp.query_fp(code, local=True).results
             if len(results) < 2:
                 continue
-            self._dedupe_song(sha1, code, results)
+            if self._dedupe_song(sha1, code, results):
+                dup_count += 1
         elapsed = time.time() - start
         log.debug('de-duped fingerprint database in %g seconds' % elapsed)
+        log.debug('found %d duplicates' % dup_count)
+        return dup_count
 
     def _dedupe_song(self, sha1, code_string, results):
         """Find fingerprint matches and eventually de-duplicate a song.
@@ -170,7 +175,10 @@ class DeDuper(object):
 
     def _resolve_dupes(self, hashes):
         """Perform best duplicate selection and remove dupes from db."""
-        log.debug('remove_dupes(%s)' % ','.join(hashes))
+        hashes_key = ','.join(sorted(hashes))
+        log.debug('remove_dupes(%s)' % hashes_key)
+        if hashes_key in self.dupes_cache:
+            return self.dupes_cache[hashes_key]
 
         def _compare_score(a, b):
             a_bitrate, a_duration, a_nmeta = a[0]
@@ -200,6 +208,7 @@ class DeDuper(object):
         songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
         log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
         self.songs_to_remove.update(songs_to_remove)
+        self.dupes_cache[hashes_key] = best_song
         return best_song
 
     def _cleanup(self):
-- 
GitLab