From 6d0d09b6abb92bcdfb6226404e460ac87178bd57 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sun, 25 Sep 2011 19:59:09 +0100 Subject: [PATCH] do not perform the same deduplication more than once --- server/djrandom/fingerprint/dedup.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index 8836662..0283051 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -21,6 +21,7 @@ class DeDuper(object): def __init__(self): self.songs_to_remove = {} + self.dupes_cache = set() def _generate_code_json(self, jsondata, track_id): """Parse the JSON string output of echoprint-codegen, and return @@ -81,13 +82,17 @@ class DeDuper(object): # Now dedupe by going through all our codes over again. log.debug('de-duping fingerprint database...') start = time.time() + dup_count = 0 for sha1, code in self.codes.iteritems(): results = fp.query_fp(code, local=True).results if len(results) < 2: continue - self._dedupe_song(sha1, code, results) + if self._dedupe_song(sha1, code, results): + dup_count += 1 elapsed = time.time() - start log.debug('de-duped fingerprint database in %g seconds' % elapsed) + log.debug('found %d duplicates' % dup_count) + return dup_count def _dedupe_song(self, sha1, code_string, results): """Find fingerprint matches and eventually de-duplicate a song. @@ -170,7 +175,10 @@ class DeDuper(object): def _resolve_dupes(self, hashes): """Perform best duplicate selection and remove dupes from db.""" - log.debug('remove_dupes(%s)' % ','.join(hashes)) + hashes_key = ','.join(sorted(hashes)) + log.debug('remove_dupes(%s)' % hashes_key) + if hashes_key in self.dupes_cache: + return self.dupes_cache[hashes_key] def _compare_score(a, b): a_bitrate, a_duration, a_nmeta = a[0] @@ -200,6 +208,7 @@ class DeDuper(object): songs_to_remove = dict((x, best_song) for x in hashes if x != best_song) log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove)) self.songs_to_remove.update(songs_to_remove) + self.dupes_cache[hashes_key] = best_song return best_song def _cleanup(self): -- GitLab