Skip to content
Snippets Groups Projects
Commit 6d0d09b6 authored by ale's avatar ale
Browse files

do not perform the same deduplication more than once

parent 0dadf13c
Branches
No related tags found
No related merge requests found
......@@ -21,6 +21,7 @@ class DeDuper(object):
def __init__(self):
self.songs_to_remove = {}
self.dupes_cache = set()
def _generate_code_json(self, jsondata, track_id):
"""Parse the JSON string output of echoprint-codegen, and return
......@@ -81,13 +82,17 @@ class DeDuper(object):
# Now dedupe by going through all our codes over again.
log.debug('de-duping fingerprint database...')
start = time.time()
dup_count = 0
for sha1, code in self.codes.iteritems():
results = fp.query_fp(code, local=True).results
if len(results) < 2:
continue
self._dedupe_song(sha1, code, results)
if self._dedupe_song(sha1, code, results):
dup_count += 1
elapsed = time.time() - start
log.debug('de-duped fingerprint database in %g seconds' % elapsed)
log.debug('found %d duplicates' % dup_count)
return dup_count
def _dedupe_song(self, sha1, code_string, results):
"""Find fingerprint matches and eventually de-duplicate a song.
......@@ -170,7 +175,10 @@ class DeDuper(object):
def _resolve_dupes(self, hashes):
"""Perform best duplicate selection and remove dupes from db."""
log.debug('remove_dupes(%s)' % ','.join(hashes))
hashes_key = ','.join(sorted(hashes))
log.debug('remove_dupes(%s)' % hashes_key)
if hashes_key in self.dupes_cache:
return self.dupes_cache[hashes_key]
def _compare_score(a, b):
a_bitrate, a_duration, a_nmeta = a[0]
......@@ -200,6 +208,7 @@ class DeDuper(object):
songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
self.songs_to_remove.update(songs_to_remove)
self.dupes_cache[hashes_key] = best_song
return best_song
def _cleanup(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment