diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index dbd342fe58e0101e9cd24405efbe06ca9dfe894f..2d1ef5762bfd382bb3ca0eae1cbb703ee582466a 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -3,105 +3,149 @@ import os import optparse import logging import json +import time from djrandom import daemonize from djrandom import utils from djrandom.model.mp3 import MP3 from djrandom.database import Session, init_db +log = logging.getLogger(__name__) -# Taken from 'fastingest.py', with minor changes. -def generate_code_json(jdata, track_id): - c = json.loads(jdata) - if "code" not in c: - return {} - - code = c["code"] - m = c["metadata"] - length = m["duration"] - version = m["version"] - artist = m.get("artist", None) - title = m.get("title", None) - release = m.get("release", None) - decoded = fp.decode_code_string(code) + +class DeDuper(object): + + def _generate_code_json(self, jsondata, track_id): + """Parse the JSON string output of echoprint-codegen, and return + a structure that fp.ingest() can deal with. + + Taken from 'fastingest.py', with minor changes. + """ + c = json.loads(jsondata) + if "code" not in c: + return {} + + code = c["code"] + m = c["metadata"] + length = m["duration"] + version = m["version"] + artist = m.get("artist", None) + title = m.get("title", None) + release = m.get("release", None) + decoded = fp.decode_code_string(code) + + data = {"track_id": track_id, + "fp": decoded, + "length": length, + "codever": "%.2f" % version} + if artist: data["artist"] = artist + if release: data["release"] = release + if title: data["track"] = title + return data + + def dedupe(self): + self._ingest() + self._scan_for_dupes() + + def _ingest(self): + self.codes = {} + """Load all known fingerprints into the db. + + Creates the {sha1: code_string} self.codes dictionary. + """ + log.debug('loading in-memory fingerprint database...') + start = time.time() + fp.erase_database(local=True, really_delete=True) + mp3s = MP3.query.filter( + (MP3.artist == u'bonobo') + & (MP3.ready == True) & (MP3.error == False) + & (MP3.echoprint_fp != None)) + for mp3 in mp3s: + code = self._generate_code_json(mp3.echoprint_fp, mp3.sha1) + if not code: + continue + self.codes[mp3.sha1] = code['fp'] + fp.ingest(code, do_commit=False, local=True) + elapsed = time.time() - start + log.debug('loaded in-memory fingerprint database in %g seconds' % elapsed) + + def _scan_for_dupes(self): + # Now dedupe by going through all our codes over again. + log.debug('de-duping fingerprint database...') + start = time.time() + for sha1, code in self.codes.iteritems(): + results = fp.query_fp(code, local=True).results + if len(results) < 2: + continue + self._dedupe_song(sha1, code, results) + elapsed = time.time() - start + log.debug('de-duped fingerprint database in %g seconds' % elapsed) + + def _dedupe_song(self, sha1, code_string, results): + """Find fingerprint matches and eventually de-duplicate a song. - data = {"track_id": track_id, - "fp": decoded, - "length": length, - "codever": "%.2f" % version - } - if artist: data["artist"] = artist - if release: data["release"] = release - if title: data["track"] = title - return data - - -def dedupe_db(): - - codes = {} - - # Load all known fingerprints into the db. - mp3s = MP3.query.filter( - (MP3.artist == u'bonobo') - & (MP3.ready == True) & (MP3.error == False) - & (MP3.echoprint_fp != None)) - for mp3 in mp3s: - code = generate_code_json(mp3.echoprint_fp, mp3.sha1) - if not code: - continue - codes[mp3.sha1] = code['fp'] - fp.ingest([code], do_commit=False, local=True) - - # Now dedupe by going through all our codes over again. - for sha1, code in codes.iteritems(): - results = fp.query_fp(code, local=True).results - if len(results) < 2: - continue - print_scores(sha1, code, results) - - -def print_scores(sha1, code_string, results): - elbow = 10 - code_len = len(code_string.split(' ')) / 2 - actual_scores = {} - original_scores = {} - for entry in results: - track_id = entry['track_id'] - track_code = fp.local_fp_code_for_track_id(track_id) - actual_scores[track_id] = fp.actual_matches(code_string, track_code, elbow=elbow) - original_scores[track_id] = entry['score'] - - # Histogram-based score computation. Only keep the highest per-track score. - sorted_actual_scores = sorted(actual_scores.iteritems(), key=lambda (k, v): v, reverse=True) - new_sorted_actual_scores = [] - existing_track_ids = set() - for trid, score in sorted_actual_scores: - track_id = trid.split('-')[0] - if track_id not in existing_track_ids: - existing_track_ids.add(track_id) - new_sorted_actual_scores.append((trid, score)) - - orig = MP3.query.get(sha1) - print "\nSONG: %s/%s (%s) code_len=%d" % (orig.artist, orig.title, sha1, code_len) - - top_score = new_sorted_actual_scores[0][1] - for track_id, score in new_sorted_actual_scores: - track_sha1 = track_id.split('-')[0] - if track_sha1 == sha1: - continue - if score < code_len * 0.1: - continue - if score < top_score / 2: - continue - #if (top_score - score) < (top_score / 2): - # continue - mp3 = MP3.query.get(track_sha1) - print ' --> %s (%s orig:%s), %s/%s' % (track_sha1, score, original_scores[track_id], - mp3.artist, mp3.title) + Returns True if de-duplication was performed, False otherwise. + """ + elbow = 10 + code_len = len(code_string.split(' ')) / 2 + actual_scores = {} + original_scores = {} + for entry in results: + track_id = entry['track_id'] + track_code = fp.local_fp_code_for_track_id(track_id) + actual_scores[track_id] = fp.actual_matches( + code_string, track_code, elbow=elbow) + original_scores[track_id] = entry['score'] + + # Histogram-based score computation. Only keep the highest per-track score. + sorted_actual_scores = sorted(actual_scores.iteritems(), + key=lambda (k, v): v, reverse=True) + new_sorted_actual_scores = [] + existing_track_ids = set() + for trid, score in sorted_actual_scores: + track_id = trid.split('-')[0] + if track_id not in existing_track_ids: + existing_track_ids.add(track_id) + new_sorted_actual_scores.append((trid, score)) + + dupes = [] + + top_score = new_sorted_actual_scores[0][1] + for track_id, score in new_sorted_actual_scores: + track_sha1 = track_id.split('-')[0] + if score < code_len * 0.1: + continue + if score < top_score / 2: + continue + #if (top_score - score) < (top_score / 2): + # continue + dupes.append((track_sha1, score, original_scores[track_id])) + + if len(dupes) < 2: + # Only one fingerprint matches. Good! + return False + + # Print out some debugging information. + orig = MP3.query.get(sha1) + log.info("duplicates for '%s/%s' (%s) code_len=%d: %s" % ( + orig.artist, orig.title, sha1, code_len)) + for track_sha1, score, original_score in dupes: + mp3 = MP3.query.get(track_sha1) + log.info(' --> %s (%s orig:%s), %s/%s' % ( + track_sha1, score, original_score, mp3.artist, mp3.title)) + + # Actually de-duplicate the songs we've found. + self._resolve_dupes([x[0] for x in dupes]) + + def _resolve_dupes(self, hashes): + """Perform best duplicate selection and remove dupes from db.""" + log.debug('remove_dupes(%s)' % ','.join(hashes)) + def run_deduper(db_url): init_db(db_url) - dedupe_db() + dup = DeDuper() + dup.dedupe() def main():