From 98b7f66c262a6732547bc57f9f048c81599ff6d0 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sat, 24 Sep 2011 22:36:55 +0100 Subject: [PATCH] add the basics of a deduper tool --- server/djrandom/fingerprint/dedup.py | 86 ++++++++++++++++++++++++++++ server/setup.py | 1 + 2 files changed, 87 insertions(+) create mode 100644 server/djrandom/fingerprint/dedup.py diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py new file mode 100644 index 0000000..8f30a74 --- /dev/null +++ b/server/djrandom/fingerprint/dedup.py @@ -0,0 +1,86 @@ +import fp +import os +import optparse +import logging +import json +from djrandom import daemonize +from djrandom import utils +from djrandom.model.mp3 import MP3 +from djrandom.database import Session, init_db + + +# Taken from 'fastingest.py', with minor changes. +def generate_code_json(jdata, track_id): + c = json.loads(jdata) + if "code" not in c: + return {} + + code = c["code"] + m = c["metadata"] + length = m["duration"] + version = m["version"] + artist = m.get("artist", None) + title = m.get("title", None) + release = m.get("release", None) + decoded = fp.decode_code_string(code) + + data = {"track_id": track_id, + "fp": decoded, + "length": length, + "codever": "%.2f" % version + } + if artist: data["artist"] = artist + if release: data["release"] = release + if title: data["track"] = title + return data + + +def dedupe_db(): + + codes = {} + + # Load all known fingerprints into the db. + mp3s = MP3.query.filter( + (MP3.ready == True) & (MP3.error == False) + & (MP3.echoprint_fp != None)) + for mp3 in mp3s: + code = generate_code_json(mp3.echoprint_fp, mp3.sha1) + codes[mp3.sha1] = code + fp.ingest([code], do_commit=False, local=True) + fp.commit() + + # Now dedupe by going through all our codes over again. + for sha1, code in codes.iteritems(): + results = fp.query_fp(code, local=True) + if len(results) < 2: + continue + print 'SHA1: %s' % sha1 + for track_id, score in results: + if track_id == sha1: + continue + print ' --> %s (%f)' % track_id, score + + +def run_deduper(db_url): + init_db(db_url) + dedupe_db() + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--db_url') + daemonize.add_standard_options(parser) + utils.read_config_defaults( + parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf')) + opts, args = parser.parse_args() + if not opts.db_url: + parser.error('Must provide --db_url') + if args: + parser.error('Too many arguments') + + daemonize.daemonize(opts, run_deduper, + (opts.db_url,)) + + +if __name__ == '__main__': + main() diff --git a/server/setup.py b/server/setup.py index 4da0494..4f8a61a 100644 --- a/server/setup.py +++ b/server/setup.py @@ -18,6 +18,7 @@ setup( "djrandom-receiver = djrandom.receiver.receiver:main", "djrandom-scanner = djrandom.scanner.scanner:main", "djrandom-fingerprinter = djrandom.fingerprint.fingerprint:main", + "djrandom-dedup = djrandom.fingerprint.dedup:main", "djrandom-streamer = djrandom.stream.stream:main", "djrandom-frontend = djrandom.frontend.frontend:main", "djrandom-update-markov = djrandom.model.markov:main", -- GitLab