Skip to content
Snippets Groups Projects
Commit 98b7f66c authored by ale's avatar ale
Browse files

add the basics of a deduper tool

parent cdde4f3e
No related branches found
No related tags found
No related merge requests found
import fp
import os
import optparse
import logging
import json
from djrandom import daemonize
from djrandom import utils
from djrandom.model.mp3 import MP3
from djrandom.database import Session, init_db
# Taken from 'fastingest.py', with minor changes.
def generate_code_json(jdata, track_id):
c = json.loads(jdata)
if "code" not in c:
return {}
code = c["code"]
m = c["metadata"]
length = m["duration"]
version = m["version"]
artist = m.get("artist", None)
title = m.get("title", None)
release = m.get("release", None)
decoded = fp.decode_code_string(code)
data = {"track_id": track_id,
"fp": decoded,
"length": length,
"codever": "%.2f" % version
}
if artist: data["artist"] = artist
if release: data["release"] = release
if title: data["track"] = title
return data
def dedupe_db():
codes = {}
# Load all known fingerprints into the db.
mp3s = MP3.query.filter(
(MP3.ready == True) & (MP3.error == False)
& (MP3.echoprint_fp != None))
for mp3 in mp3s:
code = generate_code_json(mp3.echoprint_fp, mp3.sha1)
codes[mp3.sha1] = code
fp.ingest([code], do_commit=False, local=True)
fp.commit()
# Now dedupe by going through all our codes over again.
for sha1, code in codes.iteritems():
results = fp.query_fp(code, local=True)
if len(results) < 2:
continue
print 'SHA1: %s' % sha1
for track_id, score in results:
if track_id == sha1:
continue
print ' --> %s (%f)' % track_id, score
def run_deduper(db_url):
init_db(db_url)
dedupe_db()
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('--db_url')
daemonize.add_standard_options(parser)
utils.read_config_defaults(
parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
opts, args = parser.parse_args()
if not opts.db_url:
parser.error('Must provide --db_url')
if args:
parser.error('Too many arguments')
daemonize.daemonize(opts, run_deduper,
(opts.db_url,))
if __name__ == '__main__':
main()
...@@ -18,6 +18,7 @@ setup( ...@@ -18,6 +18,7 @@ setup(
"djrandom-receiver = djrandom.receiver.receiver:main", "djrandom-receiver = djrandom.receiver.receiver:main",
"djrandom-scanner = djrandom.scanner.scanner:main", "djrandom-scanner = djrandom.scanner.scanner:main",
"djrandom-fingerprinter = djrandom.fingerprint.fingerprint:main", "djrandom-fingerprinter = djrandom.fingerprint.fingerprint:main",
"djrandom-dedup = djrandom.fingerprint.dedup:main",
"djrandom-streamer = djrandom.stream.stream:main", "djrandom-streamer = djrandom.stream.stream:main",
"djrandom-frontend = djrandom.frontend.frontend:main", "djrandom-frontend = djrandom.frontend.frontend:main",
"djrandom-update-markov = djrandom.model.markov:main", "djrandom-update-markov = djrandom.model.markov:main",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment