Select Git revision
dedup.py 2.22 KiB
import fp
import os
import optparse
import logging
import json
from djrandom import daemonize
from djrandom import utils
from djrandom.model.mp3 import MP3
from djrandom.database import Session, init_db
# Taken from 'fastingest.py', with minor changes.
def generate_code_json(jdata, track_id):
c = json.loads(jdata)
if "code" not in c:
return {}
code = c["code"]
m = c["metadata"]
length = m["duration"]
version = m["version"]
artist = m.get("artist", None)
title = m.get("title", None)
release = m.get("release", None)
decoded = fp.decode_code_string(code)
data = {"track_id": track_id,
"fp": decoded,
"length": length,
"codever": "%.2f" % version
}
if artist: data["artist"] = artist
if release: data["release"] = release
if title: data["track"] = title
return data
def dedupe_db():
codes = {}
# Load all known fingerprints into the db.
mp3s = MP3.query.filter(
(MP3.ready == True) & (MP3.error == False)
& (MP3.echoprint_fp != None))
for mp3 in mp3s:
code = generate_code_json(mp3.echoprint_fp, mp3.sha1)
if not code:
continue
codes[mp3.sha1] = code['fp']
fp.ingest([code], do_commit=False, local=True)
# Now dedupe by going through all our codes over again.
for sha1, code in codes.iteritems():
results = fp.query_fp(code, local=True)
if len(results) < 2:
continue
print 'SHA1: %s' % sha1
for track_id, score in results:
if track_id == sha1:
continue
print ' --> %s (%f)' % track_id, score
def run_deduper(db_url):
init_db(db_url)
dedupe_db()
def main():
parser = optparse.OptionParser()
parser.add_option('--db_url')
daemonize.add_standard_options(parser)
utils.read_config_defaults(
parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
opts, args = parser.parse_args()
if not opts.db_url:
parser.error('Must provide --db_url')
if args:
parser.error('Too many arguments')
daemonize.daemonize(opts, run_deduper,
(opts.db_url,))
if __name__ == '__main__':
main()