Skip to content
Snippets Groups Projects
Commit 46709548 authored by ale's avatar ale
Browse files

support more than one deduplication strategy

parent 4fd75399
Branches
No related tags found
No related merge requests found
......@@ -20,7 +20,12 @@ class SetTextFactory(PoolListener):
def init_db(uri):
# Import all ORM modules here, so that 'create_all' can find them.
from djrandom.model import mp3, playlist
if uri.startswith('mysql://'):
engine = create_engine(uri, listeners=[SetTextFactory()],
pool_recycle=1800)
else:
engine = create_engine(uri, pool_recycle=1800)
Session.configure(bind=engine)
Base.metadata.create_all(engine)
return engine
import base64
import os
import optparse
import logging
import json
import sys
import time
import traceback
from djrandom import daemonize
from djrandom import utils
from djrandom.database import init_db, Session
from djrandom.model.mp3 import MP3, Fingerprint
from djrandom.fingerprint.resolve_duplicates import Resolver
from sqlalchemy import *
log = logging.getLogger(__name__)
class CheapDeDuper(object):
"""Will find _identical_ duplicates (same bitrate)."""
def __init__(self):
self._resolver = Resolver()
def dedupe_fp(self, engine):
count, errs = 0, 0
codes = {}
print 'loading all fingerprints'
# Skip the ORM and directly query the SQL layer.
q = select([Fingerprint.sha1, Fingerprint.echoprint_fp],
(MP3.sha1 == Fingerprint.sha1)
& (MP3.state == MP3.READY)
& (MP3.has_fingerprint == True))
for row in engine.execute(q):
try:
enc_code = str(json.loads(row.echoprint_fp)['code'])
code = base64.urlsafe_b64decode(enc_code)
if code:
codes.setdefault(code, []).append(row.sha1)
except KeyError:
continue
except Exception, e:
traceback.print_exc()
errs += 1
continue
count += 1
if count % 100 == 0:
sys.stdout.write('%d \r' % count)
sys.stdout.flush()
print '\n%d fingerprints, %d errors' % (count, errs)
Session.remove()
print 'done, scanning for duplicates'
for code, hashes in codes.iteritems():
if len(hashes) > 1:
self._dedup_songs([MP3.query.get(x) for x in hashes])
def dedupe(self, dry_run):
dupes = []
last_mp3 = None
mp3s = MP3.query.filter(
(MP3.state == MP3.READY)
& (MP3.artist != None) & (MP3.artist != '')
& (MP3.title != None) & (MP3.title != '')
& (MP3.has_fingerprint == True)
).order_by(asc(MP3.artist), asc(MP3.title))
for mp3 in mp3s:
if last_mp3:
if (mp3.artist == last_mp3.artist and
mp3.title == last_mp3.title):
dupes.append(mp3)
else:
if len(dupes) > 1:
self._dedup_songs(dupes)
dupes = []
last_mp3 = mp3
def _dedup_songs(self, songs):
def _toutf8(x):
try:
return x.encode('utf-8')
except:
return '???'
print 'dedup group:'
for s in songs:
fp = json.loads(s.get_fingerprint()).get('code')
print ' - %s / %s / %s' % (_toutf8(s.artist), _toutf8(s.title), _toutf8(s.album))
print ' [%s]' % str(fp)[:128]
best = self._resolver.resolve_dupes([s.sha1 for s in songs])
print '\n * best: %s\n' % (best, )
def run_cheap_deduper(db_url, dry_run):
engine = init_db(db_url)
dup = CheapDeDuper()
#dup.dedupe(dry_run)
dup.dedupe_fp(engine)
def main():
parser = optparse.OptionParser()
parser.add_option('--db_url')
parser.add_option('--apply', action='store_true')
daemonize.add_standard_options(parser)
utils.read_config_defaults(
parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
opts, args = parser.parse_args()
if not opts.db_url:
parser.error('Must provide --db_url')
if args:
parser.error('Too many arguments')
daemonize.daemonize(opts, run_cheap_deduper,
(opts.db_url, not opts.apply))
if __name__ == '__main__':
main()
import eyeD3
from djrandom.model.mp3 import MP3
# Monkey-patch eyeD3 so that it does not look at file extensions to
# figure out if something is an MP3 or not.
eyeD3.tag.isMp3File = lambda x: True
def _compare_score(a, b):
a_bitrate, a_duration, a_nmeta = a[0]
b_bitrate, b_duration, b_nmeta = b[0]
res = cmp(a_bitrate, b_bitrate)
if res == 0:
res = cmp(a_duration, b_duration)
if res == 0:
res = cmp(a_nmeta, b_nmeta)
return res
def get_song_score(mp3):
try:
af = eyeD3.Mp3AudioFile(mp3.path)
except:
return (0, 0, 0)
# Get encoding parameters.
bitrate = af.getBitRate()[1]
duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs
# Count metadata tags.
try:
tag = af.getTag()
has_album = not (not tag.getAlbum())
has_artist = not (not tag.getArtist())
has_title = not (not tag.getTitle())
has_genre = not (not tag.getGenre())
has_year = not (not tag.getYear())
has_tracknum = (tag.getTrackNum()[0] is not None)
has_images = not (not tag.getImages())
num_meta = (4 * int(has_images)
+ 2 * sum(map(int, (has_album, has_artist, has_title)))
+ sum(map(int, (has_genre, has_year, has_tracknum))))
except:
num_meta = 0
return (bitrate, duration, num_meta)
def sort_songs(hashes=None, mp3s=None):
assert hashes or mp3s
if mp3s is None:
mp3s = MP3.query.filter(MP3.sha1.in_(hashes))
return sorted(((get_song_score(x), x.sha1) for x in mp3s),
cmp=_compare_score, reverse=True)
......@@ -9,19 +9,15 @@ from djrandom import daemonize
from djrandom import utils
from djrandom.model.mp3 import MP3
from djrandom.database import Session, init_db
from djrandom.fingerprint.resolve_duplicates import Resolver
log = logging.getLogger(__name__)
# Monkey-patch eyeD3 so that it does not look at file extensions to
# figure out if something is an MP3 or not.
eyeD3.tag.isMp3File = lambda x: True
class DeDuper(object):
def __init__(self):
self.songs_to_remove = {}
self.dupes_cache = set()
self._resolver = Resolver()
def _generate_code_json(self, jsondata, track_id):
"""Parse the JSON string output of echoprint-codegen, and return
......@@ -149,76 +145,11 @@ class DeDuper(object):
track_sha1, score, original_score, mp3.artist, mp3.title))
# Actually de-duplicate the songs we've found.
self._resolve_dupes([x[0] for x in dupes])
self._resolver.resolve_dupes([x[0] for x in dupes])
return True
def _get_song_score(self, mp3):
af = eyeD3.Mp3AudioFile(mp3.path)
# Get encoding parameters.
bitrate = af.getBitRate()[1]
duration = 30 * (int(af.getPlayTime()) / 30) # round to 30 secs
# Count metadata tags.
tag = af.getTag()
has_album = not (not tag.getAlbum())
has_artist = not (not tag.getArtist())
has_title = not (not tag.getTitle())
has_genre = not (not tag.getGenre())
has_year = not (not tag.getYear())
has_tracknum = (tag.getTrackNum()[0] is not None)
has_images = not (not tag.getImages())
num_meta = (4 * int(has_images)
+ 2 * sum(map(int, (has_album, has_artist, has_title)))
+ sum(map(int, (has_genre, has_year, has_tracknum))))
return (bitrate, duration, num_meta)
def _resolve_dupes(self, hashes):
"""Perform best duplicate selection and remove dupes from db."""
hashes_key = ','.join(sorted(hashes))
log.debug('remove_dupes(%s)' % hashes_key)
if hashes_key in self.dupes_cache:
return self.dupes_cache[hashes_key]
def _compare_score(a, b):
a_bitrate, a_duration, a_nmeta = a[0]
b_bitrate, b_duration, b_nmeta = b[0]
res = cmp(a_bitrate, b_bitrate)
if res == 0:
res = cmp(a_duration, b_duration)
if res == 0:
res = cmp(a_nmeta, b_nmeta)
return res
# Compute 'score' for each song and sort them.
scores = []
mp3s = MP3.query.filter(MP3.sha1.in_(hashes))
for mp3 in mp3s:
scores.append((self._get_song_score(mp3), mp3.sha1))
scores.sort(cmp=_compare_score, reverse=True)
best_song = scores[0][1]
log.debug('remove_dupes: best song is %s' % best_song)
log.debug('remove_dupes: score dump:')
for score, sha1 in scores:
bitrate, duration, nmeta = score
log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % (
bitrate, duration, nmeta, sha1))
# Remove all the other songs.
songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
self.songs_to_remove.update(songs_to_remove)
self.dupes_cache[hashes_key] = best_song
return best_song
def _cleanup(self):
for sha1, duplicate_of in self.songs_to_remove.iteritems():
# Mark the MP3 as duplicate, remove the associated file.
mp3 = MP3.query.get(sha1)
mp3.mark_as_duplicate(duplicate_of)
Session.add(mp3)
Session.commit()
self._resolver.commit()
def run_deduper(db_url, dry_run):
......
import logging
from djrandom.database import Session
from djrandom.model.mp3 import MP3
from djrandom.fingerprint.compare_songs import sort_songs
log = logging.getLogger(__name__)
class Resolver(object):
def __init__(self):
self._to_remove = {}
self._cache = {}
def resolve_dupes(self, hashes):
"""Perform best duplicate selection and remove dupes from db."""
hashes_key = ','.join(sorted(hashes))
log.debug('remove_dupes(%s)' % hashes_key)
if hashes_key in self._cache:
return self._cache[hashes_key]
# Compute 'score' for each song and sort them.
by_score = sort_songs(hashes=hashes)
best_song = by_score[0][1]
log.debug('remove_dupes: best song is %s' % best_song)
log.debug('remove_dupes: score dump:')
for score, sha1 in by_score:
bitrate, duration, nmeta = score
log.debug(' * (bitrate=%s, duration=%s, nmeta=%s) %s' % (
bitrate, duration, nmeta, sha1))
# Remove all the other songs.
songs_to_remove = dict((x, best_song) for x in hashes if x != best_song)
log.info('remove_dupes: songs to remove: %s' % str(songs_to_remove))
self._to_remove.update(songs_to_remove)
self._cache[hashes_key] = best_song
return best_song
def commit(self):
for sha1, duplicate_of in self._to_remove.iteritems():
mp3 = MP3.query.get(sha1)
mp3.mark_as_duplicate(duplicate_of)
Session.add(mp3)
Session.commit()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment