diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index bc68f60591973626eb51ce777798da23ed436d29..aff488d914a6f724c4af6cc659368a4a6c72d3f3 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -20,8 +20,10 @@ log = logging.getLogger(__name__) class DeDuper(object): - def __init__(self, engine): + def __init__(self, engine, filter_artist=None, filter_title=None): self._engine = engine + self._filter_artist = filter_artist + self._filter_title = filter_title self._resolver = Resolver() def _generate_code_json(self, echoprint_fp, sha1): @@ -58,10 +60,15 @@ class DeDuper(object): start = time.time() fp.erase_database(local=True, really_delete=True) # Skip the ORM and directly query the SQL layer. - q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], - (MP3.sha1 == Fingerprint.sha1) - & (MP3.state == MP3.READY) - & (MP3.has_fingerprint == True)) + qargs = ((MP3.sha1 == Fingerprint.sha1) + & (MP3.state == MP3.READY) + & (MP3.has_fingerprint == True)) + if self._filter_artist: + qargs = qargs & (MP3.artist == self._filter_artist) + if self._filter_title: + qargs = qargs & (MP3.title == self._filter_title) + log.debug('query: %s', qargs) + q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], qargs) count = 0 for row in self._engine.execute(q): count += 1 @@ -74,11 +81,12 @@ class DeDuper(object): self.codes[row.sha1] = code['fp'] fp.ingest(code, do_commit=False, local=True) elapsed = time.time() - start - log.debug('loaded in-memory fingerprint database in %g seconds' % elapsed) + log.debug('loaded in-memory fingerprint database in %g seconds, ' + '%d fingerprints', elapsed, count) def _scan_for_dupes(self): # Now dedupe by going through all our codes over again. - log.debug('de-duping fingerprint database...') + log.debug('de-duping fingerprint database (%d codes)...', len(self.codes)) start = time.time() dup_count = 0 for sha1, code in self.codes.iteritems(): @@ -153,9 +161,9 @@ class DeDuper(object): self._resolver.commit() -def run_deduper(db_url, solr_url, dry_run): +def run_deduper(db_url, solr_url, dry_run, filter_artist, filter_title): engine = init_db(db_url, solr_url) - dup = DeDuper(engine) + dup = DeDuper(engine, filter_artist, filter_title) dup.dedupe(dry_run) @@ -164,6 +172,8 @@ def main(): parser.add_option('--db_url') parser.add_option('--solr_url', default='http://localhost:8080/solr') parser.add_option('--apply', action='store_true') + parser.add_option('--filter_artist') + parser.add_option('--filter_title') daemonize.add_standard_options(parser) utils.read_config_defaults( parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf')) @@ -174,7 +184,8 @@ def main(): parser.error('Too many arguments') daemonize.daemonize(opts, run_deduper, - (opts.db_url, opts.solr_url, not opts.apply)) + (opts.db_url, opts.solr_url, not opts.apply, + opts.filter_artist, opts.filter_title)) if __name__ == '__main__':