Skip to content
Snippets Groups Projects
Commit d8fa3178 authored by ale's avatar ale
Browse files

add capability of filtering by artist/title, for testing purposes

parent d9275a4c
No related branches found
No related tags found
No related merge requests found
...@@ -20,8 +20,10 @@ log = logging.getLogger(__name__) ...@@ -20,8 +20,10 @@ log = logging.getLogger(__name__)
class DeDuper(object): class DeDuper(object):
def __init__(self, engine): def __init__(self, engine, filter_artist=None, filter_title=None):
self._engine = engine self._engine = engine
self._filter_artist = filter_artist
self._filter_title = filter_title
self._resolver = Resolver() self._resolver = Resolver()
def _generate_code_json(self, echoprint_fp, sha1): def _generate_code_json(self, echoprint_fp, sha1):
...@@ -58,10 +60,15 @@ class DeDuper(object): ...@@ -58,10 +60,15 @@ class DeDuper(object):
start = time.time() start = time.time()
fp.erase_database(local=True, really_delete=True) fp.erase_database(local=True, really_delete=True)
# Skip the ORM and directly query the SQL layer. # Skip the ORM and directly query the SQL layer.
q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], qargs = ((MP3.sha1 == Fingerprint.sha1)
(MP3.sha1 == Fingerprint.sha1)
& (MP3.state == MP3.READY) & (MP3.state == MP3.READY)
& (MP3.has_fingerprint == True)) & (MP3.has_fingerprint == True))
if self._filter_artist:
qargs = qargs & (MP3.artist == self._filter_artist)
if self._filter_title:
qargs = qargs & (MP3.title == self._filter_title)
log.debug('query: %s', qargs)
q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], qargs)
count = 0 count = 0
for row in self._engine.execute(q): for row in self._engine.execute(q):
count += 1 count += 1
...@@ -74,11 +81,12 @@ class DeDuper(object): ...@@ -74,11 +81,12 @@ class DeDuper(object):
self.codes[row.sha1] = code['fp'] self.codes[row.sha1] = code['fp']
fp.ingest(code, do_commit=False, local=True) fp.ingest(code, do_commit=False, local=True)
elapsed = time.time() - start elapsed = time.time() - start
log.debug('loaded in-memory fingerprint database in %g seconds' % elapsed) log.debug('loaded in-memory fingerprint database in %g seconds, '
'%d fingerprints', elapsed, count)
def _scan_for_dupes(self): def _scan_for_dupes(self):
# Now dedupe by going through all our codes over again. # Now dedupe by going through all our codes over again.
log.debug('de-duping fingerprint database...') log.debug('de-duping fingerprint database (%d codes)...', len(self.codes))
start = time.time() start = time.time()
dup_count = 0 dup_count = 0
for sha1, code in self.codes.iteritems(): for sha1, code in self.codes.iteritems():
...@@ -153,9 +161,9 @@ class DeDuper(object): ...@@ -153,9 +161,9 @@ class DeDuper(object):
self._resolver.commit() self._resolver.commit()
def run_deduper(db_url, solr_url, dry_run): def run_deduper(db_url, solr_url, dry_run, filter_artist, filter_title):
engine = init_db(db_url, solr_url) engine = init_db(db_url, solr_url)
dup = DeDuper(engine) dup = DeDuper(engine, filter_artist, filter_title)
dup.dedupe(dry_run) dup.dedupe(dry_run)
...@@ -164,6 +172,8 @@ def main(): ...@@ -164,6 +172,8 @@ def main():
parser.add_option('--db_url') parser.add_option('--db_url')
parser.add_option('--solr_url', default='http://localhost:8080/solr') parser.add_option('--solr_url', default='http://localhost:8080/solr')
parser.add_option('--apply', action='store_true') parser.add_option('--apply', action='store_true')
parser.add_option('--filter_artist')
parser.add_option('--filter_title')
daemonize.add_standard_options(parser) daemonize.add_standard_options(parser)
utils.read_config_defaults( utils.read_config_defaults(
parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf')) parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
...@@ -174,7 +184,8 @@ def main(): ...@@ -174,7 +184,8 @@ def main():
parser.error('Too many arguments') parser.error('Too many arguments')
daemonize.daemonize(opts, run_deduper, daemonize.daemonize(opts, run_deduper,
(opts.db_url, opts.solr_url, not opts.apply)) (opts.db_url, opts.solr_url, not opts.apply,
opts.filter_artist, opts.filter_title))
if __name__ == '__main__': if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment