Skip to content
Snippets Groups Projects
Commit d8fa3178 authored by ale's avatar ale
Browse files

add capability of filtering by artist/title, for testing purposes

parent d9275a4c
No related branches found
No related tags found
No related merge requests found
......@@ -20,8 +20,10 @@ log = logging.getLogger(__name__)
class DeDuper(object):
def __init__(self, engine):
def __init__(self, engine, filter_artist=None, filter_title=None):
self._engine = engine
self._filter_artist = filter_artist
self._filter_title = filter_title
self._resolver = Resolver()
def _generate_code_json(self, echoprint_fp, sha1):
......@@ -58,10 +60,15 @@ class DeDuper(object):
start = time.time()
fp.erase_database(local=True, really_delete=True)
# Skip the ORM and directly query the SQL layer.
q = select([Fingerprint.sha1, Fingerprint.echoprint_fp],
(MP3.sha1 == Fingerprint.sha1)
& (MP3.state == MP3.READY)
& (MP3.has_fingerprint == True))
qargs = ((MP3.sha1 == Fingerprint.sha1)
& (MP3.state == MP3.READY)
& (MP3.has_fingerprint == True))
if self._filter_artist:
qargs = qargs & (MP3.artist == self._filter_artist)
if self._filter_title:
qargs = qargs & (MP3.title == self._filter_title)
log.debug('query: %s', qargs)
q = select([Fingerprint.sha1, Fingerprint.echoprint_fp], qargs)
count = 0
for row in self._engine.execute(q):
count += 1
......@@ -74,11 +81,12 @@ class DeDuper(object):
self.codes[row.sha1] = code['fp']
fp.ingest(code, do_commit=False, local=True)
elapsed = time.time() - start
log.debug('loaded in-memory fingerprint database in %g seconds' % elapsed)
log.debug('loaded in-memory fingerprint database in %g seconds, '
'%d fingerprints', elapsed, count)
def _scan_for_dupes(self):
# Now dedupe by going through all our codes over again.
log.debug('de-duping fingerprint database...')
log.debug('de-duping fingerprint database (%d codes)...', len(self.codes))
start = time.time()
dup_count = 0
for sha1, code in self.codes.iteritems():
......@@ -153,9 +161,9 @@ class DeDuper(object):
self._resolver.commit()
def run_deduper(db_url, solr_url, dry_run):
def run_deduper(db_url, solr_url, dry_run, filter_artist, filter_title):
engine = init_db(db_url, solr_url)
dup = DeDuper(engine)
dup = DeDuper(engine, filter_artist, filter_title)
dup.dedupe(dry_run)
......@@ -164,6 +172,8 @@ def main():
parser.add_option('--db_url')
parser.add_option('--solr_url', default='http://localhost:8080/solr')
parser.add_option('--apply', action='store_true')
parser.add_option('--filter_artist')
parser.add_option('--filter_title')
daemonize.add_standard_options(parser)
utils.read_config_defaults(
parser, os.getenv('DJRANDOM_CONF', '/etc/djrandom.conf'))
......@@ -174,7 +184,8 @@ def main():
parser.error('Too many arguments')
daemonize.daemonize(opts, run_deduper,
(opts.db_url, opts.solr_url, not opts.apply))
(opts.db_url, opts.solr_url, not opts.apply,
opts.filter_artist, opts.filter_title))
if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment