From f00e38b8c645bd4f95331cb8576dcc5f1a1ef034 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sat, 8 Oct 2011 18:51:43 +0100 Subject: [PATCH] update SOLR on mp3 state changes (with a little refactoring) --- server/djrandom/database.py | 14 ++++++-- server/djrandom/fingerprint/cheap_dedup.py | 13 ++++--- server/djrandom/fingerprint/dedup.py | 7 ++-- .../fingerprint/resolve_duplicates.py | 4 ++- server/djrandom/frontend/frontend.py | 2 +- server/djrandom/frontend/search.py | 1 - server/djrandom/model/indexer.py | 36 +++++++++++++++++++ server/djrandom/scanner/indexer.py | 19 ---------- server/djrandom/scanner/scanner.py | 12 +++---- 9 files changed, 69 insertions(+), 39 deletions(-) create mode 100644 server/djrandom/model/indexer.py delete mode 100644 server/djrandom/scanner/indexer.py diff --git a/server/djrandom/database.py b/server/djrandom/database.py index 4db52f4..25dd138 100644 --- a/server/djrandom/database.py +++ b/server/djrandom/database.py @@ -2,6 +2,7 @@ from sqlalchemy import create_engine from sqlalchemy.interfaces import PoolListener from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.ext.declarative import declarative_base +from djrandom.model.indexer import Indexer Session = scoped_session(sessionmaker(autocommit=False, @@ -9,23 +10,32 @@ Session = scoped_session(sessionmaker(autocommit=False, Base = declarative_base() Base.query = Session.query_property() +indexer = Indexer() + # We are storing paths as binary blobs, without forcing a charset -# encoding. SQLAlchemy needs this class to safely do so. +# encoding. SQLAlchemy/MySQL needs this class to safely do so. class SetTextFactory(PoolListener): def connect(self, dbapi_con, con_record): dbapi_con.text_factory = str -def init_db(uri): +def init_db(uri, solr_url=None): # Import all ORM modules here, so that 'create_all' can find them. from djrandom.model import mp3, playlist + if uri.startswith('mysql://'): engine = create_engine(uri, listeners=[SetTextFactory()], pool_recycle=1800) else: engine = create_engine(uri, pool_recycle=1800) + Session.configure(bind=engine) Base.metadata.create_all(engine) + + # You can omit 'solr_url' if the program uses the db read-only. + if solr_url: + indexer.set_url(solr_url) + return engine diff --git a/server/djrandom/fingerprint/cheap_dedup.py b/server/djrandom/fingerprint/cheap_dedup.py index 603c6e7..296a349 100644 --- a/server/djrandom/fingerprint/cheap_dedup.py +++ b/server/djrandom/fingerprint/cheap_dedup.py @@ -91,17 +91,22 @@ class CheapDeDuper(object): best = self._resolver.resolve_dupes([s.sha1 for s in songs]) print '\n * best: %s\n' % (best, ) + def commit(self): + self._resolver.commit() -def run_cheap_deduper(db_url, dry_run): - engine = init_db(db_url) + +def run_cheap_deduper(db_url, solr_url, dry_run): + engine = init_db(db_url, solr_url) dup = CheapDeDuper() - #dup.dedupe(dry_run) dup.dedupe_fp(engine) + if not dry_run: + dup.commit() def main(): parser = optparse.OptionParser() parser.add_option('--db_url') + parser.add_option('--solr_url', default='http://localhost:8080/solr') parser.add_option('--apply', action='store_true') daemonize.add_standard_options(parser) utils.read_config_defaults( @@ -113,7 +118,7 @@ def main(): parser.error('Too many arguments') daemonize.daemonize(opts, run_cheap_deduper, - (opts.db_url, not opts.apply)) + (opts.db_url, opts.solr_url, not opts.apply)) if __name__ == '__main__': diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index d3dc02d..01aa1bc 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -152,8 +152,8 @@ class DeDuper(object): self._resolver.commit() -def run_deduper(db_url, dry_run): - init_db(db_url) +def run_deduper(db_url, solr_url, dry_run): + init_db(db_url, solr_url) dup = DeDuper() dup.dedupe(dry_run) @@ -161,6 +161,7 @@ def run_deduper(db_url, dry_run): def main(): parser = optparse.OptionParser() parser.add_option('--db_url') + parser.add_option('--solr_url', default='http://localhost:8080/solr') parser.add_option('--apply', action='store_true') daemonize.add_standard_options(parser) utils.read_config_defaults( @@ -172,7 +173,7 @@ def main(): parser.error('Too many arguments') daemonize.daemonize(opts, run_deduper, - (opts.db_url, not opts.apply)) + (opts.db_url, opts.solr_url, not opts.apply)) if __name__ == '__main__': diff --git a/server/djrandom/fingerprint/resolve_duplicates.py b/server/djrandom/fingerprint/resolve_duplicates.py index f635ceb..3e17258 100644 --- a/server/djrandom/fingerprint/resolve_duplicates.py +++ b/server/djrandom/fingerprint/resolve_duplicates.py @@ -1,5 +1,5 @@ import logging -from djrandom.database import Session +from djrandom.database import Session, indexer from djrandom.model.mp3 import MP3 from djrandom.fingerprint.compare_songs import sort_songs @@ -41,5 +41,7 @@ class Resolver(object): mp3 = MP3.query.get(sha1) mp3.mark_as_duplicate(duplicate_of) Session.add(mp3) + indexer.add_mp3(mp3) Session.commit() + indexer.commit() diff --git a/server/djrandom/frontend/frontend.py b/server/djrandom/frontend/frontend.py index 8eca9b3..d5bcb7c 100644 --- a/server/djrandom/frontend/frontend.py +++ b/server/djrandom/frontend/frontend.py @@ -20,7 +20,7 @@ log = logging.getLogger(__name__) def run_frontend(port, solr_url, db_url, lastfm_api_key, album_art_dir, email_sender, markov_data_file, do_profile): - init_db(db_url) + init_db(db_url, solr_url) svcs['searcher'] = Searcher(solr_url) svcs['album_images'] = AlbumImageRetriever(lastfm_api_key, album_art_dir) diff --git a/server/djrandom/frontend/search.py b/server/djrandom/frontend/search.py index 9f96651..28f6219 100644 --- a/server/djrandom/frontend/search.py +++ b/server/djrandom/frontend/search.py @@ -1,6 +1,5 @@ import solr from collections import defaultdict -from djrandom.model.mp3 import MP3 class Searcher(object): diff --git a/server/djrandom/model/indexer.py b/server/djrandom/model/indexer.py new file mode 100644 index 0000000..7022ba0 --- /dev/null +++ b/server/djrandom/model/indexer.py @@ -0,0 +1,36 @@ +import solr + + +class Indexer(object): + """A very simple wrapper for Solr that supports lazy inizialization.""" + + def __init__(self, solr_url=None): + self._solr = None + self._solr_url = solr_url + + def set_url(self, solr_url): + self._solr_url = solr_url + + def _get_solr(self): + if not self._solr: + self._solr = solr.Solr(self._solr_url, timeout=30) + return self._solr + + def add_mp3(self, mp3): + # Almost equal to mp3.to_dict() but not exactly (SOLR calls 'id' + # what the database calls 'sha1'). + if mp3.state == mp3.READY: + self._get_solr().add({ + 'id': mp3.sha1, + 'artist': mp3.artist, + 'album': mp3.album, + 'title': mp3.title, + 'genre': mp3.genre}) + else: + self._get_solr().delete(mp3.sha1) + + def del_mp3(self, mp3): + self._get_solr().delete(mp3.sha1) + + def commit(self): + self._get_solr().commit() diff --git a/server/djrandom/scanner/indexer.py b/server/djrandom/scanner/indexer.py deleted file mode 100644 index 110e51c..0000000 --- a/server/djrandom/scanner/indexer.py +++ /dev/null @@ -1,19 +0,0 @@ -import solr - - -class Indexer(object): - - def __init__(self, solr_url): - self.solr = solr.Solr(solr_url, timeout=30) - - def add_mp3(self, mp3): - # _almost_ equal to mp3.to_dict() but not quite (sha1/id mismatch). - doc = {'id': mp3.sha1, - 'artist': mp3.artist, - 'album': mp3.album, - 'title': mp3.title, - 'genre': mp3.genre} - self.solr.add(doc) - - def commit(self): - self.solr.commit() diff --git a/server/djrandom/scanner/scanner.py b/server/djrandom/scanner/scanner.py index 6617a59..d7e4ca4 100644 --- a/server/djrandom/scanner/scanner.py +++ b/server/djrandom/scanner/scanner.py @@ -6,9 +6,8 @@ import traceback from djrandom import daemonize from djrandom import utils from djrandom.model.mp3 import MP3 -from djrandom.database import Session, init_db +from djrandom.database import Session, init_db, indexer from djrandom.scanner import metadata -from djrandom.scanner import indexer log = logging.getLogger(__name__) @@ -19,16 +18,12 @@ class BadMetadataError(Exception): class Scanner(object): - def __init__(self, solr_url): - self.idx = indexer.Indexer(solr_url) - def process(self, mp3): mp3_info = metadata.analyze_mp3(mp3.path) if not mp3_info['artist'] or not mp3_info['title']: raise BadMetadataError() for key, value in mp3_info.iteritems(): setattr(mp3, key, value) - self.idx.add_mp3(mp3) def scan_db(self, run_once): """Scan the database for new files.""" @@ -39,7 +34,7 @@ class Scanner(object): if run_once: break Session.remove() - self.idx.commit() + indexer.commit() time.sleep(60) continue log.info('processing %s' % mp3.sha1) @@ -52,12 +47,13 @@ class Scanner(object): except Exception, e: log.error(traceback.format_exc()) mp3.state = MP3.ERROR + indexer.add_mp3(mp3) Session.add(mp3) Session.commit() def run_scanner(solr_url, db_url, run_once): - init_db(db_url) + init_db(db_url, solr_url) scanner = Scanner(solr_url) scanner.scan_db(run_once) -- GitLab