update SOLR on mp3 state changes (with a little refactoring)

f00e38b8 · ale · 0d4a7122 · f00e38b8 · f00e38b8 · f00e38b8
Commit f00e38b8 authored 13 years ago by ale
--- a/server/djrandom/database.py
+++ b/server/djrandom/database.py
@@ -2,6 +2,7 @@ from sqlalchemy import create_engine
 from sqlalchemy.interfaces import PoolListener
 from sqlalchemy.orm import scoped_session, sessionmaker
 from sqlalchemy.ext.declarative import declarative_base
+from djrandom.model.indexer import Indexer
 Session = scoped_session(sessionmaker(autocommit=False,
@@ -9,23 +10,32 @@ Session = scoped_session(sessionmaker(autocommit=False,
 Base = declarative_base()
 Base.query = Session.query_property()
+indexer = Indexer()
 # We are storing paths as binary blobs, without forcing a charset
-# encoding. SQLAlchemy needs this class to safely do so.
+# encoding. SQLAlchemy/MySQL needs this class to safely do so.
 class SetTextFactory(PoolListener):
    def connect(self, dbapi_con, con_record):
        dbapi_con.text_factory = str
-def init_db(uri):
+def init_db(uri, solr_url=None):
    # Import all ORM modules here, so that 'create_all' can find them.
    from djrandom.model import mp3, playlist
    if uri.startswith('mysql://'):
        engine = create_engine(uri, listeners=[SetTextFactory()],
                               pool_recycle=1800)
    else:
        engine = create_engine(uri, pool_recycle=1800)
    Session.configure(bind=engine)
    Base.metadata.create_all(engine)
+    # You can omit 'solr_url' if the program uses the db read-only.
+    if solr_url:
+        indexer.set_url(solr_url)
    return engine
--- a/server/djrandom/fingerprint/cheap_dedup.py
+++ b/server/djrandom/fingerprint/cheap_dedup.py
@@ -91,17 +91,22 @@ class CheapDeDuper(object):
        best = self._resolver.resolve_dupes([s.sha1 for s in songs])
        print '\n  * best: %s\n' % (best, )
+    def commit(self):
+        self._resolver.commit()
-def run_cheap_deduper(db_url, dry_run):
-    engine = init_db(db_url)
+def run_cheap_deduper(db_url, solr_url, dry_run):
+    engine = init_db(db_url, solr_url)
    dup = CheapDeDuper()
-    #dup.dedupe(dry_run)
    dup.dedupe_fp(engine)
+    if not dry_run:
+        dup.commit()
 def main():
    parser = optparse.OptionParser()
    parser.add_option('--db_url')
+    parser.add_option('--solr_url', default='http://localhost:8080/solr')
    parser.add_option('--apply', action='store_true')
    daemonize.add_standard_options(parser)
    utils.read_config_defaults(
@@ -113,7 +118,7 @@ def main():
        parser.error('Too many arguments')
    daemonize.daemonize(opts, run_cheap_deduper,
-                        (opts.db_url, not opts.apply))
+                        (opts.db_url, opts.solr_url, not opts.apply))
 if __name__ == '__main__':

--- a/server/djrandom/fingerprint/dedup.py
+++ b/server/djrandom/fingerprint/dedup.py
@@ -152,8 +152,8 @@ class DeDuper(object):
        self._resolver.commit()
-def run_deduper(db_url, dry_run):
+def run_deduper(db_url, solr_url, dry_run):
-    init_db(db_url)
+    init_db(db_url, solr_url)
    dup = DeDuper()
    dup.dedupe(dry_run)
@@ -161,6 +161,7 @@ def run_deduper(db_url, dry_run):
 def main():
    parser = optparse.OptionParser()
    parser.add_option('--db_url')
+    parser.add_option('--solr_url', default='http://localhost:8080/solr')
    parser.add_option('--apply', action='store_true')
    daemonize.add_standard_options(parser)
    utils.read_config_defaults(
@@ -172,7 +173,7 @@ def main():
        parser.error('Too many arguments')
    daemonize.daemonize(opts, run_deduper,
-                        (opts.db_url, not opts.apply))
+                        (opts.db_url, opts.solr_url, not opts.apply))
 if __name__ == '__main__':

--- a/server/djrandom/fingerprint/resolve_duplicates.py
+++ b/server/djrandom/fingerprint/resolve_duplicates.py
 import logging
-from djrandom.database import Session
+from djrandom.database import Session, indexer
 from djrandom.model.mp3 import MP3
 from djrandom.fingerprint.compare_songs import sort_songs
@@ -41,5 +41,7 @@ class Resolver(object):
            mp3 = MP3.query.get(sha1)
            mp3.mark_as_duplicate(duplicate_of)
            Session.add(mp3)
+            indexer.add_mp3(mp3)
        Session.commit()
+        indexer.commit()
--- a/server/djrandom/frontend/frontend.py
+++ b/server/djrandom/frontend/frontend.py
@@ -20,7 +20,7 @@ log = logging.getLogger(__name__)
 def run_frontend(port, solr_url, db_url, lastfm_api_key, album_art_dir,
                 email_sender, markov_data_file, do_profile):
-    init_db(db_url)
+    init_db(db_url, solr_url)
    svcs['searcher'] = Searcher(solr_url)
    svcs['album_images'] = AlbumImageRetriever(lastfm_api_key, album_art_dir)

--- a/server/djrandom/frontend/search.py
+++ b/server/djrandom/frontend/search.py
 import solr
 from collections import defaultdict
-from djrandom.model.mp3 import MP3
 class Searcher(object):

--- a/server/djrandom/model/indexer.py
+++ b/server/djrandom/model/indexer.py
+import solr
+class Indexer(object):
+    """A very simple wrapper for Solr that supports lazy inizialization."""
+    def __init__(self, solr_url=None):
+        self._solr = None
+        self._solr_url = solr_url
+    def set_url(self, solr_url):
+        self._solr_url = solr_url
+    def _get_solr(self):
+        if not self._solr:
+            self._solr = solr.Solr(self._solr_url, timeout=30)
+        return self._solr
+    def add_mp3(self, mp3):
+        # Almost equal to mp3.to_dict() but not exactly (SOLR calls 'id'
+        # what the database calls 'sha1').
+        if mp3.state == mp3.READY:
+            self._get_solr().add({
+                    'id': mp3.sha1,
+                    'artist': mp3.artist,
+                    'album': mp3.album,
+                    'title': mp3.title,
+                    'genre': mp3.genre})
+        else:
+            self._get_solr().delete(mp3.sha1)
+    def del_mp3(self, mp3):
+        self._get_solr().delete(mp3.sha1)
+    def commit(self):
+        self._get_solr().commit()
--- a/server/djrandom/scanner/indexer.py
+++ b/server/djrandom/scanner/indexer.py
-import solr
-class Indexer(object):
-    def __init__(self, solr_url):
-        self.solr = solr.Solr(solr_url, timeout=30)
-    def add_mp3(self, mp3):
-        # _almost_ equal to mp3.to_dict() but not quite (sha1/id mismatch).
-        doc = {'id': mp3.sha1,
-               'artist': mp3.artist,
-               'album': mp3.album,
-               'title': mp3.title,
-               'genre': mp3.genre}
-        self.solr.add(doc)
-    def commit(self):
-        self.solr.commit()
--- a/server/djrandom/scanner/scanner.py
+++ b/server/djrandom/scanner/scanner.py
@@ -6,9 +6,8 @@ import traceback
 from djrandom import daemonize
 from djrandom import utils
 from djrandom.model.mp3 import MP3
-from djrandom.database import Session, init_db
+from djrandom.database import Session, init_db, indexer
 from djrandom.scanner import metadata
-from djrandom.scanner import indexer
 log = logging.getLogger(__name__)
@@ -19,16 +18,12 @@ class BadMetadataError(Exception):
 class Scanner(object):
-    def __init__(self, solr_url):
-        self.idx = indexer.Indexer(solr_url)
    def process(self, mp3):
        mp3_info = metadata.analyze_mp3(mp3.path)
        if not mp3_info['artist'] or not mp3_info['title']:
            raise BadMetadataError()
        for key, value in mp3_info.iteritems():
            setattr(mp3, key, value)
-        self.idx.add_mp3(mp3)
    def scan_db(self, run_once):
        """Scan the database for new files."""
@@ -39,7 +34,7 @@ class Scanner(object):
                if run_once:
                    break
                Session.remove()
-                self.idx.commit()
+                indexer.commit()
                time.sleep(60)
                continue
            log.info('processing %s' % mp3.sha1)
@@ -52,12 +47,13 @@ class Scanner(object):
            except Exception, e:
                log.error(traceback.format_exc())
                mp3.state = MP3.ERROR
+            indexer.add_mp3(mp3)
            Session.add(mp3)
            Session.commit()
 def run_scanner(solr_url, db_url, run_once):
-    init_db(db_url)
+    init_db(db_url, solr_url)
    scanner = Scanner(solr_url)
    scanner.scan_db(run_once)