Skip to content
Snippets Groups Projects
Commit f00e38b8 authored by ale's avatar ale
Browse files

update SOLR on mp3 state changes (with a little refactoring)

parent 0d4a7122
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,7 @@ from sqlalchemy import create_engine ...@@ -2,6 +2,7 @@ from sqlalchemy import create_engine
from sqlalchemy.interfaces import PoolListener from sqlalchemy.interfaces import PoolListener
from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from djrandom.model.indexer import Indexer
Session = scoped_session(sessionmaker(autocommit=False, Session = scoped_session(sessionmaker(autocommit=False,
...@@ -9,23 +10,32 @@ Session = scoped_session(sessionmaker(autocommit=False, ...@@ -9,23 +10,32 @@ Session = scoped_session(sessionmaker(autocommit=False,
Base = declarative_base() Base = declarative_base()
Base.query = Session.query_property() Base.query = Session.query_property()
indexer = Indexer()
# We are storing paths as binary blobs, without forcing a charset # We are storing paths as binary blobs, without forcing a charset
# encoding. SQLAlchemy needs this class to safely do so. # encoding. SQLAlchemy/MySQL needs this class to safely do so.
class SetTextFactory(PoolListener): class SetTextFactory(PoolListener):
def connect(self, dbapi_con, con_record): def connect(self, dbapi_con, con_record):
dbapi_con.text_factory = str dbapi_con.text_factory = str
def init_db(uri): def init_db(uri, solr_url=None):
# Import all ORM modules here, so that 'create_all' can find them. # Import all ORM modules here, so that 'create_all' can find them.
from djrandom.model import mp3, playlist from djrandom.model import mp3, playlist
if uri.startswith('mysql://'): if uri.startswith('mysql://'):
engine = create_engine(uri, listeners=[SetTextFactory()], engine = create_engine(uri, listeners=[SetTextFactory()],
pool_recycle=1800) pool_recycle=1800)
else: else:
engine = create_engine(uri, pool_recycle=1800) engine = create_engine(uri, pool_recycle=1800)
Session.configure(bind=engine) Session.configure(bind=engine)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
# You can omit 'solr_url' if the program uses the db read-only.
if solr_url:
indexer.set_url(solr_url)
return engine return engine
...@@ -91,17 +91,22 @@ class CheapDeDuper(object): ...@@ -91,17 +91,22 @@ class CheapDeDuper(object):
best = self._resolver.resolve_dupes([s.sha1 for s in songs]) best = self._resolver.resolve_dupes([s.sha1 for s in songs])
print '\n * best: %s\n' % (best, ) print '\n * best: %s\n' % (best, )
def commit(self):
self._resolver.commit()
def run_cheap_deduper(db_url, dry_run):
engine = init_db(db_url) def run_cheap_deduper(db_url, solr_url, dry_run):
engine = init_db(db_url, solr_url)
dup = CheapDeDuper() dup = CheapDeDuper()
#dup.dedupe(dry_run)
dup.dedupe_fp(engine) dup.dedupe_fp(engine)
if not dry_run:
dup.commit()
def main(): def main():
parser = optparse.OptionParser() parser = optparse.OptionParser()
parser.add_option('--db_url') parser.add_option('--db_url')
parser.add_option('--solr_url', default='http://localhost:8080/solr')
parser.add_option('--apply', action='store_true') parser.add_option('--apply', action='store_true')
daemonize.add_standard_options(parser) daemonize.add_standard_options(parser)
utils.read_config_defaults( utils.read_config_defaults(
...@@ -113,7 +118,7 @@ def main(): ...@@ -113,7 +118,7 @@ def main():
parser.error('Too many arguments') parser.error('Too many arguments')
daemonize.daemonize(opts, run_cheap_deduper, daemonize.daemonize(opts, run_cheap_deduper,
(opts.db_url, not opts.apply)) (opts.db_url, opts.solr_url, not opts.apply))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -152,8 +152,8 @@ class DeDuper(object): ...@@ -152,8 +152,8 @@ class DeDuper(object):
self._resolver.commit() self._resolver.commit()
def run_deduper(db_url, dry_run): def run_deduper(db_url, solr_url, dry_run):
init_db(db_url) init_db(db_url, solr_url)
dup = DeDuper() dup = DeDuper()
dup.dedupe(dry_run) dup.dedupe(dry_run)
...@@ -161,6 +161,7 @@ def run_deduper(db_url, dry_run): ...@@ -161,6 +161,7 @@ def run_deduper(db_url, dry_run):
def main(): def main():
parser = optparse.OptionParser() parser = optparse.OptionParser()
parser.add_option('--db_url') parser.add_option('--db_url')
parser.add_option('--solr_url', default='http://localhost:8080/solr')
parser.add_option('--apply', action='store_true') parser.add_option('--apply', action='store_true')
daemonize.add_standard_options(parser) daemonize.add_standard_options(parser)
utils.read_config_defaults( utils.read_config_defaults(
...@@ -172,7 +173,7 @@ def main(): ...@@ -172,7 +173,7 @@ def main():
parser.error('Too many arguments') parser.error('Too many arguments')
daemonize.daemonize(opts, run_deduper, daemonize.daemonize(opts, run_deduper,
(opts.db_url, not opts.apply)) (opts.db_url, opts.solr_url, not opts.apply))
if __name__ == '__main__': if __name__ == '__main__':
......
import logging import logging
from djrandom.database import Session from djrandom.database import Session, indexer
from djrandom.model.mp3 import MP3 from djrandom.model.mp3 import MP3
from djrandom.fingerprint.compare_songs import sort_songs from djrandom.fingerprint.compare_songs import sort_songs
...@@ -41,5 +41,7 @@ class Resolver(object): ...@@ -41,5 +41,7 @@ class Resolver(object):
mp3 = MP3.query.get(sha1) mp3 = MP3.query.get(sha1)
mp3.mark_as_duplicate(duplicate_of) mp3.mark_as_duplicate(duplicate_of)
Session.add(mp3) Session.add(mp3)
indexer.add_mp3(mp3)
Session.commit() Session.commit()
indexer.commit()
...@@ -20,7 +20,7 @@ log = logging.getLogger(__name__) ...@@ -20,7 +20,7 @@ log = logging.getLogger(__name__)
def run_frontend(port, solr_url, db_url, lastfm_api_key, album_art_dir, def run_frontend(port, solr_url, db_url, lastfm_api_key, album_art_dir,
email_sender, markov_data_file, do_profile): email_sender, markov_data_file, do_profile):
init_db(db_url) init_db(db_url, solr_url)
svcs['searcher'] = Searcher(solr_url) svcs['searcher'] = Searcher(solr_url)
svcs['album_images'] = AlbumImageRetriever(lastfm_api_key, album_art_dir) svcs['album_images'] = AlbumImageRetriever(lastfm_api_key, album_art_dir)
......
import solr import solr
from collections import defaultdict from collections import defaultdict
from djrandom.model.mp3 import MP3
class Searcher(object): class Searcher(object):
......
import solr
class Indexer(object):
"""A very simple wrapper for Solr that supports lazy inizialization."""
def __init__(self, solr_url=None):
self._solr = None
self._solr_url = solr_url
def set_url(self, solr_url):
self._solr_url = solr_url
def _get_solr(self):
if not self._solr:
self._solr = solr.Solr(self._solr_url, timeout=30)
return self._solr
def add_mp3(self, mp3):
# Almost equal to mp3.to_dict() but not exactly (SOLR calls 'id'
# what the database calls 'sha1').
if mp3.state == mp3.READY:
self._get_solr().add({
'id': mp3.sha1,
'artist': mp3.artist,
'album': mp3.album,
'title': mp3.title,
'genre': mp3.genre})
else:
self._get_solr().delete(mp3.sha1)
def del_mp3(self, mp3):
self._get_solr().delete(mp3.sha1)
def commit(self):
self._get_solr().commit()
import solr
class Indexer(object):
def __init__(self, solr_url):
self.solr = solr.Solr(solr_url, timeout=30)
def add_mp3(self, mp3):
# _almost_ equal to mp3.to_dict() but not quite (sha1/id mismatch).
doc = {'id': mp3.sha1,
'artist': mp3.artist,
'album': mp3.album,
'title': mp3.title,
'genre': mp3.genre}
self.solr.add(doc)
def commit(self):
self.solr.commit()
...@@ -6,9 +6,8 @@ import traceback ...@@ -6,9 +6,8 @@ import traceback
from djrandom import daemonize from djrandom import daemonize
from djrandom import utils from djrandom import utils
from djrandom.model.mp3 import MP3 from djrandom.model.mp3 import MP3
from djrandom.database import Session, init_db from djrandom.database import Session, init_db, indexer
from djrandom.scanner import metadata from djrandom.scanner import metadata
from djrandom.scanner import indexer
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
...@@ -19,16 +18,12 @@ class BadMetadataError(Exception): ...@@ -19,16 +18,12 @@ class BadMetadataError(Exception):
class Scanner(object): class Scanner(object):
def __init__(self, solr_url):
self.idx = indexer.Indexer(solr_url)
def process(self, mp3): def process(self, mp3):
mp3_info = metadata.analyze_mp3(mp3.path) mp3_info = metadata.analyze_mp3(mp3.path)
if not mp3_info['artist'] or not mp3_info['title']: if not mp3_info['artist'] or not mp3_info['title']:
raise BadMetadataError() raise BadMetadataError()
for key, value in mp3_info.iteritems(): for key, value in mp3_info.iteritems():
setattr(mp3, key, value) setattr(mp3, key, value)
self.idx.add_mp3(mp3)
def scan_db(self, run_once): def scan_db(self, run_once):
"""Scan the database for new files.""" """Scan the database for new files."""
...@@ -39,7 +34,7 @@ class Scanner(object): ...@@ -39,7 +34,7 @@ class Scanner(object):
if run_once: if run_once:
break break
Session.remove() Session.remove()
self.idx.commit() indexer.commit()
time.sleep(60) time.sleep(60)
continue continue
log.info('processing %s' % mp3.sha1) log.info('processing %s' % mp3.sha1)
...@@ -52,12 +47,13 @@ class Scanner(object): ...@@ -52,12 +47,13 @@ class Scanner(object):
except Exception, e: except Exception, e:
log.error(traceback.format_exc()) log.error(traceback.format_exc())
mp3.state = MP3.ERROR mp3.state = MP3.ERROR
indexer.add_mp3(mp3)
Session.add(mp3) Session.add(mp3)
Session.commit() Session.commit()
def run_scanner(solr_url, db_url, run_once): def run_scanner(solr_url, db_url, run_once):
init_db(db_url) init_db(db_url, solr_url)
scanner = Scanner(solr_url) scanner = Scanner(solr_url)
scanner.scan_db(run_once) scanner.scan_db(run_once)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment