diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index 2d1ef5762bfd382bb3ca0eae1cbb703ee582466a..e25dad22f9dcbaefea6d8cbf91dfa35bf2aa3206 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -57,7 +57,7 @@ class DeDuper(object): fp.erase_database(local=True, really_delete=True) mp3s = MP3.query.filter( (MP3.artist == u'bonobo') - & (MP3.ready == True) & (MP3.error == False) + & (MP3.state == MP3.READY) & (MP3.echoprint_fp != None)) for mp3 in mp3s: code = self._generate_code_json(mp3.echoprint_fp, mp3.sha1) @@ -140,6 +140,18 @@ class DeDuper(object): """Perform best duplicate selection and remove dupes from db.""" log.debug('remove_dupes(%s)' % ','.join(hashes)) + # Compute 'score' for each song and sort them. + scores = [] + mp3s = MP3.query.filter(MP3.sha1 in hashes) + for mp3 in mp3s: + scores.append((self._get_song_score(mp3), mp3.sha1)) + scores.sort(key=lambda x: x[0]) + best_song = scores[0][1] + log.debug('remove_dupes: best song is %s' % best_song) + + # Remove all the other songs. + songs_to_remove = [x for x in hashes if x != best_song] + def run_deduper(db_url): diff --git a/server/djrandom/fingerprint/fingerprint.py b/server/djrandom/fingerprint/fingerprint.py index 14e6c44583954164e069b5e015416e74a23190ea..8249395dfe6ff86d47e89b18e4279ef6e9064317 100644 --- a/server/djrandom/fingerprint/fingerprint.py +++ b/server/djrandom/fingerprint/fingerprint.py @@ -31,7 +31,8 @@ class Fingerprinter(object): def compute_fingerprints(self, run_once): """Compute fingerprints of new files.""" while True: - mp3 = MP3.query.filter(MP3.echoprint_fp == None + mp3 = MP3.query.filter((MP3.state == MP3.READY) + & (MP3.echoprint_fp == None) ).limit(1).first() if not mp3: if run_once: diff --git a/server/djrandom/frontend/api_views.py b/server/djrandom/frontend/api_views.py index e4cf6aec432a7cfe9466bd68738c235d354ef530..065007516fd1c00cca2f6b5adfcd7bedcba0d346 100644 --- a/server/djrandom/frontend/api_views.py +++ b/server/djrandom/frontend/api_views.py @@ -28,8 +28,7 @@ def artist_albums_json(artist): @app.route('/json/album/<artist>/<album>') @require_auth def album_songs_json(artist, album): - songs = [x.to_dict() for x in MP3.query.filter_by( - error=False, ready=True, artist=artist, album=album)] + songs = [x.to_dict() for x in MP3.get_songs_for_album(artist, album)] return jsonify(songs=songs) diff --git a/server/djrandom/model/mp3.py b/server/djrandom/model/mp3.py index 68657c7686cea86ead20af9d0bcbf72708937b0f..10bdfc819b2a80119fe6a9dd332a02216c9d757d 100644 --- a/server/djrandom/model/mp3.py +++ b/server/djrandom/model/mp3.py @@ -13,9 +13,13 @@ class MP3(Base): __tablename__ = 'mp3' + INCOMING = 'I' + READY = 'R' + ERROR = 'E' + DUPLICATE = 'D' + sha1 = Column(String(40), primary_key=True) - ready = Column(Boolean, default=False, index=True) - error = Column(Boolean, default=False, index=True) + state = Column(String(1), default=INCOMING, index=True) path = Column(String(1024)) size = Column(Integer()) artist = Column(Unicode(256)) @@ -25,37 +29,47 @@ class MP3(Base): uploaded_at = Column(DateTime()) play_count = Column(Integer(), default=0) echoprint_fp = deferred(Column(Text())) + duplicate_of = Column(String(40)) def __init__(self, **kw): for k, v in kw.items(): setattr(self, k, v) def to_dict(self): - return {'title': self.title, + data = {'title': self.title, 'artist': self.artist, 'album': self.album, 'genre': self.genre, 'sha1': self.sha1, 'size': self.size, 'uploaded_at': self.uploaded_at} + if self.duplicate: + data['duplicate_of'] = self.duplicate_of + return data @classmethod def last_uploaded(cls, n=10): """Return the N last uploaded songs.""" - return cls.query.filter_by(ready=True).order_by( + return cls.query.filter_by(state=cls.READY).order_by( desc(cls.uploaded_at)).limit(n) @classmethod def get_random_songs(cls, n=10): """Return N completely random songs.""" results = [] - num_songs = cls.query.filter_by(ready=True).count() - for idx in xrange(n): - song = cls.query.filter_by(ready=True).limit(1).offset( - random.randint(0, num_songs - 1)).one() - results.append(song) + num_songs = cls.query.filter_by(state=cls.READY).count() + fraction = float(n) / num_songs + while len(results) < n: + tmprows = Session.query(cls.sha1).filter(func.rand() < fraction).limit(n) + for row in tmprows: + results.append(row[0]) return results + @classmethod + def get_songs_for_album(cls, artist, album): + return cls.query.filter_by( + state=cls.READY, artist=artist, album=album) + class PlayLog(Base): diff --git a/server/djrandom/receiver/receiver.py b/server/djrandom/receiver/receiver.py index 3b2c92d8a1adc94ad8a24a5d6475bf95c3db1e0c..21238fbc191a94b4c8c137aca02b3aa93a59e89f 100644 --- a/server/djrandom/receiver/receiver.py +++ b/server/djrandom/receiver/receiver.py @@ -32,6 +32,7 @@ def check(sha1): def _upload_mp3(incoming_fd, sha1): mp3 = MP3(path=utils.generate_path(storage_root, sha1), + state=MP3.INCOMING, sha1=sha1, uploaded_at=datetime.now()) with open(mp3.path, 'w') as fd: diff --git a/server/djrandom/scanner/scanner.py b/server/djrandom/scanner/scanner.py index b87239c41cb340613af7d689302ac479a5c4d706..9fdfddd927bf37541b4fbdf76908513708ad24c1 100644 --- a/server/djrandom/scanner/scanner.py +++ b/server/djrandom/scanner/scanner.py @@ -27,7 +27,7 @@ class Scanner(object): def scan_db(self, run_once): """Scan the database for new files.""" while True: - mp3 = MP3.query.filter_by(ready=False, error=False + mp3 = MP3.query.filter_by(state=MP3.INCOMING ).limit(1).first() if not mp3: if run_once: @@ -39,10 +39,10 @@ class Scanner(object): log.info('processing %s' % mp3.sha1) try: self.process(mp3) - mp3.ready = True + mp3.state = MP3.READY except Exception, e: log.error(traceback.format_exc()) - mp3.error = True + mp3.state = MP3.ERROR Session.add(mp3) Session.commit()