diff --git a/server/djrandom/fingerprint/dedup.py b/server/djrandom/fingerprint/dedup.py index 826276b9f11a4e5ec1954ed72745163b4c823086..f963562e395f0456d74e237bcb5f44ffc0ed6d31 100644 --- a/server/djrandom/fingerprint/dedup.py +++ b/server/djrandom/fingerprint/dedup.py @@ -192,8 +192,15 @@ class DeDuper(object): bitrate, duration, nmeta, sha1)) # Remove all the other songs. - songs_to_remove = [x for x in hashes if x != best_song] + songs_to_remove = set(x for x in hashes if x != best_song) log.info('remove_dupes: songs to remove: %s' % (','.join(songs_to_remove))) + for mp3 in mp3s: + if mp3.sha1 not in songs_to_remove: + continue + # Mark the MP3 as duplicate, remove the associated file. + mp3.mark_as_duplicate(best_song) + Session.add(mp3) + Session.commit() def run_deduper(db_url): diff --git a/server/djrandom/model/mp3.py b/server/djrandom/model/mp3.py index 0504ec44ddec57cb1a0f3d81438e5f68040736e9..9732597cb1057eb282845f3c80e8d7617be2aada 100644 --- a/server/djrandom/model/mp3.py +++ b/server/djrandom/model/mp3.py @@ -1,9 +1,14 @@ +import os import random +import shutil from sqlalchemy.orm import deferred from sqlalchemy import * from datetime import datetime, timedelta from djrandom.database import Base, Session +# Stage duplicate files to this directory, pending cleanup. +DUPLICATE_DIR = '/var/tmp/djrandom-duplicates' + class MP3(Base): """A single MP3. @@ -47,6 +52,17 @@ class MP3(Base): data['duplicate_of'] = self.duplicate_of return data + def mark_as_duplicate(self, duplicate_of): + self.state = self.DUPLICATE + self.duplicate_of = duplicate_of + try: + if not os.path.isdir(DUPLICATE_DIR): + os.makedirs(DUPLICATE_DIR) + shutil.move(self.path, + os.path.join(DUPLICATE_DIR, self.sha1)) + except: + pass + @classmethod def last_uploaded(cls, n=10): """Return the N last uploaded songs."""