Skip to content
Snippets Groups Projects
Commit e618e850 authored by ale's avatar ale
Browse files

remove duplicates when training the markov model

parent 5b74a80c
No related branches found
No related tags found
No related merge requests found
......@@ -80,6 +80,20 @@ class MP3(Base):
except:
pass
@classmethod
def deduplicate(cls, hashes):
result = []
for sha1 in hashes:
if sha1:
mp3 = cls.query.get(sha1)
while mp3 and mp3.state == cls.DUPLICATE:
mp3 = cls.query.get(mp3.duplicate_of)
if mp3:
result.append(mp3.sha1)
continue
result.append(None)
return result
def get_fingerprint(self):
if self.has_fingerprint:
return self.echoprint_fp.echoprint_fp
......@@ -180,7 +194,8 @@ class PlayLog(Base):
hashes = ([None] * (n - len(hashes))) + hashes
else:
hashes = [None] * n
yield (plog.sha1, hashes)
target = MP3.deduplicate([plog.sha1])[0]
yield (target, MP3.deduplicate(hashes))
@classmethod
def top_songs_for_user(cls, userid, days=30, n=10):
......
......@@ -7,18 +7,19 @@ from djrandom.model.mp3 import MP3, PlayLog, SearchLog, DUPLICATE_DIR
from djrandom.model.playlist import Playlist
class MP3Test(DbTestCase):
def _create_mp3(self, sha1='1234', **kw):
def create_mp3(sha1='1234', **kw):
mp3_data = {'title': u'title', 'artist': u'artist', 'album': u'album',
'genre': u'genre', 'sha1': sha1, 'size': 2601,
'uploaded_at': datetime(2011, 10, 10, 9, 18, 0)}
mp3_data.update(kw)
return MP3(path='/storage/' + sha1, **mp3_data), mp3_data
class MP3Test(DbTestCase):
def test_mp3_std(self):
# Simple tests building and serializing an MP3 object.
mp3, mp3_data = self._create_mp3()
mp3, mp3_data = create_mp3()
Session.add(mp3)
Session.commit()
......@@ -43,7 +44,7 @@ class MP3Test(DbTestCase):
os.path.join(DUPLICATE_DIR, '1234'))
self.mox.ReplayAll()
mp3, _ = self._create_mp3()
mp3, _ = create_mp3()
mp3.mark_as_duplicate('2345')
self.assertEquals('2345', mp3.duplicate_of)
......@@ -51,7 +52,7 @@ class MP3Test(DbTestCase):
def test_mp3_fingerprint(self):
fp = 'a fingerprint'
mp3, _ = self._create_mp3()
mp3, _ = create_mp3()
mp3.set_fingerprint(fp)
Session.add(mp3)
Session.commit()
......@@ -61,9 +62,9 @@ class MP3Test(DbTestCase):
self.assertEquals(fp, mp3b.get_fingerprint())
def test_mp3_get_with_no_fingerprint(self):
mp3_1, _ = self._create_mp3('1001')
mp3_2, _ = self._create_mp3('1002')
mp3_3, _ = self._create_mp3('1003')
mp3_1, _ = create_mp3('1001')
mp3_2, _ = create_mp3('1002')
mp3_3, _ = create_mp3('1003')
mp3_1.state = MP3.READY
mp3_2.state = MP3.BAD_METADATA
for x in (mp3_1, mp3_2, mp3_3):
......@@ -81,8 +82,8 @@ class MP3Test(DbTestCase):
pass
def test_mp3_get_songs_for_album(self):
mp3_1, _ = self._create_mp3('1001', album=u'other album', state=MP3.READY)
mp3_2, _ = self._create_mp3('1002', state=MP3.READY)
mp3_1, _ = create_mp3('1001', album=u'other album', state=MP3.READY)
mp3_2, _ = create_mp3('1002', state=MP3.READY)
Session.add(mp3_1)
Session.add(mp3_2)
Session.commit()
......@@ -91,9 +92,9 @@ class MP3Test(DbTestCase):
self.assertEquals(['1002'], [x.sha1 for x in results])
#def test_mp3_get_random_songs(self):
# mp3_1, _ = self._create_mp3('1001', state=MP3.READY)
# mp3_2, _ = self._create_mp3('1002', state=MP3.READY)
# mp3_3, _ = self._create_mp3('1003', state=MP3.READY)
# mp3_1, _ = create_mp3('1001', state=MP3.READY)
# mp3_2, _ = create_mp3('1002', state=MP3.READY)
# mp3_3, _ = create_mp3('1003', state=MP3.READY)
# for x in (mp3_1, mp3_2, mp3_3):
# Session.add(x)
# Session.commit()
......@@ -125,6 +126,11 @@ class PlayLogTest(DbTestCase):
self.assertEquals(2, result[0][1])
def test_playlog_generate_tuples(self):
for i in range(1, 5):
mp3, _ = create_mp3(unicode(i))
Session.add(mp3)
Session.commit()
result = list(PlayLog.generate_tuples(n=2))
result.sort()
expected = [(u'1', [None]),
......@@ -134,6 +140,24 @@ class PlayLogTest(DbTestCase):
(u'4', [u'3'])]
self.assertEquals(expected, result)
def test_playlog_generate_tuples_with_duplicates(self):
for i in range(1, 4):
mp3, _ = create_mp3(unicode(i))
Session.add(mp3)
mp3, _ = create_mp3(u'4', state=MP3.DUPLICATE,
duplicate_of=u'2')
Session.add(mp3)
Session.commit()
result = list(PlayLog.generate_tuples(n=2))
result.sort()
expected = [(u'1', [None]),
(u'1', [u'2']),
(u'2', [u'1']),
(u'2', [u'3']),
(u'3', [u'2'])]
self.assertEquals(expected, result)
class PlayListTest(DbTestCase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment