From ae835481385bcaee24ade3257fed4e54e625961c Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 21 Jan 2012 16:42:26 +0100
Subject: [PATCH] run the feature extractor in a separate process to avoid
 memory leaks

---
 server/djrandom/mood/feature_extraction.py | 26 ++++++----------------
 server/djrandom/mood/mood_scanner.py       | 13 +++++++----
 server/setup.py                            |  1 +
 3 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/server/djrandom/mood/feature_extraction.py b/server/djrandom/mood/feature_extraction.py
index 1818608..4aec137 100644
--- a/server/djrandom/mood/feature_extraction.py
+++ b/server/djrandom/mood/feature_extraction.py
@@ -76,6 +76,7 @@ def _vector_from_file(path):
     tnet.updControl('mrs_natural/inSamples', factor * 512)
 
     tnet.tick()
+
     return tnet.getControl('mrs_realvec/processedData').to_realvec()
 
 
@@ -84,26 +85,13 @@ def vector_from_file(path):
         return _vector_from_file(mp3_path)
 
 
-if __name__ == '__main__':
-    import sys, time, marsyas_utils
+def main():
+    import sys, marsyas_utils
 
     result = vector_from_file(sys.argv[1])
-
-    print str(result)[:512]
-
-    print 'serialization benchmark...'
-
-    n = 5000
-    start = time.time()
-    for i in xrange(n):
-        rstr = marsyas_utils.serialize_realvec(result)
-    end = time.time()
-    print 'serialization speed: %g iter/sec' % (n / (end - start))
-
-    start = time.time()
-    for i in xrange(n):
-        result2 = marsyas_utils.deserialize_realvec(rstr)
-    end = time.time()
-    print 'deserialization speed: %g iter/sec' % (n / (end - start))
+    rstr = marsyas_utils.serialize_realvec(result)
+    sys.stdout.write(rstr)
 
 
+if __name__ == '__main__':
+    main()
diff --git a/server/djrandom/mood/mood_scanner.py b/server/djrandom/mood/mood_scanner.py
index 09868c5..18515d6 100644
--- a/server/djrandom/mood/mood_scanner.py
+++ b/server/djrandom/mood/mood_scanner.py
@@ -9,19 +9,24 @@ from djrandom import utils
 from djrandom.model.mp3 import MP3
 from djrandom.database import Session, init_db
 from djrandom.model import processor
-from djrandom.mood import feature_extraction
-from djrandom.mood import marsyas_utils
 
 log = logging.getLogger(__name__)
 
 
+# We run extract_features as an external program because Marsyas has
+# a nasty tendency to leak memory during analysis...
+def get_features(path):
+    pipe = subprocess.Popen(['djrandom-mood-extract-features', path],
+                            stdout=subprocess.PIPE)
+    return pipe.communicate()[0]
+
+
 class TimbreFeatureExtractor(processor.Processor):
 
     def process(self, mp3):
         log.info('extracting features from %s' % mp3.sha1)
         try:
-            timbre_vector = feature_extraction.vector_from_file(str(mp3.path))
-            vector_str = marsyas_utils.serialize_realvec(timbre_vector)
+            vector_str = get_features(mp3.path)
         except Exception, e:
             log.error('error processing %s: %s' % (mp3.sha1, e))
             return
diff --git a/server/setup.py b/server/setup.py
index f842c30..0dc5dbd 100644
--- a/server/setup.py
+++ b/server/setup.py
@@ -33,6 +33,7 @@ setup(
       "djrandom-metadata-fixer = djrandom.metadata_fixer.metadata_fixer:main",
       "djrandom-solr-fixer = djrandom.model.verify:main",
       "djrandom-mood-scanner = djrandom.mood.mood_scanner:main",
+      "djrandom-mood-extract-features = djrandom.mood.feature_extraction:main",
     ],
   },
   )
-- 
GitLab