Improved collection indexing

I can now create an index over an entire collection faster. The bottleneck is still disk accesses, but I still think this was a good challenge.
2010-08-12 08:27:21 -04:00 · 2010-08-12 08:27:21 -04:00 · 71b3289f62
parent 52b1236b29
commit 71b3289f62
4 changed files with 67 additions and 16 deletions
--- a/libsaria/collection/collection.py
+++ b/libsaria/collection/collection.py
@ -1,42 +1,45 @@
 # Bryan Schumaker (8/8/2010)

 import libsaria
-import table

 class Collection:
 	def __init__(self):
-		self.table = table.Table()
+		pass


 	def scan(self, path):
 		print "Scanning path:", path
-		self.clear()
+		self.reset()
 		self.update(path)


-	def clear(self):
+	def reset(self):
 		print "Erasing collection ... "
+		from table import Table
+		from index import Index
+		self.table = Table()
+		self.index = Index()


 	def insert(self, file):
-		tags = file.tag()
-		id = self.table.insert(tags)
-		print id, tags
-
+		tags  = file.tag()
+		tuple = (tags.artist, tags.album, tags.title)
+		id = self.table.insert(tuple)
+		self.index.insert(id, tuple)


 	def update(self, path):
 		FileRef = libsaria.collection.FileRef
 		join    = libsaria.path.join
+		insert  = self.insert

 		for root,dirs,files in libsaria.path.walk(path):
 			for file in files:
 				file = join(root,file)
 				try:
-					tagfile = FileRef(file)
-					self.insert(tagfile)
+					insert(FileRef(file))
 				except Exception,e:
-					print e
+					pass



--- a/libsaria/collection/index.py
+++ b/libsaria/collection/index.py
@ -0,0 +1,49 @@
+# Bryan Schumaker (8/10/2010)
+
+ttable = None
+translate = None
+
+class Index:
+	def __init__(self):
+		self.tokens = dict()
+
+	def setup(self):
+		import string
+		space = ord(" ")
+		strip = u"\"#$%&'*+<=>@[]^`{|}~.?!"
+		split = u"-\/,:;()_~+"
+		upper = string.uppercase
+		lower = string.lowercase
+		translate = string.translate
+
+		ttable = dict((ord(c),None) for c in strip)
+		splitt = dict((ord(c),space) for c in split)
+		lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper))
+		for t in (splitt, lowert):
+			for c in t:
+				ttable[c] = t[c]
+
+	def insert(self, id, tags):
+		global ttable
+		idset = set([id])
+		tokens = self.tokens
+
+		if ttable == None:
+			self.setup()
+
+		for tag in tags:
+			words = translate(tag,ttable).split()
+			for word in words:
+				set = tokens.get(word,None)
+				if set == None:
+					tokens[word] = idset
+				else:
+					set.update(idset)
+				for l in word:
+					set = tokens.get(l, None)
+					if set == None:
+						tokens[word] = idset
+					else:
+						set.update(idset)
+
+
--- a/libsaria/collection/table.py
+++ b/libsaria/collection/table.py
@ -6,9 +6,8 @@ class Table(dict):
 		self.next_id = 0


-	def insert(self, tag):
+	def insert(self, tags):
 		id = self.next_id
-		self[id] = (tag.artist, tag.album, tag.title)
+		self[id] = tags
 		self.next_id += 1
-		print id, self.next_id
 		return id
--- a/tests/collection.py
+++ b/tests/collection.py
@ -2,8 +2,8 @@

 from libsaria import collection

-#src = "~/Music"
+src = "~/Music"
 #src = "~/Music/Foo Fighters"
-src = "~/Music/Foo Fighters/Foo Fighters"
+#src = "~/Music/Foo Fighters/Foo Fighters"

 collection.new_source(src, bg=False)