Improved collection indexing

I can now create an index over an entire collection faster.  The
bottleneck is still disk accesses, but I still think this was a good
challenge.
This commit is contained in:
Bryan Schumaker 2010-08-12 08:27:21 -04:00
parent 52b1236b29
commit 71b3289f62
4 changed files with 67 additions and 16 deletions

View File

@ -1,42 +1,45 @@
# Bryan Schumaker (8/8/2010)
import libsaria
import table
class Collection:
def __init__(self):
self.table = table.Table()
pass
def scan(self, path):
print "Scanning path:", path
self.clear()
self.reset()
self.update(path)
def clear(self):
def reset(self):
print "Erasing collection ... "
from table import Table
from index import Index
self.table = Table()
self.index = Index()
def insert(self, file):
tags = file.tag()
id = self.table.insert(tags)
print id, tags
tags = file.tag()
tuple = (tags.artist, tags.album, tags.title)
id = self.table.insert(tuple)
self.index.insert(id, tuple)
def update(self, path):
FileRef = libsaria.collection.FileRef
join = libsaria.path.join
insert = self.insert
for root,dirs,files in libsaria.path.walk(path):
for file in files:
file = join(root,file)
try:
tagfile = FileRef(file)
self.insert(tagfile)
insert(FileRef(file))
except Exception,e:
print e
pass

View File

@ -0,0 +1,49 @@
# Bryan Schumaker (8/10/2010)
ttable = None
translate = None
class Index:
def __init__(self):
self.tokens = dict()
def setup(self):
import string
space = ord(" ")
strip = u"\"#$%&'*+<=>@[]^`{|}~.?!"
split = u"-\/,:;()_~+"
upper = string.uppercase
lower = string.lowercase
translate = string.translate
ttable = dict((ord(c),None) for c in strip)
splitt = dict((ord(c),space) for c in split)
lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper))
for t in (splitt, lowert):
for c in t:
ttable[c] = t[c]
def insert(self, id, tags):
global ttable
idset = set([id])
tokens = self.tokens
if ttable == None:
self.setup()
for tag in tags:
words = translate(tag,ttable).split()
for word in words:
set = tokens.get(word,None)
if set == None:
tokens[word] = idset
else:
set.update(idset)
for l in word:
set = tokens.get(l, None)
if set == None:
tokens[word] = idset
else:
set.update(idset)

View File

@ -6,9 +6,8 @@ class Table(dict):
self.next_id = 0
def insert(self, tag):
def insert(self, tags):
id = self.next_id
self[id] = (tag.artist, tag.album, tag.title)
self[id] = tags
self.next_id += 1
print id, self.next_id
return id

View File

@ -2,8 +2,8 @@
from libsaria import collection
#src = "~/Music"
src = "~/Music"
#src = "~/Music/Foo Fighters"
src = "~/Music/Foo Fighters/Foo Fighters"
#src = "~/Music/Foo Fighters/Foo Fighters"
collection.new_source(src, bg=False)