From 71b3289f6212343e6d14e7b4ae48311bf72003dd Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 12 Aug 2010 08:27:21 -0400 Subject: [PATCH] Improved collection indexing I can now create an index over an entire collection faster. The bottleneck is still disk accesses, but I still think this was a good challenge. --- libsaria/collection/collection.py | 25 +++++++++------- libsaria/collection/index.py | 49 +++++++++++++++++++++++++++++++ libsaria/collection/table.py | 5 ++-- tests/collection.py | 4 +-- 4 files changed, 67 insertions(+), 16 deletions(-) create mode 100644 libsaria/collection/index.py diff --git a/libsaria/collection/collection.py b/libsaria/collection/collection.py index 024db409..cd1f2f42 100644 --- a/libsaria/collection/collection.py +++ b/libsaria/collection/collection.py @@ -1,42 +1,45 @@ # Bryan Schumaker (8/8/2010) import libsaria -import table class Collection: def __init__(self): - self.table = table.Table() + pass def scan(self, path): print "Scanning path:", path - self.clear() + self.reset() self.update(path) - def clear(self): + def reset(self): print "Erasing collection ... " + from table import Table + from index import Index + self.table = Table() + self.index = Index() def insert(self, file): - tags = file.tag() - id = self.table.insert(tags) - print id, tags - + tags = file.tag() + tuple = (tags.artist, tags.album, tags.title) + id = self.table.insert(tuple) + self.index.insert(id, tuple) def update(self, path): FileRef = libsaria.collection.FileRef join = libsaria.path.join + insert = self.insert for root,dirs,files in libsaria.path.walk(path): for file in files: file = join(root,file) try: - tagfile = FileRef(file) - self.insert(tagfile) + insert(FileRef(file)) except Exception,e: - print e + pass diff --git a/libsaria/collection/index.py b/libsaria/collection/index.py new file mode 100644 index 00000000..3f346e70 --- /dev/null +++ b/libsaria/collection/index.py @@ -0,0 +1,49 @@ +# Bryan Schumaker (8/10/2010) + +ttable = None +translate = None + +class Index: + def __init__(self): + self.tokens = dict() + + def setup(self): + import string + space = ord(" ") + strip = u"\"#$%&'*+<=>@[]^`{|}~.?!" + split = u"-\/,:;()_~+" + upper = string.uppercase + lower = string.lowercase + translate = string.translate + + ttable = dict((ord(c),None) for c in strip) + splitt = dict((ord(c),space) for c in split) + lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper)) + for t in (splitt, lowert): + for c in t: + ttable[c] = t[c] + + def insert(self, id, tags): + global ttable + idset = set([id]) + tokens = self.tokens + + if ttable == None: + self.setup() + + for tag in tags: + words = translate(tag,ttable).split() + for word in words: + set = tokens.get(word,None) + if set == None: + tokens[word] = idset + else: + set.update(idset) + for l in word: + set = tokens.get(l, None) + if set == None: + tokens[word] = idset + else: + set.update(idset) + + diff --git a/libsaria/collection/table.py b/libsaria/collection/table.py index a9dcf08f..1a2ff56b 100644 --- a/libsaria/collection/table.py +++ b/libsaria/collection/table.py @@ -6,9 +6,8 @@ class Table(dict): self.next_id = 0 - def insert(self, tag): + def insert(self, tags): id = self.next_id - self[id] = (tag.artist, tag.album, tag.title) + self[id] = tags self.next_id += 1 - print id, self.next_id return id diff --git a/tests/collection.py b/tests/collection.py index 313aa630..80fd02d2 100644 --- a/tests/collection.py +++ b/tests/collection.py @@ -2,8 +2,8 @@ from libsaria import collection -#src = "~/Music" +src = "~/Music" #src = "~/Music/Foo Fighters" -src = "~/Music/Foo Fighters/Foo Fighters" +#src = "~/Music/Foo Fighters/Foo Fighters" collection.new_source(src, bg=False)