ocarina/libsaria/collection/index.py

71 lines
1.4 KiB
Python

# Bryan Schumaker (8/10/2010)
import re
translate = unicode.translate
split = unicode.split
space_ord = ord(" ")
stripc = u"\"#$%&'*+<=>@[]^`{|}~.?!"
splitc = u"-\/,:;()_~+"
ttable = None
def format_once(text):
import string
global ttable
upper = string.uppercase
lower = string.lowercase
ttable = dict((ord(c),None) for c in stripc)
splitt = dict((ord(c),space_ord) for c in splitc)
lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper))
for t in (splitt, lowert):
for c in t:
ttable[c] = t[c]
format = format_rest
return format_rest(text)
def format_rest(text):
return text.translate(ttable).split()
format = format_once
class Index(dict):
def __init__(self):
dict.__init__(self)
def insert(self, id, tags):
get = self.get
for tag in tags:
for word in format(tag):
ids = get(word, None)
if ids == None:
self[word] = set([id])
else:
ids.add(id)
def filter(self, text):
text = unicode(text)
terms = format(text)
results = dict()
search = re.search
get = self.get
for t in terms:
results[t] = set()
for key in self.keys():
for term in terms:
if search(term, key):
results[term].update(get(key))
visible = set()
for i,t in enumerate(terms):
if i == 0:
visible.update(results[t])
else:
visible.intersection_update(results[t])
return visible