# Bryan Schumaker (8/10/2010) import re translate = unicode.translate split = unicode.split space_ord = ord(" ") stripc = u"\"#$%&'*+<=>@[]^`{|}~.?!" splitc = u"-\/,:;()_~+" ttable = None def format_once(text): import string global ttable upper = string.uppercase lower = string.lowercase ttable = dict((ord(c),None) for c in stripc) splitt = dict((ord(c),space_ord) for c in splitc) lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper)) for t in (splitt, lowert): for c in t: ttable[c] = t[c] format = format_rest return format_rest(text) def format_rest(text): return text.translate(ttable).split() format = format_once class Index(dict): def __init__(self): dict.__init__(self) def insert(self, id, tags): get = self.get for tag in tags: for word in format(tag): ids = get(word, None) if ids == None: self[word] = set([id]) else: ids.add(id) def filter(self, text): text = unicode(text) terms = format(text) results = dict() visible = set() search = re.search get = self.get # Return the set of all items if there is nothing to search on if len(terms) == 0: for key in self.keys(): visible.update(get(key)) return visible for t in terms: results[t] = set() for key in self.keys(): for term in terms: if search(term, key): results[term].update(get(key)) for i,t in enumerate(terms): if i == 0: visible.update(results[t]) else: visible.intersection_update(results[t]) return visible