ocarina/libsaria/collection/index.py

# Bryan Schumaker (8/10/2010)

import re

translate = unicode.translate
split     = unicode.split
space_ord = ord(" ")
stripc    = u"\"#$%&'*+<=>@[]^`{|}~.?!"
splitc    = u"-\/,:;()_~+"

ttable = None

def format_once(text):
	import string
	global ttable
	upper     = string.uppercase
	lower     = string.lowercase

	ttable = dict((ord(c),None) for c in stripc)
	splitt = dict((ord(c),space_ord) for c in splitc)
	lowert = dict((ord(c),ord(lower[i])) for i,c in enumerate(upper))
	for t in (splitt, lowert):
		for c in t:
			ttable[c] = t[c]
	format = format_rest
	return format_rest(text)
def format_rest(text):
	return text.translate(ttable).split()
format = format_once


class Index(dict):
	def __init__(self):
		dict.__init__(self)

	def insert(self, tags, id):
		get = self.get
		for tag in tags:
			for word in format(tag):
				ids = get(word, None)
				if ids == None:
					self[word] = set([id])
				else:
					ids.add(id)

	def filter(self, text):
		text = unicode(text)
		terms = format(text)
		results = dict()

		search = re.search
		get    = self.get

		for t in terms:
			results[t] = set()

		for key in self.keys():
			for term in terms:
				if search(term, key):
					results[term].update(get(key))

		visible = set()
		for i,t in enumerate(terms):
			if i == 0:
				visible.update(results[t])
			else:
				visible.intersection_update(results[t])
		return visible