ocarina/lib/filter.cpp

/*
 * Copyright 2013 (c) Anna Schumaker.
 */

#include <filter.h>
#include <print.h>

#include <algorithm>
#include <list>
#include <map>
#include <set>

static Database<IndexEntry> filter_index("");
static std::map<std::string, std::string> lowercase_cache;
static unsigned int lowercase_cache_hits = 0;

static void split_text(const std::string &text, std::list<std::string> &ret)
{
	std::string word;
	char c;

	for (unsigned int i = 0; i < text.size(); i++) {
		c = text[i];

		switch (c) {
		case '\\':
		case '/':
		case ',':
		case ';':
		case '(':
		case ')':
		case '_':
		case '-':
		case '~':
		case '+':
		case '"':
		case ' ':
		case '	':
			if (word != "") {
				ret.push_back(word);
				word = "";
			}
			break;
		default:
			word += c;
		};
	}

	if (word != "")
		ret.push_back(word);
}

static void lower_text(const std::string &text, std::list<std::string> &ret)
{
	char c;
	std::string word;
	std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);

	if (it != lowercase_cache.end()) {
		lowercase_cache_hits++;
		ret.push_back(it->second);
		return;
	}

	for (unsigned int i = 0; i < text.size(); i++) {
		c = text[i];
		if ( (c >= 'a') && (c <= 'z') )
			word += c;
		else if ( (c >= 'A') && (c <= 'Z') )
			word += (c + ('a' - 'A'));
		else if ( (c >= '0') && (c <= '9') )
			word += c;
	}

	lowercase_cache[text] = word;
	ret.push_back(word);
}

static void parse_text(const std::string &text, std::list<std::string> &ret)
{
	std::list<std::string> split;
	std::list<std::string>::iterator it;

	split_text(text, split);
	for (it = split.begin(); it != split.end(); it++)
		lower_text(*it, ret);
}

static void add_substrings(const std::string &text, unsigned int track_id)
{
	std::string substr;
	for (unsigned int i = 1; i <= text.size(); i++) {
		substr = text.substr(0, i);
		try {
			filter_index.find(substr).insert(track_id);
		} catch (...) {
			filter_index.insert(IndexEntry(substr, track_id));
		}
	}
}

void filter :: add(const std::string &text, unsigned int track_id)
{
	std::list<std::string> parsed;
	std::list<std::string>::iterator it;

	parse_text(text, parsed);
	for (it = parsed.begin(); it != parsed.end(); it++)
		add_substrings(*it, track_id);
}

static void find_intersection(std::string &text, std::set<unsigned int> &res)
{
	std::set<unsigned int> terms = filter_index[text];
	std::set<unsigned int> tmp;

	set_intersection(filter_index[text].begin(), filter_index[text].end(),
		res.begin(), res.end(),
		std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
	res.swap(tmp);
}

void filter :: search(const std::string &text, std::set<unsigned int> &res)
{
	std::list<std::string> parsed;
	std::list<std::string>::iterator it;

	parse_text(text, parsed);
	if (parsed.size() == 0)
		return;

	it = parsed.begin();
	res = filter_index[*it];

	for (it++; it != parsed.end(); it++)
		find_intersection(*it, res);
}

void filter :: print_cache_stats()
{
	print("Lowercase cache size: %u\n", lowercase_cache.size());
	print("Lowercase cache hits: %u\n", lowercase_cache_hits);
}

#ifdef CONFIG_TEST
Database<IndexEntry> &filter :: get_index()
{
	return filter_index;
}
#endif /* CONFIG_TEST */