filter: Implement text filtering

Because every programming project needs a test case centered around Discworld quotes ... Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
2013-08-25 10:33:48 -04:00 · 2013-08-25 10:33:48 -04:00 · 2b15048777
parent 379a96fe13
commit 2b15048777
8 changed files with 344 additions and 16 deletions
--- a/10
+++ b/10
@ -26,6 +26,7 @@ class Config:
 		self.ENV      = CONFIG_ENV
 		self.DATABASE = False
 		self.FILE     = False
+		self.FILTER   = False
 		self.INDEX    = False
 		self.TEST     = False

@ -34,13 +35,16 @@ class Config:

 	def reconfigure(self):
 		env.Replace( CCFLAGS = self.ENV )
-		if self.DATABASE: env.Append( CCFLAGS = "-DCONFIG_DATABASE" )
-		if self.FILE:     env.Append( CCFLAGS = "-DCONFIG_FILE" )
-		if self.TEST:     env.Append( CCFLAGS = "-DCONFIG_TEST" )
+		if self.DATABASE: env.Append( CCFLAGS = [ "-DCONFIG_DATABASE" ])
+		if self.FILE:     env.Append( CCFLAGS = [ "-DCONFIG_FILE" ])
+		if self.FILTER:   env.Append( CCFLAGS = [ "-DCONFIG_FILTER" ])
+		if self.INDEX:    env.Append( CCFLAGS = [ "-DCONFIG_INDEX" ])
+		if self.TEST:     env.Append( CCFLAGS = [ "-DCONFIG_TEST" ])

 	def reset(self):
 		self.DATABASE = False
 		self.FILE     = False
+		self.FILTER   = False
 		self.INDEX    = False
 		self.TEST     = False
 		self.reconfigure()
--- a/design/filter.txt
+++ b/design/filter.txt
@ -8,30 +8,45 @@
 index

 Filter: (lib/filter.cpp)
-	Filtering is used to generate a subset of songs for easier searching.
+	Filtering is used to generate a subset of songs displayed by the UI to
+	that users can choose from.  The inverted index is generated at startup
+	so there is no need for a remove() function, since it will be wiped
+	the next time the application starts.

 - Index:
-	map<string, string> lowercase_cache;
-	map<string, set<string>> substring_cache;
 	Index filter_index("");
+	map<string, string> lowercase_cache;
+	unsigned int lowercase_cache_hits;

 - Parsing:
 	1) Convert the provided string into a list of words, using whitespace
-	   and the following characters as delimiters: \/,;()_~+"
+	   and the following characters as delimiters: \/,;()_-~+"

 	For each word:
 	    2) Check the lowercase_cache to see if we have seen the word before,
 	       a) If we have, return the stored string
 	       b) Convert the string to lowercase and strip out remaining
 	          special characters.  Add the result to the lowercase_cache;
-	    3) Check the substring_cache to see if we have seen the word before,
-	       a) If we have, use the substring set returned
-	       b) Break the word into substrings from the front only.  For
-	          example:  "dalek" would contain the substrings
-	          {d, da, dal, dale, dalek}.  Add to the substring cache.

 - API:
-	filter :: add(string, track_id);
-		Parses the string and adds the track_id to the index.
+	void filter :: add(string, track_id);
+		Parse the string into substrings following the "Parsing"
+		section (above).  Add each (substring, track_id) pair to the
+		filter_index.
+
+		To generate substrings, iterate over the word starting from
+		the front.  For example: "dalek" would contain the substrings
+		{d, da, dal, dale, dalek}.
+
 	void filter :: search(string, set<track_id> &);
-		Parse the string and fill in the set with matching tracks.
+		Parse the string into substrings following the "Parsing"
+		section (above).  We want to find track_ids that match ALL
+		substrings, so take the intersection of all sets returned by
+		the filter_index for a given substring.
+
+	void filter :: print_cache_stats();
+		Print cache hit and size information.
+
+	void filter :: get_index();
+		Return the index storing all the filter data.
+		(Only available if -DCONFIG_TEST is set)
--- a/include/filter.h
+++ b/include/filter.h
@ -0,0 +1,23 @@
+/*
+ * Copyright 2013 (c) Anna Schumaker.
+ */
+#ifndef OCARINA_FILTER_H
+#define OCARINA_FILTER_H
+
+#include <index.h>
+#include <string>
+
+namespace filter {
+
+	void add(const std::string &, unsigned int);
+	void search(const std::string &, std::set<unsigned int> &);
+
+	void print_cache_stats();
+
+#ifdef CONFIG_TEST
+	Index &get_index();
+#endif /* CONFIG_TEST */
+
+};
+
+#endif /* OCARINA_FILTER_H */
--- a/lib/Sconscript
+++ b/lib/Sconscript
@ -3,6 +3,12 @@ Import("env", "CONFIG")

 build = []

+if CONFIG.FILTER:
+	CONFIG.INDEX = True
+	build += [ env.Object("filter.cpp") ]
+
+####################
+
 if CONFIG.DATABASE:
 	CONFIG.FILE = True
 	build += [ env.Object("database.cpp") ]
@ -11,6 +17,7 @@ if CONFIG.INDEX:
 	CONFIG.FILE = True
 	build += [ env.Object("index.cpp") ]

+####################

 if CONFIG.FILE:
 	CONFIG.package("glib-2.0")
--- a/lib/filter.cpp
+++ b/lib/filter.cpp
@ -0,0 +1,144 @@
+/*
+ * Copyright 2013 (c) Anna Schumaker.
+ */
+
+#include <filter.h>
+#include <index.h>
+#include <print.h>
+
+#include <algorithm>
+#include <list>
+#include <map>
+#include <set>
+
+static Index filter_index("");
+static std::map<std::string, std::string> lowercase_cache;
+static unsigned int lowercase_cache_hits = 0;
+
+static void split_text(const std::string &text, std::list<std::string> &ret)
+{
+	std::string word;
+	char c;
+
+	for (unsigned int i = 0; i < text.size(); i++) {
+		c = text[i];
+
+		switch (c) {
+		case '\\':
+		case '/':
+		case ',':
+		case ';':
+		case '(':
+		case ')':
+		case '_':
+		case '-':
+		case '~':
+		case '+':
+		case '"':
+		case ' ':
+		case '	':
+			if (word != "") {
+				ret.push_back(word);
+				word = "";
+			}
+			break;
+		default:
+			word += c;
+		};
+	}
+
+	if (word != "")
+		ret.push_back(word);
+}
+
+static void lower_text(const std::string &text, std::list<std::string> &ret)
+{
+	char c;
+	std::string word;
+	std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
+
+	if (it != lowercase_cache.end()) {
+		lowercase_cache_hits++;
+		ret.push_back(it->second);
+		return;
+	}
+
+	for (unsigned int i = 0; i < text.size(); i++) {
+		c = text[i];
+		if ( (c >= 'a') && (c <= 'z') )
+			word += c;
+		else if ( (c >= 'A') && (c <= 'Z') )
+			word += (c + ('a' - 'A'));
+		else if ( (c >= '0') && (c <= '9') )
+			word += c;
+	}
+
+	lowercase_cache[text] = word;
+	ret.push_back(word);
+}
+
+static void parse_text(const std::string &text, std::list<std::string> &ret)
+{
+	std::list<std::string> split;
+	std::list<std::string>::iterator it;
+
+	split_text(text, split);
+	for (it = split.begin(); it != split.end(); it++)
+		lower_text(*it, ret);
+}
+
+static void add_substrings(const std::string &text, unsigned int track_id)
+{
+	for (unsigned int i = 1; i <= text.size(); i++)
+		filter_index.insert(text.substr(0, i), track_id);
+}
+
+void filter :: add(const std::string &text, unsigned int track_id)
+{
+	std::list<std::string> parsed;
+	std::list<std::string>::iterator it;
+
+	parse_text(text, parsed);
+	for (it = parsed.begin(); it != parsed.end(); it++)
+		add_substrings(*it, track_id);
+}
+
+static void find_intersection(std::string &text, std::set<unsigned int> &res)
+{
+	std::set<unsigned int> terms = filter_index[text];
+	std::set<unsigned int> tmp;
+
+	set_intersection(filter_index[text].begin(), filter_index[text].end(),
+		res.begin(), res.end(),
+		std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
+	res.swap(tmp);
+}
+
+void filter :: search(const std::string &text, std::set<unsigned int> &res)
+{
+	std::list<std::string> parsed;
+	std::list<std::string>::iterator it;
+
+	parse_text(text, parsed);
+	if (parsed.size() == 0)
+		return;
+
+	it = parsed.begin();
+	res = filter_index[*it];
+
+	for (it++; it != parsed.end(); it++)
+		find_intersection(*it, res);
+}
+
+void filter :: print_cache_stats()
+{
+	print("Lowercase cache size: %u\n", lowercase_cache.size());
+	print("Lowercase cache hits: %u\n", lowercase_cache_hits);
+}
+
+#ifdef CONFIG_TEST
+Index &filter :: get_index()
+{
+	return filter_index;
+}
+#endif /* CONFIG_TEST */
--- a/tests/Sconscript
+++ b/tests/Sconscript
@ -43,7 +43,7 @@ Export("Test")


 # Read SConscript files
-scripts = [ "basic", "database", "file", "index" ]
+scripts = [ "basic", "database", "file", "filter", "index" ]
 for s in scripts:
 	CONFIG.reset()
 	CONFIG.TEST = True
--- a/tests/filter/Sconscript
+++ b/tests/filter/Sconscript
@ -0,0 +1,6 @@
+#!/usr/bin/python
+Import("Test", "CONFIG")
+
+CONFIG.FILTER = True
+
+Test("filter", "filter.cpp")
--- a/tests/filter/filter.cpp
+++ b/tests/filter/filter.cpp
@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 (c) Anna Schumaker.
+ */
+#include <filter.h>
+#include <print.h>
+#include <vector>
+
+std::string quotes [] = {
+	"What heroes like best is themselves.",
+	"The sun rose slowly, as if wasn't sure it was worth all the effort",
+	"Of course I'm sane, when trees start talking to me, I don't talk back",
+	"Darkness isn't the opposite of light, it's simply its absence",
+	"Time passed, which, basically, is its job",
+	"Million-to-one chances crop up nine times out of ten",
+	"CATS ARE NICE",
+	"Death isn't cruel - merely terribly, terribly good at his job",
+	"Thunder rolled ... it rolled a six",
+	"DROP THE SCYTHE, AND TURN AROUND SLOWLY",
+	"Time is like a drug.  Too much of it kills you",
+	"Gravity is a habit that is hard to shake off",
+	"You do not ask people like that what they are thinking about in case "
+		"they turn around very slowly and say 'You'",
+	"Do unto others before they do unto you",
+	"Not a man to mince words.  People, yes.  But not words",
+	"An elf's strength lay in persuading others they were weak",
+	"May you live in interesting times",
+	"WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN.  AND I'M NOT SURE EVEN "
+		"ABOUT THAT",
+	"I AM DEATH, NOT TAXES.  I TURN UP ONLY ONCE",
+	"All tribal myths are true, for a given value of 'true'",
+	"The Truth Shall Make Ye Fret",
+	"When you look into the abyss, it's not supposed to wave back",
+	"I have no use for people who have learned the limits of the possible",
+	"Speak softly and employ a huge man with a crowbar",
+	"Truly, the leopard can change his shorts",
+	"+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++",
+	"+++Whoops!  Here comes the cheese! +++",
+	"Bring out yer dead, bring out yer living dead",
+	"1. ALL FUNGI ARE EDIBLE.  2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.",
+	"A lot of farming is about manure",
+	"I am very attached to my fingers, and I like to think of them as attached to me",
+	"There be a lot o' men who became heroes cuz they wuz too scared tae run",
+	"If only the pawns united, make talked the rooks round, the whole board "
+		"could've been a republic in a dozen moves",
+	"Always remember that the crowd that applauds your coronation is the same "
+		"crowd that will applaud your beheading.  People like a show.",
+};
+
+static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string);
+
+void print_keys(Index &index)
+{
+	std::set<std::string>::iterator it;
+
+	print("Found keys:");
+	for (it = index.keys_begin(); it != index.keys_end(); it++)
+		print(" %s", it->c_str());
+	print("\n");
+}
+
+void print_index(Index &index)
+{
+	std::set<std::string>::iterator s_it;
+	std::set<unsigned int>::iterator u_it;
+
+	print("=== Printing index ===\n");
+	print_keys(index);
+
+	for (s_it = index.keys_begin(); s_it != index.keys_end(); s_it++) {
+		std::string key = *s_it;
+		print("index[%s] = {", key.c_str());
+
+		for (u_it = index[key].begin(); u_it != index[key].end(); u_it++) {
+			if (u_it != index[key].begin())
+				print(" ");
+			print("%d", *u_it);
+		}
+
+		print("}\n");
+	}
+	print("\n");
+}
+
+void test_search(const std::string &text)
+{
+	std::set<unsigned int> results;
+	std::set<unsigned int>::iterator it;
+
+	filter :: search(text, results);
+
+	print("Search for: \"%s\" returned %u matches:\n",
+		text.c_str(), results.size());
+
+	for (it = results.begin(); it != results.end(); it++)
+		print("\t%s\n", quotes[*it].c_str());
+	print("\n");
+}
+
+void test_0()
+{
+	for (unsigned int i = 0; i < num_quotes; i++)
+		filter :: add(quotes[i], i);
+	print_index(filter :: get_index());
+	filter :: print_cache_stats();
+	print("\n");
+}
+
+void test_1()
+{
+	test_search("");
+	test_search("Rincewind");
+	test_search("Rincewind Twoflower Luggage");
+	test_search("the");
+	test_search("the is");
+	test_search("THE IS");
+	test_search("th i");
+	test_search("th i even");
+	test_search("Th/i-eVEn");
+	test_search("whoops");
+	filter :: print_cache_stats();
+}
+
+int main(int argc, char **argv)
+{
+	test_0();
+	test_1();
+
+	return 0;
+}