From 2b150487771383c4cf3cd84d9e2bc22cc0701a89 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Sun, 25 Aug 2013 10:33:48 -0400 Subject: [PATCH] filter: Implement text filtering Because every programming project needs a test case centered around Discworld quotes ... Signed-off-by: Anna Schumaker --- config | 10 ++- design/filter.txt | 39 +++++++---- include/filter.h | 23 +++++++ lib/Sconscript | 7 ++ lib/filter.cpp | 144 ++++++++++++++++++++++++++++++++++++++++ tests/Sconscript | 2 +- tests/filter/Sconscript | 6 ++ tests/filter/filter.cpp | 129 +++++++++++++++++++++++++++++++++++ 8 files changed, 344 insertions(+), 16 deletions(-) create mode 100644 include/filter.h create mode 100644 lib/filter.cpp create mode 100644 tests/filter/Sconscript create mode 100644 tests/filter/filter.cpp diff --git a/config b/config index d42a244f..f83365bb 100644 --- a/config +++ b/config @@ -26,6 +26,7 @@ class Config: self.ENV = CONFIG_ENV self.DATABASE = False self.FILE = False + self.FILTER = False self.INDEX = False self.TEST = False @@ -34,13 +35,16 @@ class Config: def reconfigure(self): env.Replace( CCFLAGS = self.ENV ) - if self.DATABASE: env.Append( CCFLAGS = "-DCONFIG_DATABASE" ) - if self.FILE: env.Append( CCFLAGS = "-DCONFIG_FILE" ) - if self.TEST: env.Append( CCFLAGS = "-DCONFIG_TEST" ) + if self.DATABASE: env.Append( CCFLAGS = [ "-DCONFIG_DATABASE" ]) + if self.FILE: env.Append( CCFLAGS = [ "-DCONFIG_FILE" ]) + if self.FILTER: env.Append( CCFLAGS = [ "-DCONFIG_FILTER" ]) + if self.INDEX: env.Append( CCFLAGS = [ "-DCONFIG_INDEX" ]) + if self.TEST: env.Append( CCFLAGS = [ "-DCONFIG_TEST" ]) def reset(self): self.DATABASE = False self.FILE = False + self.FILTER = False self.INDEX = False self.TEST = False self.reconfigure() diff --git a/design/filter.txt b/design/filter.txt index 9bcbf007..3fab0114 100644 --- a/design/filter.txt +++ b/design/filter.txt @@ -8,30 +8,45 @@ index Filter: (lib/filter.cpp) - Filtering is used to generate a subset of songs for easier searching. + Filtering is used to generate a subset of songs displayed by the UI to + that users can choose from. The inverted index is generated at startup + so there is no need for a remove() function, since it will be wiped + the next time the application starts. - Index: - map lowercase_cache; - map> substring_cache; Index filter_index(""); + map lowercase_cache; + unsigned int lowercase_cache_hits; - Parsing: 1) Convert the provided string into a list of words, using whitespace - and the following characters as delimiters: \/,;()_~+" + and the following characters as delimiters: \/,;()_-~+" For each word: 2) Check the lowercase_cache to see if we have seen the word before, a) If we have, return the stored string b) Convert the string to lowercase and strip out remaining special characters. Add the result to the lowercase_cache; - 3) Check the substring_cache to see if we have seen the word before, - a) If we have, use the substring set returned - b) Break the word into substrings from the front only. For - example: "dalek" would contain the substrings - {d, da, dal, dale, dalek}. Add to the substring cache. - API: - filter :: add(string, track_id); - Parses the string and adds the track_id to the index. + void filter :: add(string, track_id); + Parse the string into substrings following the "Parsing" + section (above). Add each (substring, track_id) pair to the + filter_index. + + To generate substrings, iterate over the word starting from + the front. For example: "dalek" would contain the substrings + {d, da, dal, dale, dalek}. + void filter :: search(string, set &); - Parse the string and fill in the set with matching tracks. + Parse the string into substrings following the "Parsing" + section (above). We want to find track_ids that match ALL + substrings, so take the intersection of all sets returned by + the filter_index for a given substring. + + void filter :: print_cache_stats(); + Print cache hit and size information. + + void filter :: get_index(); + Return the index storing all the filter data. + (Only available if -DCONFIG_TEST is set) diff --git a/include/filter.h b/include/filter.h new file mode 100644 index 00000000..d16c8bf7 --- /dev/null +++ b/include/filter.h @@ -0,0 +1,23 @@ +/* + * Copyright 2013 (c) Anna Schumaker. + */ +#ifndef OCARINA_FILTER_H +#define OCARINA_FILTER_H + +#include +#include + +namespace filter { + + void add(const std::string &, unsigned int); + void search(const std::string &, std::set &); + + void print_cache_stats(); + +#ifdef CONFIG_TEST + Index &get_index(); +#endif /* CONFIG_TEST */ + +}; + +#endif /* OCARINA_FILTER_H */ diff --git a/lib/Sconscript b/lib/Sconscript index ec997b2a..7442f3a3 100644 --- a/lib/Sconscript +++ b/lib/Sconscript @@ -3,6 +3,12 @@ Import("env", "CONFIG") build = [] +if CONFIG.FILTER: + CONFIG.INDEX = True + build += [ env.Object("filter.cpp") ] + +#################### + if CONFIG.DATABASE: CONFIG.FILE = True build += [ env.Object("database.cpp") ] @@ -11,6 +17,7 @@ if CONFIG.INDEX: CONFIG.FILE = True build += [ env.Object("index.cpp") ] +#################### if CONFIG.FILE: CONFIG.package("glib-2.0") diff --git a/lib/filter.cpp b/lib/filter.cpp new file mode 100644 index 00000000..4cc9a756 --- /dev/null +++ b/lib/filter.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2013 (c) Anna Schumaker. + */ + +#include +#include +#include + +#include +#include +#include +#include + +static Index filter_index(""); +static std::map lowercase_cache; +static unsigned int lowercase_cache_hits = 0; + +static void split_text(const std::string &text, std::list &ret) +{ + std::string word; + char c; + + for (unsigned int i = 0; i < text.size(); i++) { + c = text[i]; + + switch (c) { + case '\\': + case '/': + case ',': + case ';': + case '(': + case ')': + case '_': + case '-': + case '~': + case '+': + case '"': + case ' ': + case ' ': + if (word != "") { + ret.push_back(word); + word = ""; + } + break; + default: + word += c; + }; + } + + if (word != "") + ret.push_back(word); +} + +static void lower_text(const std::string &text, std::list &ret) +{ + char c; + std::string word; + std::map::iterator it = lowercase_cache.find(text); + + if (it != lowercase_cache.end()) { + lowercase_cache_hits++; + ret.push_back(it->second); + return; + } + + for (unsigned int i = 0; i < text.size(); i++) { + c = text[i]; + if ( (c >= 'a') && (c <= 'z') ) + word += c; + else if ( (c >= 'A') && (c <= 'Z') ) + word += (c + ('a' - 'A')); + else if ( (c >= '0') && (c <= '9') ) + word += c; + } + + lowercase_cache[text] = word; + ret.push_back(word); +} + +static void parse_text(const std::string &text, std::list &ret) +{ + std::list split; + std::list::iterator it; + + split_text(text, split); + for (it = split.begin(); it != split.end(); it++) + lower_text(*it, ret); +} + +static void add_substrings(const std::string &text, unsigned int track_id) +{ + for (unsigned int i = 1; i <= text.size(); i++) + filter_index.insert(text.substr(0, i), track_id); +} + +void filter :: add(const std::string &text, unsigned int track_id) +{ + std::list parsed; + std::list::iterator it; + + parse_text(text, parsed); + for (it = parsed.begin(); it != parsed.end(); it++) + add_substrings(*it, track_id); +} + +static void find_intersection(std::string &text, std::set &res) +{ + std::set terms = filter_index[text]; + std::set tmp; + + set_intersection(filter_index[text].begin(), filter_index[text].end(), + res.begin(), res.end(), + std::inserter >(tmp, tmp.begin())); + res.swap(tmp); +} + +void filter :: search(const std::string &text, std::set &res) +{ + std::list parsed; + std::list::iterator it; + + parse_text(text, parsed); + if (parsed.size() == 0) + return; + + it = parsed.begin(); + res = filter_index[*it]; + + for (it++; it != parsed.end(); it++) + find_intersection(*it, res); +} + +void filter :: print_cache_stats() +{ + print("Lowercase cache size: %u\n", lowercase_cache.size()); + print("Lowercase cache hits: %u\n", lowercase_cache_hits); +} + +#ifdef CONFIG_TEST +Index &filter :: get_index() +{ + return filter_index; +} +#endif /* CONFIG_TEST */ diff --git a/tests/Sconscript b/tests/Sconscript index 7d7af1d4..77d207a9 100644 --- a/tests/Sconscript +++ b/tests/Sconscript @@ -43,7 +43,7 @@ Export("Test") # Read SConscript files -scripts = [ "basic", "database", "file", "index" ] +scripts = [ "basic", "database", "file", "filter", "index" ] for s in scripts: CONFIG.reset() CONFIG.TEST = True diff --git a/tests/filter/Sconscript b/tests/filter/Sconscript new file mode 100644 index 00000000..ea122ae8 --- /dev/null +++ b/tests/filter/Sconscript @@ -0,0 +1,6 @@ +#!/usr/bin/python +Import("Test", "CONFIG") + +CONFIG.FILTER = True + +Test("filter", "filter.cpp") diff --git a/tests/filter/filter.cpp b/tests/filter/filter.cpp new file mode 100644 index 00000000..ba214651 --- /dev/null +++ b/tests/filter/filter.cpp @@ -0,0 +1,129 @@ +/* + * Copyright 2013 (c) Anna Schumaker. + */ +#include +#include +#include + +std::string quotes [] = { + "What heroes like best is themselves.", + "The sun rose slowly, as if wasn't sure it was worth all the effort", + "Of course I'm sane, when trees start talking to me, I don't talk back", + "Darkness isn't the opposite of light, it's simply its absence", + "Time passed, which, basically, is its job", + "Million-to-one chances crop up nine times out of ten", + "CATS ARE NICE", + "Death isn't cruel - merely terribly, terribly good at his job", + "Thunder rolled ... it rolled a six", + "DROP THE SCYTHE, AND TURN AROUND SLOWLY", + "Time is like a drug. Too much of it kills you", + "Gravity is a habit that is hard to shake off", + "You do not ask people like that what they are thinking about in case " + "they turn around very slowly and say 'You'", + "Do unto others before they do unto you", + "Not a man to mince words. People, yes. But not words", + "An elf's strength lay in persuading others they were weak", + "May you live in interesting times", + "WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN " + "ABOUT THAT", + "I AM DEATH, NOT TAXES. I TURN UP ONLY ONCE", + "All tribal myths are true, for a given value of 'true'", + "The Truth Shall Make Ye Fret", + "When you look into the abyss, it's not supposed to wave back", + "I have no use for people who have learned the limits of the possible", + "Speak softly and employ a huge man with a crowbar", + "Truly, the leopard can change his shorts", + "+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++", + "+++Whoops! Here comes the cheese! +++", + "Bring out yer dead, bring out yer living dead", + "1. ALL FUNGI ARE EDIBLE. 2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.", + "A lot of farming is about manure", + "I am very attached to my fingers, and I like to think of them as attached to me", + "There be a lot o' men who became heroes cuz they wuz too scared tae run", + "If only the pawns united, make talked the rooks round, the whole board " + "could've been a republic in a dozen moves", + "Always remember that the crowd that applauds your coronation is the same " + "crowd that will applaud your beheading. People like a show.", +}; + +static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string); + +void print_keys(Index &index) +{ + std::set::iterator it; + + print("Found keys:"); + for (it = index.keys_begin(); it != index.keys_end(); it++) + print(" %s", it->c_str()); + print("\n"); +} + +void print_index(Index &index) +{ + std::set::iterator s_it; + std::set::iterator u_it; + + print("=== Printing index ===\n"); + print_keys(index); + + for (s_it = index.keys_begin(); s_it != index.keys_end(); s_it++) { + std::string key = *s_it; + print("index[%s] = {", key.c_str()); + + for (u_it = index[key].begin(); u_it != index[key].end(); u_it++) { + if (u_it != index[key].begin()) + print(" "); + print("%d", *u_it); + } + + print("}\n"); + } + print("\n"); +} + +void test_search(const std::string &text) +{ + std::set results; + std::set::iterator it; + + filter :: search(text, results); + + print("Search for: \"%s\" returned %u matches:\n", + text.c_str(), results.size()); + + for (it = results.begin(); it != results.end(); it++) + print("\t%s\n", quotes[*it].c_str()); + print("\n"); +} + +void test_0() +{ + for (unsigned int i = 0; i < num_quotes; i++) + filter :: add(quotes[i], i); + print_index(filter :: get_index()); + filter :: print_cache_stats(); + print("\n"); +} + +void test_1() +{ + test_search(""); + test_search("Rincewind"); + test_search("Rincewind Twoflower Luggage"); + test_search("the"); + test_search("the is"); + test_search("THE IS"); + test_search("th i"); + test_search("th i even"); + test_search("Th/i-eVEn"); + test_search("whoops"); + filter :: print_cache_stats(); +} + +int main(int argc, char **argv) +{ + test_0(); + test_1(); + + return 0; +}