filter: Implement text filtering

Because every programming project needs a test case centered around
Discworld quotes ...

Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
This commit is contained in:
Anna Schumaker 2013-08-25 10:33:48 -04:00 committed by Anna Schumaker
parent 379a96fe13
commit 2b15048777
8 changed files with 344 additions and 16 deletions

10
config
View File

@ -26,6 +26,7 @@ class Config:
self.ENV = CONFIG_ENV
self.DATABASE = False
self.FILE = False
self.FILTER = False
self.INDEX = False
self.TEST = False
@ -34,13 +35,16 @@ class Config:
def reconfigure(self):
env.Replace( CCFLAGS = self.ENV )
if self.DATABASE: env.Append( CCFLAGS = "-DCONFIG_DATABASE" )
if self.FILE: env.Append( CCFLAGS = "-DCONFIG_FILE" )
if self.TEST: env.Append( CCFLAGS = "-DCONFIG_TEST" )
if self.DATABASE: env.Append( CCFLAGS = [ "-DCONFIG_DATABASE" ])
if self.FILE: env.Append( CCFLAGS = [ "-DCONFIG_FILE" ])
if self.FILTER: env.Append( CCFLAGS = [ "-DCONFIG_FILTER" ])
if self.INDEX: env.Append( CCFLAGS = [ "-DCONFIG_INDEX" ])
if self.TEST: env.Append( CCFLAGS = [ "-DCONFIG_TEST" ])
def reset(self):
self.DATABASE = False
self.FILE = False
self.FILTER = False
self.INDEX = False
self.TEST = False
self.reconfigure()

View File

@ -8,30 +8,45 @@
index
Filter: (lib/filter.cpp)
Filtering is used to generate a subset of songs for easier searching.
Filtering is used to generate a subset of songs displayed by the UI to
that users can choose from. The inverted index is generated at startup
so there is no need for a remove() function, since it will be wiped
the next time the application starts.
- Index:
map<string, string> lowercase_cache;
map<string, set<string>> substring_cache;
Index filter_index("");
map<string, string> lowercase_cache;
unsigned int lowercase_cache_hits;
- Parsing:
1) Convert the provided string into a list of words, using whitespace
and the following characters as delimiters: \/,;()_~+"
and the following characters as delimiters: \/,;()_-~+"
For each word:
2) Check the lowercase_cache to see if we have seen the word before,
a) If we have, return the stored string
b) Convert the string to lowercase and strip out remaining
special characters. Add the result to the lowercase_cache;
3) Check the substring_cache to see if we have seen the word before,
a) If we have, use the substring set returned
b) Break the word into substrings from the front only. For
example: "dalek" would contain the substrings
{d, da, dal, dale, dalek}. Add to the substring cache.
- API:
filter :: add(string, track_id);
Parses the string and adds the track_id to the index.
void filter :: add(string, track_id);
Parse the string into substrings following the "Parsing"
section (above). Add each (substring, track_id) pair to the
filter_index.
To generate substrings, iterate over the word starting from
the front. For example: "dalek" would contain the substrings
{d, da, dal, dale, dalek}.
void filter :: search(string, set<track_id> &);
Parse the string and fill in the set with matching tracks.
Parse the string into substrings following the "Parsing"
section (above). We want to find track_ids that match ALL
substrings, so take the intersection of all sets returned by
the filter_index for a given substring.
void filter :: print_cache_stats();
Print cache hit and size information.
void filter :: get_index();
Return the index storing all the filter data.
(Only available if -DCONFIG_TEST is set)

23
include/filter.h Normal file
View File

@ -0,0 +1,23 @@
/*
* Copyright 2013 (c) Anna Schumaker.
*/
#ifndef OCARINA_FILTER_H
#define OCARINA_FILTER_H
#include <index.h>
#include <string>
namespace filter {
void add(const std::string &, unsigned int);
void search(const std::string &, std::set<unsigned int> &);
void print_cache_stats();
#ifdef CONFIG_TEST
Index &get_index();
#endif /* CONFIG_TEST */
};
#endif /* OCARINA_FILTER_H */

View File

@ -3,6 +3,12 @@ Import("env", "CONFIG")
build = []
if CONFIG.FILTER:
CONFIG.INDEX = True
build += [ env.Object("filter.cpp") ]
####################
if CONFIG.DATABASE:
CONFIG.FILE = True
build += [ env.Object("database.cpp") ]
@ -11,6 +17,7 @@ if CONFIG.INDEX:
CONFIG.FILE = True
build += [ env.Object("index.cpp") ]
####################
if CONFIG.FILE:
CONFIG.package("glib-2.0")

144
lib/filter.cpp Normal file
View File

@ -0,0 +1,144 @@
/*
* Copyright 2013 (c) Anna Schumaker.
*/
#include <filter.h>
#include <index.h>
#include <print.h>
#include <algorithm>
#include <list>
#include <map>
#include <set>
static Index filter_index("");
static std::map<std::string, std::string> lowercase_cache;
static unsigned int lowercase_cache_hits = 0;
static void split_text(const std::string &text, std::list<std::string> &ret)
{
std::string word;
char c;
for (unsigned int i = 0; i < text.size(); i++) {
c = text[i];
switch (c) {
case '\\':
case '/':
case ',':
case ';':
case '(':
case ')':
case '_':
case '-':
case '~':
case '+':
case '"':
case ' ':
case ' ':
if (word != "") {
ret.push_back(word);
word = "";
}
break;
default:
word += c;
};
}
if (word != "")
ret.push_back(word);
}
static void lower_text(const std::string &text, std::list<std::string> &ret)
{
char c;
std::string word;
std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
if (it != lowercase_cache.end()) {
lowercase_cache_hits++;
ret.push_back(it->second);
return;
}
for (unsigned int i = 0; i < text.size(); i++) {
c = text[i];
if ( (c >= 'a') && (c <= 'z') )
word += c;
else if ( (c >= 'A') && (c <= 'Z') )
word += (c + ('a' - 'A'));
else if ( (c >= '0') && (c <= '9') )
word += c;
}
lowercase_cache[text] = word;
ret.push_back(word);
}
static void parse_text(const std::string &text, std::list<std::string> &ret)
{
std::list<std::string> split;
std::list<std::string>::iterator it;
split_text(text, split);
for (it = split.begin(); it != split.end(); it++)
lower_text(*it, ret);
}
static void add_substrings(const std::string &text, unsigned int track_id)
{
for (unsigned int i = 1; i <= text.size(); i++)
filter_index.insert(text.substr(0, i), track_id);
}
void filter :: add(const std::string &text, unsigned int track_id)
{
std::list<std::string> parsed;
std::list<std::string>::iterator it;
parse_text(text, parsed);
for (it = parsed.begin(); it != parsed.end(); it++)
add_substrings(*it, track_id);
}
static void find_intersection(std::string &text, std::set<unsigned int> &res)
{
std::set<unsigned int> terms = filter_index[text];
std::set<unsigned int> tmp;
set_intersection(filter_index[text].begin(), filter_index[text].end(),
res.begin(), res.end(),
std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
res.swap(tmp);
}
void filter :: search(const std::string &text, std::set<unsigned int> &res)
{
std::list<std::string> parsed;
std::list<std::string>::iterator it;
parse_text(text, parsed);
if (parsed.size() == 0)
return;
it = parsed.begin();
res = filter_index[*it];
for (it++; it != parsed.end(); it++)
find_intersection(*it, res);
}
void filter :: print_cache_stats()
{
print("Lowercase cache size: %u\n", lowercase_cache.size());
print("Lowercase cache hits: %u\n", lowercase_cache_hits);
}
#ifdef CONFIG_TEST
Index &filter :: get_index()
{
return filter_index;
}
#endif /* CONFIG_TEST */

View File

@ -43,7 +43,7 @@ Export("Test")
# Read SConscript files
scripts = [ "basic", "database", "file", "index" ]
scripts = [ "basic", "database", "file", "filter", "index" ]
for s in scripts:
CONFIG.reset()
CONFIG.TEST = True

6
tests/filter/Sconscript Normal file
View File

@ -0,0 +1,6 @@
#!/usr/bin/python
Import("Test", "CONFIG")
CONFIG.FILTER = True
Test("filter", "filter.cpp")

129
tests/filter/filter.cpp Normal file
View File

@ -0,0 +1,129 @@
/*
* Copyright 2013 (c) Anna Schumaker.
*/
#include <filter.h>
#include <print.h>
#include <vector>
std::string quotes [] = {
"What heroes like best is themselves.",
"The sun rose slowly, as if wasn't sure it was worth all the effort",
"Of course I'm sane, when trees start talking to me, I don't talk back",
"Darkness isn't the opposite of light, it's simply its absence",
"Time passed, which, basically, is its job",
"Million-to-one chances crop up nine times out of ten",
"CATS ARE NICE",
"Death isn't cruel - merely terribly, terribly good at his job",
"Thunder rolled ... it rolled a six",
"DROP THE SCYTHE, AND TURN AROUND SLOWLY",
"Time is like a drug. Too much of it kills you",
"Gravity is a habit that is hard to shake off",
"You do not ask people like that what they are thinking about in case "
"they turn around very slowly and say 'You'",
"Do unto others before they do unto you",
"Not a man to mince words. People, yes. But not words",
"An elf's strength lay in persuading others they were weak",
"May you live in interesting times",
"WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN "
"ABOUT THAT",
"I AM DEATH, NOT TAXES. I TURN UP ONLY ONCE",
"All tribal myths are true, for a given value of 'true'",
"The Truth Shall Make Ye Fret",
"When you look into the abyss, it's not supposed to wave back",
"I have no use for people who have learned the limits of the possible",
"Speak softly and employ a huge man with a crowbar",
"Truly, the leopard can change his shorts",
"+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++",
"+++Whoops! Here comes the cheese! +++",
"Bring out yer dead, bring out yer living dead",
"1. ALL FUNGI ARE EDIBLE. 2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.",
"A lot of farming is about manure",
"I am very attached to my fingers, and I like to think of them as attached to me",
"There be a lot o' men who became heroes cuz they wuz too scared tae run",
"If only the pawns united, make talked the rooks round, the whole board "
"could've been a republic in a dozen moves",
"Always remember that the crowd that applauds your coronation is the same "
"crowd that will applaud your beheading. People like a show.",
};
static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string);
void print_keys(Index &index)
{
std::set<std::string>::iterator it;
print("Found keys:");
for (it = index.keys_begin(); it != index.keys_end(); it++)
print(" %s", it->c_str());
print("\n");
}
void print_index(Index &index)
{
std::set<std::string>::iterator s_it;
std::set<unsigned int>::iterator u_it;
print("=== Printing index ===\n");
print_keys(index);
for (s_it = index.keys_begin(); s_it != index.keys_end(); s_it++) {
std::string key = *s_it;
print("index[%s] = {", key.c_str());
for (u_it = index[key].begin(); u_it != index[key].end(); u_it++) {
if (u_it != index[key].begin())
print(" ");
print("%d", *u_it);
}
print("}\n");
}
print("\n");
}
void test_search(const std::string &text)
{
std::set<unsigned int> results;
std::set<unsigned int>::iterator it;
filter :: search(text, results);
print("Search for: \"%s\" returned %u matches:\n",
text.c_str(), results.size());
for (it = results.begin(); it != results.end(); it++)
print("\t%s\n", quotes[*it].c_str());
print("\n");
}
void test_0()
{
for (unsigned int i = 0; i < num_quotes; i++)
filter :: add(quotes[i], i);
print_index(filter :: get_index());
filter :: print_cache_stats();
print("\n");
}
void test_1()
{
test_search("");
test_search("Rincewind");
test_search("Rincewind Twoflower Luggage");
test_search("the");
test_search("the is");
test_search("THE IS");
test_search("th i");
test_search("th i even");
test_search("Th/i-eVEn");
test_search("whoops");
filter :: print_cache_stats();
}
int main(int argc, char **argv)
{
test_0();
test_1();
return 0;
}