filter: Implement text filtering
Because every programming project needs a test case centered around Discworld quotes ... Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
This commit is contained in:
parent
379a96fe13
commit
2b15048777
10
config
10
config
|
@ -26,6 +26,7 @@ class Config:
|
|||
self.ENV = CONFIG_ENV
|
||||
self.DATABASE = False
|
||||
self.FILE = False
|
||||
self.FILTER = False
|
||||
self.INDEX = False
|
||||
self.TEST = False
|
||||
|
||||
|
@ -34,13 +35,16 @@ class Config:
|
|||
|
||||
def reconfigure(self):
|
||||
env.Replace( CCFLAGS = self.ENV )
|
||||
if self.DATABASE: env.Append( CCFLAGS = "-DCONFIG_DATABASE" )
|
||||
if self.FILE: env.Append( CCFLAGS = "-DCONFIG_FILE" )
|
||||
if self.TEST: env.Append( CCFLAGS = "-DCONFIG_TEST" )
|
||||
if self.DATABASE: env.Append( CCFLAGS = [ "-DCONFIG_DATABASE" ])
|
||||
if self.FILE: env.Append( CCFLAGS = [ "-DCONFIG_FILE" ])
|
||||
if self.FILTER: env.Append( CCFLAGS = [ "-DCONFIG_FILTER" ])
|
||||
if self.INDEX: env.Append( CCFLAGS = [ "-DCONFIG_INDEX" ])
|
||||
if self.TEST: env.Append( CCFLAGS = [ "-DCONFIG_TEST" ])
|
||||
|
||||
def reset(self):
|
||||
self.DATABASE = False
|
||||
self.FILE = False
|
||||
self.FILTER = False
|
||||
self.INDEX = False
|
||||
self.TEST = False
|
||||
self.reconfigure()
|
||||
|
|
|
@ -8,30 +8,45 @@
|
|||
index
|
||||
|
||||
Filter: (lib/filter.cpp)
|
||||
Filtering is used to generate a subset of songs for easier searching.
|
||||
Filtering is used to generate a subset of songs displayed by the UI to
|
||||
that users can choose from. The inverted index is generated at startup
|
||||
so there is no need for a remove() function, since it will be wiped
|
||||
the next time the application starts.
|
||||
|
||||
- Index:
|
||||
map<string, string> lowercase_cache;
|
||||
map<string, set<string>> substring_cache;
|
||||
Index filter_index("");
|
||||
map<string, string> lowercase_cache;
|
||||
unsigned int lowercase_cache_hits;
|
||||
|
||||
- Parsing:
|
||||
1) Convert the provided string into a list of words, using whitespace
|
||||
and the following characters as delimiters: \/,;()_~+"
|
||||
and the following characters as delimiters: \/,;()_-~+"
|
||||
|
||||
For each word:
|
||||
2) Check the lowercase_cache to see if we have seen the word before,
|
||||
a) If we have, return the stored string
|
||||
b) Convert the string to lowercase and strip out remaining
|
||||
special characters. Add the result to the lowercase_cache;
|
||||
3) Check the substring_cache to see if we have seen the word before,
|
||||
a) If we have, use the substring set returned
|
||||
b) Break the word into substrings from the front only. For
|
||||
example: "dalek" would contain the substrings
|
||||
{d, da, dal, dale, dalek}. Add to the substring cache.
|
||||
|
||||
- API:
|
||||
filter :: add(string, track_id);
|
||||
Parses the string and adds the track_id to the index.
|
||||
void filter :: add(string, track_id);
|
||||
Parse the string into substrings following the "Parsing"
|
||||
section (above). Add each (substring, track_id) pair to the
|
||||
filter_index.
|
||||
|
||||
To generate substrings, iterate over the word starting from
|
||||
the front. For example: "dalek" would contain the substrings
|
||||
{d, da, dal, dale, dalek}.
|
||||
|
||||
void filter :: search(string, set<track_id> &);
|
||||
Parse the string and fill in the set with matching tracks.
|
||||
Parse the string into substrings following the "Parsing"
|
||||
section (above). We want to find track_ids that match ALL
|
||||
substrings, so take the intersection of all sets returned by
|
||||
the filter_index for a given substring.
|
||||
|
||||
void filter :: print_cache_stats();
|
||||
Print cache hit and size information.
|
||||
|
||||
void filter :: get_index();
|
||||
Return the index storing all the filter data.
|
||||
(Only available if -DCONFIG_TEST is set)
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright 2013 (c) Anna Schumaker.
|
||||
*/
|
||||
#ifndef OCARINA_FILTER_H
|
||||
#define OCARINA_FILTER_H
|
||||
|
||||
#include <index.h>
|
||||
#include <string>
|
||||
|
||||
namespace filter {
|
||||
|
||||
void add(const std::string &, unsigned int);
|
||||
void search(const std::string &, std::set<unsigned int> &);
|
||||
|
||||
void print_cache_stats();
|
||||
|
||||
#ifdef CONFIG_TEST
|
||||
Index &get_index();
|
||||
#endif /* CONFIG_TEST */
|
||||
|
||||
};
|
||||
|
||||
#endif /* OCARINA_FILTER_H */
|
|
@ -3,6 +3,12 @@ Import("env", "CONFIG")
|
|||
|
||||
build = []
|
||||
|
||||
if CONFIG.FILTER:
|
||||
CONFIG.INDEX = True
|
||||
build += [ env.Object("filter.cpp") ]
|
||||
|
||||
####################
|
||||
|
||||
if CONFIG.DATABASE:
|
||||
CONFIG.FILE = True
|
||||
build += [ env.Object("database.cpp") ]
|
||||
|
@ -11,6 +17,7 @@ if CONFIG.INDEX:
|
|||
CONFIG.FILE = True
|
||||
build += [ env.Object("index.cpp") ]
|
||||
|
||||
####################
|
||||
|
||||
if CONFIG.FILE:
|
||||
CONFIG.package("glib-2.0")
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
/*
|
||||
* Copyright 2013 (c) Anna Schumaker.
|
||||
*/
|
||||
|
||||
#include <filter.h>
|
||||
#include <index.h>
|
||||
#include <print.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
static Index filter_index("");
|
||||
static std::map<std::string, std::string> lowercase_cache;
|
||||
static unsigned int lowercase_cache_hits = 0;
|
||||
|
||||
static void split_text(const std::string &text, std::list<std::string> &ret)
|
||||
{
|
||||
std::string word;
|
||||
char c;
|
||||
|
||||
for (unsigned int i = 0; i < text.size(); i++) {
|
||||
c = text[i];
|
||||
|
||||
switch (c) {
|
||||
case '\\':
|
||||
case '/':
|
||||
case ',':
|
||||
case ';':
|
||||
case '(':
|
||||
case ')':
|
||||
case '_':
|
||||
case '-':
|
||||
case '~':
|
||||
case '+':
|
||||
case '"':
|
||||
case ' ':
|
||||
case ' ':
|
||||
if (word != "") {
|
||||
ret.push_back(word);
|
||||
word = "";
|
||||
}
|
||||
break;
|
||||
default:
|
||||
word += c;
|
||||
};
|
||||
}
|
||||
|
||||
if (word != "")
|
||||
ret.push_back(word);
|
||||
}
|
||||
|
||||
static void lower_text(const std::string &text, std::list<std::string> &ret)
|
||||
{
|
||||
char c;
|
||||
std::string word;
|
||||
std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
|
||||
|
||||
if (it != lowercase_cache.end()) {
|
||||
lowercase_cache_hits++;
|
||||
ret.push_back(it->second);
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < text.size(); i++) {
|
||||
c = text[i];
|
||||
if ( (c >= 'a') && (c <= 'z') )
|
||||
word += c;
|
||||
else if ( (c >= 'A') && (c <= 'Z') )
|
||||
word += (c + ('a' - 'A'));
|
||||
else if ( (c >= '0') && (c <= '9') )
|
||||
word += c;
|
||||
}
|
||||
|
||||
lowercase_cache[text] = word;
|
||||
ret.push_back(word);
|
||||
}
|
||||
|
||||
static void parse_text(const std::string &text, std::list<std::string> &ret)
|
||||
{
|
||||
std::list<std::string> split;
|
||||
std::list<std::string>::iterator it;
|
||||
|
||||
split_text(text, split);
|
||||
for (it = split.begin(); it != split.end(); it++)
|
||||
lower_text(*it, ret);
|
||||
}
|
||||
|
||||
static void add_substrings(const std::string &text, unsigned int track_id)
|
||||
{
|
||||
for (unsigned int i = 1; i <= text.size(); i++)
|
||||
filter_index.insert(text.substr(0, i), track_id);
|
||||
}
|
||||
|
||||
void filter :: add(const std::string &text, unsigned int track_id)
|
||||
{
|
||||
std::list<std::string> parsed;
|
||||
std::list<std::string>::iterator it;
|
||||
|
||||
parse_text(text, parsed);
|
||||
for (it = parsed.begin(); it != parsed.end(); it++)
|
||||
add_substrings(*it, track_id);
|
||||
}
|
||||
|
||||
static void find_intersection(std::string &text, std::set<unsigned int> &res)
|
||||
{
|
||||
std::set<unsigned int> terms = filter_index[text];
|
||||
std::set<unsigned int> tmp;
|
||||
|
||||
set_intersection(filter_index[text].begin(), filter_index[text].end(),
|
||||
res.begin(), res.end(),
|
||||
std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
|
||||
res.swap(tmp);
|
||||
}
|
||||
|
||||
void filter :: search(const std::string &text, std::set<unsigned int> &res)
|
||||
{
|
||||
std::list<std::string> parsed;
|
||||
std::list<std::string>::iterator it;
|
||||
|
||||
parse_text(text, parsed);
|
||||
if (parsed.size() == 0)
|
||||
return;
|
||||
|
||||
it = parsed.begin();
|
||||
res = filter_index[*it];
|
||||
|
||||
for (it++; it != parsed.end(); it++)
|
||||
find_intersection(*it, res);
|
||||
}
|
||||
|
||||
void filter :: print_cache_stats()
|
||||
{
|
||||
print("Lowercase cache size: %u\n", lowercase_cache.size());
|
||||
print("Lowercase cache hits: %u\n", lowercase_cache_hits);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TEST
|
||||
Index &filter :: get_index()
|
||||
{
|
||||
return filter_index;
|
||||
}
|
||||
#endif /* CONFIG_TEST */
|
|
@ -43,7 +43,7 @@ Export("Test")
|
|||
|
||||
|
||||
# Read SConscript files
|
||||
scripts = [ "basic", "database", "file", "index" ]
|
||||
scripts = [ "basic", "database", "file", "filter", "index" ]
|
||||
for s in scripts:
|
||||
CONFIG.reset()
|
||||
CONFIG.TEST = True
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
#!/usr/bin/python
|
||||
Import("Test", "CONFIG")
|
||||
|
||||
CONFIG.FILTER = True
|
||||
|
||||
Test("filter", "filter.cpp")
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright 2013 (c) Anna Schumaker.
|
||||
*/
|
||||
#include <filter.h>
|
||||
#include <print.h>
|
||||
#include <vector>
|
||||
|
||||
std::string quotes [] = {
|
||||
"What heroes like best is themselves.",
|
||||
"The sun rose slowly, as if wasn't sure it was worth all the effort",
|
||||
"Of course I'm sane, when trees start talking to me, I don't talk back",
|
||||
"Darkness isn't the opposite of light, it's simply its absence",
|
||||
"Time passed, which, basically, is its job",
|
||||
"Million-to-one chances crop up nine times out of ten",
|
||||
"CATS ARE NICE",
|
||||
"Death isn't cruel - merely terribly, terribly good at his job",
|
||||
"Thunder rolled ... it rolled a six",
|
||||
"DROP THE SCYTHE, AND TURN AROUND SLOWLY",
|
||||
"Time is like a drug. Too much of it kills you",
|
||||
"Gravity is a habit that is hard to shake off",
|
||||
"You do not ask people like that what they are thinking about in case "
|
||||
"they turn around very slowly and say 'You'",
|
||||
"Do unto others before they do unto you",
|
||||
"Not a man to mince words. People, yes. But not words",
|
||||
"An elf's strength lay in persuading others they were weak",
|
||||
"May you live in interesting times",
|
||||
"WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN "
|
||||
"ABOUT THAT",
|
||||
"I AM DEATH, NOT TAXES. I TURN UP ONLY ONCE",
|
||||
"All tribal myths are true, for a given value of 'true'",
|
||||
"The Truth Shall Make Ye Fret",
|
||||
"When you look into the abyss, it's not supposed to wave back",
|
||||
"I have no use for people who have learned the limits of the possible",
|
||||
"Speak softly and employ a huge man with a crowbar",
|
||||
"Truly, the leopard can change his shorts",
|
||||
"+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++",
|
||||
"+++Whoops! Here comes the cheese! +++",
|
||||
"Bring out yer dead, bring out yer living dead",
|
||||
"1. ALL FUNGI ARE EDIBLE. 2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.",
|
||||
"A lot of farming is about manure",
|
||||
"I am very attached to my fingers, and I like to think of them as attached to me",
|
||||
"There be a lot o' men who became heroes cuz they wuz too scared tae run",
|
||||
"If only the pawns united, make talked the rooks round, the whole board "
|
||||
"could've been a republic in a dozen moves",
|
||||
"Always remember that the crowd that applauds your coronation is the same "
|
||||
"crowd that will applaud your beheading. People like a show.",
|
||||
};
|
||||
|
||||
static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string);
|
||||
|
||||
void print_keys(Index &index)
|
||||
{
|
||||
std::set<std::string>::iterator it;
|
||||
|
||||
print("Found keys:");
|
||||
for (it = index.keys_begin(); it != index.keys_end(); it++)
|
||||
print(" %s", it->c_str());
|
||||
print("\n");
|
||||
}
|
||||
|
||||
void print_index(Index &index)
|
||||
{
|
||||
std::set<std::string>::iterator s_it;
|
||||
std::set<unsigned int>::iterator u_it;
|
||||
|
||||
print("=== Printing index ===\n");
|
||||
print_keys(index);
|
||||
|
||||
for (s_it = index.keys_begin(); s_it != index.keys_end(); s_it++) {
|
||||
std::string key = *s_it;
|
||||
print("index[%s] = {", key.c_str());
|
||||
|
||||
for (u_it = index[key].begin(); u_it != index[key].end(); u_it++) {
|
||||
if (u_it != index[key].begin())
|
||||
print(" ");
|
||||
print("%d", *u_it);
|
||||
}
|
||||
|
||||
print("}\n");
|
||||
}
|
||||
print("\n");
|
||||
}
|
||||
|
||||
void test_search(const std::string &text)
|
||||
{
|
||||
std::set<unsigned int> results;
|
||||
std::set<unsigned int>::iterator it;
|
||||
|
||||
filter :: search(text, results);
|
||||
|
||||
print("Search for: \"%s\" returned %u matches:\n",
|
||||
text.c_str(), results.size());
|
||||
|
||||
for (it = results.begin(); it != results.end(); it++)
|
||||
print("\t%s\n", quotes[*it].c_str());
|
||||
print("\n");
|
||||
}
|
||||
|
||||
void test_0()
|
||||
{
|
||||
for (unsigned int i = 0; i < num_quotes; i++)
|
||||
filter :: add(quotes[i], i);
|
||||
print_index(filter :: get_index());
|
||||
filter :: print_cache_stats();
|
||||
print("\n");
|
||||
}
|
||||
|
||||
void test_1()
|
||||
{
|
||||
test_search("");
|
||||
test_search("Rincewind");
|
||||
test_search("Rincewind Twoflower Luggage");
|
||||
test_search("the");
|
||||
test_search("the is");
|
||||
test_search("THE IS");
|
||||
test_search("th i");
|
||||
test_search("th i even");
|
||||
test_search("Th/i-eVEn");
|
||||
test_search("whoops");
|
||||
filter :: print_cache_stats();
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
test_0();
|
||||
test_1();
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue