filter: Implement text filtering
Because every programming project needs a test case centered around Discworld quotes ... Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
This commit is contained in:
parent
379a96fe13
commit
2b15048777
10
config
10
config
|
@ -26,6 +26,7 @@ class Config:
|
||||||
self.ENV = CONFIG_ENV
|
self.ENV = CONFIG_ENV
|
||||||
self.DATABASE = False
|
self.DATABASE = False
|
||||||
self.FILE = False
|
self.FILE = False
|
||||||
|
self.FILTER = False
|
||||||
self.INDEX = False
|
self.INDEX = False
|
||||||
self.TEST = False
|
self.TEST = False
|
||||||
|
|
||||||
|
@ -34,13 +35,16 @@ class Config:
|
||||||
|
|
||||||
def reconfigure(self):
|
def reconfigure(self):
|
||||||
env.Replace( CCFLAGS = self.ENV )
|
env.Replace( CCFLAGS = self.ENV )
|
||||||
if self.DATABASE: env.Append( CCFLAGS = "-DCONFIG_DATABASE" )
|
if self.DATABASE: env.Append( CCFLAGS = [ "-DCONFIG_DATABASE" ])
|
||||||
if self.FILE: env.Append( CCFLAGS = "-DCONFIG_FILE" )
|
if self.FILE: env.Append( CCFLAGS = [ "-DCONFIG_FILE" ])
|
||||||
if self.TEST: env.Append( CCFLAGS = "-DCONFIG_TEST" )
|
if self.FILTER: env.Append( CCFLAGS = [ "-DCONFIG_FILTER" ])
|
||||||
|
if self.INDEX: env.Append( CCFLAGS = [ "-DCONFIG_INDEX" ])
|
||||||
|
if self.TEST: env.Append( CCFLAGS = [ "-DCONFIG_TEST" ])
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.DATABASE = False
|
self.DATABASE = False
|
||||||
self.FILE = False
|
self.FILE = False
|
||||||
|
self.FILTER = False
|
||||||
self.INDEX = False
|
self.INDEX = False
|
||||||
self.TEST = False
|
self.TEST = False
|
||||||
self.reconfigure()
|
self.reconfigure()
|
||||||
|
|
|
@ -8,30 +8,45 @@
|
||||||
index
|
index
|
||||||
|
|
||||||
Filter: (lib/filter.cpp)
|
Filter: (lib/filter.cpp)
|
||||||
Filtering is used to generate a subset of songs for easier searching.
|
Filtering is used to generate a subset of songs displayed by the UI to
|
||||||
|
that users can choose from. The inverted index is generated at startup
|
||||||
|
so there is no need for a remove() function, since it will be wiped
|
||||||
|
the next time the application starts.
|
||||||
|
|
||||||
- Index:
|
- Index:
|
||||||
map<string, string> lowercase_cache;
|
|
||||||
map<string, set<string>> substring_cache;
|
|
||||||
Index filter_index("");
|
Index filter_index("");
|
||||||
|
map<string, string> lowercase_cache;
|
||||||
|
unsigned int lowercase_cache_hits;
|
||||||
|
|
||||||
- Parsing:
|
- Parsing:
|
||||||
1) Convert the provided string into a list of words, using whitespace
|
1) Convert the provided string into a list of words, using whitespace
|
||||||
and the following characters as delimiters: \/,;()_~+"
|
and the following characters as delimiters: \/,;()_-~+"
|
||||||
|
|
||||||
For each word:
|
For each word:
|
||||||
2) Check the lowercase_cache to see if we have seen the word before,
|
2) Check the lowercase_cache to see if we have seen the word before,
|
||||||
a) If we have, return the stored string
|
a) If we have, return the stored string
|
||||||
b) Convert the string to lowercase and strip out remaining
|
b) Convert the string to lowercase and strip out remaining
|
||||||
special characters. Add the result to the lowercase_cache;
|
special characters. Add the result to the lowercase_cache;
|
||||||
3) Check the substring_cache to see if we have seen the word before,
|
|
||||||
a) If we have, use the substring set returned
|
|
||||||
b) Break the word into substrings from the front only. For
|
|
||||||
example: "dalek" would contain the substrings
|
|
||||||
{d, da, dal, dale, dalek}. Add to the substring cache.
|
|
||||||
|
|
||||||
- API:
|
- API:
|
||||||
filter :: add(string, track_id);
|
void filter :: add(string, track_id);
|
||||||
Parses the string and adds the track_id to the index.
|
Parse the string into substrings following the "Parsing"
|
||||||
|
section (above). Add each (substring, track_id) pair to the
|
||||||
|
filter_index.
|
||||||
|
|
||||||
|
To generate substrings, iterate over the word starting from
|
||||||
|
the front. For example: "dalek" would contain the substrings
|
||||||
|
{d, da, dal, dale, dalek}.
|
||||||
|
|
||||||
void filter :: search(string, set<track_id> &);
|
void filter :: search(string, set<track_id> &);
|
||||||
Parse the string and fill in the set with matching tracks.
|
Parse the string into substrings following the "Parsing"
|
||||||
|
section (above). We want to find track_ids that match ALL
|
||||||
|
substrings, so take the intersection of all sets returned by
|
||||||
|
the filter_index for a given substring.
|
||||||
|
|
||||||
|
void filter :: print_cache_stats();
|
||||||
|
Print cache hit and size information.
|
||||||
|
|
||||||
|
void filter :: get_index();
|
||||||
|
Return the index storing all the filter data.
|
||||||
|
(Only available if -DCONFIG_TEST is set)
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2013 (c) Anna Schumaker.
|
||||||
|
*/
|
||||||
|
#ifndef OCARINA_FILTER_H
|
||||||
|
#define OCARINA_FILTER_H
|
||||||
|
|
||||||
|
#include <index.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace filter {
|
||||||
|
|
||||||
|
void add(const std::string &, unsigned int);
|
||||||
|
void search(const std::string &, std::set<unsigned int> &);
|
||||||
|
|
||||||
|
void print_cache_stats();
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEST
|
||||||
|
Index &get_index();
|
||||||
|
#endif /* CONFIG_TEST */
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* OCARINA_FILTER_H */
|
|
@ -3,6 +3,12 @@ Import("env", "CONFIG")
|
||||||
|
|
||||||
build = []
|
build = []
|
||||||
|
|
||||||
|
if CONFIG.FILTER:
|
||||||
|
CONFIG.INDEX = True
|
||||||
|
build += [ env.Object("filter.cpp") ]
|
||||||
|
|
||||||
|
####################
|
||||||
|
|
||||||
if CONFIG.DATABASE:
|
if CONFIG.DATABASE:
|
||||||
CONFIG.FILE = True
|
CONFIG.FILE = True
|
||||||
build += [ env.Object("database.cpp") ]
|
build += [ env.Object("database.cpp") ]
|
||||||
|
@ -11,6 +17,7 @@ if CONFIG.INDEX:
|
||||||
CONFIG.FILE = True
|
CONFIG.FILE = True
|
||||||
build += [ env.Object("index.cpp") ]
|
build += [ env.Object("index.cpp") ]
|
||||||
|
|
||||||
|
####################
|
||||||
|
|
||||||
if CONFIG.FILE:
|
if CONFIG.FILE:
|
||||||
CONFIG.package("glib-2.0")
|
CONFIG.package("glib-2.0")
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2013 (c) Anna Schumaker.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <filter.h>
|
||||||
|
#include <index.h>
|
||||||
|
#include <print.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <list>
|
||||||
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
static Index filter_index("");
|
||||||
|
static std::map<std::string, std::string> lowercase_cache;
|
||||||
|
static unsigned int lowercase_cache_hits = 0;
|
||||||
|
|
||||||
|
static void split_text(const std::string &text, std::list<std::string> &ret)
|
||||||
|
{
|
||||||
|
std::string word;
|
||||||
|
char c;
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < text.size(); i++) {
|
||||||
|
c = text[i];
|
||||||
|
|
||||||
|
switch (c) {
|
||||||
|
case '\\':
|
||||||
|
case '/':
|
||||||
|
case ',':
|
||||||
|
case ';':
|
||||||
|
case '(':
|
||||||
|
case ')':
|
||||||
|
case '_':
|
||||||
|
case '-':
|
||||||
|
case '~':
|
||||||
|
case '+':
|
||||||
|
case '"':
|
||||||
|
case ' ':
|
||||||
|
case ' ':
|
||||||
|
if (word != "") {
|
||||||
|
ret.push_back(word);
|
||||||
|
word = "";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
word += c;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word != "")
|
||||||
|
ret.push_back(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void lower_text(const std::string &text, std::list<std::string> &ret)
|
||||||
|
{
|
||||||
|
char c;
|
||||||
|
std::string word;
|
||||||
|
std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
|
||||||
|
|
||||||
|
if (it != lowercase_cache.end()) {
|
||||||
|
lowercase_cache_hits++;
|
||||||
|
ret.push_back(it->second);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < text.size(); i++) {
|
||||||
|
c = text[i];
|
||||||
|
if ( (c >= 'a') && (c <= 'z') )
|
||||||
|
word += c;
|
||||||
|
else if ( (c >= 'A') && (c <= 'Z') )
|
||||||
|
word += (c + ('a' - 'A'));
|
||||||
|
else if ( (c >= '0') && (c <= '9') )
|
||||||
|
word += c;
|
||||||
|
}
|
||||||
|
|
||||||
|
lowercase_cache[text] = word;
|
||||||
|
ret.push_back(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void parse_text(const std::string &text, std::list<std::string> &ret)
|
||||||
|
{
|
||||||
|
std::list<std::string> split;
|
||||||
|
std::list<std::string>::iterator it;
|
||||||
|
|
||||||
|
split_text(text, split);
|
||||||
|
for (it = split.begin(); it != split.end(); it++)
|
||||||
|
lower_text(*it, ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void add_substrings(const std::string &text, unsigned int track_id)
|
||||||
|
{
|
||||||
|
for (unsigned int i = 1; i <= text.size(); i++)
|
||||||
|
filter_index.insert(text.substr(0, i), track_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
void filter :: add(const std::string &text, unsigned int track_id)
|
||||||
|
{
|
||||||
|
std::list<std::string> parsed;
|
||||||
|
std::list<std::string>::iterator it;
|
||||||
|
|
||||||
|
parse_text(text, parsed);
|
||||||
|
for (it = parsed.begin(); it != parsed.end(); it++)
|
||||||
|
add_substrings(*it, track_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void find_intersection(std::string &text, std::set<unsigned int> &res)
|
||||||
|
{
|
||||||
|
std::set<unsigned int> terms = filter_index[text];
|
||||||
|
std::set<unsigned int> tmp;
|
||||||
|
|
||||||
|
set_intersection(filter_index[text].begin(), filter_index[text].end(),
|
||||||
|
res.begin(), res.end(),
|
||||||
|
std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
|
||||||
|
res.swap(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
void filter :: search(const std::string &text, std::set<unsigned int> &res)
|
||||||
|
{
|
||||||
|
std::list<std::string> parsed;
|
||||||
|
std::list<std::string>::iterator it;
|
||||||
|
|
||||||
|
parse_text(text, parsed);
|
||||||
|
if (parsed.size() == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
it = parsed.begin();
|
||||||
|
res = filter_index[*it];
|
||||||
|
|
||||||
|
for (it++; it != parsed.end(); it++)
|
||||||
|
find_intersection(*it, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void filter :: print_cache_stats()
|
||||||
|
{
|
||||||
|
print("Lowercase cache size: %u\n", lowercase_cache.size());
|
||||||
|
print("Lowercase cache hits: %u\n", lowercase_cache_hits);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEST
|
||||||
|
Index &filter :: get_index()
|
||||||
|
{
|
||||||
|
return filter_index;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_TEST */
|
|
@ -43,7 +43,7 @@ Export("Test")
|
||||||
|
|
||||||
|
|
||||||
# Read SConscript files
|
# Read SConscript files
|
||||||
scripts = [ "basic", "database", "file", "index" ]
|
scripts = [ "basic", "database", "file", "filter", "index" ]
|
||||||
for s in scripts:
|
for s in scripts:
|
||||||
CONFIG.reset()
|
CONFIG.reset()
|
||||||
CONFIG.TEST = True
|
CONFIG.TEST = True
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
Import("Test", "CONFIG")
|
||||||
|
|
||||||
|
CONFIG.FILTER = True
|
||||||
|
|
||||||
|
Test("filter", "filter.cpp")
|
|
@ -0,0 +1,129 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2013 (c) Anna Schumaker.
|
||||||
|
*/
|
||||||
|
#include <filter.h>
|
||||||
|
#include <print.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
std::string quotes [] = {
|
||||||
|
"What heroes like best is themselves.",
|
||||||
|
"The sun rose slowly, as if wasn't sure it was worth all the effort",
|
||||||
|
"Of course I'm sane, when trees start talking to me, I don't talk back",
|
||||||
|
"Darkness isn't the opposite of light, it's simply its absence",
|
||||||
|
"Time passed, which, basically, is its job",
|
||||||
|
"Million-to-one chances crop up nine times out of ten",
|
||||||
|
"CATS ARE NICE",
|
||||||
|
"Death isn't cruel - merely terribly, terribly good at his job",
|
||||||
|
"Thunder rolled ... it rolled a six",
|
||||||
|
"DROP THE SCYTHE, AND TURN AROUND SLOWLY",
|
||||||
|
"Time is like a drug. Too much of it kills you",
|
||||||
|
"Gravity is a habit that is hard to shake off",
|
||||||
|
"You do not ask people like that what they are thinking about in case "
|
||||||
|
"they turn around very slowly and say 'You'",
|
||||||
|
"Do unto others before they do unto you",
|
||||||
|
"Not a man to mince words. People, yes. But not words",
|
||||||
|
"An elf's strength lay in persuading others they were weak",
|
||||||
|
"May you live in interesting times",
|
||||||
|
"WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN "
|
||||||
|
"ABOUT THAT",
|
||||||
|
"I AM DEATH, NOT TAXES. I TURN UP ONLY ONCE",
|
||||||
|
"All tribal myths are true, for a given value of 'true'",
|
||||||
|
"The Truth Shall Make Ye Fret",
|
||||||
|
"When you look into the abyss, it's not supposed to wave back",
|
||||||
|
"I have no use for people who have learned the limits of the possible",
|
||||||
|
"Speak softly and employ a huge man with a crowbar",
|
||||||
|
"Truly, the leopard can change his shorts",
|
||||||
|
"+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++",
|
||||||
|
"+++Whoops! Here comes the cheese! +++",
|
||||||
|
"Bring out yer dead, bring out yer living dead",
|
||||||
|
"1. ALL FUNGI ARE EDIBLE. 2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.",
|
||||||
|
"A lot of farming is about manure",
|
||||||
|
"I am very attached to my fingers, and I like to think of them as attached to me",
|
||||||
|
"There be a lot o' men who became heroes cuz they wuz too scared tae run",
|
||||||
|
"If only the pawns united, make talked the rooks round, the whole board "
|
||||||
|
"could've been a republic in a dozen moves",
|
||||||
|
"Always remember that the crowd that applauds your coronation is the same "
|
||||||
|
"crowd that will applaud your beheading. People like a show.",
|
||||||
|
};
|
||||||
|
|
||||||
|
static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string);
|
||||||
|
|
||||||
|
void print_keys(Index &index)
|
||||||
|
{
|
||||||
|
std::set<std::string>::iterator it;
|
||||||
|
|
||||||
|
print("Found keys:");
|
||||||
|
for (it = index.keys_begin(); it != index.keys_end(); it++)
|
||||||
|
print(" %s", it->c_str());
|
||||||
|
print("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_index(Index &index)
|
||||||
|
{
|
||||||
|
std::set<std::string>::iterator s_it;
|
||||||
|
std::set<unsigned int>::iterator u_it;
|
||||||
|
|
||||||
|
print("=== Printing index ===\n");
|
||||||
|
print_keys(index);
|
||||||
|
|
||||||
|
for (s_it = index.keys_begin(); s_it != index.keys_end(); s_it++) {
|
||||||
|
std::string key = *s_it;
|
||||||
|
print("index[%s] = {", key.c_str());
|
||||||
|
|
||||||
|
for (u_it = index[key].begin(); u_it != index[key].end(); u_it++) {
|
||||||
|
if (u_it != index[key].begin())
|
||||||
|
print(" ");
|
||||||
|
print("%d", *u_it);
|
||||||
|
}
|
||||||
|
|
||||||
|
print("}\n");
|
||||||
|
}
|
||||||
|
print("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_search(const std::string &text)
|
||||||
|
{
|
||||||
|
std::set<unsigned int> results;
|
||||||
|
std::set<unsigned int>::iterator it;
|
||||||
|
|
||||||
|
filter :: search(text, results);
|
||||||
|
|
||||||
|
print("Search for: \"%s\" returned %u matches:\n",
|
||||||
|
text.c_str(), results.size());
|
||||||
|
|
||||||
|
for (it = results.begin(); it != results.end(); it++)
|
||||||
|
print("\t%s\n", quotes[*it].c_str());
|
||||||
|
print("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_0()
|
||||||
|
{
|
||||||
|
for (unsigned int i = 0; i < num_quotes; i++)
|
||||||
|
filter :: add(quotes[i], i);
|
||||||
|
print_index(filter :: get_index());
|
||||||
|
filter :: print_cache_stats();
|
||||||
|
print("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_1()
|
||||||
|
{
|
||||||
|
test_search("");
|
||||||
|
test_search("Rincewind");
|
||||||
|
test_search("Rincewind Twoflower Luggage");
|
||||||
|
test_search("the");
|
||||||
|
test_search("the is");
|
||||||
|
test_search("THE IS");
|
||||||
|
test_search("th i");
|
||||||
|
test_search("th i even");
|
||||||
|
test_search("Th/i-eVEn");
|
||||||
|
test_search("whoops");
|
||||||
|
filter :: print_cache_stats();
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
test_0();
|
||||||
|
test_1();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue