filter: Update design and rewrite the unit test

- filter :: add() now returns the lowercased text
- Don't cache lowercased strings
- Remove functions depending on CONFIG_TEST

Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
This commit is contained in:
Anna Schumaker 2014-03-15 20:44:07 -04:00 committed by Anna Schumaker
parent 7eddaac14e
commit 58ed47b37c
11 changed files with 231 additions and 1004 deletions

51
DESIGN
View File

@ -423,54 +423,37 @@ Index:
Filter: (lib/filter.cpp)
Filter:
Filtering is used to generate a subset of songs displayed by the UI to
that users can choose from. The inverted index is generated at startup
so there is no need for a remove() function, since it will be wiped
the next time the application starts.
- Index:
Database<database :: IndexEntry> filter_index;
map<string, string> lowercase_cache;
unsigned int lowercase_cache_hits;
- Parsing:
1) Convert the provided string into a list of words, using whitespace
and the following characters as delimiters: \/,;()_-~+"
For each word:
2) Check the lowercase_cache to see if we have seen the word before,
a) If we have, return the stored string
b) Convert the string to lowercase and strip out remaining
special characters. Add the result to the lowercase_cache;
- Scan over the input text to create a list of words using the following
characters as delimiters: \/,;()_-~+"
- While scanning, convert the string to lowercase and strip out any
other special characters.
- API:
void filter :: add(string, track_id);
Parse the string into substrings following the "Parsing"
section (above). Add each (substring, track_id) pair to the
filter_index.
std::string filter :: add(const std::string &key, unsigned int track_id);
Parse the key into words following the "Parsing" section above.
Generate substrings for each word and add each (substring,
track_id) pair to the index. Return the lowercased text to the
caller.
To generate substrings, iterate over the word starting from
the front. For example: "goron" would contain the substrings
{g, go, gor, goro, goron}.
void filter :: search(string, set<track_id> &);
Parse the string into substrings following the "Parsing"
section (above). We want to find track_ids that match ALL
substrings, so take the intersection of all sets returned by
the filter_index for a given substring.
std::string filter :: lowercase(const std::string &text);
Parse the text into lowercased words following the "Parsing"
section above. Return the lowercased string to the caller.
std::string filter :: to_lowercase(const std::string &string);
Split the string into words following step 1 of "Parsing"
(above). Assemble and return a result string using the lower
case cache to convert each term to lowercase.
void filter :: print_cache_stats();
Print cache hit and size information.
void filter :: get_index();
This function only exists if CONFIG_TEST is enabled.
Return the index storing all the filter data.
void filter :: search(const std::string &text, std::set<track_id> &res);
This function finds all track_ids matching the input text.
Parse the string into substrings and take the intersection of
all sets returned by the index for each substring.

View File

@ -9,11 +9,9 @@
namespace filter {
void add(const std::string &, unsigned int);
std::string add(const std::string &, unsigned int);
void search(const std::string &, std::set<unsigned int> &);
std::string to_lowercase(const std::string &);
void print_cache_stats();
std::string lowercase(const std::string &);
};

View File

@ -6,7 +6,7 @@
#include <ocarina.h>
#include <playqueue.h>
#include <set>
class Tab {
private:

View File

@ -8,13 +8,10 @@
#include <algorithm>
#include <list>
#include <map>
static Index filter_index("", false);
static std::map<std::string, std::string> lowercase_cache;
static unsigned int lowercase_cache_hits = 0;
static void split_text(const std::string &text, std::list<std::string> &ret)
static void parse_text(const std::string &text, std::list<std::string> &ret)
{
std::string word;
char c;
@ -22,6 +19,17 @@ static void split_text(const std::string &text, std::list<std::string> &ret)
for (unsigned int i = 0; i < text.size(); i++) {
c = text[i];
if ( (c >= 'a') && (c <= 'z') ) {
word += c;
continue;
} else if ( (c >= 'A') && (c <= 'Z') ) {
word += (c + ('a' - 'A'));
continue;
} else if ( (c >= '0') && (c <= '9') ) {
word += c;
continue;
}
switch (c) {
case '\\':
case '/':
@ -40,9 +48,8 @@ static void split_text(const std::string &text, std::list<std::string> &ret)
ret.push_back(word);
word = "";
}
break;
default:
word += c;
break;
};
}
@ -50,42 +57,6 @@ static void split_text(const std::string &text, std::list<std::string> &ret)
ret.push_back(word);
}
static void lower_text(const std::string &text, std::list<std::string> &ret)
{
char c;
std::string word;
std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
if (it != lowercase_cache.end()) {
lowercase_cache_hits++;
ret.push_back(it->second);
return;
}
for (unsigned int i = 0; i < text.size(); i++) {
c = text[i];
if ( (c >= 'a') && (c <= 'z') )
word += c;
else if ( (c >= 'A') && (c <= 'Z') )
word += (c + ('a' - 'A'));
else if ( (c >= '0') && (c <= '9') )
word += c;
}
lowercase_cache[text] = word;
ret.push_back(word);
}
static void parse_text(const std::string &text, std::list<std::string> &ret)
{
std::list<std::string> split;
std::list<std::string>::iterator it;
split_text(text, split);
for (it = split.begin(); it != split.end(); it++)
lower_text(*it, ret);
}
static void add_substrings(const std::string &text, unsigned int track_id)
{
std::string substr;
@ -95,7 +66,21 @@ static void add_substrings(const std::string &text, unsigned int track_id)
}
}
void filter :: add(const std::string &text, unsigned int track_id)
static std::string reassemble_text(std::list<std::string> text)
{
std::string res;
std::list<std::string>::iterator it = text.begin();
if (it == text.end())
return res;
res += *it;
for (it++; it != text.end(); it++)
res += " " + *it;
return res;
}
std::string filter :: add(const std::string &text, unsigned int track_id)
{
std::list<std::string> parsed;
std::list<std::string>::iterator it;
@ -103,6 +88,7 @@ void filter :: add(const std::string &text, unsigned int track_id)
parse_text(text, parsed);
for (it = parsed.begin(); it != parsed.end(); it++)
add_substrings(*it, track_id);
return reassemble_text(parsed);
}
static void find_intersection(std::string &text, std::set<unsigned int> &res)
@ -136,24 +122,9 @@ void filter :: search(const std::string &text, std::set<unsigned int> &res)
find_intersection(*it, res);
}
std::string filter :: to_lowercase(const std::string &text)
std::string filter :: lowercase(const std::string &text)
{
std::string res = "";
std::list<std::string> parsed;
std::list<std::string>::iterator it;
parse_text(text, parsed);
for (it = parsed.begin(); it != parsed.end(); it++) {
if (it != parsed.begin())
res += " ";
res += *it;
}
return res;
}
void filter :: print_cache_stats()
{
print("Lowercase cache size: %u\n", lowercase_cache.size());
print("Lowercase cache hits: %u\n", lowercase_cache_hits);
return reassemble_text(parsed);
}

View File

@ -47,7 +47,7 @@ library :: AGInfo :: AGInfo(DB_Type type, TagLib :: Tag *tag)
else
throw -E_INVAL;
key_lower = filter :: to_lowercase(name);
key_lower = filter :: lowercase(name);
}
library :: AGInfo :: AGInfo(DB_Type type, const std::string &str)
@ -55,7 +55,7 @@ library :: AGInfo :: AGInfo(DB_Type type, const std::string &str)
{
if ((db_type == DB_ARTIST) || (db_type == DB_GENRE)) {
name = str;
key_lower = filter :: to_lowercase(name);
key_lower = filter :: lowercase(name);
} else
throw -E_INVAL;
}
@ -68,7 +68,7 @@ const std::string library :: AGInfo :: primary_key()
void library :: AGInfo :: read(File &f)
{
name = f.getline();
key_lower = filter :: to_lowercase(name);
key_lower = filter :: lowercase(name);
}
void library :: AGInfo :: write(File &f)
@ -91,13 +91,13 @@ library :: Album :: Album(TagLib :: Tag *tag, unsigned int artist)
: name(tag->album().stripWhiteSpace().to8Bit(true)),
year(tag->year()), artist_id(artist)
{
name_lower = filter :: to_lowercase(name);
name_lower = filter :: lowercase(name);
}
library :: Album :: Album(const std::string &str, unsigned int yr, unsigned int artist)
: name(str), year(yr), artist_id(artist)
{
name_lower = filter :: to_lowercase(name);
name_lower = filter :: lowercase(name);
}
const std::string library :: Album :: primary_key()
@ -111,7 +111,7 @@ void library :: Album :: read(File &f)
{
f >> artist_id >> year;
name = f.getline();
name_lower = filter :: to_lowercase(name);
name_lower = filter :: lowercase(name);
}
void library :: Album :: write(File &f)
@ -176,7 +176,7 @@ library :: Track :: Track(TagLib :: Tag *tag, TagLib :: AudioProperties *audio,
full_path = path;
filepath = path.substr(library_db.at(library_id)->root_path.size() + 1);
title_lower = filter :: to_lowercase(title);
title_lower = filter :: lowercase(title);
minutes = length / 60;
seconds = length % 60;
@ -200,7 +200,7 @@ library :: Track :: Track(struct ImportData *data, unsigned int lib,
full_path = data->filepath;
filepath = full_path.substr(library_db.at(library_id)->root_path.size() + 1);
title_lower = filter :: to_lowercase(title);
title_lower = filter :: lowercase(title);
minutes = length / 60;
seconds = length % 60;
@ -224,7 +224,7 @@ void library :: Track :: read(File &f)
length_str = f.getline();
title = f.getline();
filepath = f.getline();
title_lower = filter :: to_lowercase(title);
title_lower = filter :: lowercase(title);
full_path = library_db.at(library_id)->root_path + "/" + filepath;
library_db.at(library_id)->size++;
}

View File

@ -8,8 +8,8 @@ if sys.argv.count("tests") > 0:
src = SConscript("src/Sconscript")
tests = [ "version", "file", "db_entry", "database", "index" ]
#scripts = [ "filter", "idle", "playlist", "library", "playqueue", "deck", "audio", "gui" ]
tests = [ "version", "file", "db_entry", "database", "index", "filter" ]
#scripts = [ "idle", "playlist", "library", "playqueue", "deck", "audio", "gui" ]
prev = None

71
tests/filter Executable file
View File

@ -0,0 +1,71 @@
#!/bin/bash
# Copyright 2014 (c) Anna Schumaker
. $(dirname $0)/_functions
function test_add
{
test_equal "./src/filter.run -a $1" "$2"
}
function test_lowercase
{
test_equal "./src/filter.run -l $1" "$2"
}
function test_text
{
test_add "$1" "$2"
test_lowercase "$1" "$2"
}
function test_search
{
num=$(cat -b $DATA_DIR/filter.txt | tail -n 1 | awk '{print $1}')
let num=$num-1
test_equal "./src/filter.run -s $num $1" "$2"
}
new_test "Filter Add and Lowercase Test"
test_text " " ""
test_text " test
text" "test text"
test_text "test text" "test text"
test_text "Test Text" "test text"
test_text "Test? Text!" "test text"
test_text "Test?123 Text!456" "test123 text456"
test_text "Test?123 Text!456" "test123 text456"
test_text "Test(text);123-456" "test text 123 456"
test_text "Test((text));;123--456" "test text 123 456"
echo
new_test "Filter Search Test"
file=$DATA_DIR/filter.txt
mkdir -p $DATA_DIR
echo "0" > $file
echo "It's dangerous to go alone! Take this..." >> $file
echo "DODONGO DISLIKES SMOKE." >> $file
echo "I am Error." >> $file
echo "Error knows a secret." >> $file
echo "Hey, you pay, then you can open the chests!" >> $file
echo "And the Master Sword sleeps again... FOREVER!" >> $file
echo "Link checked the chest. Wow! This is a nice chest!" >> $file
echo "Hey! Listen! Hey! Listen! Watch out!" >> $file
echo "You killed the Deku Tree? How could you?!" >> $file
echo "You've met with a terrible fate, haven't you?" >> $file
echo "Believe in your strengths... Believe..." >> $file
echo "Tingle! Tingle! Kooloo-Limpah!" >> $file
echo "Well excuse me, Princess!" >> $file
test_search "error" "2 3"
test_search "the" "4 5 6 8"
test_search "the ch" "4 6"
test_search "the CH" "4 6"
test_search "the ch y" "4"

View File

@ -1,6 +0,0 @@
#!/usr/bin/python
Import("Test", "CONFIG")
CONFIG.FILTER = True
Test("filter", "filter.cpp")

View File

@ -1,131 +0,0 @@
/*
* Copyright 2013 (c) Anna Schumaker.
*/
#include <filter.h>
#include <print.h>
#include <vector>
std::string quotes [] = {
"What heroes like best is themselves.",
"The sun rose slowly, as if wasn't sure it was worth all the effort",
"Of course I'm sane, when trees start talking to me, I don't talk back",
"Darkness isn't the opposite of light, it's simply its absence",
"Time passed, which, basically, is its job",
"Million-to-one chances crop up nine times out of ten",
"CATS ARE NICE",
"Death isn't cruel - merely terribly, terribly good at his job",
"Thunder rolled ... it rolled a six",
"DROP THE SCYTHE, AND TURN AROUND SLOWLY",
"Time is like a drug. Too much of it kills you",
"Gravity is a habit that is hard to shake off",
"You do not ask people like that what they are thinking about in case "
"they turn around very slowly and say 'You'",
"Do unto others before they do unto you",
"Not a man to mince words. People, yes. But not words",
"An elf's strength lay in persuading others they were weak",
"May you live in interesting times",
"WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN "
"ABOUT THAT",
"I AM DEATH, NOT TAXES. I TURN UP ONLY ONCE",
"All tribal myths are true, for a given value of 'true'",
"The Truth Shall Make Ye Fret",
"When you look into the abyss, it's not supposed to wave back",
"I have no use for people who have learned the limits of the possible",
"Speak softly and employ a huge man with a crowbar",
"Truly, the leopard can change his shorts",
"+++Divide By Cucumber Error, Please Reinstall Universe And Reboot+++",
"+++Whoops! Here comes the cheese! +++",
"Bring out yer dead, bring out yer living dead",
"1. ALL FUNGI ARE EDIBLE. 2. SOME FUNGI ARE NOT EDIBLE MORE THAN ONCE.",
"A lot of farming is about manure",
"I am very attached to my fingers, and I like to think of them as attached to me",
"There be a lot o' men who became heroes cuz they wuz too scared tae run",
"If only the pawns united, make talked the rooks round, the whole board "
"could've been a republic in a dozen moves",
"Always remember that the crowd that applauds your coronation is the same "
"crowd that will applaud your beheading. People like a show.",
};
static const unsigned int num_quotes = sizeof(quotes) / sizeof(std::string);
void print_index(Database<IndexEntry> &db)
{
db.print_keys();
for (unsigned int i = db.first(); i <= db.last(); i = db.next(i)) {
print("index[%s] = ", db[i].primary_key.c_str());
db[i].print();
print("\n");
}
print("\n");
}
void test_search(const std::string &text)
{
std::set<unsigned int> results;
std::set<unsigned int>::iterator it;
filter :: search(text, results);
print("Search for: \"%s\" returned %u matches:\n",
text.c_str(), results.size());
for (it = results.begin(); it != results.end(); it++)
print("\t%s\n", quotes[*it].c_str());
print("\n");
}
void test_lowercase(const std::string &text)
{
std::string res = filter :: to_lowercase(text);
print("Lowercasing: \"%s\" returned: \"%s\"\n", text.c_str(), res.c_str());
}
void test_0()
{
for (unsigned int i = 0; i < num_quotes; i++)
filter :: add(quotes[i], i);
print_index(filter :: get_index());
filter :: print_cache_stats();
print("\n");
}
void test_1()
{
test_search("");
test_search("Rincewind");
test_search("Rincewind Twoflower Luggage");
test_search("the");
test_search("the is");
test_search("THE IS");
test_search("th i");
test_search("th i even");
test_search("Th/i-eVEn");
test_search("whoops");
filter :: print_cache_stats();
print("\n");
}
void test_2()
{
test_lowercase("");
test_lowercase("Rincewind");
test_lowercase("Rincewind Twoflower Luggage");
test_lowercase("the");
test_lowercase("the is");
test_lowercase("THE IS");
test_lowercase("th i");
test_lowercase("th i even");
test_lowercase("Th/i-eVen");
test_lowercase("whoops");
test_lowercase("WHOOPS");
filter :: print_cache_stats();
}
int main(int argc, char **argv)
{
test_0();
test_1();
test_2();
return 0;
}

View File

@ -1,757 +0,0 @@
Found keys: 1 2 a ab abo abou about abs abse absen absenc absence aby abys abyss al all alw alwa alway always am an and ap app appl appla applau applaud applauds ar are aro arou aroun around as ask at att atta attac attach attache attached b ba bac back bas basi basic basica basical basicall basically be bec beca becam became bee been bef befo befor before beh behe behea behead beheadi beheadin beheading bes best bo boa boar board br bri brin bring bu but by c ca can cas case cat cats ch cha chan chanc chance chances chang change che chee chees cheese co com come comes cor coro coron corona coronat coronati coronatio coronation cou coul could couldv couldve cour cours course cr cro crop crow crowb crowba crowbar crowd cru crue cruel cu cuc cucu cucum cucumb cucumbe cucumber cuz d da dar dark darkn darkne darknes darkness de dea dead deat death di div divi divid divide do don dont doz doze dozen dr dro drop dru drug e ed edi edib edibl edible ef eff effo effor effort el elf elfs em emp empl emplo employ er err erro error ev eve even f fa far farm farmi farmin farming fi fin fing finge finger fingers fo for fr fre fret fu fun fung fungi g gi giv give given go goo good gr gra grav gravi gravit gravity h ha hab habi habit har hard hav have he her here hero heroe heroes hi him his hu hug huge i if im in int inte inter intere interes interest interesti interestin interesting into is isn isnt it its j jo job k ki kil kill kills l la lay le lea lear learn learne learned leo leop leopa leopar leopard li lig ligh light lik like lim limi limit limits liv live livi livin living lo loo look lot m ma mak make man manu manur manure may me men mer mere merel merely mi mil mill milli millio million min minc mince mo mor more mov move moves mu muc much my myt myth myths n ni nic nice nin nine no not o of off on onc once one onl only op opp oppo oppos opposi opposit opposite ot oth othe other others ou out p pa pas pass passe passed paw pawn pawns pe peo peop peopl people per pers persu persua persuad persuadi persuadin persuading pl ple plea pleas please po pos poss possi possib possibl possible r re reb rebo reboo reboot rei rein reins reinst reinsta reinstal reinstall rem reme remem rememb remembe remember rep repu repub republ republi republic ro rol roll rolle rolled roo rook rooks ros rose rou roun round ru run s sa sam same san sane say sc sca scar scare scared scy scyt scyth scythe sh sha shak shake shal shall sho shor short shorts show si sim simp simpl simply six sl slo slow slowl slowly so sof soft softl softly som some sp spe spea speak st sta star start str stre stren streng strengt strength su sun sup supp suppo suppos suppose supposed sur sure t ta tae tal talk talke talked talki talkin talking tax taxe taxes te ten ter terr terri terrib terribl terribly th tha than that the them thems themse themsel themselv themselve themselves ther there they thi thin think thinki thinkin thinking thu thun thund thunde thunder ti tim time times to too tr tre tree trees tri trib triba tribal tru true trul truly trut truth tu tur turn u un unc unce uncer uncert uncerta uncertai uncertain uncertaint uncertainty uni unit unite united univ unive univer univers universe unt unto up us use v va val valu value ve ver very w wa was wasn wasnt wav wave we wea weak wer were wh wha what whe when whi whic which who whol whole whoo whoop whoops wi wil will wit with wo wor word words wort worth wu wuz y ye yer yes yo you your
index[w] = {0 1 2 4 12 14 15 17 21 22 23 26 31 32 33}
index[wh] = {0 2 4 12 21 22 26 31 32}
index[wha] = {0 12}
index[what] = {0 12}
index[h] = {0 7 11 17 22 23 24 26 31}
index[he] = {0 17 26 31}
index[her] = {0 17 26 31}
index[hero] = {0 31}
index[heroe] = {0 31}
index[heroes] = {0 31}
index[l] = {0 3 10 12 15 16 21 22 24 27 29 30 31 33}
index[li] = {0 3 10 12 16 22 27 30 33}
index[lik] = {0 10 12 30 33}
index[like] = {0 10 12 30 33}
index[b] = {0 2 4 13 14 21 25 27 31 32 33}
index[be] = {0 13 31 32 33}
index[bes] = {0}
index[best] = {0}
index[i] = {0 1 2 3 4 7 8 10 11 12 15 16 17 18 21 22 29 30 32 33}
index[is] = {0 3 4 7 10 11 17 29 33}
index[t] = {0 1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 24 26 28 30 31 32 33}
index[th] = {0 1 3 8 9 11 12 13 15 17 20 21 22 24 26 28 30 31 32 33}
index[the] = {0 1 3 9 12 13 15 20 21 22 24 26 30 31 32 33}
index[them] = {0 30}
index[thems] = {0}
index[themse] = {0}
index[themsel] = {0}
index[themselv] = {0}
index[themselve] = {0}
index[themselves] = {0}
index[s] = {1 2 3 8 9 11 12 15 17 20 21 23 24 28 31 33}
index[su] = {1 17 21}
index[sun] = {1}
index[r] = {1 8 25 31 32 33}
index[ro] = {1 8 32}
index[ros] = {1}
index[rose] = {1}
index[sl] = {1 9 12}
index[slo] = {1 9 12}
index[slow] = {1 9 12}
index[slowl] = {1 9 12}
index[slowly] = {1 9 12}
index[a] = {1 3 6 7 8 9 10 11 12 14 15 17 18 19 21 23 25 28 29 30 31 32 33}
index[as] = {1 12 30}
index[if] = {1 32}
index[wa] = {1 21}
index[was] = {1}
index[wasn] = {1}
index[wasnt] = {1}
index[sur] = {1 17}
index[sure] = {1 17}
index[it] = {1 3 4 8 10 21}
index[wo] = {1 14}
index[wor] = {1 14}
index[wort] = {1}
index[worth] = {1}
index[al] = {1 19 28 33}
index[all] = {1 19 28}
index[e] = {1 15 17 23 25 28}
index[ef] = {1}
index[eff] = {1}
index[effo] = {1}
index[effor] = {1}
index[effort] = {1}
index[o] = {2 3 5 10 11 13 15 18 19 22 27 28 29 30 31 32}
index[of] = {2 3 5 10 11 19 22 29 30}
index[c] = {2 5 6 7 12 23 24 25 26 31 32 33}
index[co] = {2 26 32 33}
index[cou] = {2 32}
index[cour] = {2}
index[cours] = {2}
index[course] = {2}
index[im] = {2 17}
index[sa] = {2 12 33}
index[san] = {2}
index[sane] = {2}
index[whe] = {2 21}
index[when] = {2 21}
index[tr] = {2 19 20 24}
index[tre] = {2}
index[tree] = {2}
index[trees] = {2}
index[st] = {2 15}
index[sta] = {2}
index[star] = {2}
index[start] = {2}
index[ta] = {2 18 31 32}
index[tal] = {2 32}
index[talk] = {2 32}
index[talki] = {2}
index[talkin] = {2}
index[talking] = {2}
index[to] = {2 5 10 11 14 21 30 31}
index[m] = {2 5 7 10 14 16 19 20 23 28 29 30 31 32}
index[me] = {2 7 30 31}
index[d] = {2 3 7 9 10 12 13 18 25 27 32}
index[do] = {2 12 13 32}
index[don] = {2}
index[dont] = {2}
index[ba] = {2 4 21}
index[bac] = {2 21}
index[back] = {2 21}
index[da] = {3}
index[dar] = {3}
index[dark] = {3}
index[darkn] = {3}
index[darkne] = {3}
index[darknes] = {3}
index[darkness] = {3}
index[isn] = {3 7}
index[isnt] = {3 7}
index[op] = {3}
index[opp] = {3}
index[oppo] = {3}
index[oppos] = {3}
index[opposi] = {3}
index[opposit] = {3}
index[opposite] = {3}
index[lig] = {3}
index[ligh] = {3}
index[light] = {3}
index[its] = {3 4 21}
index[si] = {3 8}
index[sim] = {3}
index[simp] = {3}
index[simpl] = {3}
index[simply] = {3}
index[ab] = {3 12 17 21 29}
index[abs] = {3}
index[abse] = {3}
index[absen] = {3}
index[absenc] = {3}
index[absence] = {3}
index[ti] = {4 5 10 16}
index[tim] = {4 5 10 16}
index[time] = {4 5 10 16}
index[p] = {4 12 14 15 22 25 32 33}
index[pa] = {4 32}
index[pas] = {4}
index[pass] = {4}
index[passe] = {4}
index[passed] = {4}
index[whi] = {4}
index[whic] = {4}
index[which] = {4}
index[bas] = {4}
index[basi] = {4}
index[basic] = {4}
index[basica] = {4}
index[basical] = {4}
index[basicall] = {4}
index[basically] = {4}
index[j] = {4 7}
index[jo] = {4 7}
index[job] = {4 7}
index[mi] = {5 14}
index[mil] = {5}
index[mill] = {5}
index[milli] = {5}
index[millio] = {5}
index[million] = {5}
index[on] = {5 18 28 32}
index[one] = {5}
index[ch] = {5 24 26}
index[cha] = {5 24}
index[chan] = {5 24}
index[chanc] = {5}
index[chance] = {5}
index[chances] = {5}
index[cr] = {5 7 23 33}
index[cro] = {5 23 33}
index[crop] = {5}
index[u] = {5 13 17 18 22 25 32}
index[up] = {5 18}
index[n] = {5 6 12 14 17 18 21 22 28}
index[ni] = {5 6}
index[nin] = {5}
index[nine] = {5}
index[times] = {5 16}
index[ou] = {5 27}
index[out] = {5 27}
index[te] = {5 7}
index[ten] = {5}
index[ca] = {6 12 24}
index[cat] = {6}
index[cats] = {6}
index[ar] = {6 9 12 19 28}
index[are] = {6 12 19 28}
index[nic] = {6}
index[nice] = {6}
index[de] = {7 18 27}
index[dea] = {7 18 27}
index[deat] = {7 18}
index[death] = {7 18}
index[cru] = {7}
index[crue] = {7}
index[cruel] = {7}
index[mer] = {7}
index[mere] = {7}
index[merel] = {7}
index[merely] = {7}
index[ter] = {7}
index[terr] = {7}
index[terri] = {7}
index[terrib] = {7}
index[terribl] = {7}
index[terribly] = {7}
index[g] = {7 11 19}
index[go] = {7}
index[goo] = {7}
index[good] = {7}
index[at] = {7 30}
index[hi] = {7 17 24}
index[his] = {7 24}
index[thu] = {8}
index[thun] = {8}
index[thund] = {8}
index[thunde] = {8}
index[thunder] = {8}
index[rol] = {8}
index[roll] = {8}
index[rolle] = {8}
index[rolled] = {8}
index[six] = {8}
index[dr] = {9 10}
index[dro] = {9}
index[drop] = {9}
index[sc] = {9 31}
index[scy] = {9}
index[scyt] = {9}
index[scyth] = {9}
index[scythe] = {9}
index[an] = {9 12 15 17 23 25 30}
index[and] = {9 12 17 23 25 30}
index[tu] = {9 12 18}
index[tur] = {9 12 18}
index[turn] = {9 12 18}
index[aro] = {9 12}
index[arou] = {9 12}
index[aroun] = {9 12}
index[around] = {9 12}
index[dru] = {10}
index[drug] = {10}
index[too] = {10 31}
index[mu] = {10}
index[muc] = {10}
index[much] = {10}
index[k] = {10}
index[ki] = {10}
index[kil] = {10}
index[kill] = {10}
index[kills] = {10}
index[y] = {10 12 13 14 16 20 21 27 33}
index[yo] = {10 12 13 16 21 33}
index[you] = {10 12 13 16 21 33}
index[gr] = {11}
index[gra] = {11}
index[grav] = {11}
index[gravi] = {11}
index[gravit] = {11}
index[gravity] = {11}
index[ha] = {11 22}
index[hab] = {11}
index[habi] = {11}
index[habit] = {11}
index[tha] = {11 12 17 28 33}
index[that] = {11 12 17 33}
index[har] = {11}
index[hard] = {11}
index[sh] = {11 20 24 33}
index[sha] = {11 20}
index[shak] = {11}
index[shake] = {11}
index[off] = {11}
index[no] = {12 14 17 18 21 22 28}
index[not] = {12 14 17 18 21 28}
index[ask] = {12}
index[pe] = {12 14 15 22 33}
index[peo] = {12 14 22 33}
index[peop] = {12 14 22 33}
index[peopl] = {12 14 22 33}
index[people] = {12 14 22 33}
index[they] = {12 13 15 31}
index[thi] = {12 30}
index[thin] = {12 30}
index[think] = {12 30}
index[thinki] = {12}
index[thinkin] = {12}
index[thinking] = {12}
index[abo] = {12 17 29}
index[abou] = {12 17 29}
index[about] = {12 17 29}
index[in] = {12 15 16 21 32}
index[cas] = {12}
index[case] = {12}
index[v] = {12 19 30}
index[ve] = {12 30}
index[ver] = {12 30}
index[very] = {12 30}
index[say] = {12}
index[un] = {13 17 25 32}
index[unt] = {13}
index[unto] = {13}
index[ot] = {13 15}
index[oth] = {13 15}
index[othe] = {13 15}
index[other] = {13 15}
index[others] = {13 15}
index[bef] = {13}
index[befo] = {13}
index[befor] = {13}
index[before] = {13}
index[ma] = {14 16 20 23 29 32}
index[man] = {14 23 29}
index[min] = {14}
index[minc] = {14}
index[mince] = {14}
index[word] = {14}
index[words] = {14}
index[ye] = {14 20 27}
index[yes] = {14}
index[bu] = {14}
index[but] = {14}
index[el] = {15}
index[elf] = {15}
index[elfs] = {15}
index[str] = {15}
index[stre] = {15}
index[stren] = {15}
index[streng] = {15}
index[strengt] = {15}
index[strength] = {15}
index[la] = {15}
index[lay] = {15}
index[per] = {15}
index[pers] = {15}
index[persu] = {15}
index[persua] = {15}
index[persuad] = {15}
index[persuadi] = {15}
index[persuadin] = {15}
index[persuading] = {15}
index[we] = {15}
index[wer] = {15}
index[were] = {15}
index[wea] = {15}
index[weak] = {15}
index[may] = {16}
index[liv] = {16 27}
index[live] = {16}
index[int] = {16 21}
index[inte] = {16}
index[inter] = {16}
index[intere] = {16}
index[interes] = {16}
index[interest] = {16}
index[interesti] = {16}
index[interestin] = {16}
index[interesting] = {16}
index[wi] = {17 23 33}
index[wit] = {17 23}
index[with] = {17 23}
index[him] = {17}
index[here] = {17 26}
index[ev] = {17}
index[eve] = {17}
index[even] = {17}
index[unc] = {17}
index[unce] = {17}
index[uncer] = {17}
index[uncert] = {17}
index[uncerta] = {17}
index[uncertai] = {17}
index[uncertain] = {17}
index[uncertaint] = {17}
index[uncertainty] = {17}
index[am] = {18 30}
index[tax] = {18}
index[taxe] = {18}
index[taxes] = {18}
index[onl] = {18 32}
index[only] = {18 32}
index[onc] = {18 28}
index[once] = {18 28}
index[tri] = {19}
index[trib] = {19}
index[triba] = {19}
index[tribal] = {19}
index[my] = {19 30}
index[myt] = {19}
index[myth] = {19}
index[myths] = {19}
index[tru] = {19 20 24}
index[true] = {19}
index[f] = {19 20 22 28 29 30}
index[fo] = {19 22}
index[for] = {19 22}
index[gi] = {19}
index[giv] = {19}
index[give] = {19}
index[given] = {19}
index[va] = {19}
index[val] = {19}
index[valu] = {19}
index[value] = {19}
index[trut] = {20}
index[truth] = {20}
index[shal] = {20}
index[shall] = {20}
index[mak] = {20 32}
index[make] = {20 32}
index[fr] = {20}
index[fre] = {20}
index[fret] = {20}
index[lo] = {21 29 31}
index[loo] = {21}
index[look] = {21}
index[into] = {21}
index[aby] = {21}
index[abys] = {21}
index[abyss] = {21}
index[sup] = {21}
index[supp] = {21}
index[suppo] = {21}
index[suppos] = {21}
index[suppose] = {21}
index[supposed] = {21}
index[wav] = {21}
index[wave] = {21}
index[hav] = {22}
index[have] = {22}
index[us] = {22}
index[use] = {22}
index[who] = {22 26 31 32}
index[le] = {22 24}
index[lea] = {22}
index[lear] = {22}
index[learn] = {22}
index[learne] = {22}
index[learned] = {22}
index[lim] = {22}
index[limi] = {22}
index[limit] = {22}
index[limits] = {22}
index[po] = {22}
index[pos] = {22}
index[poss] = {22}
index[possi] = {22}
index[possib] = {22}
index[possibl] = {22}
index[possible] = {22}
index[sp] = {23}
index[spe] = {23}
index[spea] = {23}
index[speak] = {23}
index[so] = {23 28}
index[sof] = {23}
index[soft] = {23}
index[softl] = {23}
index[softly] = {23}
index[em] = {23}
index[emp] = {23}
index[empl] = {23}
index[emplo] = {23}
index[employ] = {23}
index[hu] = {23}
index[hug] = {23}
index[huge] = {23}
index[crow] = {23 33}
index[crowb] = {23}
index[crowba] = {23}
index[crowbar] = {23}
index[trul] = {24}
index[truly] = {24}
index[leo] = {24}
index[leop] = {24}
index[leopa] = {24}
index[leopar] = {24}
index[leopard] = {24}
index[can] = {24}
index[chang] = {24}
index[change] = {24}
index[sho] = {24 33}
index[shor] = {24}
index[short] = {24}
index[shorts] = {24}
index[di] = {25}
index[div] = {25}
index[divi] = {25}
index[divid] = {25}
index[divide] = {25}
index[by] = {25}
index[cu] = {25 31}
index[cuc] = {25}
index[cucu] = {25}
index[cucum] = {25}
index[cucumb] = {25}
index[cucumbe] = {25}
index[cucumber] = {25}
index[er] = {25}
index[err] = {25}
index[erro] = {25}
index[error] = {25}
index[pl] = {25}
index[ple] = {25}
index[plea] = {25}
index[pleas] = {25}
index[please] = {25}
index[re] = {25 32 33}
index[rei] = {25}
index[rein] = {25}
index[reins] = {25}
index[reinst] = {25}
index[reinsta] = {25}
index[reinstal] = {25}
index[reinstall] = {25}
index[uni] = {25 32}
index[univ] = {25}
index[unive] = {25}
index[univer] = {25}
index[univers] = {25}
index[universe] = {25}
index[reb] = {25}
index[rebo] = {25}
index[reboo] = {25}
index[reboot] = {25}
index[whoo] = {26}
index[whoop] = {26}
index[whoops] = {26}
index[com] = {26}
index[come] = {26}
index[comes] = {26}
index[che] = {26}
index[chee] = {26}
index[chees] = {26}
index[cheese] = {26}
index[br] = {27}
index[bri] = {27}
index[brin] = {27}
index[bring] = {27}
index[yer] = {27}
index[dead] = {27}
index[livi] = {27}
index[livin] = {27}
index[living] = {27}
index[1] = {28}
index[fu] = {28}
index[fun] = {28}
index[fung] = {28}
index[fungi] = {28}
index[ed] = {28}
index[edi] = {28}
index[edib] = {28}
index[edibl] = {28}
index[edible] = {28}
index[2] = {28}
index[som] = {28}
index[some] = {28}
index[mo] = {28 32}
index[mor] = {28}
index[more] = {28}
index[than] = {28}
index[lot] = {29 31}
index[fa] = {29}
index[far] = {29}
index[farm] = {29}
index[farmi] = {29}
index[farmin] = {29}
index[farming] = {29}
index[manu] = {29}
index[manur] = {29}
index[manure] = {29}
index[att] = {30}
index[atta] = {30}
index[attac] = {30}
index[attach] = {30}
index[attache] = {30}
index[attached] = {30}
index[fi] = {30}
index[fin] = {30}
index[fing] = {30}
index[finge] = {30}
index[finger] = {30}
index[fingers] = {30}
index[ther] = {31}
index[there] = {31}
index[men] = {31}
index[bec] = {31}
index[beca] = {31}
index[becam] = {31}
index[became] = {31}
index[cuz] = {31}
index[wu] = {31}
index[wuz] = {31}
index[sca] = {31}
index[scar] = {31}
index[scare] = {31}
index[scared] = {31}
index[tae] = {31}
index[ru] = {31}
index[run] = {31}
index[paw] = {32}
index[pawn] = {32}
index[pawns] = {32}
index[unit] = {32}
index[unite] = {32}
index[united] = {32}
index[talke] = {32}
index[talked] = {32}
index[roo] = {32}
index[rook] = {32}
index[rooks] = {32}
index[rou] = {32}
index[roun] = {32}
index[round] = {32}
index[whol] = {32}
index[whole] = {32}
index[bo] = {32}
index[boa] = {32}
index[boar] = {32}
index[board] = {32}
index[coul] = {32}
index[could] = {32}
index[couldv] = {32}
index[couldve] = {32}
index[bee] = {32}
index[been] = {32}
index[rep] = {32}
index[repu] = {32}
index[repub] = {32}
index[republ] = {32}
index[republi] = {32}
index[republic] = {32}
index[doz] = {32}
index[doze] = {32}
index[dozen] = {32}
index[mov] = {32}
index[move] = {32}
index[moves] = {32}
index[alw] = {33}
index[alwa] = {33}
index[alway] = {33}
index[always] = {33}
index[rem] = {33}
index[reme] = {33}
index[remem] = {33}
index[rememb] = {33}
index[remembe] = {33}
index[remember] = {33}
index[crowd] = {33}
index[ap] = {33}
index[app] = {33}
index[appl] = {33}
index[appla] = {33}
index[applau] = {33}
index[applaud] = {33}
index[applauds] = {33}
index[your] = {33}
index[cor] = {33}
index[coro] = {33}
index[coron] = {33}
index[corona] = {33}
index[coronat] = {33}
index[coronati] = {33}
index[coronatio] = {33}
index[coronation] = {33}
index[sam] = {33}
index[same] = {33}
index[wil] = {33}
index[will] = {33}
index[beh] = {33}
index[behe] = {33}
index[behea] = {33}
index[behead] = {33}
index[beheadi] = {33}
index[beheadin] = {33}
index[beheading] = {33}
index[show] = {33}
Lowercase cache size: 260
Lowercase cache hits: 112
Search for: "" returned 0 matches:
Search for: "Rincewind" returned 0 matches:
Search for: "Rincewind Twoflower Luggage" returned 0 matches:
Search for: "the" returned 16 matches:
What heroes like best is themselves.
The sun rose slowly, as if wasn't sure it was worth all the effort
Darkness isn't the opposite of light, it's simply its absence
DROP THE SCYTHE, AND TURN AROUND SLOWLY
You do not ask people like that what they are thinking about in case they turn around very slowly and say 'You'
Do unto others before they do unto you
An elf's strength lay in persuading others they were weak
The Truth Shall Make Ye Fret
When you look into the abyss, it's not supposed to wave back
I have no use for people who have learned the limits of the possible
Truly, the leopard can change his shorts
+++Whoops! Here comes the cheese! +++
I am very attached to my fingers, and I like to think of them as attached to me
There be a lot o' men who became heroes cuz they wuz too scared tae run
If only the pawns united, make talked the rooks round, the whole board could've been a republic in a dozen moves
Always remember that the crowd that applauds your coronation is the same crowd that will applaud your beheading. People like a show.
Search for: "the is" returned 3 matches:
What heroes like best is themselves.
Darkness isn't the opposite of light, it's simply its absence
Always remember that the crowd that applauds your coronation is the same crowd that will applaud your beheading. People like a show.
Search for: "THE IS" returned 3 matches:
What heroes like best is themselves.
Darkness isn't the opposite of light, it's simply its absence
Always remember that the crowd that applauds your coronation is the same crowd that will applaud your beheading. People like a show.
Search for: "th i" returned 13 matches:
What heroes like best is themselves.
The sun rose slowly, as if wasn't sure it was worth all the effort
Darkness isn't the opposite of light, it's simply its absence
Thunder rolled ... it rolled a six
Gravity is a habit that is hard to shake off
You do not ask people like that what they are thinking about in case they turn around very slowly and say 'You'
An elf's strength lay in persuading others they were weak
WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN ABOUT THAT
When you look into the abyss, it's not supposed to wave back
I have no use for people who have learned the limits of the possible
I am very attached to my fingers, and I like to think of them as attached to me
If only the pawns united, make talked the rooks round, the whole board could've been a republic in a dozen moves
Always remember that the crowd that applauds your coronation is the same crowd that will applaud your beheading. People like a show.
Search for: "th i even" returned 1 matches:
WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN ABOUT THAT
Search for: "Th/i-eVEn" returned 1 matches:
WITH HIM HERE, EVEN UNCERTAINTY IS UNCERTAIN. AND I'M NOT SURE EVEN ABOUT THAT
Search for: "whoops" returned 1 matches:
+++Whoops! Here comes the cheese! +++
Lowercase cache size: 269
Lowercase cache hits: 121
Lowercasing: "" returned: ""
Lowercasing: "Rincewind" returned: "rincewind"
Lowercasing: "Rincewind Twoflower Luggage" returned: "rincewind twoflower luggage"
Lowercasing: "the" returned: "the"
Lowercasing: "the is" returned: "the is"
Lowercasing: "THE IS" returned: "the is"
Lowercasing: "th i" returned: "th i"
Lowercasing: "th i even" returned: "th i even"
Lowercasing: "Th/i-eVen" returned: "th i even"
Lowercasing: "whoops" returned: "whoops"
Lowercasing: "WHOOPS" returned: "whoops"
Lowercase cache size: 271
Lowercase cache hits: 138

98
tests/src/filter.cpp Normal file
View File

@ -0,0 +1,98 @@
/*
* Copyright 2014 (c) Anna Schumaker.
* Test the filtering code
*/
#include <file.h>
#include <filter.h>
#include <print.h>
#include <string>
#include <stdlib.h>
#include <unistd.h>
enum action_t { ADD, LOWERCASE, SEARCH };
void add_text(const std::string &text)
{
std::string lc = filter :: add(text, 0);
print("%s\n", lc.c_str());
}
void to_lowercase(const std::string &text)
{
std::string lc = filter :: lowercase(text);
print("%s\n", lc.c_str());
}
void read_file(unsigned int n)
{
File f("filter.txt", FILE_TYPE_DATA);
if (f.open(OPEN_READ)) {
for (unsigned int i = 0; i < n; i++) {
std::string text = f.getline();
filter :: add(text, i);
}
f.close();
}
}
void do_search(const std::string &text)
{
std::set<unsigned int> res;
std::set<unsigned int>::iterator it;
filter :: search(text, res);
it = res.begin();
if (it == res.end())
return;
print("%u", *it);
for (it++; it != res.end(); it++)
print(" %u", *it);
print("\n");
}
int main(int argc, char **argv)
{
char c;
unsigned int n;
action_t action = ADD;
while ((c = getopt(argc, argv, "als:")) != -1) {
switch (c) {
case 'a':
action = ADD;
break;
case 'l':
action = LOWERCASE;
break;
case 's':
action = SEARCH;
n = atoi(optarg);
break;
}
}
std::string text;
for (int i = optind; i < argc; i++) {
text += " ";
text += argv[i];
}
switch (action) {
case ADD:
add_text(text);
break;
case LOWERCASE:
to_lowercase(text);
break;
case SEARCH:
read_file(n);
do_search(text);
break;
}
return 0;
}