97d889531a
This patch updates the design so indexes are built upon databases using a special IndexEntry. I also updated the tests/index/ test to match the new system, but I have not updated filter or group tests yet. Signed-off-by: Anna Schumaker <schumaker.anna@gmail.com>
151 lines
3.1 KiB
C++
151 lines
3.1 KiB
C++
/*
|
|
* Copyright 2013 (c) Anna Schumaker.
|
|
*/
|
|
|
|
#include <filter.h>
|
|
#include <print.h>
|
|
|
|
#include <algorithm>
|
|
#include <list>
|
|
#include <map>
|
|
#include <set>
|
|
|
|
static Database<IndexEntry> filter_index("");
|
|
static std::map<std::string, std::string> lowercase_cache;
|
|
static unsigned int lowercase_cache_hits = 0;
|
|
|
|
static void split_text(const std::string &text, std::list<std::string> &ret)
|
|
{
|
|
std::string word;
|
|
char c;
|
|
|
|
for (unsigned int i = 0; i < text.size(); i++) {
|
|
c = text[i];
|
|
|
|
switch (c) {
|
|
case '\\':
|
|
case '/':
|
|
case ',':
|
|
case ';':
|
|
case '(':
|
|
case ')':
|
|
case '_':
|
|
case '-':
|
|
case '~':
|
|
case '+':
|
|
case '"':
|
|
case ' ':
|
|
case ' ':
|
|
if (word != "") {
|
|
ret.push_back(word);
|
|
word = "";
|
|
}
|
|
break;
|
|
default:
|
|
word += c;
|
|
};
|
|
}
|
|
|
|
if (word != "")
|
|
ret.push_back(word);
|
|
}
|
|
|
|
static void lower_text(const std::string &text, std::list<std::string> &ret)
|
|
{
|
|
char c;
|
|
std::string word;
|
|
std::map<std::string, std::string>::iterator it = lowercase_cache.find(text);
|
|
|
|
if (it != lowercase_cache.end()) {
|
|
lowercase_cache_hits++;
|
|
ret.push_back(it->second);
|
|
return;
|
|
}
|
|
|
|
for (unsigned int i = 0; i < text.size(); i++) {
|
|
c = text[i];
|
|
if ( (c >= 'a') && (c <= 'z') )
|
|
word += c;
|
|
else if ( (c >= 'A') && (c <= 'Z') )
|
|
word += (c + ('a' - 'A'));
|
|
else if ( (c >= '0') && (c <= '9') )
|
|
word += c;
|
|
}
|
|
|
|
lowercase_cache[text] = word;
|
|
ret.push_back(word);
|
|
}
|
|
|
|
static void parse_text(const std::string &text, std::list<std::string> &ret)
|
|
{
|
|
std::list<std::string> split;
|
|
std::list<std::string>::iterator it;
|
|
|
|
split_text(text, split);
|
|
for (it = split.begin(); it != split.end(); it++)
|
|
lower_text(*it, ret);
|
|
}
|
|
|
|
static void add_substrings(const std::string &text, unsigned int track_id)
|
|
{
|
|
std::string substr;
|
|
for (unsigned int i = 1; i <= text.size(); i++) {
|
|
substr = text.substr(0, i);
|
|
try {
|
|
filter_index.find(substr).insert(track_id);
|
|
} catch (...) {
|
|
filter_index.insert(IndexEntry(substr, track_id));
|
|
}
|
|
}
|
|
}
|
|
|
|
void filter :: add(const std::string &text, unsigned int track_id)
|
|
{
|
|
std::list<std::string> parsed;
|
|
std::list<std::string>::iterator it;
|
|
|
|
parse_text(text, parsed);
|
|
for (it = parsed.begin(); it != parsed.end(); it++)
|
|
add_substrings(*it, track_id);
|
|
}
|
|
|
|
static void find_intersection(std::string &text, std::set<unsigned int> &res)
|
|
{
|
|
std::set<unsigned int> terms = filter_index[text];
|
|
std::set<unsigned int> tmp;
|
|
|
|
set_intersection(filter_index[text].begin(), filter_index[text].end(),
|
|
res.begin(), res.end(),
|
|
std::inserter<std::set<unsigned int> >(tmp, tmp.begin()));
|
|
res.swap(tmp);
|
|
}
|
|
|
|
void filter :: search(const std::string &text, std::set<unsigned int> &res)
|
|
{
|
|
std::list<std::string> parsed;
|
|
std::list<std::string>::iterator it;
|
|
|
|
parse_text(text, parsed);
|
|
if (parsed.size() == 0)
|
|
return;
|
|
|
|
it = parsed.begin();
|
|
res = filter_index[*it];
|
|
|
|
for (it++; it != parsed.end(); it++)
|
|
find_intersection(*it, res);
|
|
}
|
|
|
|
void filter :: print_cache_stats()
|
|
{
|
|
print("Lowercase cache size: %u\n", lowercase_cache.size());
|
|
print("Lowercase cache hits: %u\n", lowercase_cache_hits);
|
|
}
|
|
|
|
#ifdef CONFIG_TEST
|
|
Database<IndexEntry> &filter :: get_index()
|
|
{
|
|
return filter_index;
|
|
}
|
|
#endif /* CONFIG_TEST */
|