/* * Copyright 2013 (c) Anna Schumaker. */ #include #include #include #include #include #include static Index filter_index("", false); static std::map lowercase_cache; static unsigned int lowercase_cache_hits = 0; static void split_text(const std::string &text, std::list &ret) { std::string word; char c; for (unsigned int i = 0; i < text.size(); i++) { c = text[i]; switch (c) { case '\\': case '/': case ',': case ';': case '(': case ')': case '_': case '-': case '~': case '+': case '"': case ' ': case ' ': if (word != "") { ret.push_back(word); word = ""; } break; default: word += c; }; } if (word != "") ret.push_back(word); } static void lower_text(const std::string &text, std::list &ret) { char c; std::string word; std::map::iterator it = lowercase_cache.find(text); if (it != lowercase_cache.end()) { lowercase_cache_hits++; ret.push_back(it->second); return; } for (unsigned int i = 0; i < text.size(); i++) { c = text[i]; if ( (c >= 'a') && (c <= 'z') ) word += c; else if ( (c >= 'A') && (c <= 'Z') ) word += (c + ('a' - 'A')); else if ( (c >= '0') && (c <= '9') ) word += c; } lowercase_cache[text] = word; ret.push_back(word); } static void parse_text(const std::string &text, std::list &ret) { std::list split; std::list::iterator it; split_text(text, split); for (it = split.begin(); it != split.end(); it++) lower_text(*it, ret); } static void add_substrings(const std::string &text, unsigned int track_id) { std::string substr; for (unsigned int i = 1; i <= text.size(); i++) { substr = text.substr(0, i); filter_index.insert(substr, track_id); } } void filter :: add(const std::string &text, unsigned int track_id) { std::list parsed; std::list::iterator it; parse_text(text, parsed); for (it = parsed.begin(); it != parsed.end(); it++) add_substrings(*it, track_id); } static void find_intersection(std::string &text, std::set &res) { Index::iterator it = filter_index.find(text); std::set tmp; set_intersection(it->values.begin(), it->values.end(), res.begin(), res.end(), std::inserter >(tmp, tmp.begin())); res.swap(tmp); } void filter :: search(const std::string &text, std::set &res) { std::list parsed; std::list::iterator it; parse_text(text, parsed); if (parsed.size() == 0) return; it = parsed.begin(); try { res = filter_index.find(*it)->values; } catch (...) { return; } for (it++; it != parsed.end(); it++) find_intersection(*it, res); } std::string filter :: to_lowercase(const std::string &text) { std::string res = ""; std::list parsed; std::list::iterator it; parse_text(text, parsed); for (it = parsed.begin(); it != parsed.end(); it++) { if (it != parsed.begin()) res += " "; res += *it; } return res; } void filter :: print_cache_stats() { print("Lowercase cache size: %u\n", lowercase_cache.size()); print("Lowercase cache hits: %u\n", lowercase_cache_hits); }