// Copyright (c) 2011 Bryan Schumaker. #include #include #include #include #include using namespace std; static map > format_cache; static map > substr_cache; static map lc_cache; static unsigned int format_hits = 0; static unsigned int substr_hits = 0; static unsigned int lc_hits = 0; static const unsigned int TM_SECOND = 1; static const unsigned int TM_MINUTE = 60; static const unsigned int TM_HOUR = TM_MINUTE * 60; static const unsigned int TM_DAY = TM_HOUR * 24; void do_format(const string &text, map >::iterator &words, map::iterator &lc) { string word; string lc_string; set word_set; pair< map >::iterator, bool > ret1; pair< map::iterator, bool > ret2; char c, diff = 'a' - 'A'; for (unsigned int i = 0; i < text.size(); i++) { c = text[i]; // Character already lower case if ( (c >= 'a') && (c <= 'z') ) word += c; // Convert uppercase to lowercase else if ( (c >= 'A') && (c <= 'Z') ) word += (c + diff); // Keep numbers else if ( (c >= '0') && (c <= '9') ) word += c; else { // These characters indicate a new word switch (c) { case '-': case '\\': case '/': case ',': case ';': case '(': case ')': case '_': case '~': case '+': case '"': case ' ': if (word != "") { word_set.insert(word); if (lc_string == "") lc_string = word; else lc_string += " " + word; } word = ""; break; default: break; }; } } if (word != "") { word_set.insert(word); if (lc_string == "") lc_string = word; else lc_string += " " + word; } ret1 = format_cache.insert( pair >(text, word_set) ); ret2 = lc_cache.insert( pair(text, lc_string) ); words = ret1.first; lc = ret2.first; } void find_unique_words(const string &text, map >::iterator &words) { map::iterator lc; do_format(text, words, lc); } static set *gen_substrs(const string &word) { string key; set substrs; pair< map >::iterator, bool > ret; for (unsigned int i = 1; i <= word.size(); i++) { key = word.substr(0, i); substrs.insert(key); } ret = substr_cache.insert( pair >(word, substrs)); return &(ret.first->second); } static set *find_substrs(const string &word) { map >::iterator it; it = substr_cache.find(word); if (it == substr_cache.end()) return gen_substrs(word); else { substr_hits++; return &(it->second); } } void find_lowercase(const string &text, map::iterator &lc) { map >::iterator words; do_format(text, words, lc); } unsigned int add_to_stream(stringstream &stream, const unsigned int factor, string field, unsigned int length, unsigned int total) { unsigned int res = length / factor; if (res == 0) return 0; if (total != 0) stream << ", "; stream << res << " " << field; if (res > 1) stream << "s"; return res * factor;; } namespace libsaria { set *format_text(const string &text) { map >::iterator it; it = format_cache.find(text); /* Not found in cache... */ if (it == format_cache.end()) find_unique_words(text, it); else format_hits++; return &(it->second); } void format_substrs(const string &text, set &res) { set *words, *substrs; set::iterator it; words = format_text(text); for (it = words->begin(); it != words->end(); it++) { substrs = find_substrs(*it); res.insert(substrs->begin(), substrs->end()); } } string *lowercase(const string &text) { map::iterator it; it = lc_cache.find(text); /* Not found in cache */ if (it == lc_cache.end()) find_lowercase(text, it); else lc_hits++; return &(it->second); } string length_string(unsigned int len) { unsigned int tot; stringstream stream; if (len == 0) return ""; tot = add_to_stream(stream, TM_DAY, "day", len, 0); tot += add_to_stream(stream, TM_HOUR, "hour", len - tot, tot); tot += add_to_stream(stream, TM_MINUTE, "minute", len - tot, tot); add_to_stream(stream, TM_SECOND, "second", len - tot, tot); return stream.str(); } void print_format_stats() { println("Format cache hits: %u size: %u", format_hits, format_cache.size()); println("Substring cache hits: %u size: %u", substr_hits, substr_cache.size()); println("Lowercase cache hits: %u size: %u", lc_hits, lc_cache.size()); } } /* Namespace: libsaria */