210 lines
4.6 KiB
C++
210 lines
4.6 KiB
C++
// Copyright (c) 2011 Bryan Schumaker.
|
|
#include <format.h>
|
|
#include <print.h>
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <sstream>
|
|
using namespace std;
|
|
|
|
static map<string, set<string> > format_cache;
|
|
static map<string, set<string> > substr_cache;
|
|
static map<string, string> lc_cache;
|
|
static unsigned int format_hits = 0;
|
|
static unsigned int substr_hits = 0;
|
|
static unsigned int lc_hits = 0;
|
|
|
|
static const unsigned int TM_SECOND = 1;
|
|
static const unsigned int TM_MINUTE = 60;
|
|
static const unsigned int TM_HOUR = TM_MINUTE * 60;
|
|
static const unsigned int TM_DAY = TM_HOUR * 24;
|
|
|
|
void do_format(const string &text,
|
|
map<string, set<string> >::iterator &words,
|
|
map<string, string>::iterator &lc)
|
|
{
|
|
string word;
|
|
string lc_string;
|
|
set<string> word_set;
|
|
pair< map<string, set<string> >::iterator, bool > ret1;
|
|
pair< map<string, string>::iterator, bool > ret2;
|
|
char c, diff = 'a' - 'A';
|
|
|
|
for (unsigned int i = 0; i < text.size(); i++) {
|
|
c = text[i];
|
|
// Character already lower case
|
|
if ( (c >= 'a') && (c <= 'z') )
|
|
word += c;
|
|
// Convert uppercase to lowercase
|
|
else if ( (c >= 'A') && (c <= 'Z') )
|
|
word += (c + diff);
|
|
// Keep numbers
|
|
else if ( (c >= '0') && (c <= '9') )
|
|
word += c;
|
|
else {
|
|
// These characters indicate a new word
|
|
switch (c) {
|
|
case '-':
|
|
case '\\':
|
|
case '/':
|
|
case ',':
|
|
case ';':
|
|
case '(':
|
|
case ')':
|
|
case '_':
|
|
case '~':
|
|
case '+':
|
|
case '"':
|
|
case ' ':
|
|
if (word != "") {
|
|
word_set.insert(word);
|
|
if (lc_string == "")
|
|
lc_string = word;
|
|
else
|
|
lc_string += " " + word;
|
|
}
|
|
word = "";
|
|
break;
|
|
default:
|
|
break;
|
|
};
|
|
}
|
|
}
|
|
|
|
if (word != "") {
|
|
word_set.insert(word);
|
|
if (lc_string == "")
|
|
lc_string = word;
|
|
else
|
|
lc_string += " " + word;
|
|
}
|
|
|
|
ret1 = format_cache.insert( pair<string, set<string> >(text, word_set) );
|
|
ret2 = lc_cache.insert( pair<string, string>(text, lc_string) );
|
|
words = ret1.first;
|
|
lc = ret2.first;
|
|
}
|
|
|
|
void find_unique_words(const string &text,
|
|
map<string, set<string> >::iterator &words)
|
|
{
|
|
map<string, string>::iterator lc;
|
|
do_format(text, words, lc);
|
|
}
|
|
|
|
static set<string> *gen_substrs(const string &word)
|
|
{
|
|
string key;
|
|
set<string> substrs;
|
|
pair< map<string, set<string> >::iterator, bool > ret;
|
|
|
|
for (unsigned int i = 1; i <= word.size(); i++) {
|
|
key = word.substr(0, i);
|
|
substrs.insert(key);
|
|
}
|
|
|
|
ret = substr_cache.insert( pair<string, set<string> >(word, substrs));
|
|
return &(ret.first->second);
|
|
}
|
|
|
|
static set<string> *find_substrs(const string &word)
|
|
{
|
|
map<string, set<string> >::iterator it;
|
|
it = substr_cache.find(word);
|
|
if (it == substr_cache.end())
|
|
return gen_substrs(word);
|
|
else {
|
|
substr_hits++;
|
|
return &(it->second);
|
|
}
|
|
}
|
|
|
|
void find_lowercase(const string &text,
|
|
map<string, string>::iterator &lc)
|
|
{
|
|
map<string, set<string> >::iterator words;
|
|
do_format(text, words, lc);
|
|
}
|
|
|
|
unsigned int add_to_stream(stringstream &stream, const unsigned int factor,
|
|
string field, unsigned int length, unsigned int total)
|
|
{
|
|
unsigned int res = length / factor;
|
|
if (res == 0)
|
|
return 0;
|
|
|
|
if (total != 0)
|
|
stream << ", ";
|
|
stream << res << " " << field;
|
|
if (res > 1)
|
|
stream << "s";
|
|
|
|
return res * factor;;
|
|
}
|
|
|
|
namespace libsaria
|
|
{
|
|
|
|
set<string> *format_text(const string &text)
|
|
{
|
|
map<string, set<string> >::iterator it;
|
|
it = format_cache.find(text);
|
|
|
|
/* Not found in cache... */
|
|
if (it == format_cache.end())
|
|
find_unique_words(text, it);
|
|
else
|
|
format_hits++;
|
|
return &(it->second);
|
|
}
|
|
|
|
void format_substrs(const string &text, set<string> &res)
|
|
{
|
|
set<string> *words, *substrs;
|
|
set<string>::iterator it;
|
|
|
|
words = format_text(text);
|
|
for (it = words->begin(); it != words->end(); it++) {
|
|
substrs = find_substrs(*it);
|
|
res.insert(substrs->begin(), substrs->end());
|
|
}
|
|
}
|
|
|
|
string *lowercase(const string &text)
|
|
{
|
|
map<string, string>::iterator it;
|
|
it = lc_cache.find(text);
|
|
|
|
/* Not found in cache */
|
|
if (it == lc_cache.end())
|
|
find_lowercase(text, it);
|
|
else
|
|
lc_hits++;
|
|
return &(it->second);
|
|
}
|
|
|
|
string length_string(unsigned int len)
|
|
{
|
|
unsigned int tot;
|
|
stringstream stream;
|
|
|
|
if (len == 0)
|
|
return "";
|
|
|
|
tot = add_to_stream(stream, TM_DAY, "day", len, 0);
|
|
tot += add_to_stream(stream, TM_HOUR, "hour", len - tot, tot);
|
|
tot += add_to_stream(stream, TM_MINUTE, "minute", len - tot, tot);
|
|
|
|
add_to_stream(stream, TM_SECOND, "second", len - tot, tot);
|
|
return stream.str();
|
|
}
|
|
|
|
void print_format_stats()
|
|
{
|
|
println("Format cache hits: %u size: %u", format_hits, format_cache.size());
|
|
println("Substring cache hits: %u size: %u", substr_hits, substr_cache.size());
|
|
println("Lowercase cache hits: %u size: %u", lc_hits, lc_cache.size());
|
|
}
|
|
|
|
} /* Namespace: libsaria */
|