I probably should have done this earlier... oh well Signed-off-by: Bryan Schumaker <bjschuma@gmail.com>
130 lines
2.7 KiB
C++
130 lines
2.7 KiB
C++
// Copyright (c) 2011 Bryan Schumaker.
|
|
#include <libsaria/format.h>
|
|
#include <libsaria/print.h>
|
|
|
|
#include <map>
|
|
#include <set>
|
|
using namespace std;
|
|
|
|
static map<string, set<string> > format_cache;
|
|
static map<string, string> lc_cache;
|
|
static unsigned int format_hits;
|
|
static unsigned int lc_hits;
|
|
|
|
void do_format(const string &text,
|
|
map<string, set<string> >::iterator &words,
|
|
map<string, string>::iterator &lc)
|
|
{
|
|
string word;
|
|
string lc_string;
|
|
set<string> word_set;
|
|
pair< map<string, set<string> >::iterator, bool > ret1;
|
|
pair< map<string, string>::iterator, bool > ret2;
|
|
char c, diff = 'a' - 'A';
|
|
|
|
for (unsigned int i = 0; i < text.size(); i++) {
|
|
c = text[i];
|
|
// Character already lower case
|
|
if ( (c >= 'a') && (c <= 'z') )
|
|
word += c;
|
|
// Convert uppercase to lowercase
|
|
else if ( (c >= 'A') && (c <= 'Z') )
|
|
word += (c + diff);
|
|
// Keep numbers
|
|
else if ( (c >= '0') && (c <= '9') )
|
|
word += c;
|
|
else {
|
|
// These characters indicate a new word
|
|
switch (c) {
|
|
case '-':
|
|
case '\\':
|
|
case '/':
|
|
case ',':
|
|
case ';':
|
|
case '(':
|
|
case ')':
|
|
case '_':
|
|
case '~':
|
|
case '+':
|
|
case '"':
|
|
case ' ':
|
|
if (word != "") {
|
|
word_set.insert(word);
|
|
if (lc_string == "")
|
|
lc_string = word;
|
|
else
|
|
lc_string += " " + word;
|
|
}
|
|
word = "";
|
|
break;
|
|
default:
|
|
break;
|
|
};
|
|
}
|
|
}
|
|
|
|
if (word != "") {
|
|
word_set.insert(word);
|
|
if (lc_string == "")
|
|
lc_string = word;
|
|
else
|
|
lc_string += " " + word;
|
|
}
|
|
|
|
ret1 = format_cache.insert( pair<string, set<string> >(text, word_set) );
|
|
ret2 = lc_cache.insert( pair<string, string>(text, lc_string) );
|
|
words = ret1.first;
|
|
lc = ret2.first;
|
|
}
|
|
|
|
void find_unique_words(const string &text,
|
|
map<string, set<string> >::iterator &words)
|
|
{
|
|
map<string, string>::iterator lc;
|
|
do_format(text, words, lc);
|
|
}
|
|
|
|
void find_lowercase(const string &text,
|
|
map<string, string>::iterator &lc)
|
|
{
|
|
map<string, set<string> >::iterator words;
|
|
do_format(text, words, lc);
|
|
}
|
|
|
|
namespace libsaria
|
|
{
|
|
|
|
set<string> *format_text(const string &text)
|
|
{
|
|
map<string, set<string> >::iterator it;
|
|
it = format_cache.find(text);
|
|
|
|
/* Not found in cache... */
|
|
if (it == format_cache.end())
|
|
find_unique_words(text, it);
|
|
else
|
|
format_hits++;
|
|
return &(it->second);
|
|
}
|
|
|
|
string *lowercase(const string &text)
|
|
{
|
|
map<string, string>::iterator it;
|
|
it = lc_cache.find(text);
|
|
|
|
/* Not found in cache */
|
|
if (it == lc_cache.end())
|
|
find_lowercase(text, it);
|
|
else
|
|
lc_hits++;
|
|
return &(it->second);
|
|
}
|
|
|
|
void print_format_stats()
|
|
{
|
|
println("Format cache hits: %u size: %u", format_hits, format_cache.size());
|
|
println("Lowercase cache hits: %u size: %u", lc_hits, lc_cache.size());
|
|
}
|
|
|
|
} /* Namespace: libsaria */
|