core/string: Add unicode handling to string_lowercase()

This helps improve filtering, since I drop all modifications to
characters (such as accents over an 'e').

Signed-off-by: Anna Schumaker <Anna@OcarinaProject.net>
This commit is contained in:
Anna Schumaker 2015-10-16 13:21:50 -04:00
parent 91bbad90ff
commit 6b25987785
3 changed files with 36 additions and 28 deletions

View File

@ -39,29 +39,40 @@ gchar *string_sec2str_long(unsigned int sec)
return res;
}
static void _drop_special(gchar *str)
static gunichar __string_get_char(const gchar *str, const gchar *cur,
const gchar *res)
{
unsigned int i, j = 0;
gunichar c = g_utf8_get_char(cur);
gchar *prev = g_utf8_find_prev_char(str, res);
for (i = 0; i < strlen(str); i++) {
if (g_ascii_isalnum(str[i]))
str[j++] = str[i];
else if ((j > 0) && (str[i] == ' ') && (str[j - 1] != ' '))
str[j++] = str[i];
if (g_unichar_ismark(c))
return '\0';
if (g_unichar_ispunct(c))
return '\0';
if (g_unichar_isspace(c)) {
if (!prev || (*prev == ' '))
return '\0';
return ' ';
}
while (j < i)
str[j++] = '\0';
return g_unichar_tolower(c);
}
gchar *string_lowercase(const gchar *str)
{
gchar *res = g_ascii_strdown(str, -1);
gchar *res = g_utf8_normalize(str, -1, G_NORMALIZE_DEFAULT);
gchar *i, *j = res;
gunichar c;
g_strdelimit(res, "\\/,;()_-~+\" ", ' ');
_drop_special(res);
g_strstrip(res);
for (i = res; *i != '\0'; i = g_utf8_next_char(i)) {
c = __string_get_char(res, i, j);
if (c) {
*j = c;
j = g_utf8_next_char(j);
}
}
*j = '\0';
g_strchomp(res);
return res;
}

View File

@ -37,7 +37,7 @@ static void test_addition()
do_test_add("Believe in your strengths... Believe...",
"believe in your strengths believe");
do_test_add("Tingle! Tingle! Kooloo-Limpah!",
"tingle tingle kooloo limpah");
"tingle tingle kooloolimpah");
do_test_add("Well excuse me, Princess!",
"well excuse me princess");
}

View File

@ -55,19 +55,16 @@ void test_sec2str_long()
void test_lowercase()
{
str_test_equal(string_lowercase(" "), "");
str_test_equal(string_lowercase(" test \
text"), "test text");
str_test_equal(string_lowercase("test/text"), "test text");
str_test_equal(string_lowercase("Test, Text"), "test text");
str_test_equal(string_lowercase("Test? Text!"), "test text");
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
str_test_equal(string_lowercase("Test(text);123-456"), "test text 123 456");
str_test_equal(string_lowercase("Test((text));;123--456"), "test text 123 456");
str_test_equal(string_lowercase("! __test(TEXT) + __test(\"TEXT\")"),
"test text test text");
str_test_equal(string_lowercase("Test~tEXt\\123"), "test text 123");
str_test_equal(string_lowercase(""), "");
str_test_equal(string_lowercase(" "), "");
str_test_equal(string_lowercase("5:02 PM"), "502 pm");
str_test_equal(string_lowercase("T.N.T."), "tnt");
str_test_equal(string_lowercase("#1 Zero"), "1 zero");
str_test_equal(string_lowercase("Don't Stop"), "dont stop");
str_test_equal(string_lowercase("100,000 Years"), "100000 years");
str_test_equal(string_lowercase("Les Misérable"), "les miserable");
str_test_equal(string_lowercase("Kryptonite (live)"), "kryptonite live");
str_test_equal(string_lowercase("This Time [Hidden]"), "this time hidden");
}
void test_compare()