core/string: Add unicode handling to string_lowercase()

This helps improve filtering, since I drop all modifications to
characters (such as accents over an 'e').

Signed-off-by: Anna Schumaker <Anna@OcarinaProject.net>
This commit is contained in:
Anna Schumaker 2015-10-16 13:21:50 -04:00
parent 91bbad90ff
commit 6b25987785
3 changed files with 36 additions and 28 deletions

View File

@ -39,29 +39,40 @@ gchar *string_sec2str_long(unsigned int sec)
return res; return res;
} }
static void _drop_special(gchar *str) static gunichar __string_get_char(const gchar *str, const gchar *cur,
const gchar *res)
{ {
unsigned int i, j = 0; gunichar c = g_utf8_get_char(cur);
gchar *prev = g_utf8_find_prev_char(str, res);
for (i = 0; i < strlen(str); i++) { if (g_unichar_ismark(c))
if (g_ascii_isalnum(str[i])) return '\0';
str[j++] = str[i]; if (g_unichar_ispunct(c))
else if ((j > 0) && (str[i] == ' ') && (str[j - 1] != ' ')) return '\0';
str[j++] = str[i]; if (g_unichar_isspace(c)) {
if (!prev || (*prev == ' '))
return '\0';
return ' ';
} }
return g_unichar_tolower(c);
while (j < i)
str[j++] = '\0';
} }
gchar *string_lowercase(const gchar *str) gchar *string_lowercase(const gchar *str)
{ {
gchar *res = g_ascii_strdown(str, -1); gchar *res = g_utf8_normalize(str, -1, G_NORMALIZE_DEFAULT);
gchar *i, *j = res;
gunichar c;
g_strdelimit(res, "\\/,;()_-~+\" ", ' '); for (i = res; *i != '\0'; i = g_utf8_next_char(i)) {
_drop_special(res); c = __string_get_char(res, i, j);
g_strstrip(res); if (c) {
*j = c;
j = g_utf8_next_char(j);
}
}
*j = '\0';
g_strchomp(res);
return res; return res;
} }

View File

@ -37,7 +37,7 @@ static void test_addition()
do_test_add("Believe in your strengths... Believe...", do_test_add("Believe in your strengths... Believe...",
"believe in your strengths believe"); "believe in your strengths believe");
do_test_add("Tingle! Tingle! Kooloo-Limpah!", do_test_add("Tingle! Tingle! Kooloo-Limpah!",
"tingle tingle kooloo limpah"); "tingle tingle kooloolimpah");
do_test_add("Well excuse me, Princess!", do_test_add("Well excuse me, Princess!",
"well excuse me princess"); "well excuse me princess");
} }

View File

@ -55,19 +55,16 @@ void test_sec2str_long()
void test_lowercase() void test_lowercase()
{ {
str_test_equal(string_lowercase(" "), ""); str_test_equal(string_lowercase(""), "");
str_test_equal(string_lowercase(" test \ str_test_equal(string_lowercase(" "), "");
text"), "test text"); str_test_equal(string_lowercase("5:02 PM"), "502 pm");
str_test_equal(string_lowercase("test/text"), "test text"); str_test_equal(string_lowercase("T.N.T."), "tnt");
str_test_equal(string_lowercase("Test, Text"), "test text"); str_test_equal(string_lowercase("#1 Zero"), "1 zero");
str_test_equal(string_lowercase("Test? Text!"), "test text"); str_test_equal(string_lowercase("Don't Stop"), "dont stop");
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456"); str_test_equal(string_lowercase("100,000 Years"), "100000 years");
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456"); str_test_equal(string_lowercase("Les Misérable"), "les miserable");
str_test_equal(string_lowercase("Test(text);123-456"), "test text 123 456"); str_test_equal(string_lowercase("Kryptonite (live)"), "kryptonite live");
str_test_equal(string_lowercase("Test((text));;123--456"), "test text 123 456"); str_test_equal(string_lowercase("This Time [Hidden]"), "this time hidden");
str_test_equal(string_lowercase("! __test(TEXT) + __test(\"TEXT\")"),
"test text test text");
str_test_equal(string_lowercase("Test~tEXt\\123"), "test text 123");
} }
void test_compare() void test_compare()