From 6b2598778517d69a17ea0707f9152ca132208103 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 16 Oct 2015 13:21:50 -0400 Subject: [PATCH] core/string: Add unicode handling to string_lowercase() This helps improve filtering, since I drop all modifications to characters (such as accents over an 'e'). Signed-off-by: Anna Schumaker --- core/string.c | 39 +++++++++++++++++++++++++-------------- tests/core/filter.cpp | 2 +- tests/core/string.c | 23 ++++++++++------------- 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/core/string.c b/core/string.c index be6ad6ba..407ffcfb 100644 --- a/core/string.c +++ b/core/string.c @@ -39,29 +39,40 @@ gchar *string_sec2str_long(unsigned int sec) return res; } -static void _drop_special(gchar *str) +static gunichar __string_get_char(const gchar *str, const gchar *cur, + const gchar *res) { - unsigned int i, j = 0; + gunichar c = g_utf8_get_char(cur); + gchar *prev = g_utf8_find_prev_char(str, res); - for (i = 0; i < strlen(str); i++) { - if (g_ascii_isalnum(str[i])) - str[j++] = str[i]; - else if ((j > 0) && (str[i] == ' ') && (str[j - 1] != ' ')) - str[j++] = str[i]; + if (g_unichar_ismark(c)) + return '\0'; + if (g_unichar_ispunct(c)) + return '\0'; + if (g_unichar_isspace(c)) { + if (!prev || (*prev == ' ')) + return '\0'; + return ' '; } - - while (j < i) - str[j++] = '\0'; + return g_unichar_tolower(c); } gchar *string_lowercase(const gchar *str) { - gchar *res = g_ascii_strdown(str, -1); + gchar *res = g_utf8_normalize(str, -1, G_NORMALIZE_DEFAULT); + gchar *i, *j = res; + gunichar c; - g_strdelimit(res, "\\/,;()_-~+\" ", ' '); - _drop_special(res); - g_strstrip(res); + for (i = res; *i != '\0'; i = g_utf8_next_char(i)) { + c = __string_get_char(res, i, j); + if (c) { + *j = c; + j = g_utf8_next_char(j); + } + } + *j = '\0'; + g_strchomp(res); return res; } diff --git a/tests/core/filter.cpp b/tests/core/filter.cpp index a5cf4c4b..c657ac40 100644 --- a/tests/core/filter.cpp +++ b/tests/core/filter.cpp @@ -37,7 +37,7 @@ static void test_addition() do_test_add("Believe in your strengths... Believe...", "believe in your strengths believe"); do_test_add("Tingle! Tingle! Kooloo-Limpah!", - "tingle tingle kooloo limpah"); + "tingle tingle kooloolimpah"); do_test_add("Well excuse me, Princess!", "well excuse me princess"); } diff --git a/tests/core/string.c b/tests/core/string.c index b23d7401..0a26152e 100644 --- a/tests/core/string.c +++ b/tests/core/string.c @@ -55,19 +55,16 @@ void test_sec2str_long() void test_lowercase() { - str_test_equal(string_lowercase(" "), ""); - str_test_equal(string_lowercase(" test \ - text"), "test text"); - str_test_equal(string_lowercase("test/text"), "test text"); - str_test_equal(string_lowercase("Test, Text"), "test text"); - str_test_equal(string_lowercase("Test? Text!"), "test text"); - str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456"); - str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456"); - str_test_equal(string_lowercase("Test(text);123-456"), "test text 123 456"); - str_test_equal(string_lowercase("Test((text));;123--456"), "test text 123 456"); - str_test_equal(string_lowercase("! __test(TEXT) + __test(\"TEXT\")"), - "test text test text"); - str_test_equal(string_lowercase("Test~tEXt\\123"), "test text 123"); + str_test_equal(string_lowercase(""), ""); + str_test_equal(string_lowercase(" "), ""); + str_test_equal(string_lowercase("5:02 PM"), "502 pm"); + str_test_equal(string_lowercase("T.N.T."), "tnt"); + str_test_equal(string_lowercase("#1 Zero"), "1 zero"); + str_test_equal(string_lowercase("Don't Stop"), "dont stop"); + str_test_equal(string_lowercase("100,000 Years"), "100000 years"); + str_test_equal(string_lowercase("Les Misérable"), "les miserable"); + str_test_equal(string_lowercase("Kryptonite (live)"), "kryptonite live"); + str_test_equal(string_lowercase("This Time [Hidden]"), "this time hidden"); } void test_compare()