core/string: Add unicode handling to string_lowercase()
This helps improve filtering, since I drop all modifications to characters (such as accents over an 'e'). Signed-off-by: Anna Schumaker <Anna@OcarinaProject.net>
This commit is contained in:
parent
91bbad90ff
commit
6b25987785
|
@ -39,29 +39,40 @@ gchar *string_sec2str_long(unsigned int sec)
|
|||
return res;
|
||||
}
|
||||
|
||||
static void _drop_special(gchar *str)
|
||||
static gunichar __string_get_char(const gchar *str, const gchar *cur,
|
||||
const gchar *res)
|
||||
{
|
||||
unsigned int i, j = 0;
|
||||
gunichar c = g_utf8_get_char(cur);
|
||||
gchar *prev = g_utf8_find_prev_char(str, res);
|
||||
|
||||
for (i = 0; i < strlen(str); i++) {
|
||||
if (g_ascii_isalnum(str[i]))
|
||||
str[j++] = str[i];
|
||||
else if ((j > 0) && (str[i] == ' ') && (str[j - 1] != ' '))
|
||||
str[j++] = str[i];
|
||||
if (g_unichar_ismark(c))
|
||||
return '\0';
|
||||
if (g_unichar_ispunct(c))
|
||||
return '\0';
|
||||
if (g_unichar_isspace(c)) {
|
||||
if (!prev || (*prev == ' '))
|
||||
return '\0';
|
||||
return ' ';
|
||||
}
|
||||
|
||||
while (j < i)
|
||||
str[j++] = '\0';
|
||||
return g_unichar_tolower(c);
|
||||
}
|
||||
|
||||
gchar *string_lowercase(const gchar *str)
|
||||
{
|
||||
gchar *res = g_ascii_strdown(str, -1);
|
||||
gchar *res = g_utf8_normalize(str, -1, G_NORMALIZE_DEFAULT);
|
||||
gchar *i, *j = res;
|
||||
gunichar c;
|
||||
|
||||
g_strdelimit(res, "\\/,;()_-~+\" ", ' ');
|
||||
_drop_special(res);
|
||||
g_strstrip(res);
|
||||
for (i = res; *i != '\0'; i = g_utf8_next_char(i)) {
|
||||
c = __string_get_char(res, i, j);
|
||||
if (c) {
|
||||
*j = c;
|
||||
j = g_utf8_next_char(j);
|
||||
}
|
||||
}
|
||||
*j = '\0';
|
||||
|
||||
g_strchomp(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ static void test_addition()
|
|||
do_test_add("Believe in your strengths... Believe...",
|
||||
"believe in your strengths believe");
|
||||
do_test_add("Tingle! Tingle! Kooloo-Limpah!",
|
||||
"tingle tingle kooloo limpah");
|
||||
"tingle tingle kooloolimpah");
|
||||
do_test_add("Well excuse me, Princess!",
|
||||
"well excuse me princess");
|
||||
}
|
||||
|
|
|
@ -55,19 +55,16 @@ void test_sec2str_long()
|
|||
|
||||
void test_lowercase()
|
||||
{
|
||||
str_test_equal(string_lowercase(" "), "");
|
||||
str_test_equal(string_lowercase(" test \
|
||||
text"), "test text");
|
||||
str_test_equal(string_lowercase("test/text"), "test text");
|
||||
str_test_equal(string_lowercase("Test, Text"), "test text");
|
||||
str_test_equal(string_lowercase("Test? Text!"), "test text");
|
||||
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
|
||||
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
|
||||
str_test_equal(string_lowercase("Test(text);123-456"), "test text 123 456");
|
||||
str_test_equal(string_lowercase("Test((text));;123--456"), "test text 123 456");
|
||||
str_test_equal(string_lowercase("! __test(TEXT) + __test(\"TEXT\")"),
|
||||
"test text test text");
|
||||
str_test_equal(string_lowercase("Test~tEXt\\123"), "test text 123");
|
||||
str_test_equal(string_lowercase(""), "");
|
||||
str_test_equal(string_lowercase(" "), "");
|
||||
str_test_equal(string_lowercase("5:02 PM"), "502 pm");
|
||||
str_test_equal(string_lowercase("T.N.T."), "tnt");
|
||||
str_test_equal(string_lowercase("#1 Zero"), "1 zero");
|
||||
str_test_equal(string_lowercase("Don't Stop"), "dont stop");
|
||||
str_test_equal(string_lowercase("100,000 Years"), "100000 years");
|
||||
str_test_equal(string_lowercase("Les Misérable"), "les miserable");
|
||||
str_test_equal(string_lowercase("Kryptonite (live)"), "kryptonite live");
|
||||
str_test_equal(string_lowercase("This Time [Hidden]"), "this time hidden");
|
||||
}
|
||||
|
||||
void test_compare()
|
||||
|
|
Loading…
Reference in New Issue