core/string: Add unicode handling to string_lowercase()
This helps improve filtering, since I drop all modifications to characters (such as accents over an 'e'). Signed-off-by: Anna Schumaker <Anna@OcarinaProject.net>
This commit is contained in:
parent
91bbad90ff
commit
6b25987785
|
@ -39,29 +39,40 @@ gchar *string_sec2str_long(unsigned int sec)
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void _drop_special(gchar *str)
|
static gunichar __string_get_char(const gchar *str, const gchar *cur,
|
||||||
|
const gchar *res)
|
||||||
{
|
{
|
||||||
unsigned int i, j = 0;
|
gunichar c = g_utf8_get_char(cur);
|
||||||
|
gchar *prev = g_utf8_find_prev_char(str, res);
|
||||||
|
|
||||||
for (i = 0; i < strlen(str); i++) {
|
if (g_unichar_ismark(c))
|
||||||
if (g_ascii_isalnum(str[i]))
|
return '\0';
|
||||||
str[j++] = str[i];
|
if (g_unichar_ispunct(c))
|
||||||
else if ((j > 0) && (str[i] == ' ') && (str[j - 1] != ' '))
|
return '\0';
|
||||||
str[j++] = str[i];
|
if (g_unichar_isspace(c)) {
|
||||||
|
if (!prev || (*prev == ' '))
|
||||||
|
return '\0';
|
||||||
|
return ' ';
|
||||||
}
|
}
|
||||||
|
return g_unichar_tolower(c);
|
||||||
while (j < i)
|
|
||||||
str[j++] = '\0';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
gchar *string_lowercase(const gchar *str)
|
gchar *string_lowercase(const gchar *str)
|
||||||
{
|
{
|
||||||
gchar *res = g_ascii_strdown(str, -1);
|
gchar *res = g_utf8_normalize(str, -1, G_NORMALIZE_DEFAULT);
|
||||||
|
gchar *i, *j = res;
|
||||||
|
gunichar c;
|
||||||
|
|
||||||
g_strdelimit(res, "\\/,;()_-~+\" ", ' ');
|
for (i = res; *i != '\0'; i = g_utf8_next_char(i)) {
|
||||||
_drop_special(res);
|
c = __string_get_char(res, i, j);
|
||||||
g_strstrip(res);
|
if (c) {
|
||||||
|
*j = c;
|
||||||
|
j = g_utf8_next_char(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*j = '\0';
|
||||||
|
|
||||||
|
g_strchomp(res);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ static void test_addition()
|
||||||
do_test_add("Believe in your strengths... Believe...",
|
do_test_add("Believe in your strengths... Believe...",
|
||||||
"believe in your strengths believe");
|
"believe in your strengths believe");
|
||||||
do_test_add("Tingle! Tingle! Kooloo-Limpah!",
|
do_test_add("Tingle! Tingle! Kooloo-Limpah!",
|
||||||
"tingle tingle kooloo limpah");
|
"tingle tingle kooloolimpah");
|
||||||
do_test_add("Well excuse me, Princess!",
|
do_test_add("Well excuse me, Princess!",
|
||||||
"well excuse me princess");
|
"well excuse me princess");
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,19 +55,16 @@ void test_sec2str_long()
|
||||||
|
|
||||||
void test_lowercase()
|
void test_lowercase()
|
||||||
{
|
{
|
||||||
str_test_equal(string_lowercase(" "), "");
|
str_test_equal(string_lowercase(""), "");
|
||||||
str_test_equal(string_lowercase(" test \
|
str_test_equal(string_lowercase(" "), "");
|
||||||
text"), "test text");
|
str_test_equal(string_lowercase("5:02 PM"), "502 pm");
|
||||||
str_test_equal(string_lowercase("test/text"), "test text");
|
str_test_equal(string_lowercase("T.N.T."), "tnt");
|
||||||
str_test_equal(string_lowercase("Test, Text"), "test text");
|
str_test_equal(string_lowercase("#1 Zero"), "1 zero");
|
||||||
str_test_equal(string_lowercase("Test? Text!"), "test text");
|
str_test_equal(string_lowercase("Don't Stop"), "dont stop");
|
||||||
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
|
str_test_equal(string_lowercase("100,000 Years"), "100000 years");
|
||||||
str_test_equal(string_lowercase("Test?123 Text!456"), "test123 text456");
|
str_test_equal(string_lowercase("Les Misérable"), "les miserable");
|
||||||
str_test_equal(string_lowercase("Test(text);123-456"), "test text 123 456");
|
str_test_equal(string_lowercase("Kryptonite (live)"), "kryptonite live");
|
||||||
str_test_equal(string_lowercase("Test((text));;123--456"), "test text 123 456");
|
str_test_equal(string_lowercase("This Time [Hidden]"), "this time hidden");
|
||||||
str_test_equal(string_lowercase("! __test(TEXT) + __test(\"TEXT\")"),
|
|
||||||
"test text test text");
|
|
||||||
str_test_equal(string_lowercase("Test~tEXt\\123"), "test text 123");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_compare()
|
void test_compare()
|
||||||
|
|
Loading…
Reference in New Issue