diff --git a/emmental/format.py b/emmental/format.py index 16b5dc6..7a842f5 100644 --- a/emmental/format.py +++ b/emmental/format.py @@ -1,5 +1,8 @@ # Copyright 2022 (c) Anna Schumaker """Helper functions for formatting strings.""" +import re + +IGNORE_WORDS = set(["a", "an", "the", ""]) def search(input: str) -> str | None: @@ -19,3 +22,14 @@ def search(input: str) -> str | None: input += "*" return input + + +def sort_key(input: str) -> tuple: + """Translate the input string into a sort key.""" + if len(input) == 0: + return () + input = re.sub(r"[\"\'’“”]", "", input.casefold()) + res = re.split(r"[ /_-]", input) + if len(res) > 1 and res[0] in IGNORE_WORDS: + res = res[1:] + return tuple(res) diff --git a/tests/test_format.py b/tests/test_format.py index 14318ab..afdc3f3 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -18,3 +18,30 @@ class TestFormatter(unittest.TestCase): self.assertEqual(format.search("*Test$"), "*test") self.assertEqual(format.search("^"), "*") self.assertEqual(format.search("$"), "*") + + def test_ignore_words(self): + """Test words that aren't included at the start of the sort key.""" + self.assertSetEqual(format.IGNORE_WORDS, set(["a", "an", "the", ""])) + + def test_sort_key(self): + """Test translating a string into a sort key.""" + self.assertEqual(format.sort_key(""), ()) + self.assertEqual(format.sort_key("Test Text"), ("test", "text")) + self.assertEqual(format.sort_key("Tést Téxt"), ("tést", "téxt")) + self.assertEqual(format.sort_key("A Test"), ("test",)) + self.assertEqual(format.sort_key("A"), ("a",)) + self.assertEqual(format.sort_key("An Extra Test"), ("extra", "test",)) + self.assertEqual(format.sort_key("An"), ("an",)) + self.assertEqual(format.sort_key("The Test"), ("test",)) + self.assertEqual(format.sort_key("The"), ("the",)) + self.assertEqual(format.sort_key("Test The"), ("test", "the")) + self.assertEqual(format.sort_key("Test-Text"), ("test", "text")) + self.assertEqual(format.sort_key("Test_Text"), ("test", "text")) + self.assertEqual(format.sort_key("\"Test\" Text"), ("test", "text")) + self.assertEqual(format.sort_key("“Test” Text"), ("test", "text")) + self.assertEqual(format.sort_key("'Test' Text"), ("test", "text")) + self.assertEqual(format.sort_key("Fish N’ Chips"), + ("fish", "n", "chips")) + self.assertEqual(format.sort_key("ac/dc"), ("ac", "dc")) + self.assertEqual(format.sort_key("/a/B/c/D"), + ("a", "b", "c", "d"))