From f4b44952a3c07bd0a4d3b00955cddd50dc795cf6 Mon Sep 17 00:00:00 2001 From: Mohamed Amgd Date: Mon, 27 Nov 2023 20:29:35 +0200 Subject: [PATCH 1/4] added documentation for word to letters method --- .../seen_arabic/arabic_services/ArabicServices.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java index 0db7509..83c65b4 100644 --- a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java +++ b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java @@ -88,6 +88,17 @@ public static String tashfeer(String text) { return newSentence.toString().trim(); } + /** + * Word to letters. + * + * For example + * text: "هذه جملة" + * to + * resulting string: "هاء ذال هاء جيم ميم لام تاء_مربوطة" + * + * @param text The input string {@link String} + * @return The resulting string {@link String} + */ public static String wordToLetters(String word) { StringBuilder newWord = new StringBuilder(); From d5727d7c90256ac9d284adf8b917ed20dd605e23 Mon Sep 17 00:00:00 2001 From: Mohamed Amgd Date: Thu, 30 Nov 2023 00:48:43 +0200 Subject: [PATCH 2/4] Remove Arabic Affixes Fixes #3 --- .../arabic_services/ArabicServices.java | 17 +++ .../seen_arabic/arabic_services/Data.java | 14 +++ .../arabic_services/ArabicServicesTest.java | 117 ++++++++++++++++++ 3 files changed, 148 insertions(+) diff --git a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java index 83c65b4..2d476cd 100644 --- a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java +++ b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java @@ -124,6 +124,23 @@ public static String wordToLetters(String word) { return newWord.toString().trim(); } + public static String removeArabicAffixes(String word) { + if (Data.ARABIC_PREFIXES.contains(word.substring(0, 2))) { + // For: ALEF & LAM + word = word.substring(2); + } else if (Data.ARABIC_PREFIXES.contains(word.substring(0, 1))) { + word = word.substring(1); + } + + if (Data.ARABIC_SUFFIXES.contains(word.substring(word.length() - 2))) { + word = word.substring(0, word.length() - 2); + } else if (Data.ARABIC_SUFFIXES.contains(word.substring(word.length() - 1))) { + word = word.substring(0, word.length() - 1); + } + + return word.trim(); + } + private static String handleNoonIssue(String text) { String arabicLetters = String.join("", Data.LETTERS_DICT.keySet()) + "ـ"; String regex = Data.NOON + "(" + "?=[^" + arabicLetters + "]" + ")|" + Data.NOON + "\\z"; diff --git a/src/main/java/io/github/seen_arabic/arabic_services/Data.java b/src/main/java/io/github/seen_arabic/arabic_services/Data.java index fad177c..15eb726 100644 --- a/src/main/java/io/github/seen_arabic/arabic_services/Data.java +++ b/src/main/java/io/github/seen_arabic/arabic_services/Data.java @@ -11,6 +11,20 @@ class Data { static final List YAA = Arrays.asList('ى', 'ئ'); static final List WAW = Arrays.asList('ؤ'); + /** + * List of common Arabic prefixes. These prefixes are used in the + * removeArabicPrefix method + * to identify and remove them from the beginning of Arabic words. + */ + public static final List ARABIC_PREFIXES = Arrays.asList("أ", "ا", "إ", "ال", "ي", "ت", "ن", "ب"); + + /** + * List of common Arabic suffixes. These suffixes might be used in other + * functions + * to identify and manipulate them at the end of Arabic words. + */ + public static final List ARABIC_SUFFIXES = Arrays.asList("ة", "ه", "ي", "ى", "ية", "ين", "ون", "هم"); + static final String TEXT_NULL_MESSAGE = "text must be not null"; static final Map LETTERS_DICT; diff --git a/src/test/java/io/github/seen_arabic/arabic_services/ArabicServicesTest.java b/src/test/java/io/github/seen_arabic/arabic_services/ArabicServicesTest.java index 567cf9e..63ea4ea 100644 --- a/src/test/java/io/github/seen_arabic/arabic_services/ArabicServicesTest.java +++ b/src/test/java/io/github/seen_arabic/arabic_services/ArabicServicesTest.java @@ -124,4 +124,121 @@ private void itShouldHandleInputWithSpaces() { String result = ArabicServices.wordToLetters(input); assertEquals("هاء ذال هاء جيم ميم لام تاء_مربوطة ألف خاء راء ألف_لينة", result); } + + @Test + public void testRemoveArabicAffixes() { + itShouldRemoveAlfPrefixFromAWord(); + itShouldRemoveAlefPrefixAndTaaSuffixFromAWord(); + itShouldRemoveAlefHamzaBelowPrefixFromAWord(); + itShouldRemoveAlPrefixFromAWord(); + itShouldRemoveYaPrefixFromAWord(); + itShouldRemoveTaPrefixFromAWord(); + itShouldRemoveNunPrefixFromAWord(); + itShouldRemoveBaPrefixFromAWord(); + itShouldRemoveTaSuffixFromAWord(); + itShouldRemoveHaSuffixFromAWord(); + itShouldRemoveYaSuffixFromAWord(); + itShouldRemoveAlefMaksuraSuffixFromAWord(); + itShouldRemoveYaAlefSuffixFromAWord(); + itShouldRemoveYaNunSuffixFromAWord(); + itShouldRemoveWawNunSuffixFromAWord(); + itShouldRemoveHumSuffixFromAWord(); + } + + private void itShouldRemoveAlfPrefixFromAWord() { + String word = "أمل"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("مل", result); + } + + private void itShouldRemoveAlefPrefixAndTaaSuffixFromAWord() { + String word = "امرأة"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("مرأ", result); + } + + private void itShouldRemoveAlefHamzaBelowPrefixFromAWord() { + String word = "إنسان"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("نسان", result); + } + + private void itShouldRemoveAlPrefixFromAWord() { + String word = "الكتاب"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("كتاب", result); + } + + private void itShouldRemoveYaPrefixFromAWord() { + String word = "يوم"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("وم", result); + } + + private void itShouldRemoveTaPrefixFromAWord() { + String word = "تفاح"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("فاح", result); + } + + private void itShouldRemoveNunPrefixFromAWord() { + String word = "نجم"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("جم", result); + } + + private void itShouldRemoveBaPrefixFromAWord() { + String word = "بيت"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("يت", result); + } + + private void itShouldRemoveTaSuffixFromAWord() { + String word = "كتابة"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("كتاب", result); + } + + private void itShouldRemoveHaSuffixFromAWord() { + String word = "جديه"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("جدي", result); + } + + private void itShouldRemoveYaSuffixFromAWord() { + String word = "ذهبي"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("ذهب", result); + } + + private void itShouldRemoveAlefMaksuraSuffixFromAWord() { + String word = "منزلي"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("منزل", result); + } + + private void itShouldRemoveYaAlefSuffixFromAWord() { + String word = "علمية"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("علم", result); + } + + private void itShouldRemoveYaNunSuffixFromAWord() { + String word = "موظفين"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("موظف", result); + } + + private void itShouldRemoveWawNunSuffixFromAWord() { + String word = "موظفون"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("موظف", result); + } + + private void itShouldRemoveHumSuffixFromAWord() { + String word = "طلابهم"; + String result = ArabicServices.removeArabicAffixes(word); + assertEquals("طلاب", result); + } + } From d409ba4cc63d6381038d4eaa2c75740d9dd5a17b Mon Sep 17 00:00:00 2001 From: Mohamed Amgd Date: Thu, 30 Nov 2023 00:52:45 +0200 Subject: [PATCH 3/4] Added docs for Remove Arabic Affixes method --- .../arabic_services/ArabicServices.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java index 2d476cd..f86c97b 100644 --- a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java +++ b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java @@ -124,6 +124,20 @@ public static String wordToLetters(String word) { return newWord.toString().trim(); } + /** + * RemoveArabicAffixes + * + * Removes predefined affixes (prefixes and suffixes) from an Arabic word if it + * starts or ends with those affixes. + * This function is designed specifically for processing Arabic text, where + * certain affixes might need to be removed + * for linguistic, stylistic, or morphological reasons. + * + * @param word - The Arabic word from which the affixes are to be + * removed. {@link String} + * @returns The word after removing any matching affixes. Returns the + * original word if no affix matches are found. {@link String} + */ public static String removeArabicAffixes(String word) { if (Data.ARABIC_PREFIXES.contains(word.substring(0, 2))) { // For: ALEF & LAM From 959685c2588cbdbf37f8ad8bcad9d62ae83a28f5 Mon Sep 17 00:00:00 2001 From: Mohamed Amgd Date: Thu, 30 Nov 2023 09:41:58 +0200 Subject: [PATCH 4/4] minor fixes in docs --- .../github/seen_arabic/arabic_services/ArabicServices.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java index f86c97b..f159363 100644 --- a/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java +++ b/src/main/java/io/github/seen_arabic/arabic_services/ArabicServices.java @@ -96,7 +96,7 @@ public static String tashfeer(String text) { * to * resulting string: "هاء ذال هاء جيم ميم لام تاء_مربوطة" * - * @param text The input string {@link String} + * @param word The input string {@link String} * @return The resulting string {@link String} */ public static String wordToLetters(String word) { @@ -135,8 +135,8 @@ public static String wordToLetters(String word) { * * @param word - The Arabic word from which the affixes are to be * removed. {@link String} - * @returns The word after removing any matching affixes. Returns the - * original word if no affix matches are found. {@link String} + * @return The word after removing any matching affixes. Returns the + * original word if no affix matches are found. {@link String} */ public static String removeArabicAffixes(String word) { if (Data.ARABIC_PREFIXES.contains(word.substring(0, 2))) {