Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Arabic Affixes Fixes #3 #7

Merged
merged 4 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ public static String tashfeer(String text) {
return newSentence.toString().trim();
}

/**
* Word to letters.
*
* For example
* text: "هذه جملة"
* to
* resulting string: "هاء ذال هاء جيم ميم لام تاء_مربوطة"
*
* @param word The input string {@link String}
* @return The resulting string {@link String}
*/
public static String wordToLetters(String word) {
StringBuilder newWord = new StringBuilder();

Expand All @@ -113,6 +124,37 @@ public static String wordToLetters(String word) {
return newWord.toString().trim();
}

/**
* RemoveArabicAffixes
*
* Removes predefined affixes (prefixes and suffixes) from an Arabic word if it
* starts or ends with those affixes.
* This function is designed specifically for processing Arabic text, where
* certain affixes might need to be removed
* for linguistic, stylistic, or morphological reasons.
*
* @param word - The Arabic word from which the affixes are to be
* removed. {@link String}
* @return The word after removing any matching affixes. Returns the
* original word if no affix matches are found. {@link String}
*/
public static String removeArabicAffixes(String word) {
if (Data.ARABIC_PREFIXES.contains(word.substring(0, 2))) {
// For: ALEF & LAM
word = word.substring(2);
} else if (Data.ARABIC_PREFIXES.contains(word.substring(0, 1))) {
word = word.substring(1);
}

if (Data.ARABIC_SUFFIXES.contains(word.substring(word.length() - 2))) {
word = word.substring(0, word.length() - 2);
} else if (Data.ARABIC_SUFFIXES.contains(word.substring(word.length() - 1))) {
word = word.substring(0, word.length() - 1);
}

return word.trim();
}

private static String handleNoonIssue(String text) {
String arabicLetters = String.join("", Data.LETTERS_DICT.keySet()) + "ـ";
String regex = Data.NOON + "(" + "?=[^" + arabicLetters + "]" + ")|" + Data.NOON + "\\z";
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/io/github/seen_arabic/arabic_services/Data.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ class Data {
static final List<Character> YAA = Arrays.asList('ى', 'ئ');
static final List<Character> WAW = Arrays.asList('ؤ');

/**
* List of common Arabic prefixes. These prefixes are used in the
* removeArabicPrefix method
* to identify and remove them from the beginning of Arabic words.
*/
public static final List<String> ARABIC_PREFIXES = Arrays.asList("أ", "ا", "إ", "ال", "ي", "ت", "ن", "ب");

/**
* List of common Arabic suffixes. These suffixes might be used in other
* functions
* to identify and manipulate them at the end of Arabic words.
*/
public static final List<String> ARABIC_SUFFIXES = Arrays.asList("ة", "ه", "ي", "ى", "ية", "ين", "ون", "هم");

static final String TEXT_NULL_MESSAGE = "text must be not null";

static final Map<String, String> LETTERS_DICT;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,121 @@ private void itShouldHandleInputWithSpaces() {
String result = ArabicServices.wordToLetters(input);
assertEquals("هاء ذال هاء جيم ميم لام تاء_مربوطة ألف خاء راء ألف_لينة", result);
}

@Test
public void testRemoveArabicAffixes() {
itShouldRemoveAlfPrefixFromAWord();
itShouldRemoveAlefPrefixAndTaaSuffixFromAWord();
itShouldRemoveAlefHamzaBelowPrefixFromAWord();
itShouldRemoveAlPrefixFromAWord();
itShouldRemoveYaPrefixFromAWord();
itShouldRemoveTaPrefixFromAWord();
itShouldRemoveNunPrefixFromAWord();
itShouldRemoveBaPrefixFromAWord();
itShouldRemoveTaSuffixFromAWord();
itShouldRemoveHaSuffixFromAWord();
itShouldRemoveYaSuffixFromAWord();
itShouldRemoveAlefMaksuraSuffixFromAWord();
itShouldRemoveYaAlefSuffixFromAWord();
itShouldRemoveYaNunSuffixFromAWord();
itShouldRemoveWawNunSuffixFromAWord();
itShouldRemoveHumSuffixFromAWord();
}

private void itShouldRemoveAlfPrefixFromAWord() {
String word = "أمل";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("مل", result);
}

private void itShouldRemoveAlefPrefixAndTaaSuffixFromAWord() {
String word = "امرأة";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("مرأ", result);
}

private void itShouldRemoveAlefHamzaBelowPrefixFromAWord() {
String word = "إنسان";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("نسان", result);
}

private void itShouldRemoveAlPrefixFromAWord() {
String word = "الكتاب";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("كتاب", result);
}

private void itShouldRemoveYaPrefixFromAWord() {
String word = "يوم";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("وم", result);
}

private void itShouldRemoveTaPrefixFromAWord() {
String word = "تفاح";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("فاح", result);
}

private void itShouldRemoveNunPrefixFromAWord() {
String word = "نجم";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("جم", result);
}

private void itShouldRemoveBaPrefixFromAWord() {
String word = "بيت";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("يت", result);
}

private void itShouldRemoveTaSuffixFromAWord() {
String word = "كتابة";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("كتاب", result);
}

private void itShouldRemoveHaSuffixFromAWord() {
String word = "جديه";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("جدي", result);
}

private void itShouldRemoveYaSuffixFromAWord() {
String word = "ذهبي";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("ذهب", result);
}

private void itShouldRemoveAlefMaksuraSuffixFromAWord() {
String word = "منزلي";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("منزل", result);
}

private void itShouldRemoveYaAlefSuffixFromAWord() {
String word = "علمية";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("علم", result);
}

private void itShouldRemoveYaNunSuffixFromAWord() {
String word = "موظفين";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("موظف", result);
}

private void itShouldRemoveWawNunSuffixFromAWord() {
String word = "موظفون";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("موظف", result);
}

private void itShouldRemoveHumSuffixFromAWord() {
String word = "طلابهم";
String result = ArabicServices.removeArabicAffixes(word);
assertEquals("طلاب", result);
}

}