ICU-13219 add -u-dx- support to BreakIterator

unicode-org · Nov 16, 2023 · a7b3d95 · a7b3d95
1 parent 511e5ef
commit a7b3d95
Show file tree

Hide file tree

Showing 11 changed files with 260 additions and 32 deletions.
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
@@ -1,4 +1,5 @@
 // © 2016 and later: Unicode, Inc. and others.
+//
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
@@ -55,7 +56,7 @@ U_NAMESPACE_BEGIN
 // -------------------------------------
 
 BreakIterator*
-BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
+BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status)
 {
     char fnbuff[256];
     char ext[4]={'\0'};
@@ -116,8 +117,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
         return nullptr;
     }
 
-    // Create a RuleBasedBreakIterator
-    result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
+    {
+        const char* dxs = nullptr;
+        CharString dxsValue; // keep on the stack till we no longer need dxs.
+        // If it is word or line instance, try to get the value for dx
+        if (checkDX) {
+            UErrorCode dxsStatus = U_ZERO_ERROR;
+            CharStringByteSink dxsSink(&dxsValue);
+            loc.getKeywordValue("dx", dxsSink, dxsStatus);
+            if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) {
+                dxs = dxsValue.data();
+            }
+        }
+
+        // Create a RuleBasedBreakIterator
+        result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status);
+    }
 
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != nullptr) {
@@ -421,14 +436,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_CHARACTER:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
-            result = BreakIterator::buildInstance(loc, "grapheme", status);
+            result = BreakIterator::buildInstance(loc, "grapheme", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
     case UBRK_WORD:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
-            result = BreakIterator::buildInstance(loc, "word", status);
+            result = BreakIterator::buildInstance(loc, "word", true, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
@@ -454,7 +469,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
                     uprv_strcat(lb_lw, value.data());
                 }
             }
-            result = BreakIterator::buildInstance(loc, lb_lw, status);
+            result = BreakIterator::buildInstance(loc, lb_lw, true, status);
 
             UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
             UTRACE_EXIT_STATUS(status);
@@ -463,7 +478,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_SENTENCE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
-            result = BreakIterator::buildInstance(loc, "sentence", status);
+            result = BreakIterator::buildInstance(loc, "sentence", false, status);
 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
             char ssKeyValue[kKeyValueLenMax] = {0};
             UErrorCode kvStatus = U_ZERO_ERROR;
@@ -482,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_TITLE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
-            result = BreakIterator::buildInstance(loc, "title", status);
+            result = BreakIterator::buildInstance(loc, "title", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;

diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
@@ -25,6 +25,7 @@
 #include "unicode/uchriter.h"
 #include "unicode/uclean.h"
 #include "unicode/udata.h"
+#include "unicode/uniset.h"
 
 #include "brkeng.h"
 #include "ucln_cmn.h"
@@ -89,9 +90,37 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
+                                               const char* dxs,
         UErrorCode &status) : RuleBasedBreakIterator(udm, status)
 {
     fIsPhraseBreaking = isPhraseBreaking;
+    if (U_FAILURE(status)) {
+        return;
+    }
+    if (dxs != nullptr) {
+        size_t length = uprv_strlen(dxs);
+        // The value should be a list of 4 letter script codes joined by '-'.
+        if (length % 5 != 4) {
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        size_t items = 1 + length / 5;
+        // Change from "thai" to "[:thai:]" or
+        // "thai-arab" to "[[:thai:][:arab:]]"
+        UnicodeString udxs;
+        if (items > 1) {
+            udxs.append(u'[');
+        }
+        for (size_t i = 0; i < items; i++) {
+            udxs.append(u"[:", -1);
+            udxs.append(UnicodeString(dxs + i * 5, 4, US_INV));
+            udxs.append(u":]", -1);
+        }
+        if (items > 1) {
+            udxs.append(u']');
+        }
+        fDX = new UnicodeSet(udxs, status);
+    }
 }
 
 //
@@ -198,7 +227,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
  * Simple Constructor with an error code.
  * Handles common initialization for all other constructors.
  */
-RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) {
     UErrorCode ec = U_ZERO_ERROR;
     if (status == nullptr) {
         status = &ec;
@@ -212,6 +241,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
     }
     fDictionaryCache = lpDictionaryCache.orphan();
     fBreakCache = lpBreakCache.orphan();
+    fDX = nullptr;
 
 #ifdef RBBI_DEBUG
     static UBool debugInitDone = false;
@@ -261,6 +291,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
     delete fDictionaryCache;
     fDictionaryCache = nullptr;
 
+    delete fDX;
+    fDX = nullptr;
+
     delete fLanguageBreakEngines;
     fLanguageBreakEngines = nullptr;
 
@@ -333,6 +366,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
     //       the assumption that the current position is on a rule boundary.
     fBreakCache->reset(fPosition, fRuleStatusIndex);
     fDictionaryCache->reset();
+    fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed();
 
     return *this;
 }
@@ -381,11 +415,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
         return false;
     }
 
+    // If only one has fDX or they are not equal
+    if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) {
+        return false;
+    }
     if (that2.fData == fData ||
         (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
             // The two break iterators are using the same rules.
             return true;
-        }
+    }
     return false;
 }
 
@@ -1298,6 +1336,10 @@ RuleBasedBreakIterator::getRules() const {
     }
 }
 
+bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) {
+    return fDX != nullptr && fDX->contains(c);
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp
@@ -156,17 +156,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
             break;
         }
 
-        // We now have a dictionary character. Get the appropriate language object
-        // to deal with it.
-        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
-            c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
-
-        // Ask the language object if there are any breaks. It will add them to the cache and
-        // leave the text pointer on the other side of its range, ready to search for the next one.
-        if (lbe != nullptr) {
-            foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+        // We now have a dictionary character.
+        // Handle dx (Dictionary break script exclusions) first if needed
+        if (fBI->excludedFromDictionaryBreak(c)) {
+            utext_next32(text);
+        } else {
+            // Get the appropriate language object to deal with it.
+            const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
+                c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
+
+            // Ask the language object if there are any breaks. It will add them to the cache and
+            // leave the text pointer on the other side of its range, ready to search for the next one.
+            if (lbe != nullptr) {
+                foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+            }
         }
-
         // Reload the loop variables for the next go-round
         c = utext_current32(text);
         category = ucptrie_get(fBI->fData->fTrie, c);

diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h
@@ -623,7 +623,7 @@ class U_COMMON_API BreakIterator : public UObject {
     virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
 
  private:
-    static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
+    static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status);
     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
     static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
 

diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
@@ -17,6 +17,7 @@
 #define RBBI_H
 
 #include "unicode/utypes.h"
+#include "unicode/uniset.h"
 
 #if U_SHOW_CPLUSPLUS_API
 
@@ -42,6 +43,7 @@ struct RBBIDataHeader;
 class  RBBIDataWrapper;
 class  UnhandledEngine;
 class  UStack;
+class  UnicodeSet;
 
 
 #ifndef U_HIDE_DRAFT_API
@@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      */
     UBool fIsPhraseBreaking = false;
 
+    /**
+     * A UnicodeSet for Dictionary Break Exclusion.
+     */
+    UnicodeSet* fDX = nullptr;
+private:
+
     //=======================================================================
     // constructors
     //=======================================================================
@@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      *        which will be responsible for closing it when it is no longer needed.
      * @param status Information on any errors encountered.
      * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
+     * @param dxs nullptr or a string to denote "Dictionary break script exclusions".
      * @see udata_open
      * @see #getBinaryRules
      * @internal (private)
      */
-    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
+    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status);
 
     /** @internal */
     friend class RBBIRuleBuilder;
@@ -766,6 +775,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      * signature)
      */
 
+    /*
+     * Check should the character be excluded from dictionary-based text break.
+     * @internal (private)
+     */
+    bool excludedFromDictionaryBreak(int32_t c);
+
     typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
 
     template<typename RowType, PTrieFunc trieFunc>

diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
@@ -106,6 +106,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
     TESTCASE_AUTO(TestLineBreaks);
     TESTCASE_AUTO(TestSentBreaks);
     TESTCASE_AUTO(TestExtended);
+    TESTCASE_AUTO(TestDXLineBreaks);
+    TESTCASE_AUTO(TestDXWordBreaks);
 #endif
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
     TESTCASE_AUTO(TestMonkey);
@@ -3900,6 +3902,66 @@ void RBBITest::TestLineBreaks()
 #endif
 }
 
+void RBBITest::TestDXLineBreaks()
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+    UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
+    std::vector<int32_t> expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 };
+    Locale        locale("ja-u-dx-hani-thai");
+    UErrorCode    status = U_ZERO_ERROR;
+    std::unique_ptr<BreakIterator> bi(BreakIterator::createLineInstance(locale, status));
+    TEST_ASSERT_SUCCESS(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    bi->setText(text);
+    int32_t c = bi->first();
+    std::vector<int32_t> actuals;
+    do {
+      actuals.push_back(c);
+    } while ((c = bi->next()) != BreakIterator::DONE );
+
+    assertEquals(WHERE,
+                 static_cast<int32_t>(expected.size()),
+                 static_cast<int32_t>(actuals.size()));
+    if (expected.size() == actuals.size()) {
+        for (size_t i = 0; i < expected.size(); i++) {
+            assertEquals(WHERE, expected[i], actuals[i]);
+        }
+    }
+#endif
+}
+
+void RBBITest::TestDXWordBreaks()
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+    UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
+    Locale        locale("ja-u-dx-hani-thai");
+    std::vector<int32_t> expected{ 0, 5, 6, 16, 32 };
+    UErrorCode    status = U_ZERO_ERROR;
+    std::unique_ptr<BreakIterator> bi(BreakIterator::createWordInstance(locale, status));
+    TEST_ASSERT_SUCCESS(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    bi->setText(text);
+    int32_t c = bi->first();
+    std::vector<int32_t> actuals;
+    do {
+      actuals.push_back(c);
+    } while ((c = bi->next()) != BreakIterator::DONE );
+
+    assertEquals(WHERE,
+                 static_cast<int32_t>(expected.size()),
+                 static_cast<int32_t>(actuals.size()));
+    if (expected.size() == actuals.size()) {
+        for (size_t i = 0; i < expected.size(); i++) {
+            assertEquals(WHERE, expected[i], actuals[i]);
+        }
+    }
+#endif
+}
+
 void RBBITest::TestSentBreaks()
 {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
@@ -99,6 +99,8 @@ class RBBITest: public IntlTest {
     void TestExternalBreakEngineWithFakeTaiLe();
     void TestExternalBreakEngineWithFakeYue();
 
+    void TestDXLineBreaks();
+    void TestDXWordBreaks();
 #if U_ENABLE_TRACING
     void TestTraceCreateCharacter();
     void TestTraceCreateWord();

diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
@@ -17,6 +17,7 @@
 import com.ibm.icu.impl.CSCharacterIterator;
 import com.ibm.icu.impl.CacheValue;
 import com.ibm.icu.impl.ICUDebug;
+import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ICUCloneNotSupportedException;
 import com.ibm.icu.util.ULocale;