Skip to content

Commit

Permalink
ICU-22956 Use InCB for grapheme cluster segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Nov 12, 2024
1 parent 700c5e3 commit 0b9eb9c
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 88 deletions.
14 changes: 5 additions & 9 deletions icu4c/source/data/brkitr/rules/char.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];

#
# From cldr/common/properties/segments/
# and issue CLDR-10994
#
$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
$InCBConsonant = [\p{InCB=Consonant}];
$InCBExtend = [\p{InCB=Extend}];
$InCBLinker = [\p{InCB=Linker}];

# Korean Syllable Definitions
#
Expand Down Expand Up @@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT);
# GB 9b
$Prepend [^$Control $CR $LF];

# GB 9.3, from CLDR-10994
$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
# GB 9c
$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant;

# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
$Extended_Pict $Extend* $ZWJ $Extended_Pict;
Expand Down
41 changes: 20 additions & 21 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind {
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fExtendedPictSet;
UnicodeSet *fViramaSet;
UnicodeSet *fLinkingConsonantSet;
UnicodeSet *fExtCccZwjSet;
UnicodeSet *fInCBLinkerSet;
UnicodeSet *fInCBConsonantSet;
UnicodeSet *fInCBExtendSet;
UnicodeSet *fAnySet;

const UnicodeString *fText;
Expand Down Expand Up @@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() {
fHangulSet->addAll(*fLVTSet);

fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Virama}]", status);
fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Consonant}]", status);
fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status);
fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status);
fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);

// Create sets of characters, and add the names of the above character sets.
Expand All @@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() {
sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul");
sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ");
sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict");
sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama");
sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant");
sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj");
sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker");
sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant");
sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend");
sets.emplace_back(*fAnySet); classNames.emplace_back("Any");

if (U_FAILURE(status)) {
Expand Down Expand Up @@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
continue;
}

// Note: Viramas are also included in the ExtCccZwj class.
if (fLinkingConsonantSet->contains(c2)) {
if (fInCBConsonantSet->contains(c2)) {
int pi = p1;
bool sawVirama = false;
while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
if (fViramaSet->contains(fText->char32At(pi))) {
while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) ||
fInCBLinkerSet->contains(fText->char32At(pi)))) {
if (fInCBLinkerSet->contains(fText->char32At(pi))) {
sawVirama = true;
}
pi = fText->moveIndex32(pi, -1);
}
if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
continue;
if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) {
setAppliedRule(
p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})");
continue;
}
}

Expand Down Expand Up @@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() {
delete fAnySet;
delete fZWJSet;
delete fExtendedPictSet;
delete fViramaSet;
delete fLinkingConsonantSet;
delete fExtCccZwjSet;
delete fInCBLinkerSet;
delete fInCBConsonantSet;
delete fInCBExtendSet;
}

//------------------------------------------------------------------------------------------
Expand Down
16 changes: 7 additions & 9 deletions icu4c/source/test/testdata/break_rules/grapheme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];

Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
Expand All @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];

# Indic Sequences
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];

LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];

ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
InCBLinker = [\p{InCB=Linker}];
InCBConsonant = [\p{InCB=Consonant}];
InCBExtend = [\p{InCB=Extend}];

GB3: CR LF;
GB4: (Control | CR | LF) ÷;
Expand All @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;

GB11: Extended_Pict Extend* ZWJ Extended_Pict;
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9: . (Extend | ZWJ);
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
GB9: . (Extend_ | ZWJ);

GB9a: . SpacingMark;
GB9b: Prepend .;
Expand Down
15 changes: 3 additions & 12 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,9 @@
#
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>

#
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
# Sample Chars: LinkingConsonant: \u0915
# Virama: \u094d [also Extend]
# ExtCccZWJ: \u0308
# Extend but not ExtCCCZWJ \u093A
<char>
<data>•\u0915\u094d\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
# This test would have caught ICU-22956.
<data>•સૻ્સૻ•</data>

#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
UnicodeSet fHangulSet;
UnicodeSet fZWJSet;
UnicodeSet fExtendedPictSet;
UnicodeSet fViramaSet;
UnicodeSet fLinkingConsonantSet;
UnicodeSet fExtCccZwjSet;
UnicodeSet fInCBLinkerSet;
UnicodeSet fInCBConsonantSet;
UnicodeSet fInCBExtendSet;
UnicodeSet fAnySet;


Expand Down Expand Up @@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fHangulSet.addAll(fLVTSet);

fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Virama}]");
fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Consonant}]");
fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]");
fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]");
fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]");
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");


Expand All @@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fSets.add(fAnySet); fClassNames.add("Any");
fSets.add(fZWJSet); fClassNames.add("ZWJ");
fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict");
fSets.add(fViramaSet); fClassNames.add("Virama");
fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant");
fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj");
fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker");
fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant");
fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend");
}


Expand Down Expand Up @@ -315,17 +313,18 @@ int next(int prevPos) {
}

// Note: Viramas are also included in the ExtCccZwj class.
if (fLinkingConsonantSet.contains(c2)) {
if (fInCBConsonantSet.contains(c2)) {
int pi = p1;
boolean sawVirama = false;
while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
if (fViramaSet.contains(fText.codePointAt(pi))) {
while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) ||
fInCBLinkerSet.contains(fText.codePointAt(pi)))) {
if (fInCBLinkerSet.contains(fText.codePointAt(pi))) {
sawVirama = true;
}
pi = fText.offsetByCodePoints(pi, -1);
}
if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) {
setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})");
continue;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];

Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
Expand All @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];

# Indic Sequences
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];

LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];

ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
InCBLinker = [\p{InCB=Linker}];
InCBConsonant = [\p{InCB=Consonant}];
InCBExtend = [\p{InCB=Extend}];

GB3: CR LF;
GB4: (Control | CR | LF) ÷;
Expand All @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;

GB11: Extended_Pict Extend* ZWJ Extended_Pict;
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9: . (Extend | ZWJ);
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
GB9: . (Extend_ | ZWJ);

GB9a: . SpacingMark;
GB9b: Prepend .;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,9 @@
#
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>

#
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
# Sample Chars: LinkingConsonant: \u0915
# Virama: \u094d [also Extend]
# ExtCccZWJ: \u0308
# Extend but not ExtCCCZWJ \u093A
<char>
<data>•\u0915\u094d\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
# This test would have caught ICU-22956.
<data>•સૻ્સૻ•</data>

#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
Expand Down

0 comments on commit 0b9eb9c

Please sign in to comment.