-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Convert punctuation into Fullwidth near the CJK (#6)
* Auto correct punctuation into fullwidth. * Update document
- Loading branch information
Showing
10 changed files
with
114 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package autocorrect | ||
|
||
import ( | ||
"regexp" | ||
) | ||
|
||
var ( | ||
fullwidthMaps = map[string]string{ | ||
",": ",", | ||
".": "。", | ||
";": ";", | ||
":": ":", | ||
"!": "!", | ||
"?": "?", | ||
"~": "~", | ||
// "(": "(", | ||
// ")": ")", | ||
} | ||
|
||
spcialPunctuations = `[.:]` | ||
normalPunctuations = `[,;\!\?~]` | ||
|
||
punctuationWithLeftCJKRe = regexp.MustCompile(normalPunctuations + `[` + cjk + `]+`) | ||
punctuationWithRightCJKRe = regexp.MustCompile(`[` + cjk + `]+` + normalPunctuations) | ||
punctuationWithSpeicalCJKRe = regexp.MustCompile(`[` + cjk + `]+` + spcialPunctuations + `[` + cjk + `]+`) | ||
punctuationWithSpeicalLastCJKRe = regexp.MustCompile(`[` + cjk + `]+` + spcialPunctuations + "$") | ||
punctuationsRe = regexp.MustCompile(`(` + spcialPunctuations + `|` + normalPunctuations + `)`) | ||
) | ||
|
||
// fullwidth correct punctuations near the CJK chars | ||
func fullwidth(text string) (out string) { | ||
out = text | ||
|
||
out = punctuationWithLeftCJKRe.ReplaceAllStringFunc(out, fullwidthReplacePart) | ||
out = punctuationWithRightCJKRe.ReplaceAllStringFunc(out, fullwidthReplacePart) | ||
out = punctuationWithSpeicalCJKRe.ReplaceAllStringFunc(out, fullwidthReplacePart) | ||
out = punctuationWithSpeicalLastCJKRe.ReplaceAllStringFunc(out, fullwidthReplacePart) | ||
|
||
return | ||
} | ||
|
||
func fullwidthReplacePart(part string) string { | ||
part = punctuationsRe.ReplaceAllStringFunc(part, func(str string) string { | ||
str = fullwidthMaps[str] | ||
return str | ||
}) | ||
|
||
return part | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package autocorrect | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func Test_fullwidth(t *testing.T) { | ||
cases := map[string]string{ | ||
"你好,这是一个句子.": "你好,这是一个句子。", | ||
"刚刚买了一部iPhone,好开心!": "刚刚买了一部 iPhone,好开心!", | ||
"蚂蚁集团上市后有多大的上涨空间?": "蚂蚁集团上市后有多大的上涨空间?", | ||
"我们需要一位熟悉 JavaScript、HTML5,至少理解一种框架(如 Backbone.js、AngularJS、React 等)的前端开发者.": "我们需要一位熟悉 JavaScript、HTML5,至少理解一种框架 (如 Backbone.js、AngularJS、React 等) 的前端开发者。", | ||
"蚂蚁疾奔:蚂蚁集团两地上市~全速推进!": "蚂蚁疾奔:蚂蚁集团两地上市~全速推进!", | ||
"蚂蚁集团是阿里巴巴(BABA.N)旗下金融科技子公司": "蚂蚁集团是阿里巴巴 (BABA.N) 旗下金融科技子公司", | ||
"Dollar的演示 $阿里巴巴.US$ 股票标签": "Dollar 的演示 $阿里巴巴.US$ 股票标签", | ||
} | ||
|
||
for source, exptected := range cases { | ||
actual := Format(source) | ||
assertEqual(t, exptected, actual) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters