Skip to content

Commit

Permalink
fix(itn): 七八公斤 => 7~8kg, 七八百块 => 700~800块 (#116)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingchensong authored Oct 12, 2023
1 parent 75ac443 commit fc1b2ca
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions itn/chinese/data/measure/units_en.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
千卡 kcal
千克力 kgf
千克 kg
公斤 kg
千赫兹 khz
平方千米 km²
公里 km
Expand Down
14 changes: 14 additions & 0 deletions itn/chinese/rules/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def build_tagger(self):
(number + accep('亿') + delete('零').ques).ques + number)
# 负的xxx 1.11, 1.01
number = sign.ques + number + (dot + digits.plus).ques
# 五六万,三五千,六七百,三四十
number |= add_weight(
(digit + insert("0~") + digit + cross("十", "0")) |
(digit + insert("00~") + digit + cross("百", "00")) |
(digit + insert("000~") + digit + cross("千", "000")) |
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
)
self.number = number.optimize()

# 十/百/千/万
Expand All @@ -87,6 +94,13 @@ def build_tagger(self):
(number_exclude_0_to_9 | digits) +
(dot + digits.plus).plus
)
# 五六万,三五千,六七百,三四十
number_exclude_0_to_9 |= add_weight(
(digit + insert("0~") + digit + cross("十", "0")) |
(digit + insert("00~") + digit + cross("百", "00")) |
(digit + insert("000~") + digit + cross("千", "000")) |
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
)
self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa

# cardinal string like 127.0.0.1, used in ID, IP, etc.
Expand Down
3 changes: 3 additions & 0 deletions itn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, exclude_one=True, enable_0_to_9=True):
def build_tagger(self):
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
to = cross('到', '~') | cross('到百分之', '~')

Expand All @@ -48,6 +49,8 @@ def build_tagger(self):

# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
measure = number + (to + number).ques + units
# 七八块钱
measure |= add_weight(digit + insert("~") + digit + units, -1.0)
tagger = insert('value: "') + (measure | percent) + insert('"')

# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
Expand Down
5 changes: 5 additions & 0 deletions itn/chinese/test/data/cardinal.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
幺幺零 => 110
幺二七点零点零点幺 => 127.0.0.1
这是手机一八五四四一三九一二一 => 这是手机18544139121
三五百 => 300~500
三五千 => 3000~5000
三五万 => 30000~50000
三四万 => 30000~40000
五六十 => 50~60
5 changes: 5 additions & 0 deletions itn/chinese/test/data/measure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,8 @@
百分之三十一到百分之百 => 31~100%
十一到一百千米每小时 => 11~100km/h
每小时三十到三百一十一千米 => 30~311km/h
七八公斤 => 7~8kg
五六十块钱 => 50~60块钱
三五百公里 => 300~500km
八九千美元 => $8000~9000
三四万吨 => 30000~40000吨

0 comments on commit fc1b2ca

Please sign in to comment.