-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathwords.py
89 lines (78 loc) · 2.55 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
@File : words.py
@Time : 2019/12/22 下午8:48
@Author : yizuotian
@Description :
"""
import codecs
import os
class Word(object):
def __init__(self,
chinese_word=True,
alphabet=True,
digit=True,
punctuation=True,
currency=True
):
"""
:param chinese_word: 中文字
:param alphabet: 英文字母
:param digit: 数字
:param punctuation: 标点符号
:param currency: 货币符号
"""
self.chinese_word = chinese_word
self.alphabet = alphabet
self.digit = digit
self.punctuation = punctuation
self.currency = currency
@classmethod
def get_digits(cls):
return '0123456789'
@classmethod
def get_alphabet(cls):
return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
@classmethod
def get_chinese_words(cls):
cur_dir = os.path.dirname(__file__)
# f = codecs.open(os.path.join(cur_dir, 'chinese_word.txt'),
# mode='r', encoding='utf-8')
f = codecs.open(os.path.join(cur_dir, 'char_std_5990.txt'),
mode='r', encoding='utf-8')
lines = f.readlines()
f.close()
lines = [l.strip() for l in lines]
return ''.join(lines)
@classmethod
def get_punctuations(cls):
return "。,、;:?!…-·ˉˇ¨‘'“”~‖∶"'`|〃〔〕〈〉《》「」『』..〖〗【】()[]{}"
@classmethod
def get_currency(cls):
return '$¥'
def get_all_words(self):
# words = ' '
# if self.chinese_word:
# words += self.get_chinese_words()
# if self.alphabet:
# words += self.get_alphabet()
# if self.digit:
# words += self.get_digits()
# if self.punctuation:
# words += self.get_punctuations()
# if self.currency:
# words += self.get_currency()
# return words
cur_dir = os.path.dirname(__file__)
f = codecs.open(os.path.join(cur_dir, 'all_words.txt'),
mode='r', encoding='utf-8')
lines = f.readlines()
f.close()
lines = [l.strip() for l in lines]
return ' '+''.join(lines)
if __name__ == "__main__":
w = Word()
print(len(w.get_all_words()) == len(set(w.get_all_words())))
print(w.get_chinese_words())
print(w.get_all_words())
print(w.get_all_words().__contains__(' '))