forked from OpenITI/barzakh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_folder_for_disallowed_characters.py
172 lines (162 loc) · 4.17 KB
/
check_folder_for_disallowed_characters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from openiti.helper.ara import *
import unicodedata
import os
import re
allowed_chars = """\
ء ARABIC LETTER HAMZA
آ ARABIC LETTER ALEF WITH MADDA ABOVE
أ ARABIC LETTER ALEF WITH HAMZA ABOVE
ؤ ARABIC LETTER WAW WITH HAMZA ABOVE
إ ARABIC LETTER ALEF WITH HAMZA BELOW
ئ ARABIC LETTER YEH WITH HAMZA ABOVE
ا ARABIC LETTER ALEF
ب ARABIC LETTER BEH
ة ARABIC LETTER TEH MARBUTA
ت ARABIC LETTER TEH
ث ARABIC LETTER THEH
ج ARABIC LETTER JEEM
ح ARABIC LETTER HAH
خ ARABIC LETTER KHAH
د ARABIC LETTER DAL
ذ ARABIC LETTER THAL
ر ARABIC LETTER REH
ز ARABIC LETTER ZAIN
س ARABIC LETTER SEEN
ش ARABIC LETTER SHEEN
ص ARABIC LETTER SAD
ض ARABIC LETTER DAD
ط ARABIC LETTER TAH
ظ ARABIC LETTER ZAH
ع ARABIC LETTER AIN
غ ARABIC LETTER GHAIN
ف ARABIC LETTER FEH
ق ARABIC LETTER QAF
ك ARABIC LETTER KAF
ل ARABIC LETTER LAM
م ARABIC LETTER MEEM
ن ARABIC LETTER NOON
ه ARABIC LETTER HEH
و ARABIC LETTER WAW
ى ARABIC LETTER ALEF MAKSURA
ي ARABIC LETTER YEH
٠ ARABIC-INDIC DIGIT ZERO
١ ARABIC-INDIC DIGIT ONE
٢ ARABIC-INDIC DIGIT TWO
٣ ARABIC-INDIC DIGIT THREE
٤ ARABIC-INDIC DIGIT FOUR
٥ ARABIC-INDIC DIGIT FIVE
٦ ARABIC-INDIC DIGIT SIX
٧ ARABIC-INDIC DIGIT SEVEN
٨ ARABIC-INDIC DIGIT EIGHT
٩ ARABIC-INDIC DIGIT NINE
# NUMBER SIGN
% PERCENT SIGN
( LEFT PARENTHESIS
) RIGHT PARENTHESIS
. FULL STOP
/ SOLIDUS
0 DIGIT ZERO
1 DIGIT ONE
2 DIGIT TWO
3 DIGIT THREE
4 DIGIT FOUR
5 DIGIT FIVE
7 DIGIT SEVEN
8 DIGIT EIGHT
9 DIGIT NINE
: COLON
| VERTICAL LINE
~ TILDE
؟ ARABIC QUESTION MARK
، ARABIC COMMA
! EXCLAMATION MARK
$ DOLLAR SIGN
* ASTERISK
- HYPHEN-MINUS
« LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
» RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
؛ ARABIC SEMICOLON
! EXCLAMATION MARK
" QUOTATION MARK
, COMMA
= EQUALS SIGN
? QUESTION MARK
“ LEFT DOUBLE QUOTATION MARK
” RIGHT DOUBLE QUOTATION MARK
¶ PILCROW SIGN
¬ NOT SIGN
• BULLET
< LESS-THAN SIGN
> GREATER-THAN SIGN
{ LEFT CURLY BRACKET
} RIGHT CURLY BRACKET
ְ HEBREW POINT SHEVA
ֳ HEBREW POINT HATAF QAMATS
ִ HEBREW POINT HIRIQ
ֵ HEBREW POINT TSERE
ֶ HEBREW POINT SEGOL
ַ HEBREW POINT PATAH
ָ HEBREW POINT QAMATS
ֹ HEBREW POINT HOLAM
ּ HEBREW POINT DAGESH OR MAPIQ
א HEBREW LETTER ALEF
ב HEBREW LETTER BET
ג HEBREW LETTER GIMEL
ד HEBREW LETTER DALET
ה HEBREW LETTER HE
ו HEBREW LETTER VAV
ז HEBREW LETTER ZAYIN
ח HEBREW LETTER HET
ט HEBREW LETTER TET
י HEBREW LETTER YOD
ך HEBREW LETTER FINAL KAF
כ HEBREW LETTER KAF
ל HEBREW LETTER LAMED
ם HEBREW LETTER FINAL MEM
מ HEBREW LETTER MEM
ן HEBREW LETTER FINAL NUN
נ HEBREW LETTER NUN
ס HEBREW LETTER SAMEKH
ע HEBREW LETTER AYIN
ף HEBREW LETTER FINAL PE
פ HEBREW LETTER PE
ץ HEBREW LETTER FINAL TSADI
צ HEBREW LETTER TSADI
ק HEBREW LETTER QOF
ר HEBREW LETTER RESH
ש HEBREW LETTER SHIN
ת HEBREW LETTER TAV
"""
allowed_chars = [x.split("\t")[0] for x in allowed_chars.splitlines()]
allowed_chars = [c for c in allowed_chars if c not in ("-", ".")]
allowed_chars += ["\.", "\-"]
allowed_chars = re.compile("[{}]+".format("".join(allowed_chars)))
def get_all_non_allowed_chars_in_file(fp, print_output=False):
with open(fp, mode="r", encoding="utf-8") as file:
text = file.read()
text = normalize_composites(denoise(text))
all_chars = "".join(set(text))
filtered_chars = re.sub(allowed_chars, "", all_chars)
filtered_chars = re.sub("[0-9a-zA-ZāĀēĒṭṬṯṮūŪīĪİıōŌṣṢšŠḍḌḏḎǧǦġĠḫḪḳḲẓẒčČñʿʾ' \"\n\t\[\]]+", "", filtered_chars)
return filtered_chars
def get_all_non_allowed_chars_in_folder(folder):
all_chars = set()
for fn in os.listdir(folder):
fp = os.path.join(folder, fn)
if os.path.isfile(fp) and not fn.endswith((".py", ".yml", ".docx", ".md")):
print(fn)
all_chars = all_chars.union(set(get_all_non_allowed_chars_in_file(fp)))
print(len(all_chars))
# print the non-allowed characters in the folder:
all_chars = "".join(all_chars)
not_found = []
for c in sorted(all_chars):
try:
print(c, "\t", unicodedata.name(c))
except:
not_found.append(c)
if not_found:
print("NOT FOUND:")
for c in not_found:
print(c)
get_all_non_allowed_chars_in_folder(".")