-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphoc_label_generator.py
58 lines (50 loc) · 1.91 KB
/
phoc_label_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
'''This code will take an input word as in string and will
output the PHOC label of the word. The Phoc label is a
vector of length 3784.
((2 + 3 + 4 + 5) * languageCharactersAndNumbersCount) + (2*commonBigram)
((2+3+4+5) * 256) + (2*100) = 3784
((2+3+4+5) * 45) + (2*50) = 730
Reference: https://ieeexplore.ieee.org/document/6857995/?part=1
2 - 'فسيك' + 'فيكهم'
3 - 'فسي' + 'كفي' + 'كهم'
4 - 'فس' + 'يك'+ 'في' + 'كهم'
5 - 'ف' + 'س' + 'ي'+ 'ك'+ 'فيكهم'
'''
def generate_45(word):
'''The vector is a binary and stands for:
https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
arabic unicode characters is 256
'''
generate_45 = [0 for i in range(45)]
for char in word:
generate_45[ord(char) - ord('ا')] = 1
return generate_45
def generate_50(word):
'''This vector is going to count the number of most frequent
bigram words found in the text
'''
bigram = ['لم', 'لل', 'ين', 'لت', 'لي', 'يت', 'لع', 'هم', 'لن', 'تم', 'في', 'عل',
'لب', 'ست', 'بي', 'يم', 'مت', 'ته', 'لح', 'لق', 'ما', 'لف', 'من', 'ها',
'له', 'كم', 'يس', 'مل', 'بت', 'لك', 'نا', 'لس', 'يب', 'بع', 'مس', 'سب',
'يع', 'تح', 'يل', 'فت', 'فل', 'مع', 'تع', 'لا', 'تن', 'تب', 'يح', 'يه',
'لج', 'فع']
vector_50 = [0 for i in range(50)]
for char in word:
try:
vector_50[bigram.index(char)] = 1
except:
continue
return vector_50
def phoc_generate_label(word):
word = word.lower()
vector = []
L = len(word)
for split in range(2, 5):
parts = L//split
for mul in range(split-1):
vector += generate_45(word[mul*parts:mul*parts+parts])
vector += generate_45(word[(split-1)*parts:L])
# Append the most common 50 bigram text using L2 split
vector += generate_50(word[0:L//2])
vector += generate_50(word[L//2: L])
return vector