-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphos_label_generator.py
79 lines (59 loc) · 2.41 KB
/
phos_label_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Library imports
import csv
import numpy as np
# Input: CSV file name that has shape counts for each alphabet
# Output: Number of shapes/columns
def get_number_of_columns(csv_file):
with open(csv_file, encoding='UTF-8-sig') as file:
reader = csv.reader(file, delimiter=',', skipinitialspace=True)
return len(next(reader))-1
# Input: CSV file name that has shape counts for each alphabet
# Output: A dictionary where alphabet is key mapped to its shape count vector(np-array)
def create_alphabet_dictionary(csv_file):
alphabet_dict = dict()
with open(csv_file, encoding='UTF-8-sig') as file:
reader = csv.reader(file, delimiter=',', skipinitialspace=True)
for index, line in enumerate(reader):
alphabet_dict[line[0]] = index
return alphabet_dict
alphabet_csv = "Arabic_alphabet.csv"
alphabet_dict = create_alphabet_dictionary(alphabet_csv)
csv_num_cols = get_number_of_columns(alphabet_csv)
numpy_csv = np.genfromtxt(alphabet_csv, dtype=int, encoding="utf-8-sig", delimiter=",")
numpy_csv=np.delete(numpy_csv,0,1)
# Input: A word segment(string)
# Output: A shape count vector for all alphabets in input word segment (np-array)
def word_vector(word):
vector = np.zeros(csv_num_cols)
for letter in word:
letter_index = alphabet_dict[letter]
vector += numpy_csv[letter_index]
return vector
# Input: A word(string)
# Output: PHOS vector
def phos_generate_label(word):
vector = word_vector(word)
L = len(word)
for split in range(2, 5):
parts = L//split
for mul in range(split-1):
vector=np.concatenate((vector,word_vector(word[mul*parts:mul*parts+parts])),axis=0)
vector=np.concatenate((vector,word_vector(word[(split-1)*parts:L])),axis=0)
return vector
# Input: A list of words(strings)
# Output: A dictionary of PHOS vectors in which the words serve as the key
def gen_label(word_list):
label={}
for word in word_list:
label[word]=phos_generate_label(word)
return label
# Input: A text file name that has a list of words(strings)
# Output: A dictionary of PHOS vectors in which the words serve as the key
def label_maker(word_txt):
label={}
with open(word_txt, "r") as file:
for word_index, line in enumerate(file):
word = line.split()[0]
label[word]=phos_generate_label(word)
return label
#write_s_file(s_matrix_csv, s_matrix, word_list)