-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
162 lines (136 loc) · 3.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""N-gram Language Model.
Usage:
main.py train [--n <n>] [--path <path>]
main.py generate [--lines <n>]
main.py perplexity [--path <path>]
main.py common [--number <n>]
main.py --help
Options:
--n <n> Number of n-gram
--path <path> Train/Test file path
--lines <n> No. of lines to be generated
--number <n> No. of n-gram to show
--help Show this screen
"""
'''
@Author: Touhidul Alam
'''
from lm import LanguageModel
from corpus import *
import pickle
from docopt import docopt
import click
def readFile(filename):
"""
Read a file, return file data as tokenized in list form
Parameters
----------
filename: path
File location
Returns
-------
list
tokenized data in a nested list form
"""
data = []
file = open(filename)
for line in file:
data.append(tokenize(line))
return data
def loadPickle():
"""
Load saved model after training and get the saved object information
TODO: Static file saved for now
Return
------
Object
Language model object saved from training
"""
f = open('trained_model_ngram.pkl','rb')
lm = pickle.load(f)
f.close()
return lm
def main(args):
"""
Main function of the program operates based on the argument provided.
Train
- Ask for ngram
- Ask for training file path
- Train language model
- Save the trained model
Generate
- Load the saved model from pickle file
- Ask for a beam search (y/n)
- Ask Beam length
- Print one generated sentence in terminal
- Ask for number of sentences to be generated on file
- Save the input number of sentences in a file (Default: new_shakespeare.txt)
Perplexity
- Load Pickle file
- Ask the test set file path
- Print perplexity value
Common
- Load pickle
- Ask number of most common ngram
- Print the most common ngram with their occurence number.
"""
if args['train']:
if not args['--n']:
ngram = input("Please enter n for n-gram (Default: 3)-\n")
if not ngram:
ngram=3
else:
ngram=args['--n']
lm = LanguageModel(int(ngram))
if not args['--path']:
path = input("Please enter path of the file-\n")
else:
path = args['--path']
lm.train(readFile(path))
print("N-gram training completed")
print("Saving the model")
f = open('trained_model_ngram.pkl','wb')
pickle.dump(lm, f)
f.close()
print("Model saved")
if args['generate']:
lm = loadPickle()
if click.confirm('Do you want to generate with Beam search?', default=True):
lm.beam_flag = True
beam_size =input("Enter beam size (Default: 20)-\n")
if not beam_size:
lm.beam_width = beam_size
else:
lm.beam_flag = False
print("Generating one sentence in terminal...")
print(detokenize(lm.generate()))
if not args['--lines']:
noOfText =input("Enter number of generated text you want to save (Default: 10)-\n")
if not noOfText:
noOfText=10
else:
noOfText = args['--lines']
generated = []
for g in range(0, int(noOfText)):
generated.append(detokenize(lm.generate()))
with open('new_shakespeare.txt', 'w') as f:
for g in generated:
f.write("%s\n" % g)
print("Sentence file generated in current folder")
if args['perplexity']:
lm = loadPickle()
if not args['--path']:
path = input("Please enter path of the test file-\n")
else:
path = args['--path']
print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path))))
if args['common']:
lm = loadPickle()
if args['--number']:
number = args['--number']
else:
number = 5
lm.count_common_ngram(int(number))
if __name__ == '__main__':
argument = docopt(__doc__)
main(argument)