-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp.py
102 lines (81 loc) · 2.52 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
def getWordsList(filename):
l1 = []
l2 = []
#opening the StopWords.txt file
file1 = open("StopWords.txt", 'r')
#this will copy each word in the list one by one
for e in file1:
l1.append(e)
#removing \n
for e in list(l1):
k = l1.index(e)
if len(e) >= 2:
l1[k] = e[0:-1]
#print(l1)
#l1 has StopWords list[]
# a file named filename, will be opened with the reading mode.
file = open(filename, 'r')
for each in file:
#firstly, removing Alphanumeric data and converting to lower case
each = each.lower()
each = each.split(" ")
for x in list(each):
if x == '.' or x == '(' or x == ')' or x == '"' or x == ',' or x == '?':
each.remove(x)
for i in range(len(x)):
if x[i] >= str(0) and x[i] <= str(9):
each.remove(x)
break
#Now removing Special characters from in between the words
for x in list(each):
ix = (each.index(x))
x2 = ""
for i in range(len(x)):
if x[i] >= 'a' and x[i] <= 'z':
x2 = x2 + x[i]
each[ix] = x2
#Now removing empty blanks
for x in list(each):
if x == '':
each.remove(x)
#now removing StopWords
for x in list(each):
for jk in list(l1):
if x == jk:
each.remove(x)
break
#finally copying the list into new list
l2.append(each)
#l2 holds modified data from the file
return l2
def printWord(my_list):
from collections import Counter
total_count = 0
for each in my_list:
for x in list(each):
total_count += 1
my_dict = {
"Word": 0
}
for each in my_list:
for x in each:
if x in my_dict.keys():
my_dict[x] = my_dict[x] + 1
else:
my_dict[x] = 1
#now Counting Dictionary elements
count = Counter(my_dict)
#picking up highest 100 values
top100 = count.most_common(100)
print("Now printing the 100 Top values!\n")
print("Word: Count: Probability: ")
for h in top100:
print(h[0], " ", h[1], " ", h[1]/total_count, "\n")
l = []
l1 = []
i = 1
while i < 5:
l = getWordsList("data" + str(i) + ".txt")
l1.extend(l)
i +=1
printWord(l1)