-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathii.py
80 lines (58 loc) · 1.98 KB
/
ii.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from nltk.tokenize import RegexpTokenizer, word_tokenize
from collections import defaultdict
from nltk.stem import snowball
import json
##### stop words list
f = open("Stopword-List.txt","r")
stop_words=[]
for lines in f:
stop_words.append(lines.strip())
f.close()
# retrieve stories text AS ONE STRING
stories = ""
for i in range(1,51):
ss = open( "ShortStories/" + str(i) + ".txt","r")
for text in ss:
stories = stories + text.strip() + " "
ss.close()
#### form dictionary
# tokenize raw text (stories) - remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(stories)
# case fold to lowercase
lower_words = [w.lower() for w in tokens]
#remove stop words
words = [wrd for wrd in lower_words if not wrd in stop_words]
# stem and order alphabetically
stemmer = snowball.SnowballStemmer('english')
dictionary = list( sorted( set([stemmer.stem(x) for x in words])) )
## all file data at each index. type = 2d arr
files = [[]]
for j in range(1,51):
text = ""
f = open( "ShortStories/" + str(j) + ".txt","r")
for lines in f:
text = text + lines.strip() + " "
f.close()
# tokenize
docid_tokens = tokenizer.tokenize(text)
# case fold to lowercase
docid_words = [w.lower() for w in docid_tokens]
#remove stopwords
wo_stopwords = [wrd for wrd in docid_words if not wrd in stop_words]
# stem and order alphabetically
docid_stem = list( sorted( set([stemmer.stem(x) for x in wo_stopwords])) )
files.append(docid_stem)
#### inverted index
i_index = defaultdict(list)
for word in dictionary:
templist = []
for docid in range(1,51):
if word in files[docid]:
templist.append(docid)
l = len(templist)
# the 0th element of the key-value list = frequency of documents for 'word'
i_index[word] = [l, templist]
with open('inverted_index.txt', 'w') as file1:
file1.write(json.dumps(i_index)) # use `json.loads` to do the reverse
file1.close()