-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpi.py
71 lines (56 loc) · 1.75 KB
/
pi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import snowball
from collections import defaultdict
import json
##### stop words list
f = open("Stopword-List.txt","r")
stop_words=[]
for lines in f:
stop_words.append(lines.strip())
f.close()
#### retrieve stories text AS ONE STRING
stories = ""
for i in range(1,51):
ss = open( "ShortStories/" + str(i) + ".txt","r")
for text in ss:
# stories.append(text.strip())
stories = stories + text.strip() + " "
ss.close()
#### form dictionary
# tokenize raw text (stories) - remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(stories)
# case fold to lowercase
lower_words = [w.lower() for w in tokens]
#remove stop words
word = set([wrd for wrd in lower_words if not wrd in stop_words])
dictionary = list(word)
print(dictionary)
## all file data at each index. type = 2d arr
files = [[]]
for j in range(1,51):
text = ""
f = open( "ShortStories/" + str(j) + ".txt","r")
for lines in f:
text = text + lines.strip() + " "
f.close()
docid_tokens = tokenizer.tokenize(text)
# case fold to lowercase
docid_words = [w.lower() for w in docid_tokens]
print(docid_words)
files.append(docid_words)
##### form positional index
p_index = defaultdict(dict)
for word in dictionary:
posting_list = []
docs = 0
temp = {}
for docid in range(1,51):
positions = [i for i, x in enumerate(files[docid]) if x == word]
if len(positions) > 0 :
temp[docid] = [len(positions), positions]
docs += 1
p_index[word] = [docs, temp]
with open('positional_index.txt', 'w') as file2:
file2.write(json.dumps(p_index)) # use `json.loads` to do the reverse
file2.close()