-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinverse_index_lab.py
83 lines (65 loc) · 2.96 KB
/
inverse_index_lab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# version code d345910f07ae
coursera = 1
# Please fill out this stencil and submit using the provided submission script.
## 1: (Task 1) Movie Review
## Task 1
def movie_review(name):
"""
Input: the name of a movie
Output: a string (one of the review options), selected at random using randint
"""
from random import randint
reviews = ["See it!", "A gem!", "Ideological claptrap!"]
return reviews[randint(0, len(reviews)-1)]
## 2: (Task 2) Make Inverse Index
def makeInverseIndex(strlist):
"""
Input: a list of documents as strings
Output: a dictionary that maps each word in any document to the set consisting of the
document ids (ie, the index in the strlist) for all documents containing the word.
Distinguish between an occurence of a string (e.g. "use") in the document as a word
(surrounded by spaces), and an occurence of the string as a substring of a word (e.g. "because").
Only the former should be represented in the inverse index.
Feel free to use a loop instead of a comprehension.
Example:
>>> makeInverseIndex(['hello world','hello','hello cat','hellolot of cats']) == {'hello': {0, 1, 2}, 'cat': {2}, 'of': {3}, 'world': {0}, 'cats': {3}, 'hellolot': {3}}
True
"""
dict = {}
for (i, str) in list(enumerate(strlist)):
for word in str.split():
if word in dict:
dict[word] = dict[word] | {i}
else:
dict[word] = {i} # first time
return dict
## 3: (Task 3) Or Search
def orSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of document ids that contain _any_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> orSearch(idx, ['Bach','the'])
{0, 2, 3, 4, 5}
>>> orSearch(idx, ['Johann', 'Carl'])
{0, 2, 3, 4, 5}
"""
docnsetlst = [inverseIndex[word] for word in query if word in inverseIndex]
docnset = set.union(*docnsetlst)
return docnset
## 4: (Task 4) And Search
def andSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of all document ids that contain _all_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> andSearch(idx, ['Johann', 'the'])
{2, 3}
>>> andSearch(idx, ['Johann', 'Bach'])
{0, 4}
"""
docnsetlst = [inverseIndex[word] for word in query if word in inverseIndex]
docnset = set.intersection(*docnsetlst)
return docnset