-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
202 lines (159 loc) · 7.31 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import re
import nltk
import sys
import getopt
import json
import heapq
import time
import math
from itertools import groupby, chain, islice
show_time = False
k = 10 # number of results to return
def usage():
"""Prints the proper format for calling this script."""
print "usage: " + sys.argv[0] + " -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results"
def load_args():
"""Attempts to parse command line arguments fed into the script when it was called.
Notifies the user of the correct format if parsing failed.
"""
dictionary_file = postings_file = queries_file = output_file = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'd:p:q:o:')
except getopt.GetoptError, err:
usage()
sys.exit(2)
for o, a in opts:
if o == '-d':
dictionary_file = a
elif o == '-p':
postings_file = a
elif o == '-q':
queries_file = a
elif o == '-o':
output_file = a
else:
assert False, "unhandled option"
if dictionary_file is None or postings_file is None or queries_file is None or output_file is None:
usage()
sys.exit(2)
return dictionary_file, postings_file, queries_file, output_file
def process_queries(dictionary_file, postings_file, queries_file, output_file):
# load dictionary
begin = time.time() * 1000.0
with open(dictionary_file) as dict_file:
temp = json.load(dict_file)
doc_length = temp[0]
dictionary = temp[1]
# open queries
postings = file(postings_file)
output = file(output_file, 'w')
with open(queries_file) as queries:
for query in queries:
query_terms = normalize(query)
single_term_query = len(query_terms) == 1
doc_scores = {}
for term in query_terms:
doc_scores = update_relevance(doc_scores, dictionary, postings, query_terms, term, single_term_query)
for key in doc_scores:
doc_scores[key] /= doc_length[str(key)]
results = first_k_most_relevant(doc_scores)
output.write(" ".join(results))
output.write("\n")
postings.close()
output.close()
after = time.time() * 1000.0
if show_time: print after-begin
def normalize(query):
""" Tokenizes and stems search query string
:param query: Search query string in plain English
:return: List of query terms that have been stemmed with the Porter stemmer
"""
query_tokens = nltk.word_tokenize(query)
stemmer = nltk.stem.PorterStemmer()
query_terms = map(lambda word : stemmer.stem(word.lower()), query_tokens)
return query_terms
def update_relevance(doc_scores, dictionary, postings_file, query_terms, term, single_term_query):
""" Given a query token, updates the score of all documents with matching docIDs in the term's postings list
:param doc_scores: Dictionary containing current scores for all documents
:param dictionary: Dictionary that takes search token keys, and returns a tuple of pointer and length and idf.
The pointer points to the starting point of the search token's postings list in the file.
The length refers to the length of the search token's postings list in bytes.
The idf is the pre-calculated idf weight for the given term.
:param postings_file: File object referencing the file containing the complete set of postings lists.
:param query_terms: List of query terms.
:param term: Current query term being processed
:param single_term_query: Boolean value indicating if query only contains a single term
:return: Dictionary containing updated scores for all documents
"""
postings = read_postings(term, dictionary, postings_file)
for docID_and_tf in postings:
docID, tf_in_doc = docID_and_tf
tf_in_query = query_terms.count(term)
term_idf = dictionary[term][2]
weight_of_term_in_doc = tf_in_doc
weight_of_term_in_query = 1 if single_term_query else (1 + math.log10(tf_in_query)) * term_idf
if docID not in doc_scores:
doc_scores[docID] = 0
doc_scores[docID] += weight_of_term_in_doc * weight_of_term_in_query
return doc_scores
def read_postings(term, dictionary, postings_file):
""" Gets own postings list from file and stores it in its attribute. For search token nodes only.
:param term: Current query term being processed
:param postings_file: File object referencing the file containing the complete set of postings lists.
:param dictionary: Dictionary that takes search token keys, and returns a tuple of pointer and length and idf.
The pointer points to the starting point of the search token's postings list in the file.
The length refers to the length of the search token's postings list in bytes.
The idf is the pre-calculated idf weight for the given term.
:return Postings list of given term.
"""
if term in dictionary:
term_pointer = dictionary[term][0]
postings_length = dictionary[term][1]
postings_file.seek(term_pointer)
postings = postings_file.read(postings_length).split()
postings = map(lambda docID_and_tf : docID_and_tf.split(","), postings)
postings = map(lambda docID_and_tf : [int(docID_and_tf[0]), float(docID_and_tf[1])],postings)
return postings
else:
return []
def first_k_most_relevant(doc_scores):
"""If there are more than k documents containing terms in a query, return the k documents with the highest scores, tiebroken by least docID first.
If there are less than k documents, return them, sorted by highest scores, and tiebroken by least docID first. O(n) + O(k lg n)
:param doc_scores: A dictionary of docID to its corresponding document's score.
:return: List of k docIDs of string type, which are the most relevant (i.e. have the highest scores)
"""
scores = [(-score, docID) for docID, score in doc_scores.iteritems()] # invert the scores so that heappop gives us the smallest score
heapq.heapify(scores)
most_relevant_docs = []
for _ in range(k):
if not scores:
break
most_relevant_docs.append(heapq.heappop(scores))
if not most_relevant_docs:
return most_relevant_docs
# deals with equal-score cases
kth_score, kth_docID = most_relevant_docs[-1]
while scores:
next_score, next_docID = heapq.heappop(scores)
if next_score == kth_score:
most_relevant_docs.append((next_score, next_docID))
else:
break
return sort_relevant_docs(most_relevant_docs)
def sort_relevant_docs(most_relevant_docs):
"""Given a list of tuples of documents in the format of (score, docID), sort them primarily by decreasing score, and tiebreak by increasing docID,
and then return up to the first k elements in the list.
:param most_relevant_docs: A list of tuples of documents and their scores, where each tuple contains (score, docID).
:return: List of docIDs of string type, which have been tiebroken through sorting with their docIDs
"""
grouped_relevant = groupby(most_relevant_docs, key=lambda score_doc_entry: score_doc_entry[0])
sorted_relevant = [sorted(equal_score_entries[1], key=lambda equal_score_entry: equal_score_entry[1]) for equal_score_entries in grouped_relevant]
flattened_relevant = chain.from_iterable(sorted_relevant)
trimmed_relevant = islice(flattened_relevant, k) # Takes first k elements from the iterable. If there are less than k elements, it stops when the iterable stops
relevant_docIDs = [str(docID) for score, docID in trimmed_relevant] # Finally, extract the docID from the tuple and convert it to a string to be written to output
return list(relevant_docIDs)
def main():
dictionary_file, postings_file, queries_file, output_file = load_args()
process_queries(dictionary_file, postings_file, queries_file, output_file)
if __name__ == "__main__":
main()