-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextretrieval.py
217 lines (171 loc) · 9.64 KB
/
textretrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Place your imports here
import numpy as np
import nltk
import pandas as pd
from nltk.corpus import stopwords
import math
class TextRetrieval():
# For preprocessing
punctuations = ""
stop_words = set()
# For VSM definition
vocab = np.zeros(200)
dataset = None
def __init__(self):
##
# TODO: obtain the file "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv"
# and store it locally in a location accessible directly by this script (e.g. same directory don't use absolute paths)
### TODO: Initialize punctuations (a string) and stop_words (a set)
# Define self.punctuations to be any '"\,<>./?@#$%^&*_~/!()-[]{};:
# Define self.stop_words from stopwords.words('english')
nltk.download('stopwords')
self.punctuations = "\"\'\\,<>./?@#$%^&*_~/!()-[]{};:"
self.stop_words = set(stopwords.words('english'))
def read_and_preprocess_Data_File(self):
### Reads the test.csv file and iterates over every document content (entry in the column 2)
### removes leading and trailing spaces, transforms to lower case, remove punctuation, and removes stopwords
### Stores the formated information in the same "dataset" object
dataset = pd.read_csv(
"https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv", header=None)
punctuations = self.punctuations
stop_words = self.stop_words
# dataset.head()
for index, row in dataset.iterrows():
line = row[2]
# TODO: Implement removing stopwords and punctuation
# 1. transform words to lower-case, remove leading and trailing spaces
line = line.strip().lower()
# 2. remove punctuation and numbers
line = ''.join([char if char not in punctuations and not char.isdigit() else ' ' for char in line])
# 3 split words and remove stop words
words = line.split()
words = [word for word in words if word not in self.stop_words]
# 4. join words
processed_line = ' '.join(words)
# 5. save processed words in dataset
dataset.at[index, 2] = processed_line
self.dataset = dataset # Set dataset as object attribute
#### Bit Vector with Dot Product
def build_vocabulary(self): # ,collection):
### Return an array of 200 most frequent works in the collection
### dataset has to be read before calling the vocabulary construction
# TODO: Create a vocabulary. Assume self.dataset has been preprocessed. Count the ocurrance of the words in the dataset. Select the 200 most common words as your vocabulary vocab.
word_count = {}
# record the frequency of each word
for index, row in self.dataset.iterrows():
words = row[2].split()
for word in words:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
# Sort by frequency and select the top 200 most common words
sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
vocab = [word for word, count in sorted_words[:200]]
# If the number of unique words is less than 200, pad the vocabulary
if len(vocab) < 200:
vocab += ['<PAD>'] * (200 - len(vocab))
# Ensure self.vocab is a NumPy array
self.vocab = np.array(vocab)
def text2BitVector(self, text):
### return the bit vector representation of the text
# TODO: Use self.vocab (assume self.vocab is created already) to transform the content of text into a bitVector
# Use the order in the vocabulary to match the order in the bitVector
text_words = set(text.split())
bitVector = np.array([1 if word in text_words else 0 for word in self.vocab])
return bitVector
def bit_vector_score(self, query, doc):
### query and doc are the space-sparated list of words in the query and document
q = self.text2BitVector(query)
d = self.text2BitVector(doc)
# TODO: compute the relevance using q and d
relevance = np.dot(q, d)
return relevance
def adapt_vocab_query(self, query):
### Updates the vocabulary to add the words in the query
# TODO: Use self.vocab and check whether the words in query are included in the vocabulary
# If a word is not present, add it to the vocabulary (new size of vocabulary = original + #of words not in the vocabulary)
# you can use a local variable vocab to work your changes and then update self.vocab
query_words = set(query.split())
missing_words = [word for word in query_words if word not in self.vocab]
# Convert missing_words to a NumPy array and append to self.vocab
if missing_words:
self.vocab = np.append(self.vocab, missing_words)
def execute_search_BitVec(self, query):
### executes the computation of the relevance score for each document
### but first it verifies the query words are in the vocabulary
# e.g.: query = "olympic gold athens"
self.adapt_vocab_query(query) # Ensure query is part of the "common language" of documents and query
relevances = np.zeros(self.dataset.shape[0]) # Initialize relevances of all documents to 0
# TODO: Use self.vocab to compute the relevance/ranking score of each document in the dataset using bit_vector_score
for index, row in self.dataset.iterrows():
doc = row[2]
relevances[index] = self.bit_vector_score(query, doc)
return relevances # in the same order of the documents in the dataset
#### TF-IDF with Dot Product
def compute_IDF(self, M, collection):
### M number of documents in the collection; collection: documents (i.e., column 3 (index 2) in the dataset)
# To solve this question you should use self.vocab
self.IDF = np.zeros(len(self.vocab)) # Initialize the IDFs to zero
# TODO: for word in vocab: Compute the IDF frequency of each word in the vocabulary
for i, word in enumerate(self.vocab):
# compute the doc freq containing word 计算包含该词的文档数 k
doc_freq = sum(1 for doc in collection if word in doc.split())
# avoid division error in case doc_freq is 0 避免 doc_freq 为 0,防止除零错误
if doc_freq > 0:
self.IDF[i] = math.log((M + 1) / doc_freq)
else:
self.IDF[i] = 0
def text2TFIDF(self, text, applyBM25_and_IDF=False):
### returns the bit vector representation of the text
# TODO: Use self.vocab and self.IDF to compute the TF-IDF representation of the text
k = 3.5
tfidfVector = np.zeros(len(self.vocab))
# Split the text into words once, outside the loop
text_words = text.split()
for i, word in enumerate(self.vocab):
if word in text_words:
# TODO: Set the value of TF-IDF to be (temporarily) equal to the word count of word in the text
term_freq = text_words.count(word)
tfidfVector[i] = term_freq
if applyBM25_and_IDF:
# TODO: update the value of the tfidfVector entry to be equal to BM-25 (of the word in the document) multiplied times the IDF of the word
tfidfVector[i] = self.IDF[i] * (k + 1) * term_freq / (term_freq + k)
return tfidfVector
# grade (enter your code in this cell - DO NOT DELETE THIS LINE)
def tfidf_score(self, query, doc, applyBM25_and_IDF=False):
q = self.text2TFIDF(query)
d = self.text2TFIDF(doc, applyBM25_and_IDF)
# TODO: compute the relevance using q and d
relevance = np.dot(q, d)
return relevance
def execute_search_TF_IDF(self, query):
# DIFF: Compute IDF
self.adapt_vocab_query(query) # Ensure query is part of the "common language" of documents and query
# global IDF
self.compute_IDF(self.dataset.shape[0], self.dataset[
2]) # IDF is needed for TF-IDF and can be precomputed for all words in the vocabulary and a given fixed collection (this excercise)
# For this function, you can use self.IDF and self.dataset
relevances = np.zeros(self.dataset.shape[0]) # Initialize relevances of all documents to 0
# TODO: Use self.vocab to compute the relevance/ranking score of each document in the dataset using tfidf_score
for index, row in self.dataset.iterrows():
doc = row[2]
relevances[index] = self.tfidf_score(query, doc, True)
return relevances # in the same order of the documents in the dataset
if __name__ == '__main__':
tr = TextRetrieval()
tr.read_and_preprocess_Data_File() # builds the collection
tr.build_vocabulary() # builds an initial vocabulary based on common words
queries = ["olympic gold athens", "reuters stocks friday", "investment market prices"]
print("#########\n")
print("Results for BitVector")
for query in queries:
print("QUERY:", query)
relevance_docs = tr.execute_search_BitVec(query)
# TODO: Once the relevances are computed, print the top 5 most relevant documents and the bottom 5 least relevant (for your reference)
print("#########\n")
print("Results for TF-IDF")
for query in queries:
print("QUERY:", query)
relevance_docs = tr.execute_search_TF_IDF(query)
# TODO: Once the relevances are computed, print the top 5 most relevant documents and the bottom 5 least relevant (for your reference)