-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtext_preprocessing.rb
295 lines (273 loc) · 12.2 KB
/
text_preprocessing.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
require 'constants'
require 'edge'
require 'vertex'
require 'faster_csv'
class TextPreprocessing
=begin
Fetching review data from the tables based on the response_map id
=end
def fetch_data(filename)
# reviewFileName = "/Users/lakshmi/Documents/Thesis/Ruby-Relevance-2012/Recognizing Textual Entailment/text.csv"
#getting review texts
data_array = Array.new
FasterCSV.foreach(filename) do |row|
data_array << row[0]
# puts row[0]
end
return data_array
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
pre-processes the review text and sends it in for graph formation and further analysis
=end
def segment_text(flag, text_array)
if(flag == 0)
reviews = Array.new(1){Array.new}
else
reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
end
i = 0
j = 0
for k in (0..text_array.length-1)
text = text_array[k]
if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
reviews[j] = Array.new #initializing the array for sentences in a test review
i = 0
end
#******* Pre-processing the review/submission text **********
#replacing commas in large numbers, makes parsing sentences with commas confusing!
#replacing quotation marks
text.gsub!("\"", "")
text.gsub!("(", "")
text.gsub!(")", "")
if(text.include?("http://"))
text = remove_urls(text)
end
#break the text into multiple sentences
beginn = 0
if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
endd = 0
#these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
if(text.include?("."))
endd = text.index(".")
end
if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
endd = text.index("?")
end
if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
endd = text.index("!")
end
if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
endd = text.index(",")
end
if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
endd = text.index(";")
end
#check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
if(flag == 0) #training
reviews[0][i] = text[beginn..endd].strip
else #testing
reviews[j][i] = text[beginn..endd].strip
end
i+=1 #incrementing the sentence counter
text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
end #end of the while loop
else #if there is only 1 sentence in the text
if(flag == 0)#training
reviews[0][i] = text.strip
i+=1 #incrementing the sentence counter
else #testing
reviews[j][i] = text.strip
end
end
if(flag == 1)#incrementing reviews counter only for test reviews
j+=1
end
end #end of the for loop with 'k' reading text rows
#setting the number of reviews before returning
if(flag == 0)#training
num_reviews = 1 #for training the number of reviews is 1
else #testing
num_reviews = j
end
if(flag == 0)
return reviews[0]
end
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
* Reads the patterns from the csv file containing them.
* maxValue is the maximum value of the patterns found
=end
def read_patterns(filename, pos)
num = 1000 #some large number
patterns = Array.new
state = POSITIVE
i = 0 #keeps track of the number of edges
#setting the state for problem detection and suggestive patterns
if(filename.include?("prob"))
state = NEGATED
elsif(filename.include?("suggest"))
state = SUGGESTIVE
end
FasterCSV.foreach(filename) do |text|
in_vertex = text[0][0..text[0].index("=")-1].strip
out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
patterns[i] = Edge.new("noun", NOUN)
#setting the invertex
if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
elsif(first_string_in_vertex.include?("JJ"))
patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
elsif(first_string_in_vertex.include?("/RB"))
patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
else #default to noun
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
end
#setting outvertex
if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
elsif(first_string_out_vertex.include?("JJ"))
patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
elsif(first_string_out_vertex.include?("/RB"))
patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
else #default is noun
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
end
i+=1 #incrementing for each pattern
end #end of the FasterCSV.foreach loop
num_patterns = i
return patterns
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
Removes any urls in the text and returns the remaining text as it is
=end
def remove_urls(text)
final_text = String.new
if(text.include?("http://"))
tokens = text.split(" ")
tokens.each{
|token|
if(!token.include?("http://"))
final_text = final_text + " " + token
end
}
else
return text
end
return final_text
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
Check for plagiarism after removing text within quotes for reviews
=end
def remove_text_within_quotes(review_text)
# puts "Inside removeTextWithinQuotes:: "
reviews = Array.new
review_text.each{ |row|
# puts "row #{row}"
text = row
#text = text[1..text.length-2] #since the first and last characters are quotes
#puts "text #{text}"
#the read text is tagged with two sets of quotes!
if(text.include?("\""))
while(text.include?("\"")) do
replace_text = text.scan(/"([^"]*)"/)
# puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
# puts text.index(replace_text[0].to_s)
# puts "replace_text length .. #{replace_text[0].to_s.length}"
#fetching the start index of the quoted text, in order to replace the complete segment
start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
# puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
#replacing the text segment within the quotes (including the quotes) with an empty string
text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
# puts "text .. #{text}"
end #end of the while loop
end
reviews << text #set the text after all quoted segments have been removed.
} #end of the loop for "text" array
# puts "returning reviews length .. #{reviews.length}"
return reviews #return only the first array element - a string!
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
=end
def check_correct_spellings(review_text_array, speller)
review_text_array_temp = Array.new
#iterating through each response
review_text_array.each{
|review_text|
review_tokens = review_text.split(" ")
review_text_temp = ""
#iterating through tokens from each response
review_tokens.each{
|review_tok|
#checkiing the stem word's spelling for correctness
if(!speller.correct?(review_tok))
if(!speller.suggestions(review_tok).first.nil?)
review_tok = speller.suggestions(review_tok).first
end
end
review_text_temp = review_text_temp +" " + review_tok.downcase
}
review_text_array_temp << review_text_temp
}
return review_text_array_temp
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
=end
public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
def contains_punct(str)
if(str.include?".")
str.gsub!(".","")
elsif(str.include?",")
str.gsub!(",","")
elsif(str.include?"?")
str.gsub!("?","")
elsif(str.include?"!")
str.gsub!("!","")
elsif(str.include?";")
str.gsub(";","")
elsif(str.include?":")
str.gsub!(":","")
elsif(str.include?"(")
str.gsub!("(","")
elsif(str.include?")")
str.gsub!(")","")
elsif(str.include?"[")
str.gsub!("[","")
elsif(str.include?"]")
str.gsub!("]","")
end
return str
end
def contains_punct_bool(str)
if(str.include?("\\n") or str.include?("}") or str.include?("{"))
return true
else
return false
end
end
#------------------------------------------#------------------------------------------#------------------------------------------
=begin
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
=end
def is_punct(str)
if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
return true
else
return false
end
end
end #end of class