-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimplifiedLesk.py
139 lines (108 loc) · 5.87 KB
/
SimplifiedLesk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
class SimplifiedLesk:
def __init__(self):
self.semeval2007 = None # Test dataset
self.semeval2013 = None # Test dataset
self.semeval2015 = None # Test dataset
self.senseval2 = None # Test dataset
self.senseval3 = None # Test dataset
self.allTest = None # All test datasets
def loadData(self):
"""
Load the test datasets, and remove rows with NaN values.
"""
# Load the test datasets
semeval2007 = pd.read_csv("data/cleaned/semeval2007.csv")
semeval2013 = pd.read_csv("data/cleaned/semeval2013.csv")
semeval2015 = pd.read_csv("data/cleaned/semeval2015.csv")
senseval2 = pd.read_csv("data/cleaned/senseval2.csv")
senseval3 = pd.read_csv("data/cleaned/senseval3.csv")
allTest = pd.read_csv("data/cleaned/allTest.csv")
self.semeval2007 = semeval2007.dropna().reset_index(drop = True)
self.semeval2013 = semeval2013.dropna().reset_index(drop = True)
self.semeval2015 = semeval2015.dropna().reset_index(drop = True)
self.senseval2 = senseval2.dropna().reset_index(drop = True)
self.senseval3 = senseval3.dropna().reset_index(drop = True)
self.allTest = allTest.dropna().reset_index(drop = True)
@staticmethod
def preprocess(text):
"""
Preprocess the text by tokenization, lemmatization, removing punctuation, stopwords, forbidden words and numbers, and converting to lowercase.
:param text: the string to perform the processing on
:return: processed string as a list of words
"""
# Lemmatize, convert to lowercase and tokenize
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(lemmatizer.lemmatize(text.lower()))
# Remove stopwords, punctuation, forbidden words and numbers
forbidden_words = ["'", "``", "''", "'", "`", "'s"]
tokens = [word for word in tokens if not word in stopwords.words('english') and not word in punctuation and not word in forbidden_words and not word.isnumeric()]
# Remove duplicates
tokens = list(set(tokens))
return tokens
@staticmethod
def computeOverlap(signature, context):
"""
Count words that occur in both lists
:param signature: list of words of the glosses and examples of a synset
:param context: list of words in the context
:return: number of words in common
"""
overlap = 0
for token in context:
if token in signature:
overlap += 1
return overlap
def classify(self, row):
"""
Classifies a word with a sense class.
:param row: contains <id, target_word, context_string>,
where target_word is the word to be classified and context_string is the context
:return: the predicted sense class of the word
"""
word = row.target_word
context = word_tokenize(row.context_string)
synsets = wn.synsets(word)
if len(synsets) > 0:
bestSense = synsets[0].lemmas()[0].key()
maxOverlap = 0
for synset in synsets:
# Add definition and examples to signature
signature = synset.definition()
for example in synset.examples():
signature += " " + example
signature = self.preprocess(signature)
# Find overlap between signature and context
overlap = self.computeOverlap(signature, context)
if overlap > maxOverlap:
maxOverlap = overlap
bestSense = synset.lemmas()[0].key()
return bestSense
return "None"
def runClassification(self):
"""
Classify all the words in the test datasets and save results to file.
"""
self.semeval2007["predicted"] = self.semeval2007.apply(lambda row: self.classify(row), axis = 1)
self.semeval2007 = self.semeval2007.drop(["target_word", "context_string"], axis = 1)
self.semeval2007.to_csv("results/lesk/lesk_semeval2007_predicted.txt", sep = ' ', header = False, index = False)
self.semeval2013["predicted"] = self.semeval2013.apply(lambda row: self.classify(row), axis = 1)
self.semeval2013 = self.semeval2013.drop(["target_word", "context_string"], axis = 1)
self.semeval2013.to_csv("results/lesk/lesk_semeval2013_predicted.txt", sep = ' ', header = False, index = False)
self.semeval2015["predicted"] = self.semeval2015.apply(lambda row: self.classify(row), axis = 1)
self.semeval2015 = self.semeval2015.drop(["target_word", "context_string"], axis = 1)
self.semeval2015.to_csv("results/lesk/lesk_semeval2015_predicted.txt", sep = ' ', header = False, index = False)
self.senseval2["predicted"] = self.senseval2.apply(lambda row: self.classify(row), axis = 1)
self.senseval2 = self.senseval2.drop(["target_word", "context_string"], axis = 1)
self.senseval2.to_csv("results/lesk/lesk_senseval2_predicted.txt", sep = ' ', header = False, index = False)
self.senseval3["predicted"] = self.senseval3.apply(lambda row: self.classify(row), axis = 1)
self.senseval3 = self.senseval3.drop(["target_word", "context_string"], axis = 1)
self.senseval3.to_csv("results/lesk/lesk_senseval3_predicted.txt", sep = ' ', header = False, index = False)
self.allTest["predicted"] = self.allTest.apply(lambda row: self.classify(row), axis=1)
self.allTest = self.allTest.drop(["target_word", "context_string"], axis=1)
self.allTest.to_csv("results/lesk/lesk_allTest_predicted.txt", sep=' ', header=False, index=False)