-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlanguage_features.py
137 lines (107 loc) · 4.11 KB
/
language_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
def read_from_file(file_name):
print("Importing csv")
processed_data = pd.read_csv(file_name)
return processed_data
def compute_question_length(input_data):
questions = input_data['Question']
question_length = []
for i in range(0, questions.shape[0]):
question_length.append(len(questions[i]))
return question_length
def named_entity_recognition(input_data, entity_label_type):
questions = input_data['Question']
named_entities = []
# question = questions[1]
# doc = nlp(question)
# print(doc)
# for ent in doc.ents:
# print(ent.text, " ", ent.label_)
# displacy.serve(doc, style="ent")
for question in questions:
names = []
doc = nlp(question)
for ent in doc.ents:
if ent.label_ == entity_label_type:
names.append(ent.text)
named_entities.append(names)
return named_entities
# displacy.serve(doc, style="ent")
def get_name_count(names):
name_count = []
name_boolean = []
for name in names:
name_count.append(len(name))
if len(name) > 0:
name_boolean.append(True)
else:
name_boolean.append(False)
return name_count, name_boolean
def get_person_name_count_details(input_data):
person_names = named_entity_recognition(input_data, "PERSON")
name_count, name_boolean = get_name_count(person_names)
# print(person_name_count)
# print(person_name_boolean)
return name_count, name_boolean
def get_conjunction_phrases(input_data):
questions = input_data["Question"]
conjunction_count = []
for question in questions:
count = 0
doc = nlp(question)
for token in doc:
if token.dep_ == "cc":
count += 1
conjunction_count.append(count)
return conjunction_count
def get_prepositional_phrases(input_data):
questions = input_data["Question"]
preposition_count = []
for question in questions:
count = 0
doc = nlp(question)
for token in doc:
if token.dep_ == "prep":
count += 1
preposition_count.append(count)
return preposition_count
def get_math_symbol_count(input_data):
questions = input_data['Question']
symbols = ['+', '-', '*', '/', '=', '>', '<', '^', '%']
math_symbol_count = []
for question in questions:
count = 0
for character in question:
if character in symbols:
count += 1
math_symbol_count.append(count)
return math_symbol_count
if __name__ == '__main__':
nlp = spacy.load("en_core_web_sm")
data = read_from_file("More_Processed_Data.csv")
# Get rid of all the questions those are attempted by less than 5 students
data = data[data['num students'] > 5].reset_index(drop=True)
# Extract sentence length
ques_length = compute_question_length(data)
# Find all person names in a given sentence
person_name_count, person_name_boolean = get_person_name_count_details(data)
# Find all conjunction phrases in a given sentence
conjunction_phrase_count = get_conjunction_phrases(data)
# Find all prepositional phrases in a given sentence
preposition_phrase_count = get_prepositional_phrases(data)
# Find all occurrences of math symbols
math_symbols = get_math_symbol_count(data)
# This will be the final dataframe
data_language_features = data.copy()
# Finally append all the relevant columns
data_language_features['question_length'] = np.array(ques_length)
data_language_features['person_name_count'] = np.array(person_name_count)
data_language_features['person_name_boolean'] = np.array(person_name_boolean)
data_language_features['conjunction_phrase_count'] = np.array(conjunction_phrase_count)
data_language_features['preposition_phrase_count'] = np.array(preposition_phrase_count)
data_language_features['math_symbols_count'] = np.array(math_symbols)
print(data_language_features.head())
data_language_features.to_csv("Language_Processed_Data.csv")