-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAutoGrade_Train.py
120 lines (101 loc) · 4.75 KB
/
AutoGrade_Train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from Tools import word_count, words_per_sentence, voice, tense, grammar, spelling_mistakes, semantic_similarity, \
vocabulary
from nltk import sent_tokenize
import csv
from openpyxl import load_workbook
import math
import os
def get_attributes(topic_essay, full_essay, word_limit):
attributes = []
# ---------------Attribute 1-----------------
# |word_count/word_limit| will be the value, too high is bad
num_of_words = word_count.word_count(full_essay)
word_count_limit_ratio = abs(1 - (num_of_words / word_limit))
attributes.append(["Word count", num_of_words])
attributes.append(["Word count limit ratio", word_count_limit_ratio])
# ---------------Attribute 2-----------------
# proportion of all sentences of length > 15
sentences = sent_tokenize(full_essay)
attributes.append(["Sentence count", len(sentences)])
sentence_length_value = words_per_sentence.long_sentences_score(sentences, 15)
attributes.append(["Long sentences", sentence_length_value])
# ---------------Attribute 3-----------------
# voice: number of active voice sentences / total
[active_voice, passive_voice] = voice.check_voice(full_essay)
voice_ratio = 0
if active_voice + passive_voice != 0:
voice_ratio = active_voice / (active_voice + passive_voice)
attributes.append(["Voice", voice_ratio])
# ---------------Attribute 4-----------------
# tense: dominant tense verbs / total verbs
total_verbs, present, past, future = tense.check_tense(full_essay)
dominant_tense_verbs = max(present, past, future)
tense_ratio = 0
if total_verbs != 0:
tense_ratio = dominant_tense_verbs / total_verbs
attributes.append(["Tense", tense_ratio])
# ---------------Attribute 5-----------------
# spell errors: num of errors / num of words
spell_errors, count = spelling_mistakes.get_spell_errors_count(full_essay)
spell_errors_ratio = count / num_of_words
attributes.append(["Spell errors", spell_errors_ratio])
# ---------------Attribute 6-----------------
# Grammatical errors: num of errors / num of words
grammar_error_score = grammar.check_grammar(sentences)
grammar_error_ratio = grammar_error_score / num_of_words
attributes.append(["Grammatical Errors", grammar_error_ratio])
# ---------------Attribute 7-----------------
# semantic similarity of the whole essay
essay_semantic_similarity = semantic_similarity.intra_para_semantic_similarity(full_essay)
num_of_sentences = len(sentences)
essay_semantic_similarity = essay_semantic_similarity * (math.log(num_of_sentences, 2))
attributes.append(["SS Essay", essay_semantic_similarity])
# ---------------Attribute 8-----------------
# semantic similarity of the topic and essay
topic_essay_semantic_similarity = semantic_similarity.inter_para_semantic_similarity(topic_essay, full_essay)
topic_essay_semantic_similarity = topic_essay_semantic_similarity * (math.log(num_of_sentences, 2))
attributes.append(["SS Topic Essay", topic_essay_semantic_similarity])
#
# ---------------Attribute 9-----------------
# vocabulary
vocabulary_words, vocabulary_size = vocabulary.get_vocab(full_essay)
attributes.append(["Vocabulary", vocabulary_size / word_limit])
return attributes
def write_to_csv(attr_with_values, csv_file_path):
attrs = [attr[0] for attr in attr_with_values]
empty = True
with open(csv_file_path) as csvfile:
reader = csv.reader(csvfile)
if (len(list(reader))) > 1:
empty = False
with open(csv_file_path, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=attrs)
if empty:
writer.writeheader()
data = {}
for attr in attr_with_values:
data[attr[0]] = attr[1]
writer.writerow(data)
def train_data():
file_path = "Data/valid_set_set8.xlsx"
wb = load_workbook(filename=file_path, read_only=True)
ws = wb['valid_set']
for row in ws.rows:
essay_id = row[0].value
essay_set = row[1].value
essay = row[2].value
score = row[3].value
topic = ""
if essay_set == 8:
topic = "If you want a place in the sun, you will have to expect some blisters."
print("------------------------------------------------------")
print(essay_id)
if topic != "":
word_limit = 800
attributes_with_values = get_attributes(topic, essay, word_limit)
attributes_with_values.append(["Score", score])
attributes_with_values.append(["Essay ID", essay_id])
attributes_with_values.append(["Essay Set", essay_set])
write_to_file = "data_test_set8.csv"
write_to_csv(attributes_with_values, write_to_file)
# train_data()