-
Notifications
You must be signed in to change notification settings - Fork 38
/
stanford_sentiment_extractor.py
65 lines (57 loc) · 2.22 KB
/
stanford_sentiment_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: UTF-8 -*-
import sys
if __name__ == "__main__":
split_type = sys.argv[1]
sentence_type = sys.argv[2]
data_path = sys.argv[3]
dataset_fixes = {"\x83\xc2": "", "-LRB-":"(", "-RRB-":")", "\xc3\x82\xc2\xa0":" "}
#read dataset split
dataset_split = {}
with open(data_path + "datasetSplit.txt", "r") as f:
next(f)
for line in f:
line_parts = line.strip().split(",")
dataset_split[line_parts[0].strip()] = line_parts[1].strip()
# read relevant sentences
sentences = []
with open(data_path + "datasetSentences.txt", "r") as f:
next(f)
for line in f:
line_parts = line.strip().split("\t")
if len(line_parts) != 2:
raise ValueError("Unexpected file format")
if dataset_split[line_parts[0]] == split_type:
sentence = line_parts[1]
for fix in dataset_fixes:
sentence = sentence.replace(fix, dataset_fixes[fix])
sentences.append(sentence)
# read sentiment labels
sentiment_labels = {}
with open(data_path + "sentiment_labels.txt", "r") as f:
next(f)
for line in f:
line_parts = line.strip().split("|")
if len(line_parts) != 2:
raise ValueError("Unexpected file format")
sentiment_labels[line_parts[0]] = float(line_parts[1])
# read the phrases
phrases = {}
with open(data_path + "dictionary.txt", "r") as f:
for line in f:
line_parts = line.strip().split("|")
if len(line_parts) != 2:
raise ValueError("Unexpected file format")
phrases[line_parts[0]] = sentiment_labels[line_parts[1]]
# print the labels and sentences/phrases
if sentence_type == "full":
for sentence in sentences:
print str(phrases[sentence]) + "\t" + sentence
elif sentence_type == "all":
for phrase in phrases:
print_phrase = False
for sentence in sentences:
if sentence.find(phrase) >= 0:
print_phrase = True
break
if print_phrase:
print str(phrases[phrase]) + "\t" + phrase