-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloaders.py
187 lines (141 loc) · 6.87 KB
/
dataloaders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
This file has utility functions that we expect to reuse in different experiments throughout the
whole project.
"""
# Package imports
import logging
import json
import os
from tqdm import tqdm # my addition
####################################################################################################
def read_swell_directory(path:str="."):
"""
This function takes a path where you expect to find the directories with the Swell files and
reads all of the files in there.
NOTE: If you're expecting to have extra folders or anything like that, you _will_ have to change
this function to fix some hardcoded assumptions.
INPUT:
- path A string containing the path to the folder containing the folders with the Swell
files. So, for example if the current folder has "spIn_v2.0", "SW1203_v2.0", and
"TISUS_v2.0" you would pass the value "." as an argument.
Default: "."
OUTPUT:
- document_list A list containing a dictionary for each document. The keys for each
document are the following:
- id The Swell ID of the imported file
- text The original text of the essay
- normalized_text The normalized essay text
- metadata The metadata imported as a dictionary
- svala_graph The graph for Svala annotations for the current file
- file_errors A list containing the IDs of the files where we encountered errors
"""
# Initialize list of documents and of errors
document_list = []
file_errors = []
# Go through all directories in the current path
for dir in os.walk(path):
# Get path parts
parent = dir[0]
child = dir[1]
files = dir[2]
# Verify we're where we want to be at
# NOTE: these paths were chosen assuming that you gave the argument "." as a path
# if the program breaks when giving it another path, you have to fix this
if parent in [ path , path+"/__pycache__" ]: #[".","./__pycache__"]:
continue
# NOTE: this was assuming that you do not change the folders for Swell. Things will break
# if either you have extra folders within the same Swell folders or if there are
# any empty folders
if len(child) != 0:
raise Exception("Found a folder in one of the Swell directories")
if len(files) == 0:
raise Exception("Found an empty folder in the given path")
# Read current file
for file in tqdm(files, desc='Reading...'): # my change
if not file.startswith('._'): # my addition here
curr_path = parent + "/" + file
document, error = read_swell_file(curr_path)
if error:
file_errors.append(document["id"])
continue
document_list.append(document)
return document_list , file_errors
####################################################################################################
# This reads a single Swell-Pilot file
def read_swell_file(path:str):
"""
This function reads a single file given its path.
INPUT:
path A string containing the path to the file we want to read. The file has to be in the
format of the files that Elena and Maria worked on (i.e. Swell-Pilot).
OUTPUT:
document A dictionary with the following keys:
- id The Swell ID of the imported file
- text The original text of the essay
- normalized_text The normalized essay text
- metadata The metadata imported as a dictionary
- svala_graph The graph for Svala annotations for the current file
error_flag A boolean flag to note whether there were any errors when reading the file
"""
# Activate logger
logger = logging.getLogger("swell_read_file")
# Initialize stuff
document = {}
graph_flag = 0
error_flag = 0
# Open the file
with open(path,"r", encoding="utf-8") as F:
# The documents have very weird formatting
lines = F.readlines()
for line in lines:
# Remove leading/preceding newlines and whitespaces
line = line.strip("\n ")
# Skip empty lines
if len(line) == 0:
continue
# Once the Svala Graph starts, ignore everything else
elif line[:11].lower() == "svala-graph":
document["svala_graph"] = line.split(":")[1].strip(" ")
graph_flag = 1
# This should go before the previous line for computational purposes
# But I added it here to make things clearer
elif graph_flag == 1:
document["svala_graph"] += line
# ID is easy to get
elif line[:8].lower() == "essay id":
document["id"] = line.split(":")[1].strip(" ")
# Metadata is annowying and I'm sure this will have lots of issues
elif line[:8].lower() == "metadata":
items = line[9:].split()
items_dict = {}
old_key = ""
for item in items[1:]:
if "=" in item:
key, value = item.split('="')
items_dict[key] = value.strip('"')
old_key = key
else:
items_dict[old_key] += " " + item
document["metadata"] = items_dict
# Getting the original text
elif line[:6].lower() == "source":
document["text"] = line[7:].strip()
# Getting the normalized text
elif line[:6].lower() == "target":
document["text_normalized"] = line[7:].strip()
# If there's a line we don't know how to read, log a warning and raise the error flag
else:
error_flag = 1
logger.warning("Weird line found in document "+document["id"])
logger.warning("Content: "+line)
logger.warning(len(line))
# Read the Svala graph
try:
document["svala_graph"] = json.loads(document["svala_graph"])
# If we cannot read the Svala graph, log the error and raise the error flag
except json.JSONDecodeError as E:
error_flag = 1
logger.error("Could not import svala graph from document "+document["id"])
logger.error(E)
return document, error_flag
####################################################################################################