-
Notifications
You must be signed in to change notification settings - Fork 0
/
post_process.py
109 lines (93 loc) · 3.42 KB
/
post_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#
# post_process.py
# Post process output JSON files and remove noisy ones
#
import json
import os
from os import walk
from tqdm import tqdm
import logging
logFormatter = logging.Formatter(
"%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]\t%(message)s")
skippedTables_logFormatter = logging.Formatter(
"%(message)s")
rootLogger = logging.getLogger('Post_Processing')
skippedLogger = logging.getLogger('Removed_tables')
fileHandler = logging.FileHandler(
"{0}/{1}.log".format('./', 'output_post_processing'), 'w')
skippedTables_fileHandler = logging.FileHandler(
"{0}/{1}.log".format('./', 'removed_tables_post_processing'), 'w')
fileHandler.setFormatter(logFormatter)
skippedTables_fileHandler.setFormatter(skippedTables_logFormatter)
rootLogger.addHandler(fileHandler)
skippedLogger.addHandler(skippedTables_fileHandler)
# consoleHandler = logging.StreamHandler()
# consoleHandler.setFormatter(logFormatter)
# rootLogger.addHandler(consoleHandler)
rootLogger.setLevel(logging.INFO)
skippedLogger.setLevel(logging.INFO)
def clean_str(str):
characters = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', '
', ' ', ' ']
new_str = str
for c in characters:
new_str = new_str.replace(c, ' ')
return new_str.strip()
def clean_report(report):
"""
Clean the given report
:param report: the report to clean
:return:
"""
clean_output = []
table_idx_to_remove = []
# It is a list of tables
for table_idx, table in enumerate(report):
empty_row_idx = None
num_empty_rows = 0
num_columns = len(table['Cells'][0])
for row_idx, row in enumerate(table['Cells']):
num_Nones = 0
for column in row:
if column['V'] == "":
num_Nones += 1
else:
column['V'] = clean_str(column['V'])
if num_Nones == num_columns:
num_empty_rows += 1
empty_row_idx = row_idx
# For this table: if it has more than one row empty, then remove
if num_empty_rows > 1:
table_idx_to_remove.append(table_idx)
skippedLogger.info(f'Skipped table: {table["Title"]}')
else:
# Just remove that empty row from the table
if empty_row_idx is not None:
table['Cells'].pop(empty_row_idx)
for table_idx, table in enumerate(report):
if table_idx not in table_idx_to_remove:
clean_output.append(table)
return clean_output
def batch_process(directory):
"""
Batch post processing of JSON files
:param directory: The directory containing the json files
:return:
"""
print('Getting filenames for post processing..')
files = []
for (dirpath, dirnames, filenames) in walk(directory):
for file in filenames:
files.append(os.path.join(dirpath, file))
cnt_errors = 0
print('Processing files..')
for file in tqdm(files):
with open(file) as fp:
filedata = fp.read()
report = json.loads(filedata)
output_filename = './output_cleaned/' + file.split('/')[-1]
cleaned_data = clean_report(report)
with open(output_filename, 'w') as fout:
fout.write(json.dumps(cleaned_data))
if __name__ == "__main__":
batch_process('./temp_output')