-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeCSV.py
225 lines (197 loc) · 8.61 KB
/
makeCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import csv
import re
import sys
from pathlib import Path
def find_special_characters(text, logfile):
# Define a regular expression pattern to match special characters
special_chars = re.findall(r"[!@#$%^&*()=+[\]{};':\"<>?~\s`]", text)
with open(logfile, mode='a', newline='', encoding='utf-8') as log:
if special_chars:
writer = csv.writer(log)
msg = "Special characters replaced [" + ''.join(special_chars) + "] in " + text
print("!!! " + msg)
writer.writerow([msg])
def find_non_latin(text, logfile):
# Regular expression to match any non-Latin character
non_latin = re.findall(r'[^\u0000-\u007F]', text)
with open(logfile, mode='a', newline='', encoding='utf-8') as log:
if non_latin:
writer = csv.writer(log)
msg = "Non Latin characters found [" + ''.join(non_latin) + "] in " + text
print("!!! " + msg)
writer.writerow([msg])
def gen_source_files(directory, bagged, log, makeMetaCSV):
source_file = os.path.join(directory+'/metadata', "source-metadata.csv")
with open(source_file, mode='w', newline='', encoding='utf-8') as file1:
writer1 = csv.writer(file1)
writer1.writerow(["filename","metadata","type"])
for root, dirs, files in os.walk(directory, topdown=True):
files.sort()
for file_name in files:
full_path = os.path.join(root, file_name)
relative_path = os.path.relpath(full_path, directory)
# Skip the metadata file that's generated
if relative_path.startswith("metadata"):
# Skip any hidden files like .DS_Store
if relative_path.endswith(".xml"):
find_special_characters(relative_path, log)
find_non_latin(relative_path, log)
clean = re.sub(r"[!@#$%^&*()=+[\]{};':\"<>?~\s`]", "_", relative_path)
path = Path(clean).parts
parts = list(path)
parts.pop(0)
#filename = os.path.join(*parts)
filename = "/".join(parts)
writer1.writerow(["objects",filename])
print(f"source-metadata.csv created!")
if (makeMetaCSV == 'y'):
gen_metadata_files(directory, bagged, log)
else:
another(directory)
def gen_metadata_files(directory, bagged, log):
meta_file = os.path.join(directory+'/metadata', "metadata.csv")
rights_file = os.path.join(directory+'/metadata', "rights.csv")
with open(meta_file, mode='w', newline='', encoding='utf-8') as file1, \
open(rights_file, mode='w', newline='', encoding='utf-8') as file2:
writer1 = csv.writer(file1)
writer1.writerow(["filename"])
writer2 = csv.writer(file2)
writer2.writerow([
"file",
"basis",
"status",
"jurisdiction",
"determination_date",
"start_date",
"end_date",
"grant_act",
"grant_restriction"
])
# Walk through the directory tree
for root, dirs, files in os.walk(directory, topdown=True):
dirs.sort()
files.sort()
# Write directories
for dir_name in dirs:
full_path = os.path.join(root, dir_name)
relative_path = os.path.relpath(full_path, directory)
if not relative_path.startswith("metadata"):
find_special_characters(relative_path, log)
find_non_latin(relative_path, log)
clean_text = re.sub(r"[!@#$%^&*()=+[\]{};':\"<>?~\s`]", "_", relative_path)
path = Path(clean_text).parts
parts = list(path)
filename = "/".join(parts)
writer1.writerow([filename])
# Write files
for file_name in files:
full_path = os.path.join(root, file_name)
relative_path = os.path.relpath(full_path, directory)
# Skip the metadata file that's generated
if not relative_path.startswith("metadata"):
path = Path(relative_path).parts
parts = list(path)
filename = "/".join(parts)
# Skip any hidden files like .DS_Store
if not file_name.startswith('.'):
find_non_latin(filename, log)
if bagged == 'y':
writer1.writerow(['data/'+filename])
writer2.writerow(['data/'+filename])
else:
writer1.writerow([filename])
writer2.writerow([filename])
print(f"metadata.csv created!")
print(f"rights.csv created!")
another(directory)
def another(directory):
if os.path.getsize(directory+'/metadata/log.txt') == 0:
os.remove(directory+'/metadata/log.txt')
while True:
try:
another = input("Would you like to process another directory? (y/n): ")
if another == 'y' or another == 'n':
if another == 'y':
main()
else:
print("Okay bye!")
sys.exit()
else:
print("You need to actually type either 'y' for yes or 'n' for no")
except ValueError as e:
print(f"An error occurred: {e}")
def questions(directory):
try:
# Get the list of files and directories
while True:
try:
if not os.path.exists(directory+'/objects'):
print("Your files need to be placed within a folder named 'objects' first.")
directory = input("Please enter the directory path again: ").strip()
else:
break
except ValueError as e:
print(f"An error occurred: {e}")
# Prepare the CSV file
if not os.path.exists(directory+'/metadata'):
os.mkdir(directory+'/metadata')
else:
while True:
try:
overwrite = input("It looks like a metadata folder already exists. Your CSV files will get overwritten. Continue? (y/n): ")
if overwrite == 'y':
break
else:
print("Okay bye!")
sys.exit()
except ValueError as e:
print(f"An error occurred: {e}")
xmlExists = False
bagged = 'n'
if os.path.exists(directory+'/metadata/log.txt'):
os.remove(directory+'/metadata/log.txt')
logfile = os.path.join(directory+'/metadata', "log.txt")
while True:
try:
if any(fname.endswith('.xml') for fname in os.listdir(directory+'/metadata')):
xmlExists = True
makeMetaCSV = input("Looks like you're doing an XML Import. Generate source-metadata.csv, metadata.csv and rights.csv? (y/n): ")
if makeMetaCSV == 'y' or makeMetaCSV == 'n':
break
else:
print("You need to actually type either 'y' for yes or 'n' for no")
else:
makeMetaCSV = 'y'
break
except ValueError as e:
print(f"An error occurred: {e}")
if (makeMetaCSV == 'y'):
while True:
try:
bagged = input("Is this a bagged transfer type (y/n): ")
if bagged == 'y' or bagged == 'n':
break
else:
print("You need to actually type either 'y' for yes or 'n' for no")
except ValueError as e:
print(f"An error occurred: {e}")
if (xmlExists):
gen_source_files(directory, bagged, logfile, makeMetaCSV)
else:
gen_metadata_files(directory, bagged, logfile)
except FileNotFoundError:
print("The directory path you provided does not exist.")
except Exception as e:
print(f"An error occurred: {e}")
def main():
print("""
__ ______ _______ ___ ___
.--------..---.-.| |--..-----.| || __|| | |
| || _ || < | -__|| ---||__ || | |
|__|__|__||___._||__|__||_____||______||_______| \\_____/
""")
directory = input("Please enter the directory path: ").strip()
questions(directory)
if __name__ == "__main__":
main()