forked from digital-preservation/freud
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfreud.py
161 lines (124 loc) · 7.89 KB
/
freud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#Script to Analyse DROID reports - Freud
#This script will analyse the DROID report to look for common issues which effect the ability to ingest into DRI. It can be run easily by running the batch file and then dragging the file which you want to analyse
#this will then create an excel spreadsheet in the directory which the program is running from with a different worksheet for each issue highlighted.
import pandas as pd
import numpy as np
import os
#below loads the csv file into pandas and takes required columns, additional columns for multiple identification are not taken yet as this breaks the csv read. It also loads a copy of the white list formats accepted into DRI
csvraw = input("Enter filepath of DROID csv to analyse: ")
csvraw = csvraw.strip('"')
columns_needed = ['ID','PARENT_ID','URI','FILE_PATH','NAME','METHOD','STATUS','SIZE','TYPE','EXT','LAST_MODIFIED','EXTENSION_MISMATCH','SHA256_HASH','FORMAT_COUNT','PUID','MIME_TYPE','FORMAT_NAME','FORMAT_VERSION']
csv = pd.read_csv(csvraw, usecols=columns_needed)
droidname = os.path.basename(csvraw)
droidname = droidname.rstrip('.csv')
results = pd.ExcelWriter(droidname+'_freudresults.xlsx', engine='xlsxwriter')
originalWhiteList = pd.read_csv('formats-whitelist.csv')
def unidentified(): #function run to add a worksheet which selects all files which have not been identified by DROID, also adds a new title row and makes it blue
unidentified = csv
unidentified = unidentified.loc[(unidentified['FORMAT_COUNT'] == 0)]
unidentified = unidentified.sort_values('EXT')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
unidentified.to_excel(results, sheet_name='Unidentified_Formats',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Unidentified_Formats']
sheet1.write('A1', 'UNIDENTIFIED FORMATS', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
unidentified()
def extension_only(): #function run to add a worksheet which selects all files which have only been identified by their extensions by DROID, also adds a new title row and makes it blue
extension = csv
extension = extension.loc[(extension['METHOD'] == "Extension")]
extension = extension.sort_values('PUID')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
extension.to_excel(results, sheet_name='Extension_Only_ID',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Extension_Only_ID']
sheet1.write('A1', 'EXTENSION ONLY IDENTIFICATION', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
extension_only()
def multiple(): #function run to add a worksheet which selects all files which have been identified as multiple formats by DROID, also adds a new title row and makes it blue
multiple = csv
multiple = multiple.loc[(multiple['FORMAT_COUNT'] > 1)]
multiple = multiple.sort_values('EXT')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
multiple.to_excel(results, sheet_name='Multiple_ID',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Multiple_ID']
sheet1.write('A1', 'MULTIPLE IDENTIFICATION (Check original CSV for additional identifications)', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
multiple()
def mismatch(): #function run to add a worksheet which selects all files which it identifies as having mismatched extensions to their format identification by DROID, also adds a new title row and makes it blue
mismatch = csv
mismatch = mismatch.loc[(mismatch['EXTENSION_MISMATCH'] == True)]
mismatch = mismatch.sort_values('PUID')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
mismatch.to_excel(results, sheet_name='Extension_Mismatch',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Extension_Mismatch']
sheet1.write('A1', 'EXTENSION MISMATCH', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
mismatch()
def container(): #function run to add a worksheet which selects all files which it identifies as compressed container formats by DROID, also adds a new title row and makes it blue
container = csv
container = container.loc[(container['TYPE'] == 'Container')]
container = container.sort_values('PUID')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
container.to_excel(results, sheet_name='Compressed_Container_Formats',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Compressed_Container_Formats']
sheet1.write('A1', 'COMPRESSED CONTAINER FORMATS', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
container()
def zerobyte(): #function run to add a worksheet which selects all files which it identifies any zero byte files, also adds a new title row and makes it blue
zerobyte = csv
zerobyte = zerobyte.loc[(zerobyte['SIZE'] == 0)]
zerobyte = zerobyte.sort_values('EXT')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
zerobyte.to_excel(results, sheet_name='Zero_Byte_Files',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Zero_Byte_Files']
sheet1.write('A1', 'ZERO BYTE FILES', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
zerobyte()
def duplicates(): #function run to add a worksheet which selects all files which it identifies as having mismatched extensions to their format identification by DROID, also adds a new title row and makes it blue
duplicates = csv
duplicates = (duplicates.loc[duplicates['TYPE'].isin(['File','Container'])])
duplicates = duplicates.loc[duplicates['SHA256_HASH'].duplicated(keep=False), :]
duplicates = duplicates.sort_values('SHA256_HASH')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
duplicates.to_excel(results, sheet_name='Duplicate_Files',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Duplicate_Files']
sheet1.write('A1', 'DUPLICATE FILES', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
duplicates()
def whiteListFormats(): #function run to add a worksheet which selects all files of formats identified by DROID which are not on the DRI white list, also adds a new title row and makes it blue
whiteListFormats = csv
whiteListFormats = whiteListFormats.loc[(whiteListFormats['FORMAT_COUNT'] > 0)]
whiteList = {}
whiteList = originalWhiteList["label"].values.tolist()
whiteListFormats = whiteListFormats.loc[~whiteListFormats.PUID.isin(whiteList)]
whiteListFormats = whiteListFormats.sort_values('PUID')
resultbook = results.book
format = resultbook.add_format({
'bold': True,
'fg_color': '#4c9df7'})
whiteListFormats.to_excel(results, sheet_name='Formats_Not_On_White_List',index=False, startcol = 0, startrow = 1)
sheet1 = results.sheets['Formats_Not_On_White_List']
sheet1.write('A1', 'FORMATS NOT ON WHITELIST', format)
sheet1.write_row('B1:X1',['','','','','','','','','','','','','','','','','','','','','','','','','','',''],format)
whiteListFormats()
results.close()