forked from humanconnectome/NIHToolbox2NDA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrosswalkscratch.py
89 lines (77 loc) · 4.86 KB
/
crosswalkscratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#program to turn list of unzipped folders of templates etc from the NDA into something that more closely resembles a crosswalk
import os
import pandas as pd
from openpyxl import load_workbook
#specify path to the folder that contains the unzipped folders from the NDA - end path in '/' or
# tell me how I can make this code robust to variations
#inout='/yourpath/NIH_toolbox_crosswalk_docs/'
inout='/home/petra/UbWinSharedSpace1/redcap2nda_Lifespan2019/NIH_toolbox_crosswalk_docs/HCPD/'
#open one of the folders that the NDA sent and find the file that contains the list of vars in
#your file and the list of vars in the NDA as one-to-one pairs
#for example, we have files with the column headers "NDA Element" and "HCP-D Element"
#we also have files with the column headers "NDA Element" and "HCP-A Element"
#provide the name of local variable:
localvar="HCP-D Element"
# get list of files and folders in the inout directory
# The string containing the name of the new folders from the NDA will be assigned to
# variable called 'Instrument_Short' because after filtering out all the other
# files/folders (list1 and list2 below) thats what you'll have:
# the NDAs shorthand name for the NIH Toolbox Instrument
dirs=pd.DataFrame(os.listdir(inout),columns=['Instrument_Short'])
#initialize an empty crosswalk
crosswalk_meta=pd.DataFrame(columns=['Inst','Instrument_Short','key','template','requestfile','varmapfile'])
#create two lists of for file extensions and/or folders in inout that you don't want to be treated
# as something to be added to a crosswalk
list1=['zip','csv','xlsx','pptx'] #file extensions in this folder of folders from Leo that you want to ignore
list2=['drafts','HCPD','temp','added_tocrosswalk','dummypass','almost trash','prepped_structures'] #identify folders you want to ignore
# for the items not in list1 or list2, read the folder contents and turn them into something appendable/mergeable
# four possible files:
# Mapping Key has the Full Name of the NDA structure to which this stuff will be mapped, the short name,
# and the name on the Form as we will be uploading it (Inst)
# One file in this folder has an Instrument_short.structure name format
# Occasionally you'll get a formRequest file which will contains NDA verbal instructions that
# ultimately need to be translated into executable python code (more on this later).
# The template file is not used in the crosswalk, but may come in handy for debugging (i.e. if someone
# on either end of this process accidentally assigned the wrong aliases).
for i in dirs.Instrument_Short:
for j in list1:
if j in i:
print('skipping file '+i)
i='dummypass'
if i in list2:
if i not in 'dummypass':
print('skipping folder '+i)
else:
requestfile='' #need to initialize this particular var becaus some folders dont have requests
cw = pd.DataFrame(columns=['Instrument_Short', 'key', 'template', 'requestfile', 'varmapfile'])
haystack=os.listdir(inout+i)
for j, element in enumerate(haystack):
if "Key" in element:
key=element
elif "template" in element:
template=element
elif "01.xlsx" in element:
varmapfile=element
elif 'equest' in element:
requestfile=element
cw.loc[0, ['Instrument_Short', 'key','template','varmapfile','requestfile']] = [i, key,template,varmapfile,requestfile]
wb = load_workbook(inout + cw.Instrument_Short[0] + '/' + cw.varmapfile[0])
ws = wb.active
df = pd.DataFrame(ws.values)
df.columns = [df.iloc[0, 0], df.iloc[0, 1]]
df = df.loc[~(df['NDA Element'] == 'NDA Element')].copy()
df = df.loc[~(df[localvar] == 'PIN')].copy()
df['Instrument_Short'] = cw.Instrument_Short[0]
cw2 = pd.merge(cw, df, how='outer', on='Instrument_Short')
# pull the instrument name from the mapping key and merge it with the crosswalk_meta
wb2 = load_workbook(inout + cw.Instrument_Short[0] + '/' + cw.key[0])
ws2 = wb2.active
df2 = pd.DataFrame(ws2.values)
cw2['Inst'] = df2.iloc[df2.shape[0] - 1, 2] # if Leo changes the format of his mapping file, this will break...
# right now, it is grabbing the 3rd column of the last row of the Mapping File, which ccontains the instrument name
crosswalk_meta = pd.concat([crosswalk_meta, cw2], axis=0,sort=True)
#lastly grab the name of the structure from the varmapfile
crosswalk_meta['structure']=crosswalk_meta.varmapfile.str.split('.').str[-2]
# output this information to a csv in the inout directory so you can easily append it to
# the full existing/curated Crosswalkfile that you tweaked for your purposes (by renaming localvar, for example)
crosswalk_meta[['Inst','template','Instrument_Short','structure','NDA Element',localvar,'requestfile']].to_csv(inout+'crosswalk_part_'+localvar+'.csv',index=False)