update dbuild files

PNNL-CompBio · Apr 16, 2024 · 05301bc · 05301bc
1 parent ecc8dda
commit 05301bc
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 50 deletions.
diff --git a/bmd2Samps_v3/buildv3database.R b/bmd2Samps_v3/buildv3database.R
@@ -197,8 +197,8 @@ removeChemIdDuplicates<-function(tab, chemIds,cols=c('AUC_Norm','End_Point_Name'
 #' @param comptoxfiles Files downloaded from the EPA website
 #' @return data.frame
 getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
-                          comptoxfiles=c('CompToxChemicalsDashboard-Batch-Search_2021-11-05_17_57_46.xlsx',
-                                         'CCD-Batch-Search_2022-01-26_10_28_30.xlsx')){    ##This mapping file consumes data from OSU to match identifiers to CAS
+                          comptoxfile=''){#c('CompToxChemicalsDashboard-Batch-Search_2021-11-05_17_57_46.xlsx',
+                                         #'CCD-Batch-Search_2022-01-26_10_28_30.xlsx')){    ##This mapping file consumes data from OSU to match identifiers to CAS
 
      ##we have curated descriptions for each chemical
    curatedDesc <- rio::import(file=desc_file,which=1)%>%
@@ -210,9 +210,9 @@ getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
     ##here we join the chemical metadata from the comptox dashboard
 
 
-    chemMeta<-do.call(rbind,lapply(comptoxfiles,function(x) rio::import(paste0(data.dir,'/',x))|>#,
+    chemMeta<-rio::import(comptoxfile)|>#,
                                                                               #sheet='Main Data')%>%
-                                      select(required_comptox_columns)))%>%
+                                      select(required_comptox_columns)%>%
         rename(cas_number='INPUT')%>%
         left_join(chemicalClasses)%>% ##add in chemical class
         left_join(curatedDesc)%>%
@@ -245,9 +245,9 @@ getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
 #' getEndpointMetadata
 #' @param data.dir directory where standard endpoint data is stored
 #' @return data.frame
-getEndpointMetadata<-function(data.dir){
+getEndpointMetadata<-function(epfile){
                                         #here is our pre-defined dictionary
-    endpointDetails<-rio::import(paste0(data.dir,'/SuperEndpoint%20Mapping%202021NOV04.xlsx'),which=4)|>#,
+    endpointDetails<-rio::import(epfile,which=4)#paste0(data.dir,'/SuperEndpoint%20Mapping%202021NOV04.xlsx'),which=4)|>#,
                                        #sheet='Dictionary')%>%
         #subset(`Portal Display`=='Display')%>%
         rename(End_Point='Abbreviation',`End_Point_Name`='Simple name (<20char)')%>%
@@ -304,27 +304,33 @@ getNewChemicalClass<-function(data.dir){
 #' buildSampleData - takes the curated information and selects the data we need
 #' @param data.dir
 #' @return data.frame
-buildSampleData<-function(data.dir,chemMeta,sampTab){
+buildSampleData<-function(fses_files,chemMeta,sampMapping){
     ##New data provided by michael
-    fses1<-subset(sampTab,name=='fses1')[['location']]
-    sampChem <- rio::import(fses1)|>#paste0(data.dir,'/fses/fses_data_for_pnnl_4-27-2021.csv'))%>%
+
+    sampChem<-do.call(rbind,lapply(fses_files,function(fs){
+
+#    fses1<-subset(sampTab,name=='fses1')[['location']]
+        sc <- rio::import(fs)|>#paste0(data.dir,'/fses/fses_data_for_pnnl_4-27-2021.csv'))%>%
                                         #  sampChem<-read.csv(paste0(data.dir,'/pnnl_bioassay_sample_query_1-14-2021.csv'))%>%
-        dplyr::select(required_sample_columns)%>%
-        subset(SampleNumber!='None')%>%
-        subset(cas_number!='NULL')%>%
-        mutate(water_concentration_molar=stringr::str_replace_all(water_concentration_molar,'BLOD|NULL|nc:BDL',"0"))%>%
-        mutate(measurement_value_molar=stringr::str_replace_all(measurement_value_molar,'BLOD|NULL|BDL',"0"))%>%
-        mutate(water_concentration=stringr::str_replace_all(water_concentration,'BLOD|NULL|BDL',"0"))%>%
+            dplyr::select(required_sample_columns)%>%
+            subset(SampleNumber!='None')%>%
+            subset(cas_number!='NULL')%>%
+            mutate(water_concentration_molar=stringr::str_replace_all(water_concentration_molar,'BLOD|NULL|nc:BDL',"0"))%>%
+            mutate(measurement_value_molar=stringr::str_replace_all(measurement_value_molar,'BLOD|NULL|BDL',"0"))%>%
+            mutate(water_concentration=stringr::str_replace_all(water_concentration,'BLOD|NULL|BDL',"0"))%>%
                                         # subset(water_concentration_molar!='0.0')%>%
-        subset(!measurement_value_molar%in%c('0'))%>%
-        subset(!measurement_value%in%c("0","NULL",""))#%>%
+            subset(!measurement_value_molar%in%c('0'))%>%
+            subset(!measurement_value%in%c("0","NULL",""))#%>%
 #        select(-c(Sample_ID))#,Chemical_ID)) ##These two are added in the 4/27 version of the file
 
     ##data added 1/19/2022
-    fses2<-subset(sampTab,name=='fses2')[['location']]
-    newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
-        dplyr::select(required_sample_columns)%>%
-      mutate(LocationLon=LocationLon*-1)
+        #fses2<-subset(sampTab,name=='fses2')[['location']]
+        #newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
+                                        #dplyr::select(required_sample_columns)
+###if there is negative
+        if(any(sc$LocationLon>0))
+            sc<-sc|>mutate(LocationLon=LocationLon*-1)
+    })   )
 
     chemDat<-chemMeta%>%
         select(Chemical_ID,cas_number,AVERAGE_MASS)%>%#,PREFERRED_NAME,chemDescription)%>%
@@ -334,7 +340,7 @@ buildSampleData<-function(data.dir,chemMeta,sampTab){
         rbind(newSamp)
 
     ##This mapipng file maps tanguay lab identifiers to those in the anderson lab
-    sampMap<-subset(sampTab,data_type=='mapping')[['location']]
+    #sampMap<-subset(sampTab,data_type=='mapping')[['location']]
     ids<-sampIdMasterTable(finalSampChem$SampleNumber,sampMap)
 
     finalSampChem <- finalSampChem %>%
@@ -878,19 +884,21 @@ generateSummaryStats<-function(){
   write.table(chem.eps,paste0(out.dir,'chemCounts_v2.tsv'),row.names=F,col.names=T,sep='\t')
 }
 
-#' main method
+#' main methodbm
 #' Parsers arguments
 main<-function(){
     ##now we check for additional files
     parser <- ArgumentParser()
-    parser$add_argument('-s','--sample',dest='is_sample',action='store_true',default=FALSE)
-    parser$add_argument('-c','--chemical',dest='is_chem',default=FALSE,action='store_True',
-                        help='The subsequent files are chemical')
-    parser$add_argument('-d','--drcStat',dest='dose_res_stat',default=c('bmd','dose','fit'),help='which file is it')
-    parser$add_argument('-p','--sampMap',dest='sample_mapping_file',default='')
-    parser$add_argument('-m','--chemMap',dest='chem_meta_file',default='')
-    parser$add_argument('-e','--epMap',dest='endpoint_mapping_file',default='')
-    parser$add_argument('-l','--chemClass',dest='chem_class_file',default='')
+    parser$add_argument('-s','--sample',dest='is_sample',action='store_true',default=FALSE,help='Flag to indicate file are for samples')
+    parser$add_argument('-c','--chemical',dest='is_chem',default=FALSE,action='store_true',
+                        help='Flag to indicate file is for chemicals')
+    parser$add_argument('-d','--drcFile',dest='dose_res_stat',default'',help='dose response curve file')
+    parser$add_argument('-p','--sampMap',dest='sample_mapping_file',default='',help='Sample mapping file location')
+    parser$add_argument('-m','--chemMap',dest='chem_meta_file',default='',help='Chemical metadata file location')
+    parser$add_argument('-e','--epMap',dest='endpoint_mapping_file',default='',help='Endpoint naming file location')
+    parser$add_argument('-l','--chemClass',dest='chem_class_file',default='',help='chemical class file location')
+    parser$add_argument('-x','--compToxFile',dest='comp_tox_mapping',default='',help='comptox data file location')
+    parser$add_argument('-f','--sampleFiles',dest='sample_files',default='',help='comma dlimited list of fses files to merge' )
 
 
     args <- parser$parse_args()
@@ -899,16 +907,16 @@ main<-function(){
 
     chemClass<-masvChemClass(args$chem_class_file)
 
-    chemMeta<-getChemMetadata(args$chem_meta_file,chemClass)
+    chemMeta<-getChemMetadata(args$chem_meta_file,chemClass,args$comp_tox_mapping)
 
-    sampChem<-buildSampleData(args$sample_mapping_file,chemMeta)
+    sampChem<-buildSampleData(args$fses_files,chemMeta,args$sample_mapping_file)
 
     endpointDetails<-getEndpointMetadata(args$endpoint_mapping_file)%>%unique()
 
     ##now, depending on what combination of data type and statistic, we can redo file
 
 
-    generateSummaryStats()
+    #generateSummaryStats()
 
 }
 

diff --git a/data/srp_build_files.csv b/data/srp_build_files.csv
@@ -33,7 +33,6 @@ fit8,fit,extract,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/mai
 gene1,expression,zebrafish,,3
 chemMap,mapping,chemical,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/chemicalIdMapping.csv,1
 sampMap,mapping,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/sampleIdMapping.csv,1
-fses1,fses,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/fses/fses_data_for_pnnl_4-27-2021.csv,1
-fses2,fses,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/fses/FSES_indoor_outdoor_study.xlsx,1
 class1,classification,chemical,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/MASV%20Classifications%202021.xlsx,1
-
+compTox,mapping,chemical,https://github.com/PNNL-CompBio/srpAnalytics/blob/main/data/CCDBatch_2022-01-26_AND_2021_11_05.xlsx,1
+endpointMap,mapping,endpoint,https://github.com/PNNL-CompBio/srpAnalytics/blob/main/data/SuperEndpoint%20Mapping%202021NOV04.xlsx,1
diff --git a/v3_buildScript.py b/v3_buildScript.py
@@ -25,16 +25,6 @@ def fitCurveFiles(morpho_behavior_tuples):
     '''
 
 
-def endpointMetadata(endpointmappingfile):
-    '''
-    retrieve endpoint mapping
-    '''
-
-
-def chemicalClass(chem_class_mapping_file):
-    '''
-    get chemical classes
-    '''
 
 def combineFiles(location_list,ftype):
     '''
@@ -72,12 +62,32 @@ def main():
     ##get output in bmds, fits, and curves
 
     #add chemical BMDS, fits, curves to existing data
+    chemfiles=[]
+    sampfiles=[]
     for st in ['chemical','extract']:
         for dt in ['bmd','fit','dose']:
             fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
             fname = 'tmp_'+st+'_'+dt+'.csv'
             fdf.to_csv(fname,index=False)
-
-            ##now map sample information
-
+            if st=='chemical':
+                chemfiles.append(fname)
+            else:
+                sampfiles.append(fname)
+
+    ##now map sample information
+    smap = df.loc[df.name=='sampMap'].location
+    cmap = df.loc[df.name=='chemMap'].location
+    cclass = df.loc[df.name=='class1'].location
+    emap = df.loc[df.name=='endpointMap'].location
+    fses = ','.join(df.loc[df.data_type=='sample'].location)
+    ctfile = df.loc[df.name=='colmpTox'].location
+
+    ##call script with sample files
+    cmd = "Rscript bmd2Samps_v3/buildV3database.R --sample --drcFile="+','.join(sampfiles)+\
+        ' --sampMap='+smap+' --chemMap='+cmap+' --epMap='+emap+' --chemClass='+cclass+\
+        ' --compToxFile='+ctfile+' --sampleFiles='+fses
+    print(cmd)
+    os.system(cmd)
+    ##call script with chemical files
+
 main()