Skip to content

Commit

Permalink
update dbuild files
Browse files Browse the repository at this point in the history
  • Loading branch information
sgosline committed Apr 16, 2024
1 parent ecc8dda commit 05301bc
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 50 deletions.
76 changes: 42 additions & 34 deletions bmd2Samps_v3/buildv3database.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ removeChemIdDuplicates<-function(tab, chemIds,cols=c('AUC_Norm','End_Point_Name'
#' @param comptoxfiles Files downloaded from the EPA website
#' @return data.frame
getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
comptoxfiles=c('CompToxChemicalsDashboard-Batch-Search_2021-11-05_17_57_46.xlsx',
'CCD-Batch-Search_2022-01-26_10_28_30.xlsx')){ ##This mapping file consumes data from OSU to match identifiers to CAS
comptoxfile=''){#c('CompToxChemicalsDashboard-Batch-Search_2021-11-05_17_57_46.xlsx',
#'CCD-Batch-Search_2022-01-26_10_28_30.xlsx')){ ##This mapping file consumes data from OSU to match identifiers to CAS

##we have curated descriptions for each chemical
curatedDesc <- rio::import(file=desc_file,which=1)%>%
Expand All @@ -210,9 +210,9 @@ getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
##here we join the chemical metadata from the comptox dashboard


chemMeta<-do.call(rbind,lapply(comptoxfiles,function(x) rio::import(paste0(data.dir,'/',x))|>#,
chemMeta<-rio::import(comptoxfile)|>#,
#sheet='Main Data')%>%
select(required_comptox_columns)))%>%
select(required_comptox_columns)%>%
rename(cas_number='INPUT')%>%
left_join(chemicalClasses)%>% ##add in chemical class
left_join(curatedDesc)%>%
Expand Down Expand Up @@ -245,9 +245,9 @@ getChemMetadata<-function(desc_file='ChemicalDescriptions.xlsx',chemicalClasses,
#' getEndpointMetadata
#' @param data.dir directory where standard endpoint data is stored
#' @return data.frame
getEndpointMetadata<-function(data.dir){
getEndpointMetadata<-function(epfile){
#here is our pre-defined dictionary
endpointDetails<-rio::import(paste0(data.dir,'/SuperEndpoint%20Mapping%202021NOV04.xlsx'),which=4)|>#,
endpointDetails<-rio::import(epfile,which=4)#paste0(data.dir,'/SuperEndpoint%20Mapping%202021NOV04.xlsx'),which=4)|>#,
#sheet='Dictionary')%>%
#subset(`Portal Display`=='Display')%>%
rename(End_Point='Abbreviation',`End_Point_Name`='Simple name (<20char)')%>%
Expand Down Expand Up @@ -304,27 +304,33 @@ getNewChemicalClass<-function(data.dir){
#' buildSampleData - takes the curated information and selects the data we need
#' @param data.dir
#' @return data.frame
buildSampleData<-function(data.dir,chemMeta,sampTab){
buildSampleData<-function(fses_files,chemMeta,sampMapping){
##New data provided by michael
fses1<-subset(sampTab,name=='fses1')[['location']]
sampChem <- rio::import(fses1)|>#paste0(data.dir,'/fses/fses_data_for_pnnl_4-27-2021.csv'))%>%

sampChem<-do.call(rbind,lapply(fses_files,function(fs){

# fses1<-subset(sampTab,name=='fses1')[['location']]
sc <- rio::import(fs)|>#paste0(data.dir,'/fses/fses_data_for_pnnl_4-27-2021.csv'))%>%
# sampChem<-read.csv(paste0(data.dir,'/pnnl_bioassay_sample_query_1-14-2021.csv'))%>%
dplyr::select(required_sample_columns)%>%
subset(SampleNumber!='None')%>%
subset(cas_number!='NULL')%>%
mutate(water_concentration_molar=stringr::str_replace_all(water_concentration_molar,'BLOD|NULL|nc:BDL',"0"))%>%
mutate(measurement_value_molar=stringr::str_replace_all(measurement_value_molar,'BLOD|NULL|BDL',"0"))%>%
mutate(water_concentration=stringr::str_replace_all(water_concentration,'BLOD|NULL|BDL',"0"))%>%
dplyr::select(required_sample_columns)%>%
subset(SampleNumber!='None')%>%
subset(cas_number!='NULL')%>%
mutate(water_concentration_molar=stringr::str_replace_all(water_concentration_molar,'BLOD|NULL|nc:BDL',"0"))%>%
mutate(measurement_value_molar=stringr::str_replace_all(measurement_value_molar,'BLOD|NULL|BDL',"0"))%>%
mutate(water_concentration=stringr::str_replace_all(water_concentration,'BLOD|NULL|BDL',"0"))%>%
# subset(water_concentration_molar!='0.0')%>%
subset(!measurement_value_molar%in%c('0'))%>%
subset(!measurement_value%in%c("0","NULL",""))#%>%
subset(!measurement_value_molar%in%c('0'))%>%
subset(!measurement_value%in%c("0","NULL",""))#%>%
# select(-c(Sample_ID))#,Chemical_ID)) ##These two are added in the 4/27 version of the file

##data added 1/19/2022
fses2<-subset(sampTab,name=='fses2')[['location']]
newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
dplyr::select(required_sample_columns)%>%
mutate(LocationLon=LocationLon*-1)
#fses2<-subset(sampTab,name=='fses2')[['location']]
#newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
#dplyr::select(required_sample_columns)
###if there is negative
if(any(sc$LocationLon>0))
sc<-sc|>mutate(LocationLon=LocationLon*-1)
}) )

chemDat<-chemMeta%>%
select(Chemical_ID,cas_number,AVERAGE_MASS)%>%#,PREFERRED_NAME,chemDescription)%>%
Expand All @@ -334,7 +340,7 @@ buildSampleData<-function(data.dir,chemMeta,sampTab){
rbind(newSamp)

##This mapipng file maps tanguay lab identifiers to those in the anderson lab
sampMap<-subset(sampTab,data_type=='mapping')[['location']]
#sampMap<-subset(sampTab,data_type=='mapping')[['location']]
ids<-sampIdMasterTable(finalSampChem$SampleNumber,sampMap)

finalSampChem <- finalSampChem %>%
Expand Down Expand Up @@ -878,19 +884,21 @@ generateSummaryStats<-function(){
write.table(chem.eps,paste0(out.dir,'chemCounts_v2.tsv'),row.names=F,col.names=T,sep='\t')
}

#' main method
#' main methodbm
#' Parsers arguments
main<-function(){
##now we check for additional files
parser <- ArgumentParser()
parser$add_argument('-s','--sample',dest='is_sample',action='store_true',default=FALSE)
parser$add_argument('-c','--chemical',dest='is_chem',default=FALSE,action='store_True',
help='The subsequent files are chemical')
parser$add_argument('-d','--drcStat',dest='dose_res_stat',default=c('bmd','dose','fit'),help='which file is it')
parser$add_argument('-p','--sampMap',dest='sample_mapping_file',default='')
parser$add_argument('-m','--chemMap',dest='chem_meta_file',default='')
parser$add_argument('-e','--epMap',dest='endpoint_mapping_file',default='')
parser$add_argument('-l','--chemClass',dest='chem_class_file',default='')
parser$add_argument('-s','--sample',dest='is_sample',action='store_true',default=FALSE,help='Flag to indicate file are for samples')
parser$add_argument('-c','--chemical',dest='is_chem',default=FALSE,action='store_true',
help='Flag to indicate file is for chemicals')
parser$add_argument('-d','--drcFile',dest='dose_res_stat',default'',help='dose response curve file')
parser$add_argument('-p','--sampMap',dest='sample_mapping_file',default='',help='Sample mapping file location')
parser$add_argument('-m','--chemMap',dest='chem_meta_file',default='',help='Chemical metadata file location')
parser$add_argument('-e','--epMap',dest='endpoint_mapping_file',default='',help='Endpoint naming file location')
parser$add_argument('-l','--chemClass',dest='chem_class_file',default='',help='chemical class file location')
parser$add_argument('-x','--compToxFile',dest='comp_tox_mapping',default='',help='comptox data file location')
parser$add_argument('-f','--sampleFiles',dest='sample_files',default='',help='comma dlimited list of fses files to merge' )


args <- parser$parse_args()
Expand All @@ -899,16 +907,16 @@ main<-function(){

chemClass<-masvChemClass(args$chem_class_file)

chemMeta<-getChemMetadata(args$chem_meta_file,chemClass)
chemMeta<-getChemMetadata(args$chem_meta_file,chemClass,args$comp_tox_mapping)

sampChem<-buildSampleData(args$sample_mapping_file,chemMeta)
sampChem<-buildSampleData(args$fses_files,chemMeta,args$sample_mapping_file)

endpointDetails<-getEndpointMetadata(args$endpoint_mapping_file)%>%unique()

##now, depending on what combination of data type and statistic, we can redo file


generateSummaryStats()
#generateSummaryStats()

}

Expand Down
5 changes: 2 additions & 3 deletions data/srp_build_files.csv
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ fit8,fit,extract,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/mai
gene1,expression,zebrafish,,3
chemMap,mapping,chemical,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/chemicalIdMapping.csv,1
sampMap,mapping,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/sampleIdMapping.csv,1
fses1,fses,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/fses/fses_data_for_pnnl_4-27-2021.csv,1
fses2,fses,sample,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/fses/FSES_indoor_outdoor_study.xlsx,1
class1,classification,chemical,https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data/MASV%20Classifications%202021.xlsx,1

compTox,mapping,chemical,https://github.com/PNNL-CompBio/srpAnalytics/blob/main/data/CCDBatch_2022-01-26_AND_2021_11_05.xlsx,1
endpointMap,mapping,endpoint,https://github.com/PNNL-CompBio/srpAnalytics/blob/main/data/SuperEndpoint%20Mapping%202021NOV04.xlsx,1
36 changes: 23 additions & 13 deletions v3_buildScript.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,6 @@ def fitCurveFiles(morpho_behavior_tuples):
'''


def endpointMetadata(endpointmappingfile):
'''
retrieve endpoint mapping
'''


def chemicalClass(chem_class_mapping_file):
'''
get chemical classes
'''

def combineFiles(location_list,ftype):
'''
Expand Down Expand Up @@ -72,12 +62,32 @@ def main():
##get output in bmds, fits, and curves

#add chemical BMDS, fits, curves to existing data
chemfiles=[]
sampfiles=[]
for st in ['chemical','extract']:
for dt in ['bmd','fit','dose']:
fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
fname = 'tmp_'+st+'_'+dt+'.csv'
fdf.to_csv(fname,index=False)

##now map sample information

if st=='chemical':
chemfiles.append(fname)
else:
sampfiles.append(fname)

##now map sample information
smap = df.loc[df.name=='sampMap'].location
cmap = df.loc[df.name=='chemMap'].location
cclass = df.loc[df.name=='class1'].location
emap = df.loc[df.name=='endpointMap'].location
fses = ','.join(df.loc[df.data_type=='sample'].location)
ctfile = df.loc[df.name=='colmpTox'].location

##call script with sample files
cmd = "Rscript bmd2Samps_v3/buildV3database.R --sample --drcFile="+','.join(sampfiles)+\
' --sampMap='+smap+' --chemMap='+cmap+' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses
print(cmd)
os.system(cmd)
##call script with chemical files

main()

0 comments on commit 05301bc

Please sign in to comment.