bundle update re-engineering

sigven · Feb 27, 2024 · 37a5ef9 · 37a5ef9
1 parent 2d83111
commit 37a5ef9
Show file tree

Hide file tree

Showing 90 changed files with 1,984 additions and 6,534 deletions.
diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py
@@ -9,8 +9,8 @@ def check_args(arg_dict):
 
     logger = getlogger("pcgr-validate-arguments-input-a")
     # Check the existence of required arguments
-    if arg_dict['pcgr_dir'] is None or not os.path.isdir(arg_dict['pcgr_dir']):
-        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
+    if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
+        err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
         error_message(err_msg, logger)
 
     if arg_dict['genome_assembly'] is None:
@@ -347,28 +347,28 @@ def verify_input_files(arg_dict):
             os.path.abspath(arg_dict["cpsr_report"]))
 
     # check the existence of base folder
-    base_dir = os.path.abspath(arg_dict["pcgr_dir"])
+    base_dir = os.path.abspath(arg_dict["refdata_dir"])
     if not os.path.isdir(base_dir):
         err_msg = "Base directory (" + str(base_dir) + ") does not exist"
         error_message(err_msg, logger)
 
     # check the existence of data folder within the base folder
-    db_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data")
+    db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
     if not os.path.isdir(db_dir):
         err_msg = "Data directory (" + str(db_dir) + ") does not exist"
         error_message(err_msg, logger)
 
     # check the existence of specified assembly data folder within the base folder
     db_assembly_dir = os.path.join(os.path.abspath(
-        arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"])
+        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
     if not os.path.isdir(db_assembly_dir):
         err_msg = "Data directory for the specified genome assembly (" + str(
             db_assembly_dir) + ") does not exist"
         error_message(err_msg, logger)
 
     # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
     rel_notes_file = os.path.join(os.path.abspath(
-        arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
+        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
     if not os.path.exists(rel_notes_file):
         err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
         error_message(err_msg, logger)
@@ -393,8 +393,8 @@ def verify_input_files(arg_dict):
       "input_cpsr_report_dir": input_cpsr_report_dir,
       "input_cna_plot_dir": input_cna_plot_dir,
       "panel_normal_vcf_dir": panel_normal_vcf_dir,
-      "db_dir": db_assembly_dir,
-      "base_dir": base_dir,
+      "db_assembly_dir": db_assembly_dir,
+      "refdata_dir": base_dir,
       "output_dir": output_dir_full,
       "output_vcf": output_vcf,
       "output_cna": output_cna,
@@ -421,8 +421,8 @@ def check_args_cpsr(arg_dict):
         err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
         error_message(err_msg,logger)
     ## Check that PCGR directory (with data bundle) is provided and exists
-    if arg_dict['pcgr_dir'] is None or not os.path.isdir(arg_dict['pcgr_dir']):
-        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
+    if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
+        err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
         error_message(err_msg,logger)
     ## Check that genome assembly is set
     if arg_dict['genome_assembly'] is None:
@@ -576,26 +576,26 @@ def verify_input_files_cpsr(arg_dict):
             error_message(err_msg,logger)
 
     ## check the existence of base folder
-    base_dir = os.path.abspath(arg_dict['pcgr_dir'])
+    base_dir = os.path.abspath(arg_dict['refdata_dir'])
     if not os.path.isdir(base_dir):
         err_msg = f"Base directory ({base_dir}) does not exist"
         error_message(err_msg,logger)
 
     ## check the existence of data folder within the base folder
-    db_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data')
+    db_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data')
     if not os.path.isdir(db_dir):
         err_msg = f"Data directory ({db_dir}) does not exist"
         error_message(err_msg,logger)
 
     ## check the existence of specified assembly data folder within the base folder
-    db_assembly_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data', arg_dict['genome_assembly'])
+    db_assembly_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data', arg_dict['genome_assembly'])
     if not os.path.isdir(db_assembly_dir):
         err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist"
         error_message(err_msg,logger)
 
     ## check the existence of RELEASE_NOTES
     rel_notes_file = os.path.join(os.path.abspath(
-        arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
+        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
     if not os.path.exists(rel_notes_file):
         err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)'
         error_message(err_msg,logger)
@@ -617,8 +617,8 @@ def verify_input_files_cpsr(arg_dict):
     cpsr_paths = {
             "input_vcf_dir": input_vcf_dir,
             "input_customlist_dir": input_customlist_dir,
-            "db_dir": db_assembly_dir,
-            "base_dir": base_dir,
+            "db_assembly_dir": db_assembly_dir,
+            "refdata_dir": base_dir,
             "output_dir": output_dir_full,
             "output_vcf": output_vcf,
             "input_vcf_basename": input_vcf_basename,

diff --git a/pcgr/cna.py b/pcgr/cna.py
@@ -16,7 +16,7 @@
 def annotate_cna_segments(output_fname: str, 
                           output_dir: str, 
                           cna_segment_file: str, 
-                          pcgr_build_db_dir: str, 
+                          db_assembly_dir: str, 
                           build: str, 
                           sample_id: str,
                           n_copy_amplifications: int = 5,
@@ -28,7 +28,7 @@ def annotate_cna_segments(output_fname: str,
         output_fname (str): File name of the annotated output file.
         output_dir (str): Directory to save the annotated file.
         cna_segment_file (str): Path to the user-provided copy number aberrations segment file.
-        pcgr_build_db_dir (str): Path to the build-specific PCGR database directory.
+        db_assembly_dir (str): Path to the build-specific PCGR database directory.
         build (str): Genome assembly build of input data.
         sample_id( (str): Sample identifier
         output_dir (str): Directory to save the annotated file.
@@ -81,7 +81,7 @@ def annotate_cna_segments(output_fname: str,
 
     ## Check that 'End' of segments do not exceed chromosome lengths
     chromsizes_fname = \
-        os.path.join(pcgr_build_db_dir, 'chromsize.' + build + '.tsv')
+        os.path.join(db_assembly_dir, 'chromsize.' + build + '.tsv')
 
     check_file_exists(chromsizes_fname, logger)
     chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength'])
@@ -105,14 +105,14 @@ def annotate_cna_segments(output_fname: str,
     temp_files.append(cna_query_segment_bed.fn)
 
     ## annotate segments with cytobands
-    cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, pcgr_build_db_dir, logger)
+    cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, db_assembly_dir, logger)
 
     ## annotate with protein-coding transcripts
     cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df)
     temp_files.append(cna_query_segment_bed.fn)
 
     cna_query_segment_df = annotate_transcripts(
-       cna_query_segment_bed, output_dir, pcgr_build_db_dir, overlap_fraction=overlap_fraction, logger=logger)
+       cna_query_segment_bed, output_dir, db_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)
 
     cna_query_segment_df['segment_length_mb'] = \
         ((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5)
@@ -123,8 +123,8 @@ def annotate_cna_segments(output_fname: str,
     cna_actionable_dict = {}
 
     for db in ['cgi','civic']:
-        variant_fname = os.path.join(pcgr_build_db_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
-        clinical_fname = os.path.join(pcgr_build_db_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
+        variant_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
+        clinical_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
         logger.info(f"Loading copy-number biomarker evidence from {db} ..")
         biomarkers[db] = load_biomarkers(
             logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')
@@ -193,14 +193,14 @@ def annotate_cna_segments(output_fname: str,
     return 0
 
 
-def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_dir: str, logger: logging.Logger) -> pd.DataFrame:
+def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:
 
     pybedtools.set_tempdir(output_dir)    
     temp_files = []
 
     # BED file with cytoband annotations
     cytoband_bed_fname = \
-        os.path.join(pcgr_build_db_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
+        os.path.join(db_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
 
     cytoband_annotated_segments = pd.DataFrame()
 
@@ -263,7 +263,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_d
 
 
 def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
-                         pcgr_build_db_dir: str, overlap_fraction: float,
+                         db_assembly_dir: str, overlap_fraction: float,
                          logger: logging.Logger) -> pd.DataFrame:
 
     """
@@ -272,7 +272,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
     Parameters:
     - cna_segments_bt: A BedTool object representing the CNA segments.
     - output_dir: A string specifying the output directory (for temporary files generation).
-    - pcgr_build_db_dir: A string specifying the directory of the build-specific PCGR data bundle.
+    - db_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
     - overlap_fraction: A float representing the fraction of overlap required for annotation.
     
     Returns:
@@ -284,9 +284,9 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
 
     # BED file with protein-coding transcripts
     gene_transcript_bed_fname = \
-        os.path.join(pcgr_build_db_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
+        os.path.join(db_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
     gene_xref_tsv_fname = \
-        os.path.join(pcgr_build_db_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
+        os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
 
     cna_segments_annotated = pd.DataFrame()
 

diff --git a/pcgr/config.py b/pcgr/config.py
@@ -90,14 +90,16 @@ def create_config(arg_dict, workflow = "PCGR"):
             'exclude_pon': int(arg_dict['exclude_pon']),
             'exclude_likely_hom_germline': int(arg_dict['exclude_likely_hom_germline']),
             'exclude_likely_het_germline': int(arg_dict['exclude_likely_het_germline']),
+            'exclude_clinvar_germline': int(arg_dict['exclude_clinvar_germline']),
             'exclude_dbsnp_nonsomatic': int(arg_dict['exclude_dbsnp_nonsomatic']),
             'exclude_nonexonic': int(arg_dict['exclude_nonexonic'])
         }
         conf_options['somatic_snv']['msi'] = {
             'run': int(arg_dict['estimate_msi'])
         }
         conf_options['somatic_snv']['tmb'] = {
-            'run': int(arg_dict['estimate_tmb']),            
+            'run': int(arg_dict['estimate_tmb']), 
+            'tmb_display': arg_dict['tmb_display'],           
             'tmb_dp_min': arg_dict['tmb_dp_min'],
             'tmb_af_min': arg_dict['tmb_af_min']
         }
@@ -154,15 +156,21 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
     conf_data['software']['pcgr_version'] = pcgr_vars.PCGR_VERSION
     conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION
 
-    ## add paths to annotated files (VCF/TSV, CNA)
+    ## add paths to annotated files (VCF/TSV, CNA, TMB)
     conf_data['molecular_data'] = {}
     conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
     conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
+
     conf_data['molecular_data']['fname_cna_tsv'] = "None"
     if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
         conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
         del conf_options['annotated_cna']
 
+    conf_data['molecular_data']['fname_tmb'] = "None"
+    if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
+        conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
+        del conf_options['fname_tmb']
+
     genome_assembly = conf_options['genome_assembly']
     del conf_options['sample_id']
     del conf_options['genome_assembly']

diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py
@@ -22,7 +22,7 @@ def get_args():
 
     program_description = "Cancer Predisposition Sequencing Reporter - report of " + \
        "clinically significant cancer-predisposing germline variants"
-    program_options = " --input_vcf <INPUT_VCF> --pcgr_dir <PCGR_DIR> --output_dir <OUTPUT_DIR> --genome_assembly " + \
+    program_options = " --input_vcf <INPUT_VCF> --refdata_dir <REFDATA_DIR> --output_dir <OUTPUT_DIR> --genome_assembly " + \
        " <GENOME_ASSEMBLY> --sample_id <SAMPLE_ID>"
 
     parser = argparse.ArgumentParser(description = program_description,
@@ -69,7 +69,7 @@ def get_args():
     optional_vep.add_argument('--vep_no_intergenic', action = "store_true", help="Skip intergenic variants during processing (option '--no_intergenic' in VEP), default: %(default)s")
 
     required.add_argument('--input_vcf', help='VCF input file with germline query variants (SNVs/InDels).', required = True)
-    required.add_argument('--pcgr_dir',help=f"Directory that contains the PCGR data bundle directory, e.g. ~/pcgr-{pcgr_vars.PCGR_VERSION}", required = True)
+    required.add_argument('--refdata_dir',help=f"Directory that contains the PCGR/CPSR reference data, e.g. ~/pcgr-data-{pcgr_vars.PCGR_VERSION}", required = True)
     required.add_argument('--output_dir',help='Output directory', required = True)
     required.add_argument('--genome_assembly',choices = ['grch37','grch38'], help='Genome assembly build: grch37 or grch38', required = True)
     required.add_argument('--sample_id',help="Sample identifier - prefix for output files", required = True)
@@ -124,7 +124,8 @@ def run_cpsr(conf_options, cpsr_paths):
             warn_message(warn_msg, logger)
 
     if not input_vcf == 'None':
-        data_dir = cpsr_paths['base_dir']
+        refdata_dir = cpsr_paths['refdata_dir']
+        db_assembly_dir = cpsr_paths['db_assembly_dir']
         output_dir = cpsr_paths['output_dir']
 
         check_subprocess(logger, f'mkdir -p {output_dir}', debug)
@@ -147,7 +148,7 @@ def run_cpsr(conf_options, cpsr_paths):
         ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci
         vcf_validate_command = (
                 f'cpsr_validate_input.py '
-                f'{data_dir} '
+                f'{refdata_dir} '
                 f'{input_vcf} '
                 f'{input_vcf_validated_uncompr} '
                 f'{input_customlist} '
@@ -192,7 +193,7 @@ def run_cpsr(conf_options, cpsr_paths):
             conf_options['gene_panel']['custom_list_bed'] = output_custom_bed
 
         ## Write YAML configuration file  - settings, path to files, reference bundle etc
-        yaml_data = populate_config_data(conf_options, data_dir, workflow = "CPSR", logger = logger)
+        yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "CPSR", logger = logger)
         with open(yaml_fname, "w") as outfile:
             outfile.write(yaml.dump(yaml_data))
         outfile.close()
@@ -233,7 +234,8 @@ def run_cpsr(conf_options, cpsr_paths):
                 f'{"--debug" if debug else ""} '
                 f"--dbmts --gerp --tcga --gnomad_non_cancer --gene_transcript_xref "
                 f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} "
-                f"{os.path.join(data_dir, 'data', str(yaml_data['genome_assembly']))}"
+                f"{db_assembly_dir}"
+                #f"{os.path.join(refdata_dir, 'data', str(yaml_data['genome_assembly']))}"
                 )
         check_subprocess(logger, pcgr_vcfanno_command, debug)
         logger.info("Finished cpsr-vcfanno")
@@ -245,7 +247,7 @@ def run_cpsr(conf_options, cpsr_paths):
                 f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} 0 '
                 f'{yaml_data["conf"]["vep"]["vep_regulatory"]} 0 '
                 f'Any {yaml_data["conf"]["vep"]["vep_pick_order"]} '
-                f'{cpsr_paths["db_dir"]} --compress_output_vcf '
+                f'{cpsr_paths["db_assembly_dir"]} --compress_output_vcf '
                 f'--cpsr_yaml {yaml_fname} '
                 f'--cpsr {"--debug" if debug else ""}'
                 )
@@ -278,7 +280,7 @@ def run_cpsr(conf_options, cpsr_paths):
         logger.info("Appending ClinVar traits, official gene names, and protein domain annotations")        
         variant_set = \
            variant.append_annotations(
-              output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger)
+              output_pass_vcf2tsv_gz, db_assembly_dir = cpsr_paths["db_assembly_dir"], logger = logger)
         variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)
         if {'GENOTYPE'}.issubset(variant_set.columns):
             if variant_set.loc[variant_set['GENOTYPE'] == '.'].empty and variant_set.loc[variant_set['GENOTYPE'] == 'undefined'].empty: