Skip to content

Commit

Permalink
bundle update re-engineering
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Feb 27, 2024
1 parent 2d83111 commit 37a5ef9
Show file tree
Hide file tree
Showing 90 changed files with 1,984 additions and 6,534 deletions.
32 changes: 16 additions & 16 deletions pcgr/arg_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def check_args(arg_dict):

logger = getlogger("pcgr-validate-arguments-input-a")
# Check the existence of required arguments
if arg_dict['pcgr_dir'] is None or not os.path.isdir(arg_dict['pcgr_dir']):
err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
error_message(err_msg, logger)

if arg_dict['genome_assembly'] is None:
Expand Down Expand Up @@ -347,28 +347,28 @@ def verify_input_files(arg_dict):
os.path.abspath(arg_dict["cpsr_report"]))

# check the existence of base folder
base_dir = os.path.abspath(arg_dict["pcgr_dir"])
base_dir = os.path.abspath(arg_dict["refdata_dir"])
if not os.path.isdir(base_dir):
err_msg = "Base directory (" + str(base_dir) + ") does not exist"
error_message(err_msg, logger)

# check the existence of data folder within the base folder
db_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data")
db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
if not os.path.isdir(db_dir):
err_msg = "Data directory (" + str(db_dir) + ") does not exist"
error_message(err_msg, logger)

# check the existence of specified assembly data folder within the base folder
db_assembly_dir = os.path.join(os.path.abspath(
arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"])
arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
if not os.path.isdir(db_assembly_dir):
err_msg = "Data directory for the specified genome assembly (" + str(
db_assembly_dir) + ") does not exist"
error_message(err_msg, logger)

# check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
rel_notes_file = os.path.join(os.path.abspath(
arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
if not os.path.exists(rel_notes_file):
err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
error_message(err_msg, logger)
Expand All @@ -393,8 +393,8 @@ def verify_input_files(arg_dict):
"input_cpsr_report_dir": input_cpsr_report_dir,
"input_cna_plot_dir": input_cna_plot_dir,
"panel_normal_vcf_dir": panel_normal_vcf_dir,
"db_dir": db_assembly_dir,
"base_dir": base_dir,
"db_assembly_dir": db_assembly_dir,
"refdata_dir": base_dir,
"output_dir": output_dir_full,
"output_vcf": output_vcf,
"output_cna": output_cna,
Expand All @@ -421,8 +421,8 @@ def check_args_cpsr(arg_dict):
err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
error_message(err_msg,logger)
## Check that PCGR directory (with data bundle) is provided and exists
if arg_dict['pcgr_dir'] is None or not os.path.isdir(arg_dict['pcgr_dir']):
err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
error_message(err_msg,logger)
## Check that genome assembly is set
if arg_dict['genome_assembly'] is None:
Expand Down Expand Up @@ -576,26 +576,26 @@ def verify_input_files_cpsr(arg_dict):
error_message(err_msg,logger)

## check the existence of base folder
base_dir = os.path.abspath(arg_dict['pcgr_dir'])
base_dir = os.path.abspath(arg_dict['refdata_dir'])
if not os.path.isdir(base_dir):
err_msg = f"Base directory ({base_dir}) does not exist"
error_message(err_msg,logger)

## check the existence of data folder within the base folder
db_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data')
db_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data')
if not os.path.isdir(db_dir):
err_msg = f"Data directory ({db_dir}) does not exist"
error_message(err_msg,logger)

## check the existence of specified assembly data folder within the base folder
db_assembly_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data', arg_dict['genome_assembly'])
db_assembly_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data', arg_dict['genome_assembly'])
if not os.path.isdir(db_assembly_dir):
err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist"
error_message(err_msg,logger)

## check the existence of RELEASE_NOTES
rel_notes_file = os.path.join(os.path.abspath(
arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
if not os.path.exists(rel_notes_file):
err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)'
error_message(err_msg,logger)
Expand All @@ -617,8 +617,8 @@ def verify_input_files_cpsr(arg_dict):
cpsr_paths = {
"input_vcf_dir": input_vcf_dir,
"input_customlist_dir": input_customlist_dir,
"db_dir": db_assembly_dir,
"base_dir": base_dir,
"db_assembly_dir": db_assembly_dir,
"refdata_dir": base_dir,
"output_dir": output_dir_full,
"output_vcf": output_vcf,
"input_vcf_basename": input_vcf_basename,
Expand Down
26 changes: 13 additions & 13 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def annotate_cna_segments(output_fname: str,
output_dir: str,
cna_segment_file: str,
pcgr_build_db_dir: str,
db_assembly_dir: str,
build: str,
sample_id: str,
n_copy_amplifications: int = 5,
Expand All @@ -28,7 +28,7 @@ def annotate_cna_segments(output_fname: str,
output_fname (str): File name of the annotated output file.
output_dir (str): Directory to save the annotated file.
cna_segment_file (str): Path to the user-provided copy number aberrations segment file.
pcgr_build_db_dir (str): Path to the build-specific PCGR database directory.
db_assembly_dir (str): Path to the build-specific PCGR database directory.
build (str): Genome assembly build of input data.
sample_id( (str): Sample identifier
output_dir (str): Directory to save the annotated file.
Expand Down Expand Up @@ -81,7 +81,7 @@ def annotate_cna_segments(output_fname: str,

## Check that 'End' of segments do not exceed chromosome lengths
chromsizes_fname = \
os.path.join(pcgr_build_db_dir, 'chromsize.' + build + '.tsv')
os.path.join(db_assembly_dir, 'chromsize.' + build + '.tsv')

check_file_exists(chromsizes_fname, logger)
chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength'])
Expand All @@ -105,14 +105,14 @@ def annotate_cna_segments(output_fname: str,
temp_files.append(cna_query_segment_bed.fn)

## annotate segments with cytobands
cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, pcgr_build_db_dir, logger)
cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, db_assembly_dir, logger)

## annotate with protein-coding transcripts
cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df)
temp_files.append(cna_query_segment_bed.fn)

cna_query_segment_df = annotate_transcripts(
cna_query_segment_bed, output_dir, pcgr_build_db_dir, overlap_fraction=overlap_fraction, logger=logger)
cna_query_segment_bed, output_dir, db_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)

cna_query_segment_df['segment_length_mb'] = \
((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5)
Expand All @@ -123,8 +123,8 @@ def annotate_cna_segments(output_fname: str,
cna_actionable_dict = {}

for db in ['cgi','civic']:
variant_fname = os.path.join(pcgr_build_db_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
clinical_fname = os.path.join(pcgr_build_db_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
variant_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
clinical_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
logger.info(f"Loading copy-number biomarker evidence from {db} ..")
biomarkers[db] = load_biomarkers(
logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')
Expand Down Expand Up @@ -193,14 +193,14 @@ def annotate_cna_segments(output_fname: str,
return 0


def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_dir: str, logger: logging.Logger) -> pd.DataFrame:
def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:

pybedtools.set_tempdir(output_dir)
temp_files = []

# BED file with cytoband annotations
cytoband_bed_fname = \
os.path.join(pcgr_build_db_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
os.path.join(db_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')

cytoband_annotated_segments = pd.DataFrame()

Expand Down Expand Up @@ -263,7 +263,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_d


def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
pcgr_build_db_dir: str, overlap_fraction: float,
db_assembly_dir: str, overlap_fraction: float,
logger: logging.Logger) -> pd.DataFrame:

"""
Expand All @@ -272,7 +272,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
Parameters:
- cna_segments_bt: A BedTool object representing the CNA segments.
- output_dir: A string specifying the output directory (for temporary files generation).
- pcgr_build_db_dir: A string specifying the directory of the build-specific PCGR data bundle.
- db_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
- overlap_fraction: A float representing the fraction of overlap required for annotation.
Returns:
Expand All @@ -284,9 +284,9 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,

# BED file with protein-coding transcripts
gene_transcript_bed_fname = \
os.path.join(pcgr_build_db_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
os.path.join(db_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
gene_xref_tsv_fname = \
os.path.join(pcgr_build_db_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")

cna_segments_annotated = pd.DataFrame()

Expand Down
12 changes: 10 additions & 2 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,16 @@ def create_config(arg_dict, workflow = "PCGR"):
'exclude_pon': int(arg_dict['exclude_pon']),
'exclude_likely_hom_germline': int(arg_dict['exclude_likely_hom_germline']),
'exclude_likely_het_germline': int(arg_dict['exclude_likely_het_germline']),
'exclude_clinvar_germline': int(arg_dict['exclude_clinvar_germline']),
'exclude_dbsnp_nonsomatic': int(arg_dict['exclude_dbsnp_nonsomatic']),
'exclude_nonexonic': int(arg_dict['exclude_nonexonic'])
}
conf_options['somatic_snv']['msi'] = {
'run': int(arg_dict['estimate_msi'])
}
conf_options['somatic_snv']['tmb'] = {
'run': int(arg_dict['estimate_tmb']),
'run': int(arg_dict['estimate_tmb']),
'tmb_display': arg_dict['tmb_display'],
'tmb_dp_min': arg_dict['tmb_dp_min'],
'tmb_af_min': arg_dict['tmb_af_min']
}
Expand Down Expand Up @@ -154,15 +156,21 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
conf_data['software']['pcgr_version'] = pcgr_vars.PCGR_VERSION
conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION

## add paths to annotated files (VCF/TSV, CNA)
## add paths to annotated files (VCF/TSV, CNA, TMB)
conf_data['molecular_data'] = {}
conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']

conf_data['molecular_data']['fname_cna_tsv'] = "None"
if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
del conf_options['annotated_cna']

conf_data['molecular_data']['fname_tmb'] = "None"
if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
del conf_options['fname_tmb']

genome_assembly = conf_options['genome_assembly']
del conf_options['sample_id']
del conf_options['genome_assembly']
Expand Down
18 changes: 10 additions & 8 deletions pcgr/cpsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_args():

program_description = "Cancer Predisposition Sequencing Reporter - report of " + \
"clinically significant cancer-predisposing germline variants"
program_options = " --input_vcf <INPUT_VCF> --pcgr_dir <PCGR_DIR> --output_dir <OUTPUT_DIR> --genome_assembly " + \
program_options = " --input_vcf <INPUT_VCF> --refdata_dir <REFDATA_DIR> --output_dir <OUTPUT_DIR> --genome_assembly " + \
" <GENOME_ASSEMBLY> --sample_id <SAMPLE_ID>"

parser = argparse.ArgumentParser(description = program_description,
Expand Down Expand Up @@ -69,7 +69,7 @@ def get_args():
optional_vep.add_argument('--vep_no_intergenic', action = "store_true", help="Skip intergenic variants during processing (option '--no_intergenic' in VEP), default: %(default)s")

required.add_argument('--input_vcf', help='VCF input file with germline query variants (SNVs/InDels).', required = True)
required.add_argument('--pcgr_dir',help=f"Directory that contains the PCGR data bundle directory, e.g. ~/pcgr-{pcgr_vars.PCGR_VERSION}", required = True)
required.add_argument('--refdata_dir',help=f"Directory that contains the PCGR/CPSR reference data, e.g. ~/pcgr-data-{pcgr_vars.PCGR_VERSION}", required = True)
required.add_argument('--output_dir',help='Output directory', required = True)
required.add_argument('--genome_assembly',choices = ['grch37','grch38'], help='Genome assembly build: grch37 or grch38', required = True)
required.add_argument('--sample_id',help="Sample identifier - prefix for output files", required = True)
Expand Down Expand Up @@ -124,7 +124,8 @@ def run_cpsr(conf_options, cpsr_paths):
warn_message(warn_msg, logger)

if not input_vcf == 'None':
data_dir = cpsr_paths['base_dir']
refdata_dir = cpsr_paths['refdata_dir']
db_assembly_dir = cpsr_paths['db_assembly_dir']
output_dir = cpsr_paths['output_dir']

check_subprocess(logger, f'mkdir -p {output_dir}', debug)
Expand All @@ -147,7 +148,7 @@ def run_cpsr(conf_options, cpsr_paths):
## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci
vcf_validate_command = (
f'cpsr_validate_input.py '
f'{data_dir} '
f'{refdata_dir} '
f'{input_vcf} '
f'{input_vcf_validated_uncompr} '
f'{input_customlist} '
Expand Down Expand Up @@ -192,7 +193,7 @@ def run_cpsr(conf_options, cpsr_paths):
conf_options['gene_panel']['custom_list_bed'] = output_custom_bed

## Write YAML configuration file - settings, path to files, reference bundle etc
yaml_data = populate_config_data(conf_options, data_dir, workflow = "CPSR", logger = logger)
yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "CPSR", logger = logger)
with open(yaml_fname, "w") as outfile:
outfile.write(yaml.dump(yaml_data))
outfile.close()
Expand Down Expand Up @@ -233,7 +234,8 @@ def run_cpsr(conf_options, cpsr_paths):
f'{"--debug" if debug else ""} '
f"--dbmts --gerp --tcga --gnomad_non_cancer --gene_transcript_xref "
f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} "
f"{os.path.join(data_dir, 'data', str(yaml_data['genome_assembly']))}"
f"{db_assembly_dir}"
#f"{os.path.join(refdata_dir, 'data', str(yaml_data['genome_assembly']))}"
)
check_subprocess(logger, pcgr_vcfanno_command, debug)
logger.info("Finished cpsr-vcfanno")
Expand All @@ -245,7 +247,7 @@ def run_cpsr(conf_options, cpsr_paths):
f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} 0 '
f'{yaml_data["conf"]["vep"]["vep_regulatory"]} 0 '
f'Any {yaml_data["conf"]["vep"]["vep_pick_order"]} '
f'{cpsr_paths["db_dir"]} --compress_output_vcf '
f'{cpsr_paths["db_assembly_dir"]} --compress_output_vcf '
f'--cpsr_yaml {yaml_fname} '
f'--cpsr {"--debug" if debug else ""}'
)
Expand Down Expand Up @@ -278,7 +280,7 @@ def run_cpsr(conf_options, cpsr_paths):
logger.info("Appending ClinVar traits, official gene names, and protein domain annotations")
variant_set = \
variant.append_annotations(
output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger)
output_pass_vcf2tsv_gz, db_assembly_dir = cpsr_paths["db_assembly_dir"], logger = logger)
variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)
if {'GENOTYPE'}.issubset(variant_set.columns):
if variant_set.loc[variant_set['GENOTYPE'] == '.'].empty and variant_set.loc[variant_set['GENOTYPE'] == 'undefined'].empty:
Expand Down
Loading

0 comments on commit 37a5ef9

Please sign in to comment.