Release 1.7.8 (#502)

* Dev gcgi 1378 new gsicapbench inputs (#486) * Update default sample names and placeholder IDs/purity * Simplify by removing the benchmark_params.json file * Add new JSON diff script and tests * Added patient study id raw * Updated changelog * Allowed tumour and normal ids to be manually specified for provenance helper * Added oncokb definitions to 40x assays * Dev-GCGI-1446_fusion-total (#498) * Debugging * Debugging * Update clinically relevant variants count to reflect unique fusions after filtering by OncoKB levels * Update test checksum --------- Co-authored-by: Oumaima Hamza <[email protected]> * self.patient_id_raw defaults to None when not in use -- instead of being undefined, which causes an error * Dev gcgi 1326 whizbam links for fusions (#487) * Dump to json output * Process fusions and generate blob URLs from json template * gzip instead of pysam * skip header to use index * debugging * Fix column names * debugging * Fix genes combination scenario * Get template json * Specify path to file under plugin dir * no message * template json path * debugging * Reformat breakpoint * Search for bam and bai files * Use Node.js script for compression * Get tumour ID * Add positional argument * Add compression script relative path * Fix path * Change to CommonJS syntax * Add js scripts for compression * Convert to to CommonJS syntax * convert to CommonJS syntax * Compression using pysam.bgzip * base64 encoding * Convert into URL safe base64 * Mimic the JavaScript behavior * Fix blob URL format * Fix base64 encode * Compress json not the path to the file * Change whizbam URL * Change csv to tsv file * Fix and refactor fusion test * Changelog * Respond to PR comments: Improve warning/error messages and replace hardcoded values with variables * Clean up process_fusion function by adding helper functions * Add custom error class * Add whizbam_project parameter to the fusion plugin * Optimizing checks for files existence and URL assignment * Fix warning logging --------- Co-authored-by: Oumaima Hamza <[email protected]> * Release 1.7.8 update (#500) * Update changelog and version.py * Revert "Update changelog and version.py" This reverts commit 97bf3d7. * Update changelog and djerba version * Revert "Update changelog and djerba version" This reverts commit 26e27ac. * Update changelog and djerba version --------- Co-authored-by: Oumaima Hamza <[email protected]> * Refactor fusion plugin test; move setup operations to setUp(self) * Remove self.maxDiff = None --------- Co-authored-by: Iain Bancarz PhD <[email protected]> Co-authored-by: Aqsa Alam <[email protected]> Co-authored-by: Aqsa Alam <[email protected]> Co-authored-by: Oumaima Hamza <[email protected]>
oicr-gsi · Dec 16, 2024 · 01357cf · 01357cf
1 parent cdaea31
commit 01357cf
Show file tree

Hide file tree

Showing 19 changed files with 455 additions and 120 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # CHANGELOG
 
+## 1.7.8: 2024-12-12
+- GCGI-1464: Standalone script to diff two Djerba JSON reports
+- GCGI-1454: Added OncoKB definitions to WGTS40X and WGS40X assays
+- GCGI-1469: Prevent provenance_helper from overwriting manually provided tumour and normal IDs
+- GCGI-1472: Add all patient IDs to case_overview (not just the initial ID)
+- GCGI-1326: Generate whizbam links to view fusions in split-view
+
 ## 1.7.7: 2024-11-11
 - GCGI-1424: Fix rendering of non-Latin characters
 - GCGI-1437: Remove unnecessary checks in OncoKB cache

diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@
     version=__version__,
     scripts=[
         'src/bin/benchmark.py',
+        'src/bin/diff_reports.py',
         'src/bin/djerba.py',
         'src/bin/generate_ini.py',
         'src/bin/mini_djerba.py',

diff --git a/src/bin/diff_reports.py b/src/bin/diff_reports.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python3
+
+"""Diff two Djerba JSON reports"""
+
+import argparse
+import sys
+
+sys.path.pop(0) # do not import from script directory
+
+from djerba.util.benchmark import report_equivalence_tester
+from djerba.util.logger import logger
+from djerba.util.validator import path_validator
+
+def get_parser():
+    """Construct the parser for command-line arguments"""
+    parser = argparse.ArgumentParser(
+        description='Compare two JSON reports output by Djerba. Exit status is 0 if reports are equivalent, 1 otherwise. Run with --verbose for a summary and/or --print to view the full diff.',
+        epilog='Run with -h/--help for additional information',
+    )
+    parser.add_argument('-r', '--report', metavar='PATH', help='Path to Djerba JSON report file. Must be supplied exactly twice.', required=True, action='append')
+    parser.add_argument('-d', '--debug', action='store_true', help='More verbose logging')
+    parser.add_argument('-l', '--log-path', metavar='PATH', help='Output file for log messages; defaults to STDERR')
+    parser.add_argument('-p', '--print', action='store_true', help='Print a full diff to STDOUT; prints "NONE" if reports are identical')
+    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode; logging errors only')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose logging')
+    return parser
+
+def main(args):
+    reports = args.report
+    if len(reports)!=2:
+        print("ERROR: Must have exactly 2 JSON report files, each specified wtih -r/--report", file=sys.stderr)
+        sys.exit(1)
+    log_level = logger.get_log_level(args.debug, args.verbose, args.quiet)
+    if args.log_path:
+        path_validator(log_level).validate_output_file(args.log_path)
+    validator = path_validator(log_level, args.log_path)
+    for report in reports:
+        validator.validate_input_file(report)
+    delta_path = None # TODO make configurable
+    tester = report_equivalence_tester(reports, delta_path, log_level, args.log_path)
+    if args.print:
+        print(tester.get_diff_text())
+    if tester.is_equivalent():
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+if __name__ == '__main__':
+    parser = get_parser()
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+    main(parser.parse_args())
diff --git a/src/lib/djerba/core/constants.py b/src/lib/djerba/core/constants.py
@@ -112,3 +112,6 @@
 
 # JSON file suffix
 REPORT_JSON_SUFFIX = '_report.json'
+
+# root directory pattern for WHIZBAM files
+WHIZBAM_PATTERN_ROOT='/.mounts/labs/prod/whizbam'
diff --git a/src/lib/djerba/data/benchmark_params.json b/src/lib/djerba/data/benchmark_params.json
diff --git a/src/lib/djerba/helpers/provenance_helper/helper.py b/src/lib/djerba/helpers/provenance_helper/helper.py
@@ -99,12 +99,12 @@ def configure(self, config):
                     raise DjerbaProvenanceError(msg)
                 else:
                     wrapper.set_my_param(key, value)
-            elif value == None:
-                value = wrapper.get_my_string(key)
-                msg = "Overwriting null value for '{0}' in sample info ".format(key)+\
-                    "with user-defined value '{0}'".format(value)
-                self.logger.debug(msg)
-                sample_info[key] = value
+            elif wrapper.my_param_is_not_null(key):
+                user_value = wrapper.get_my_string(key)
+                msg = "Overwriting found value '{0}' for '{1}' in sample info with user-defined value '{2}'".format(value, key, user_value)
+                self.logger.warning(msg)
+                sample_info[key] = user_value
+
         # Write updated sample info as JSON
         self.write_sample_info(sample_info)
         return wrapper.get_config()
@@ -194,10 +194,11 @@ def read_provenance(self, study, donor, assay, samples):
         )
         names = reader.get_sample_names()
         ids = reader.get_identifiers()
+
         sample_info = {
             self.STUDY_TITLE: study,
             self.ROOT_SAMPLE_NAME: donor,
-            core_constants.PATIENT_STUDY_ID: ids.get(ini.PATIENT_ID),
+            core_constants.PATIENT_STUDY_ID: ids.get(ini.PATIENT_ID_RAW),
             core_constants.TUMOUR_ID: ids.get(ini.TUMOUR_ID),
             core_constants.NORMAL_ID: ids.get(ini.NORMAL_ID),
             ini.SAMPLE_NAME_WG_T: names.get(ini.SAMPLE_NAME_WG_T),

diff --git a/src/lib/djerba/plugins/fusion/constants.py b/src/lib/djerba/plugins/fusion/constants.py
@@ -8,6 +8,7 @@
 MAVIS_PATH = 'mavis_path'
 ARRIBA_PATH = 'arriba_path'
 ONCOTREE_CODE = 'oncotree_code'
+WHIZBAM_PROJECT = 'whizbam_project'
 ENTREZ_CONVERSION_PATH = 'entrez conv path'
 MIN_FUSION_READS = 'minimum fusion reads'
 
@@ -32,6 +33,7 @@
 DATA_FUSIONS_OLD = 'data_fusions.txt'
 DATA_FUSIONS_ANNOTATED = 'data_fusions_oncokb_annotated.txt'
 DATA_FUSIONS_NCCN_ANNOTATED = 'data_fusions_NCCN.txt'
+JSON_TO_BE_COMPRESSED = 'fusion_template_to_be_compressed.json'
 FUSION_INDEX = 3
 HUGO_SYMBOL = 'Hugo_Symbol'
 NCCN_RELEVANT_VARIANTS = 'nccn_relevant_variants'
diff --git a/src/lib/djerba/plugins/fusion/fusion_template_to_be_compressed.json b/src/lib/djerba/plugins/fusion/fusion_template_to_be_compressed.json
@@ -0,0 +1,47 @@
+{
+ "version": "2.15.11",
+ "showSampleNames": false,
+ "reference": {
+  "id": "hg38",
+  "name": "Human (GRCh38/hg38)",
+  "fastaURL": "https://igv-genepattern-org.s3.amazonaws.com/genomes/seq/hg38/hg38.fa",
+  "indexURL": "https://igv-genepattern-org.s3.amazonaws.com/genomes/seq/hg38/hg38.fa.fai",
+  "cytobandURL": "https://igv-genepattern-org.s3.amazonaws.com/genomes/hg38/cytoBandIdeo.txt.gz",
+  "aliasURL": "https://igv-genepattern-org.s3.amazonaws.com/genomes/hg38/hg38_alias.tab",
+  "chromSizesURL": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes",
+  "twoBitURL": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit",
+  "chromosomeOrder": "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY"
+ },
+ "locus": [],
+ "roi": [],
+ "tracks": [
+  {
+   "type": "sequence",
+   "order": -9007199254740991
+  },
+  {
+   "name": "",
+   "url": "",
+   "indexURL": "",
+   "format": "bam",
+   "type": "alignment",
+   "samplingDepth": 1000,
+   "sort": {
+    "position": 0,
+    "option": "BASE",
+    "direction": "DESC"
+   },
+   "order": 1
+  },
+  {
+   "name": "Refseq Genes",
+   "format": "refgene",
+   "url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/ncbiRefSeq.txt.gz",
+   "indexed": false,
+   "order": 1000001,
+   "infoURL": "https://www.ncbi.nlm.nih.gov/gene/?term=$$",
+   "type": "annotation",
+   "height": 70
+  }
+ ]
+}