diff --git a/README.md b/README.md index 94329bce04..9f1d5d8091 100644 --- a/README.md +++ b/README.md @@ -17,4 +17,10 @@ Read more about our pipelines and repository on the [WARP documentation site](ht To contribute to WARP, please read the [contribution guidelines](https://broadinstitute.github.io/warp/docs/contribution/README). +### Citing WARP + +When citing WARP, please use the following: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + [![Build Status](https://img.shields.io/github/workflow/status/broadinstitute/warp/Deploy%20WARP%20Website?label=Website&logo=github&style=flat-square)](https://github.com/broadinstitute/warp/actions?query=workflow%3A%22Deploy+WARP+Website%22) diff --git a/pipelines/broad/annotation_filtration/AnnotationFiltration.changelog.md b/pipelines/broad/annotation_filtration/AnnotationFiltration.changelog.md index e87877e3db..2e661ad34d 100644 --- a/pipelines/broad/annotation_filtration/AnnotationFiltration.changelog.md +++ b/pipelines/broad/annotation_filtration/AnnotationFiltration.changelog.md @@ -1,3 +1,8 @@ +# 1.2.5 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.2.4 2022-11-09 (Date of Last Commit) diff --git a/pipelines/broad/annotation_filtration/AnnotationFiltration.wdl b/pipelines/broad/annotation_filtration/AnnotationFiltration.wdl index e7178ab9bc..af4fa32a27 100644 --- a/pipelines/broad/annotation_filtration/AnnotationFiltration.wdl +++ b/pipelines/broad/annotation_filtration/AnnotationFiltration.wdl @@ -4,7 +4,7 @@ import "../../../tasks/broad/Funcotator.wdl" as Funcotator workflow AnnotationFiltration { - String pipeline_version = "1.2.4" + String pipeline_version = "1.2.5" input { Array[File] vcfs @@ -15,7 +15,7 @@ workflow AnnotationFiltration { File ref_dict File? funcotator_interval_list - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" File? custom_data_source_tar_gz } diff --git a/pipelines/broad/annotation_filtration/test_inputs/Plumbing/hg38.json b/pipelines/broad/annotation_filtration/test_inputs/Plumbing/hg38.json index a464cbb5d3..d87a07c1ea 100644 --- a/pipelines/broad/annotation_filtration/test_inputs/Plumbing/hg38.json +++ b/pipelines/broad/annotation_filtration/test_inputs/Plumbing/hg38.json @@ -10,6 +10,6 @@ "AnnotationFiltration.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "AnnotationFiltration.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "AnnotationFiltration.gatk_docker": "us.gcr.io/broad-gatk/gatk:4.3.0.0", + "AnnotationFiltration.gatk_docker": "us.gcr.io/broad-gatk/gatk:4.5.0.0", "AnnotationFiltration.custom_data_source_tar_gz": "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124g.tar.gz" } diff --git a/pipelines/broad/annotation_filtration/test_inputs/Scientific/hg38.json b/pipelines/broad/annotation_filtration/test_inputs/Scientific/hg38.json index c3e324d5f2..6f36aeaafa 100644 --- a/pipelines/broad/annotation_filtration/test_inputs/Scientific/hg38.json +++ b/pipelines/broad/annotation_filtration/test_inputs/Scientific/hg38.json @@ -8,6 +8,6 @@ "AnnotationFiltration.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "AnnotationFiltration.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "AnnotationFiltration.gatk_docker": "us.gcr.io/broad-gatk/gatk:4.3.0.0", + "AnnotationFiltration.gatk_docker": "us.gcr.io/broad-gatk/gatk:4.5.0.0", "AnnotationFiltration.custom_data_source_tar_gz": "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124g.tar.gz" } diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md index 4f87b2ac76..e96dabb6a6 100644 --- a/pipelines/broad/arrays/imputation/Imputation.changelog.md +++ b/pipelines/broad/arrays/imputation/Imputation.changelog.md @@ -1,3 +1,8 @@ +# 1.1.12 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.1.11 2023-08-01 (Date of last Commit) diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl index 245ab01455..44d5a93cd0 100644 --- a/pipelines/broad/arrays/imputation/Imputation.wdl +++ b/pipelines/broad/arrays/imputation/Imputation.wdl @@ -6,7 +6,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Imputation { - String pipeline_version = "1.1.11" + String pipeline_version = "1.1.12" input { Int chunkLength = 25000000 diff --git a/pipelines/broad/arrays/single_sample/Arrays.changelog.md b/pipelines/broad/arrays/single_sample/Arrays.changelog.md index dc8494544b..1155468625 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.changelog.md +++ b/pipelines/broad/arrays/single_sample/Arrays.changelog.md @@ -1,3 +1,8 @@ +# 2.6.22 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 2.6.21 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl index e7693b152d..a59633f839 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.wdl +++ b/pipelines/broad/arrays/single_sample/Arrays.wdl @@ -23,7 +23,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Arrays { - String pipeline_version = "2.6.21" + String pipeline_version = "2.6.22" input { String chip_well_barcode diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md b/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md index 0e36caabd0..a55bbaaeea 100644 --- a/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md +++ b/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md @@ -1,3 +1,8 @@ +# 1.16.4 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.16.3 2023-01-13 (Date of Last Commit) diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.wdl b/pipelines/broad/arrays/validate_chip/ValidateChip.wdl index a2db629677..a18dffecbb 100644 --- a/pipelines/broad/arrays/validate_chip/ValidateChip.wdl +++ b/pipelines/broad/arrays/validate_chip/ValidateChip.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/InternalArraysTasks.wdl" as InternalTasks workflow ValidateChip { - String pipeline_version = "1.16.3" + String pipeline_version = "1.16.4" input { String sample_alias diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md index 55378fdca6..d97dee1d00 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md @@ -1,3 +1,8 @@ +# 1.6.10 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.6.9 2023-09-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl index 87832c4c12..097f398553 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl @@ -7,7 +7,7 @@ import "https://raw.githubusercontent.com/broadinstitute/gatk/4.5.0.0/scripts/vc # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) workflow JointGenotyping { - String pipeline_version = "1.6.9" + String pipeline_version = "1.6.10" input { File unpadded_intervals_file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md index a6c388e712..55087513c4 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md @@ -1,3 +1,13 @@ +# 1.1.7 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + +# 1.1.6 +2023-02-06 (Date of Last Commit) + +* Updated VETS filtering pipeline to GATK version 4.5.0.0. Does not affect outputs. + # 1.1.5 2023-09-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl index 48e5da0d28..7ad923f4a6 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl @@ -1,7 +1,7 @@ version 1.0 import "../../../../../../tasks/broad/JointGenotypingTasks.wdl" as Tasks -import "https://raw.githubusercontent.com/broadinstitute/gatk/4.3.0.0/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as Filtering +import "https://raw.githubusercontent.com/broadinstitute/gatk/4.5.0.0/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as Filtering import "../../../../../../tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl" as FilteringThreshold @@ -11,7 +11,7 @@ import "../../../../../../tasks/broad/UltimaGenomicsGermlineFilteringThreshold.w # For choosing a filtering threshold (where on the ROC curve to filter) a sample with truth data is required. workflow UltimaGenomicsJointGenotyping { - String pipeline_version = "1.1.5" + String pipeline_version = "1.1.7" input { File unpadded_intervals_file @@ -51,10 +51,11 @@ workflow UltimaGenomicsJointGenotyping { String flow_order #inputs for training and applying filter model - String snp_annotations - String indel_annotations - Boolean use_allele_specific_annotations + Array[String] snp_annotations + Array[String] indel_annotations String model_backend + String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" Int? top_level_scatter_count Boolean? gather_vcfs @@ -154,24 +155,42 @@ workflow UltimaGenomicsJointGenotyping { disk_size_gb = medium_disk } - call Filtering.JointVcfFiltering as TrainAndApplyFilteringModel { + call Filtering.JointVcfFiltering as TrainAndApplyFilteringModelSNPs { input: - vcf = CalculateAverageAnnotations.output_vcf, - vcf_index = CalculateAverageAnnotations.output_vcf_index, + input_vcfs = CalculateAverageAnnotations.output_vcf, + input_vcf_idxs = CalculateAverageAnnotations.output_vcf_index, sites_only_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - snp_annotations = snp_annotations, - indel_annotations = indel_annotations, + sites_only_vcf_idx = SitesOnlyGatherVcf.output_vcf_index, + annotations = snp_annotations, + resource_args = snp_resource_args, model_backend = model_backend, - use_allele_specific_annotations = use_allele_specific_annotations, - basename = callset_name, - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + output_prefix = callset_name, + extract_extra_args = "--mode SNP", + train_extra_args = "--mode SNP", + score_extra_args = "--mode SNP", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + + call Filtering.JointVcfFiltering as TrainAndApplyFilteringModelINDELs { + input: + input_vcfs = TrainAndApplyFilteringModelSNPs.scored_vcfs, + input_vcf_idxs = TrainAndApplyFilteringModelSNPs.scored_vcf_idxs, + sites_only_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_vcf_idx = SitesOnlyGatherVcf.output_vcf_index, + annotations = indel_annotations, + resource_args = indel_resource_args, + model_backend = model_backend, + output_prefix = callset_name, + extract_extra_args = "--mode INDEL", + train_extra_args = "--mode INDEL", + score_extra_args = "--mode INDEL", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } call FilteringThreshold.ExtractOptimizeSingleSample as FindFilteringThresholdAndFilter { input: - input_vcf = TrainAndApplyFilteringModel.variant_scored_vcf, - input_vcf_index = TrainAndApplyFilteringModel.variant_scored_vcf_index, + input_vcf = TrainAndApplyFilteringModelINDELs.scored_vcfs, + input_vcf_index = TrainAndApplyFilteringModelINDELs.scored_vcf_idxs, base_file_name = callset_name, call_sample_name = call_sample_name, truth_vcf = truth_vcf, @@ -188,7 +207,7 @@ workflow UltimaGenomicsJointGenotyping { medium_disk = medium_disk } - scatter (idx in range(length(TrainAndApplyFilteringModel.variant_scored_vcf))) { + scatter (idx in range(length(TrainAndApplyFilteringModelINDELs.scored_vcfs))) { # For large callsets we need to collect metrics from the shards and gather them later. if (!is_small_callset) { call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json index a271e30aa6..3dc1a947c6 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json @@ -14,17 +14,17 @@ "UltimaGenomicsJointGenotyping.scatter_cross_check_fingerprints":false, "UltimaGenomicsJointGenotyping.unbounded_scatter_count_scale_factor":2.5, "UltimaGenomicsJointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", -"UltimaGenomicsJointGenotyping.snp_annotations": "-A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", -"UltimaGenomicsJointGenotyping.indel_annotations": "-A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE", +"UltimaGenomicsJointGenotyping.snp_annotations": ["AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE", "AVERAGE_ASSEMBLED_HAPS", "AVERAGE_FILTERED_HAPS"], +"UltimaGenomicsJointGenotyping.indel_annotations": ["AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE"], "UltimaGenomicsJointGenotyping.flow_order": "TGCA", "UltimaGenomicsJointGenotyping.ref_fasta_sdf": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/reference_sdf.tar", "UltimaGenomicsJointGenotyping.runs_file": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/runs.conservative.bed", "UltimaGenomicsJointGenotyping.annotation_intervals": ["gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/LCR-hs38.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/mappability.0.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/exome.twist.bed"], -"UltimaGenomicsJointGenotyping.use_allele_specific_annotations": true, "UltimaGenomicsJointGenotyping.truth_vcf":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_vcf_index":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_highconf_intervals": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", "UltimaGenomicsJointGenotyping.call_sample_name": "NA12878", "UltimaGenomicsJointGenotyping.truth_sample_name": "HG001", -"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST" +"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST", +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.train_runtime_attributes": {"additional_mem_gb":2} } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json index a91bede656..9b6270b0b4 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json @@ -14,17 +14,18 @@ "UltimaGenomicsJointGenotyping.scatter_cross_check_fingerprints":false, "UltimaGenomicsJointGenotyping.unbounded_scatter_count_scale_factor":2.5, "UltimaGenomicsJointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", -"UltimaGenomicsJointGenotyping.snp_annotations": "-A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", -"UltimaGenomicsJointGenotyping.indel_annotations": "-A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE", +"UltimaGenomicsJointGenotyping.snp_annotations": ["AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE", "AVERAGE_ASSEMBLED_HAPS", "AVERAGE_FILTERED_HAPS"], +"UltimaGenomicsJointGenotyping.indel_annotations": ["AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE"], "UltimaGenomicsJointGenotyping.flow_order": "TGCA", "UltimaGenomicsJointGenotyping.ref_fasta_sdf": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/reference_sdf.tar", "UltimaGenomicsJointGenotyping.runs_file": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/runs.conservative.bed", "UltimaGenomicsJointGenotyping.annotation_intervals": ["gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/LCR-hs38.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/mappability.0.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/exome.twist.bed"], -"UltimaGenomicsJointGenotyping.use_allele_specific_annotations": true, "UltimaGenomicsJointGenotyping.truth_vcf":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_vcf_index":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_highconf_intervals": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", "UltimaGenomicsJointGenotyping.call_sample_name": "NA12878", "UltimaGenomicsJointGenotyping.truth_sample_name": "HG001", -"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST" +"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST", +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.extract_runtime_attributes": {"command_mem_gb":13, "additional_mem_gb":2}, +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.train_runtime_attributes": {"command_mem_gb":13, "additional_mem_gb":2} } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.changelog.md index 61a0a0daed..9054632fd0 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.changelog.md @@ -1,3 +1,8 @@ +# 1.4.12 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.4.11 2023-09-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.wdl index a067eb2882..0cbbabc68d 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.wdl @@ -5,7 +5,7 @@ import "../../../../../../tasks/broad/JointGenotypingTasks.wdl" as Tasks # Joint Genotyping for hg38 Exomes and Whole Genomes (has not been tested on hg19) workflow JointGenotypingByChromosomePartOne { - String pipeline_version = "1.4.11" + String pipeline_version = "1.4.12" input { File unpadded_intervals_file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.changelog.md index 4d158220e9..74948716f2 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.changelog.md @@ -1,3 +1,8 @@ +# 1.4.11 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0 + # 1.4.10 2023-09-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.wdl index ccb36af7d0..7e9e2b7bb1 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.wdl @@ -5,7 +5,7 @@ import "../../../../../../tasks/broad/JointGenotypingTasks.wdl" as Tasks # Joint Genotyping for hg38 Exomes and Whole Genomes (has not been tested on hg19) workflow JointGenotypingByChromosomePartTwo { - String pipeline_version = "1.4.10" + String pipeline_version = "1.4.11" input { String callset_name diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md index 7f3d1abfc6..58f1643d4e 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md @@ -1,10 +1,14 @@ +# 2.1.11 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. Header documentation change for RAW_GT_COUNT annotation. + # 2.1.10 2023-12-14 (Date of Last Commit) * Updated GATK for Reblock task to version 4.5.0.0 * Added options to Reblock task to remove annotations and move filters to genotype level - # 2.1.9 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl index 69c0e37591..b27820b937 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl @@ -5,7 +5,7 @@ import "../../../../../../tasks/broad/Qc.wdl" as QC workflow ReblockGVCF { - String pipeline_version = "2.1.10" + String pipeline_version = "2.1.11" input { @@ -50,7 +50,7 @@ workflow ReblockGVCF { calling_interval_list_index = gvcf_index, is_gvcf = true, extra_args = "--no-overlaps", - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } output { diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index 14369eee0e..c167d5d698 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,10 +1,14 @@ +# 3.1.18 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 3.1.17 2023-12-14 (Date of Last Commit) * Updated GATK for Reblock task to version 4.5.0.0 * Added options to Reblock task to remove annotations and move filters to genotype level - # 3.1.16 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl index 8be4e1a5fc..31900c8c6f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl @@ -44,7 +44,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" # WORKFLOW DEFINITION workflow ExomeGermlineSingleSample { - String pipeline_version = "3.1.17" + String pipeline_version = "3.1.18" input { diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md index c011691d1f..bca633e289 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md @@ -1,3 +1,8 @@ +# 1.0.15 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.0.14 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl index 479e6b914a..8548e22478 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl @@ -50,7 +50,7 @@ workflow UltimaGenomicsWholeGenomeGermline { filtering_model_no_gt_name: "String describing the optional filtering model; default set to rf_model_ignore_gt_incl_hpol_runs" } - String pipeline_version = "1.0.14" + String pipeline_version = "1.0.15" References references = alignment_references.references diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index 052c66c391..38fdadfaa8 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.1.19 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 3.1.18 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl index 2b1fad60a3..23e9d76845 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl @@ -40,7 +40,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeGermlineSingleSample { - String pipeline_version = "3.1.18" + String pipeline_version = "3.1.19" input { diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json index 5d7841519a..94f90073c8 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json @@ -82,5 +82,6 @@ "agg_preemptible_tries": 3 }, - "WholeGenomeGermlineSingleSample.dragen_maximum_quality_mode": true + "WholeGenomeGermlineSingleSample.dragen_maximum_quality_mode": true, + "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2 } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json index d52c139fd2..c4b9608f29 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json @@ -81,5 +81,6 @@ "agg_preemptible_tries": 3 }, - "WholeGenomeGermlineSingleSample.dragen_functional_equivalence_mode": true + "WholeGenomeGermlineSingleSample.dragen_functional_equivalence_mode": true, + "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2 } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md index 676deb72b6..12af2b9efb 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md @@ -1,3 +1,8 @@ +# 2.1.17 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. Header documentation change for RAW_GT_COUNT annotation. + # 2.1.16 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 90dff51c08..27263d6150 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.16" + String pipeline_version = "2.1.17" input { @@ -183,7 +183,7 @@ workflow VariantCalling { calling_interval_list = calling_interval_list, is_gvcf = make_gvcf, extra_args = if (skip_reblocking == false) then "--no-overlaps" else "", - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0", preemptible_tries = agg_preemptible_tries } diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md index 53166ccfe6..129c68527a 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md @@ -1,3 +1,8 @@ +# 1.0.15 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.0.14 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl index 3946c87545..dd411ed458 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl @@ -43,7 +43,7 @@ workflow UltimaGenomicsWholeGenomeCramOnly { save_bam_file: "If true, then save intermeidate ouputs used by germline pipeline (such as the output BAM) otherwise they won't be kept as outputs." } - String pipeline_version = "1.0.14" + String pipeline_version = "1.0.15" References references = alignment_references.references diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/test_inputs/Scientific/HCC1187.json b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/test_inputs/Scientific/HCC1187.json index 1dbd4f99e3..340a82d66e 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/test_inputs/Scientific/HCC1187.json +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/test_inputs/Scientific/HCC1187.json @@ -41,5 +41,6 @@ "ref_dbsnp": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", "ref_dbsnp_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", "wgs_coverage_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list" - } + }, + "UltimaGenomicsWholeGenomeCramOnly.AlignmentAndMarkDuplicates.MarkDuplicatesSpark.memory_mb":300000 } diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md index ba50f4ee6d..64fd9a9fb1 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md @@ -1,3 +1,8 @@ +# 1.12.16 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.12.15 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl index c7a35494db..af3b1e57ec 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/Qc.wdl" as Qc workflow IlluminaGenotypingArray { - String pipeline_version = "1.12.15" + String pipeline_version = "1.12.16" input { String sample_alias diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md index 3efa30e876..0ac74c9794 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md @@ -1,3 +1,8 @@ +# 1.1.10 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.1.9 2023-08-01 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl index 653b8cefad..3021fe6a4c 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl @@ -9,7 +9,7 @@ workflow BroadInternalImputation { description: "Push outputs of Imputation.wdl to TDR dataset table ImputationOutputsTable and split out Imputation arrays into ImputationWideOutputsTable." allowNestedInputs: true } - String pipeline_version = "1.1.9" + String pipeline_version = "1.1.10" input { # inputs to wrapper task diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md index ba8009535f..f4b807441a 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md @@ -1,3 +1,8 @@ +# 1.1.6 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.1.5 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl index 185ab12d95..762eef0709 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl @@ -9,7 +9,7 @@ workflow BroadInternalArrays { description: "Push outputs of Arrays.wdl to TDR dataset table ArraysOutputsTable." } - String pipeline_version = "1.1.5" + String pipeline_version = "1.1.6" input { # inputs to wrapper task diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md index 69de3d250d..f30781fe04 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md @@ -1,3 +1,8 @@ +# 1.0.16 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. +* # 1.0.15 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl index fe1761415a..6d5a522cf8 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl @@ -6,7 +6,7 @@ import "../../../../../../../pipelines/broad/qc/CheckFingerprint.wdl" as FP workflow BroadInternalUltimaGenomics { - String pipeline_version = "1.0.15" + String pipeline_version = "1.0.16" input { diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md index 782832746f..45e54d0d46 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md @@ -1,3 +1,8 @@ +# 1.0.28 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.0.27 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl index 466ea8c070..293b03a33e 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow BroadInternalRNAWithUMIs { - String pipeline_version = "1.0.27" + String pipeline_version = "1.0.28" input { # input needs to be either "hg19" or "hg38" diff --git a/pipelines/broad/qc/CheckFingerprint.changelog.md b/pipelines/broad/qc/CheckFingerprint.changelog.md index cd3db0e0e3..e47ab3c0f4 100644 --- a/pipelines/broad/qc/CheckFingerprint.changelog.md +++ b/pipelines/broad/qc/CheckFingerprint.changelog.md @@ -1,3 +1,8 @@ +# 1.0.15 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.0.14 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/qc/CheckFingerprint.wdl b/pipelines/broad/qc/CheckFingerprint.wdl index 24d2c5f3ea..dd8d48b424 100644 --- a/pipelines/broad/qc/CheckFingerprint.wdl +++ b/pipelines/broad/qc/CheckFingerprint.wdl @@ -24,7 +24,7 @@ import "../../../tasks/broad/Qc.wdl" as Qc workflow CheckFingerprint { - String pipeline_version = "1.0.14" + String pipeline_version = "1.0.15" input { File? input_vcf diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md index 7a0b8b9a7d..f6fc478efb 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md @@ -1,10 +1,14 @@ +# 3.1.18 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 3.1.17 2023-12-14 (Date of Last Commit) * Updated GATK for Reblock task to version 4.5.0.0 * Added options to Reblock task to remove annotations and move filters to genotype level - # 3.1.16 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl index 46607e6499..a26b442550 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl @@ -7,7 +7,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow ExomeReprocessing { - String pipeline_version = "3.1.17" + String pipeline_version = "3.1.18" input { File? input_cram diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md index af5579f916..c33dcb6544 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md @@ -1,10 +1,14 @@ +# 3.1.20 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 3.1.19 2023-12-14 (Date of Last Commit) * Updated GATK for Reblock task to version 4.5.0.0 * Added options to Reblock task to remove annotations and move filters to genotype level - # 3.1.18 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl index be94c054af..391afed2c2 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl @@ -5,7 +5,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalExomeReprocessing { - String pipeline_version = "3.1.19" + String pipeline_version = "3.1.20" input { diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md index 152a5ce375..21ed6a7962 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md @@ -1,10 +1,14 @@ +# 2.1.20 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 2.1.19 2023-12-14 (Date of Last Commit) * Updated GATK for Reblock task to version 4.5.0.0 * Added options to Reblock task to remove annotations and move filters to genotype level - # 2.1.18 2023-12-08 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl index a80fde6b94..f4dd1af8d9 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalWholeGenomeReprocessing { - String pipeline_version = "2.1.19" + String pipeline_version = "2.1.20" input { File? input_cram diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md index 83d465aae4..9b7163a4a0 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.1.19 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 3.1.18 2023-12-14 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl index f60603e7bd..6cfd2bbf39 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeReprocessing { - String pipeline_version = "3.1.18" + String pipeline_version = "3.1.19" input { File? input_cram diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md index bb98577d93..3007372a10 100644 --- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md +++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md @@ -1,3 +1,8 @@ +# 1.0.16 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.0.15 2023-07-27 (Date of Last Commit) diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl index d88fc95fed..9787fa6dcd 100644 --- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl +++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl @@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks workflow RNAWithUMIsPipeline { - String pipeline_version = "1.0.15" + String pipeline_version = "1.0.16" input { File? bam diff --git a/pipelines/cemba/cemba_methylcseq/CEMBA.changelog.md b/pipelines/cemba/cemba_methylcseq/CEMBA.changelog.md index 37c70e3f93..e778cd427c 100644 --- a/pipelines/cemba/cemba_methylcseq/CEMBA.changelog.md +++ b/pipelines/cemba/cemba_methylcseq/CEMBA.changelog.md @@ -1,3 +1,8 @@ +# 1.1.6 +2023-12-18 (Date of Last Commit) + +* Updated to GATK version 4.5.0.0. + # 1.1.5 2023-01-13 (Date of Last Commit) diff --git a/pipelines/cemba/cemba_methylcseq/CEMBA.wdl b/pipelines/cemba/cemba_methylcseq/CEMBA.wdl index 49998776cf..ab2df2802f 100644 --- a/pipelines/cemba/cemba_methylcseq/CEMBA.wdl +++ b/pipelines/cemba/cemba_methylcseq/CEMBA.wdl @@ -57,7 +57,7 @@ workflow CEMBA { } # version of this pipeline - String pipeline_version = "1.1.5" + String pipeline_version = "1.1.6" # trim off hardcoded sequence adapters call Trim as TrimAdapters { @@ -1008,7 +1008,7 @@ task AddReadGroup { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" # if the input size is less than 1 GB adjust to min input size of 1 GB # disks should be set to 2 * input file size disks: "local-disk " + ceil(2 * (if input_size < 1 then 1 else input_size)) + " HDD" @@ -1063,7 +1063,7 @@ task MethylationTypeCaller { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" # if the input size is less than 1 GB adjust to min input size of 1 GB disks: "local-disk " + ceil(4.5 * (if input_size < 1 then 1 else input_size)) + " HDD" cpu: 1 diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index ecd2478024..04ef2db368 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,8 +1,32 @@ + +# 3.3.0 +2024-02-28 (Date of Last Commit) + +* Added the gene expression library-level metrics CSV as output of the Multiome pipeline; this is produced by the Optimus subworkflow + +# 3.2.1 +2024-02-29 (Date of Last Commit) + +* Moved the disk and mem for the Multiome Join Barcodes task into the task inputs section + + +# 3.2.0 +2024-02-22 (Date of Last Commit) + +* Updated StarAlign.MergeStarOutput to add a shard number to the metrics files +* Removed ref_genome_fasta input from Multiome WDL and JSON + +# 3.1.3 +2024-02-07 (Date of Last Commit) + +* Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic + # 3.1.2 2024-02-01 (Date of Last Commit) * Add new paired-tag task to parse sample barcodes from cell barcodes when preindexing is set to true; this does not affect the Multiome pipeline + # 3.1.1 2024-01-30 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 16113b5e8c..5db2c67c0b 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -6,7 +6,8 @@ import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender workflow Multiome { - String pipeline_version = "3.1.2" + + String pipeline_version = "3.3.0" input { String input_id @@ -18,7 +19,6 @@ workflow Multiome { Array[File]? gex_i1_fastq File tar_star_reference File annotations_gtf - File ref_genome_fasta File? mt_genes Int tenx_chemistry_version = 3 Int emptydrops_lower = 100 @@ -61,7 +61,6 @@ workflow Multiome { output_bam_basename = input_id + "_gex", tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, - ref_genome_fasta = ref_genome_fasta, mt_genes = mt_genes, tenx_chemistry_version = tenx_chemistry_version, whitelist = gex_whitelist, @@ -143,6 +142,7 @@ workflow Multiome { Array[File?] multimappers_Rescue_matrix = Optimus.multimappers_Rescue_matrix Array[File?] multimappers_PropUnique_matrix = Optimus.multimappers_PropUnique_matrix File? gex_aligner_metrics = Optimus.aligner_metrics + File? library_metrics = Optimus.library_metrics # cellbender outputs File? cell_barcodes_csv = CellBender.cell_csv diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index 13d51a928c..170caa2aed 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,3 +1,8 @@ +# 1.1.8 +2024-02-07 (Date of Last Commit) + +* Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic + # 1.1.7 2024-02-01 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 4db04a9968..3dd81d7bf5 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -41,7 +41,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "1.1.7" + String pipeline_version = "1.1.8" parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index 902b564388..7d15111f38 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -16,7 +16,6 @@ "Multiome.atac_r3_fastq":[ "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R3_atac.fastq.gz" ], - "Multiome.ref_genome_fasta":"gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", "Multiome.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", "Multiome.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", "Multiome.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", diff --git a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json index 846b91ed2d..a5ddf2c947 100644 --- a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json @@ -25,7 +25,6 @@ "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_atac_S1_L001_R3_001.fastq.gz", "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_atac_S1_L002_R3_001.fastq.gz" ], - "Multiome.ref_genome_fasta":"gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", "Multiome.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", "Multiome.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", "Multiome.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 9123a32d64..29100d3d80 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,23 @@ + +# 6.5.0 +2024-02-28 (Date of Last Commit) + +* Added a library-level metrics CSV as output of the Optimus workflow; this iteration includes read-level metrics + +# 6.4.1 +2024-02-29 (Date of Last Commit) +* Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Optimus workflow + + +# 6.4.0 +2024-02-21 (Date of Last Commit) +* Updated StarAlign.MergeStarOutput to add a shard number to the metrics files +* Removed ref_genome_fasta input from Optimus WDL and JSON + +# 6.3.6 +2024-02-07 (Date of Last Commit) +* Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic + # 6.3.5 2024-01-30 (Date of Last Commit) * Added task GetNumSplits before FastqProcess ATAC task to determine the number of splits based on the bwa-mem2 machine specs diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index f4a07d840f..f5a9f9f304 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -29,7 +29,6 @@ workflow Optimus { # organism reference parameters File tar_star_reference File annotations_gtf - File ref_genome_fasta File? mt_genes String? soloMultiMappers @@ -65,7 +64,9 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "6.3.5" + + String pipeline_version = "6.5.0" + # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays Array[Int] indices = range(length(r1_fastq)) @@ -86,7 +87,6 @@ workflow Optimus { input_name_metadata_field: "String that describes the metadata field containing the input_name" tar_star_reference: "star genome reference" annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)" - ref_genome_fasta: "genome fasta file (must match star reference)" whitelist: "10x genomics cell barcode whitelist" tenx_chemistry_version: "10X Genomics v2 (10 bp UMI) or v3 chemistry (12bp UMI)" force_no_check: "Set to true to override input checks and allow pipeline to proceed with invalid input" @@ -146,6 +146,7 @@ workflow Optimus { input: bam_input = MergeBam.output_bam, mt_genes = mt_genes, + original_gtf = annotations_gtf, input_id = input_id } @@ -166,7 +167,8 @@ workflow Optimus { summary = STARsoloFastq.summary, align_features = STARsoloFastq.align_features, umipercell = STARsoloFastq.umipercell, - input_id = input_id + input_id = input_id, + counting_mode = counting_mode } if (counting_mode == "sc_rna"){ call RunEmptyDrops.RunEmptyDrops { @@ -203,7 +205,12 @@ workflow Optimus { features = STARsoloFastq.features_sn_rna, matrix = STARsoloFastq.matrix_sn_rna, cell_reads = STARsoloFastq.cell_reads_sn_rna, - input_id = input_id + input_id = input_id, + counting_mode = "sc_rna", + summary = STARsoloFastq.summary_sn_rna, + align_features = STARsoloFastq.align_features_sn_rna, + umipercell = STARsoloFastq.umipercell_sn_rna, + input_id = input_id } call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: @@ -239,10 +246,12 @@ workflow Optimus { File gene_metrics = GeneMetrics.gene_metrics File? cell_calls = RunEmptyDrops.empty_drops_result File? aligner_metrics = MergeStarOutputs.cell_reads_out + File? library_metrics = MergeStarOutputs.library_metrics Array[File?] multimappers_EM_matrix = STARsoloFastq.multimappers_EM_matrix Array[File?] multimappers_Uniform_matrix = STARsoloFastq.multimappers_Uniform_matrix Array[File?] multimappers_Rescue_matrix = STARsoloFastq.multimappers_Rescue_matrix Array[File?] multimappers_PropUnique_matrix = STARsoloFastq.multimappers_PropUnique_matrix + # h5ad File h5ad_output_file = final_h5ad_output diff --git a/pipelines/skylab/optimus/example_inputs/human_v2_example.json b/pipelines/skylab/optimus/example_inputs/human_v2_example.json index 04e54e6d80..0b0da39f58 100644 --- a/pipelines/skylab/optimus/example_inputs/human_v2_example.json +++ b/pipelines/skylab/optimus/example_inputs/human_v2_example.json @@ -15,6 +15,5 @@ "Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/star_2.7.9a_primary_gencode_human_v27.tar", "Optimus.input_id": "pbmc4k_human", "Optimus.chemistry": "tenX_v2", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf" } diff --git a/pipelines/skylab/optimus/example_inputs/human_v3_example.json b/pipelines/skylab/optimus/example_inputs/human_v3_example.json index 82dd8c219a..6a0e8edf98 100644 --- a/pipelines/skylab/optimus/example_inputs/human_v3_example.json +++ b/pipelines/skylab/optimus/example_inputs/human_v3_example.json @@ -15,6 +15,5 @@ "Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/star_2.7.9a_primary_gencode_human_v27.tar", "Optimus.input_id": "pbmc_human_v3", "Optimus.chemistry": "tenX_v3", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf" } diff --git a/pipelines/skylab/optimus/example_inputs/mouse_v2_example.json b/pipelines/skylab/optimus/example_inputs/mouse_v2_example.json index 45981c2ac7..8efad7a498 100644 --- a/pipelines/skylab/optimus/example_inputs/mouse_v2_example.json +++ b/pipelines/skylab/optimus/example_inputs/mouse_v2_example.json @@ -27,6 +27,5 @@ "Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/mm10/v0/star/star_2.7.9a_primary_gencode_mouse_vM21.tar", "Optimus.input_id": "neurons2k_mouse", "Optimus.chemistry": "tenX_v2", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf" } diff --git a/pipelines/skylab/optimus/example_inputs/mouse_v2_snRNA_example.json b/pipelines/skylab/optimus/example_inputs/mouse_v2_snRNA_example.json index 293c9f326f..e3b905f62d 100644 --- a/pipelines/skylab/optimus/example_inputs/mouse_v2_snRNA_example.json +++ b/pipelines/skylab/optimus/example_inputs/mouse_v2_snRNA_example.json @@ -24,7 +24,6 @@ "Optimus.input_id": "nuclei_2k_mouse", "Optimus.chemistry": "tenX_v2", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa", "Optimus.counting_mode": "sn_rna", "Optimus.count_exons": true } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json index ff5a02caaf..612659d25c 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json @@ -15,6 +15,5 @@ "Optimus.input_id": "pbmc_human_v3", "Optimus.tenx_chemistry_version": "3", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", - "Optimus.star_strand_mode": "Forward", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa" + "Optimus.star_strand_mode": "Forward" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json index bbf625ef27..0dc26af9fd 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json @@ -27,6 +27,5 @@ "Optimus.input_id": "neurons2k_mouse", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/GRCm39/GRCm39.primary_assembly.genome.fa.gz" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json index 239b7d1fcb..787a1a8347 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json @@ -24,7 +24,6 @@ "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/GRCm39/GRCm39.primary_assembly.genome.fa.gz", "Optimus.counting_mode": "sn_rna", "Optimus.count_exons": true } diff --git a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json index 0f5ce301f1..773af4f2f4 100644 --- a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json +++ b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json @@ -15,8 +15,7 @@ "Optimus.input_id": "8k_pbmc", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf" } diff --git a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json index 2581f222dc..98c9c9912d 100644 --- a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json +++ b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json @@ -15,8 +15,7 @@ "Optimus.input_id": "8k_pbmc", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Forward", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf", - "Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf" } diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 06b2ec320b..cdff655469 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,25 @@ +# 0.3.0 + +2024-03-01 (Date of Last Commit) + +* Added the gene expression library-level metrics CSV as output of the Paired-tag pipeline; this is produced by the Optimus subworkflow + +# 0.2.0 +2024-02-29 (Date of Last Commit) +* Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Paired-tag workflow + + +# 0.1.0 +2024-02-22 (Date of Last Commit) + +* Updated StarAlign output metrics to include shard ids, which is called by Optimus +* Remove ref_genome_fasta from Optimus input + +# 0.0.7 +2024-02-07 (Date of Last Commit) + +* Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic + # 0.0.6 2024-02-01 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index bc0e6763f7..05aa4867ee 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -5,7 +5,7 @@ import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing workflow PairedTag { - String pipeline_version = "0.0.6" + String pipeline_version = "0.3.0" input { String input_id @@ -17,7 +17,6 @@ workflow PairedTag { Array[File]? gex_i1_fastq File tar_star_reference File annotations_gtf - File ref_genome_fasta File? mt_genes Int tenx_chemistry_version = 3 Int emptydrops_lower = 100 @@ -55,7 +54,6 @@ workflow PairedTag { output_bam_basename = input_id + "_gex", tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, - ref_genome_fasta = ref_genome_fasta, mt_genes = mt_genes, tenx_chemistry_version = tenx_chemistry_version, whitelist = gex_whitelist, @@ -127,5 +125,6 @@ workflow PairedTag { File gene_metrics_gex = Optimus.gene_metrics File? cell_calls_gex = Optimus.cell_calls File h5ad_output_file_gex = Optimus.h5ad_output_file + File? library_metrics = Optimus.library_metrics } } diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index f540bdc710..c02c3fd51b 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,30 @@ +# 3.1.2 + +2024-02-28 (Date of Last Commit) + +* Updated the Optimus workflow to produce a library-level metrics CSV; this does not impact the slide-seq pipeline + +# 3.1.1 +2024-02-29 (Date of Last Commit) +* Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Slideseq workflow + + +# 3.1.0 +2024-02-07 (Date of Last Commit) + +* Updated StarAlign output metrics to include shard ids + +# 3.0.1 +2024-02-13 (Date of Last Commit) + +* Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic; this does affect the SlideSeq workflow + +# 3.0.0 +2024-02-12 (Date of Last Commit) + +* Updated the SlideSeq WDL output to utilize the h5ad format in place of Loom + + # 2.1.6 2024-01-30 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index c469d7fe56..4f241b24d8 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -3,7 +3,7 @@ version 1.0 import "../../../tasks/skylab/StarAlign.wdl" as StarAlign import "../../../tasks/skylab/FastqProcessing.wdl" as FastqProcessing import "../../../tasks/skylab/Metrics.wdl" as Metrics -import "../../../tasks/skylab/LoomUtils.wdl" as LoomUtils +import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/CheckInputs.wdl" as OptimusInputChecks import "../../../tasks/skylab/MergeSortBam.wdl" as Merge @@ -23,7 +23,7 @@ import "../../../tasks/skylab/MergeSortBam.wdl" as Merge workflow SlideSeq { - String pipeline_version = "2.1.6" + String pipeline_version = "3.1.2" input { Array[File] r1_fastq @@ -50,8 +50,8 @@ workflow SlideSeq { } call StarAlign.STARGenomeRefVersion as ReferenceCheck { - input: - tar_star_reference = tar_star_reference + input: + tar_star_reference = tar_star_reference } call Metrics.FastqMetricsSlideSeq as FastqMetrics { @@ -91,11 +91,13 @@ workflow SlideSeq { call Metrics.CalculateGeneMetrics as GeneMetrics { input: bam_input = MergeBam.output_bam, + original_gtf = annotations_gtf, input_id = input_id } call Metrics.CalculateUMIsMetrics as UMIsMetrics { input: bam_input = MergeBam.output_bam, + original_gtf = annotations_gtf, input_id = input_id } @@ -114,7 +116,7 @@ workflow SlideSeq { input_id = input_id } if ( !count_exons ) { - call LoomUtils.OptimusLoomGeneration as SlideseqLoomGeneration{ + call H5adUtils.OptimusH5adGeneration as SlideseqH5adGeneration{ input: input_id = input_id, annotation_file = annotations_gtf, @@ -135,7 +137,7 @@ workflow SlideSeq { matrix = STARsoloFastqSlideSeq.matrix_sn_rna, input_id = input_id } - call LoomUtils.SingleNucleusOptimusLoomOutput as SlideseqLoomGenerationWithExons{ + call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: input_id = input_id, annotation_file = annotations_gtf, @@ -149,10 +151,9 @@ workflow SlideSeq { gene_id_exon = MergeStarOutputsExons.col_index, pipeline_version = "SlideSeq_v~{pipeline_version}" } - } - File final_loom_output = select_first([SlideseqLoomGenerationWithExons.loom_output, SlideseqLoomGeneration.loom_output]) + File final_h5ad_output = select_first([OptimusH5adGenerationWithExons.h5ad_output, SlideseqH5adGeneration.h5ad_output]) output { String pipeline_version_out = pipeline_version @@ -173,8 +174,7 @@ workflow SlideSeq { File fastq_reads_per_umi = FastqMetrics.numReads_perUMI - # loom - File? loom_output_file = final_loom_output - + # h5ad + File? h5ad_output_file = final_h5ad_output } } diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index b0e84df63f..1d030b4af5 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,4 +1,14 @@ -# 1.2.28 +# 1.3.1 +2024-02-28 (Date of Last Commit) + +* Updated the Optimus workflow to produce a library-level metrics CSV; this does not impact the Single-nucleus Multi Sample Smart-seq2 pipeline + +# 1.3.0 +2024-01-22 (Date of Last Commit) + +* Updated StarAlign output metrics to include shard ids + + # 1.2.28 2024-01-11 (Date of Last Commit) * Increased memory for MergeStarOutputs in StarAlign.wdl, RunEmptyDrops in RunEmptyDrops.wdl, OptimusH5ad in H5adUtils.wdl and GeneMetrics in Metrics.wdl diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index d0bf9dbb2f..de5824ae13 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -40,7 +40,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { String? input_id_metadata_field } # Version of this pipeline - String pipeline_version = "1.2.28" + String pipeline_version = "1.3.1" if (false) { String? none = "None" diff --git a/pipelines/skylab/snM3C/snM3C.changelog.md b/pipelines/skylab/snM3C/snM3C.changelog.md index b24145073b..9a77ee7eff 100644 --- a/pipelines/skylab/snM3C/snM3C.changelog.md +++ b/pipelines/skylab/snM3C/snM3C.changelog.md @@ -1,3 +1,21 @@ +# 3.0.0 +2024-02-23 (Date of Last Commit) + +* Updated the snM3C docker to include the latest changes to the CEMBA repository; this impacts the scientific outputs +* Added docker as a workflow-level input +* Reverted the Hisat alignments to use the --no-repeat-index parameter + +# 2.0.1 +2024-2-15 (Date of Last Commit) + +* Updated the snM3C task memory, disk, and CPUs + +# 2.0.0 +2024-2-13 (Date of Last Commit) + +* Merged several tasks in snM3C.wdl to reduce the cost of running this pipeline +* Removed several final outputs from snM3C.wdl + # 1.0.1 2024-01-31 (Date of Last Commit) @@ -6,4 +24,4 @@ # 1.0.0 2023-08-01 (Date of Last Commit) -* First release of the snM3C workflow \ No newline at end of file +* First release of the snM3C workflow diff --git a/pipelines/skylab/snM3C/snM3C.wdl b/pipelines/skylab/snM3C/snM3C.wdl index 3feefb6787..0413581aae 100644 --- a/pipelines/skylab/snM3C/snM3C.wdl +++ b/pipelines/skylab/snM3C/snM3C.wdl @@ -23,11 +23,13 @@ workflow snM3C { Int num_downstr_bases = 2 Int compress_level = 5 Int batch_number - + String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.3" + String single_end_hisat_cpu_platform = "Intel Ice Lake" + String merge_sort_analyze_cpu_platform = "Intel Ice Lake" } # version of the pipeline - String pipeline_version = "1.0.1" + String pipeline_version = "3.0.0" call Demultiplexing { input: @@ -35,6 +37,7 @@ workflow snM3C { fastq_input_read2 = fastq_input_read2, random_primer_indexes = random_primer_indexes, plate_id = plate_id, + docker = docker, batch_number = batch_number } @@ -49,9 +52,10 @@ workflow snM3C { r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, min_read_length = min_read_length, + docker = docker, plate_id = plate_id } - + call Hisat_3n_pair_end_mapping_dna_mode { input: r1_trimmed_tar = Sort_and_trim_r1_and_r2.r1_trimmed_fq_tar, @@ -59,72 +63,40 @@ workflow snM3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, chromosome_sizes = chromosome_sizes, + docker = docker, plate_id = plate_id } - call Separate_unmapped_reads { + call Separate_and_split_unmapped_reads { input: hisat3n_bam_tar = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_bam_tar, min_read_length = min_read_length, - plate_id = plate_id - } - - call Split_unmapped_reads { - input: - unmapped_fastq_tar = Separate_unmapped_reads.unmapped_fastq_tar, - min_read_length = min_read_length, - plate_id = plate_id + plate_id = plate_id, + docker = docker } - call Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name { + call hisat_single_end { input: - split_fq_tar = Split_unmapped_reads.split_fq_tar, + split_fq_tar = Separate_and_split_unmapped_reads.split_fq_tar, tarred_index_files = tarred_index_files, genome_fa = genome_fa, - plate_id = plate_id + plate_id = plate_id, + docker = docker, + single_end_hisat_cpu_platform = single_end_hisat_cpu_platform } - call remove_overlap_read_parts { + call merge_sort_analyze { input: - bam = Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.merge_sorted_bam_tar, - plate_id = plate_id - } - - call merge_original_and_split_bam_and_sort_all_reads_by_name_and_position { - input: - bam = Separate_unmapped_reads.unique_bam_tar, - split_bam = remove_overlap_read_parts.output_bam_tar, - plate_id = plate_id - } - - call call_chromatin_contacts { - input: - name_sorted_bam = merge_original_and_split_bam_and_sort_all_reads_by_name_and_position.name_sorted_bam, - plate_id = plate_id - } - - call dedup_unique_bam_and_index_unique_bam { - input: - bam = merge_original_and_split_bam_and_sort_all_reads_by_name_and_position.position_sorted_bam, - plate_id = plate_id - } - - call unique_reads_allc { - input: - bam_and_index_tar = dedup_unique_bam_and_index_unique_bam.output_tar, - genome_fa = genome_fa, - num_upstr_bases = num_upstr_bases, - num_downstr_bases = num_downstr_bases, - compress_level = compress_level, - plate_id = plate_id - } - - call unique_reads_cgn_extraction { - input: - allc_tar = unique_reads_allc.allc, - tbi_tar = unique_reads_allc.tbi, - chrom_size_path = chromosome_sizes, - plate_id = plate_id + paired_end_unique_tar = Separate_and_split_unmapped_reads.unique_bam_tar, + read_overlap_tar = hisat_single_end.remove_overlaps_output_bam_tar, + genome_fa = genome_fa, + num_upstr_bases = num_upstr_bases, + num_downstr_bases = num_downstr_bases, + compress_level = compress_level, + chromosome_sizes = chromosome_sizes, + plate_id = plate_id, + docker = docker, + merge_sort_analyze_cpu_platform = merge_sort_analyze_cpu_platform } } @@ -132,35 +104,28 @@ workflow snM3C { input: trimmed_stats = Sort_and_trim_r1_and_r2.trim_stats_tar, hisat3n_stats = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_stats_tar, - r1_hisat3n_stats = Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.hisat3n_dna_split_reads_summary_R1_tar, - r2_hisat3n_stats = Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.hisat3n_dna_split_reads_summary_R2_tar, - dedup_stats = dedup_unique_bam_and_index_unique_bam.dedup_stats_tar, - chromatin_contact_stats = call_chromatin_contacts.chromatin_contact_stats, - allc_uniq_reads_stats = unique_reads_allc.allc_uniq_reads_stats, - unique_reads_cgn_extraction_tbi = unique_reads_cgn_extraction.output_tbi_tar, - plate_id = plate_id + r1_hisat3n_stats = hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, + r2_hisat3n_stats = hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, + dedup_stats = merge_sort_analyze.dedup_stats_tar, + chromatin_contact_stats = merge_sort_analyze.chromatin_contact_stats, + allc_uniq_reads_stats = merge_sort_analyze.allc_uniq_reads_stats, + unique_reads_cgn_extraction_tbi = merge_sort_analyze.extract_allc_output_tbi_tar, + plate_id = plate_id, + docker = docker } output { File MappingSummary = summary.mapping_summary - Array[File] trimmed_stats = Sort_and_trim_r1_and_r2.trim_stats_tar - Array[File] r1_trimmed_fq = Sort_and_trim_r1_and_r2.r1_trimmed_fq_tar - Array[File] r2_trimmed_fq = Sort_and_trim_r1_and_r2.r2_trimmed_fq_tar - Array[File] hisat3n_stats_tar = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_stats_tar - Array[File] hisat3n_bam_tar = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_bam_tar - Array[File] unique_bam_tar = Separate_unmapped_reads.unique_bam_tar - Array[File] multi_bam_tar = Separate_unmapped_reads.multi_bam_tar - Array[File] unmapped_fastq_tar = Separate_unmapped_reads.unmapped_fastq_tar - Array[File] split_fq_tar = Split_unmapped_reads.split_fq_tar - Array[File] merge_sorted_bam_tar = Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.merge_sorted_bam_tar - Array[File] name_sorted_bams = merge_original_and_split_bam_and_sort_all_reads_by_name_and_position.name_sorted_bam - Array[File] pos_sorted_bams = merge_original_and_split_bam_and_sort_all_reads_by_name_and_position.position_sorted_bam - Array[File] remove_overlap_read_parts_bam_tar = remove_overlap_read_parts.output_bam_tar - Array[File] dedup_unique_bam_and_index_unique_bam_tar = dedup_unique_bam_and_index_unique_bam.output_tar - Array[File] unique_reads_cgn_extraction_allc = unique_reads_cgn_extraction.output_allc_tar - Array[File] unique_reads_cgn_extraction_tbi = unique_reads_cgn_extraction.output_tbi_tar - Array[File] chromatin_contact_stats = call_chromatin_contacts.chromatin_contact_stats + Array[File] name_sorted_bams = merge_sort_analyze.name_sorted_bam + Array[File] unique_reads_cgn_extraction_allc= merge_sort_analyze.allc + Array[File] unique_reads_cgn_extraction_tbi = merge_sort_analyze.tbi Array[File] reference_version = Hisat_3n_pair_end_mapping_dna_mode.reference_version + Array[File] all_reads_dedup_contacts = merge_sort_analyze.all_reads_dedup_contacts + Array[File] all_reads_3C_contacts = merge_sort_analyze.all_reads_3C_contacts + Array[File] chromatin_contact_stats = merge_sort_analyze.chromatin_contact_stats + Array[File] unique_reads_cgn_extraction_allc_extract = merge_sort_analyze.extract_allc_output_allc_tar + Array[File] unique_reads_cgn_extraction_tbi_extract = merge_sort_analyze.extract_allc_output_tbi_tar + } } @@ -171,12 +136,13 @@ task Demultiplexing { File random_primer_indexes String plate_id Int batch_number + String docker + - String docker_image = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 50 + Int disk_size = 1000 Int mem_size = 10 Int preemptible_tries = 3 - Int cpu = 1 + Int cpu = 8 } command <<< @@ -233,10 +199,10 @@ task Demultiplexing { for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done - + # Counter for the folder index folder_index=1 - + # Define lists of r1 and r2 fq files R1_files=($(ls | grep "\-R1.fq.gz")) R2_files=($(ls | grep "\-R2.fq.gz")) @@ -256,11 +222,11 @@ task Demultiplexing { done - echo "TAR files created successfully." + echo "TAR files created successfully." >>> runtime { - docker: docker_image + docker: docker disks: "local-disk ${disk_size} HDD" cpu: cpu memory: "${mem_size} GiB" @@ -285,11 +251,11 @@ task Sort_and_trim_r1_and_r2 { Int r2_right_cut Int min_read_length - Int disk_size = 50 - Int mem_size = 10 - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" + Int disk_size = 500 + Int mem_size = 16 + String docker Int preemptible_tries = 3 - Int cpu = 1 + Int cpu = 4 } command <<< @@ -369,11 +335,11 @@ task Hisat_3n_pair_end_mapping_dna_mode{ File chromosome_sizes String plate_id - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" + String docker Int disk_size = 1000 Int mem_size = 64 Int preemptible_tries = 3 - Int cpu = 16 + Int cpu = 48 } command <<< set -euo pipefail @@ -384,28 +350,19 @@ task Hisat_3n_pair_end_mapping_dna_mode{ echo "The reference is $BASE" > ~{plate_id}.reference_version.txt - mkdir reference/ - mkdir fastq/ - - cp ~{tarred_index_files} reference/ - cp ~{genome_fa} reference/ - cp ~{chromosome_sizes} reference/ - cp ~{r1_trimmed_tar} fastq/ - cp ~{r2_trimmed_tar} fastq/ - # untar the index files - cd reference/ echo "Untarring the index files" tar -zxvf ~{tarred_index_files} rm ~{tarred_index_files} + cp ~{genome_fa} . + #get the basename of the genome_fa file genome_fa_basename=$(basename ~{genome_fa} .fa) echo "samtools faidx $genome_fa_basename.fa" samtools faidx $genome_fa_basename.fa # untar the demultiplexed fastq files - cd ../fastq/ echo "Untarring the fastq files" tar -zxvf ~{r1_trimmed_tar} tar -zxvf ~{r2_trimmed_tar} @@ -416,30 +373,47 @@ task Hisat_3n_pair_end_mapping_dna_mode{ R1_files=($(ls | grep "\-R1_trimmed.fq.gz")) R2_files=($(ls | grep "\-R2_trimmed.fq.gz")) + echo "starting hisat" + + task() { + sample_id=$(basename "$file" "-R1_trimmed.fq.gz") + hisat-3n /cromwell_root/$genome_fa_basename \ + -q \ + -1 ${sample_id}-R1_trimmed.fq.gz \ + -2 ${sample_id}-R2_trimmed.fq.gz \ + --directional-mapping-reverse \ + --base-change C,T \ + --no-repeat-index \ + --no-spliced-alignment \ + --no-temp-splicesite \ + -t \ + --new-summary \ + --summary-file ${sample_id}.hisat3n_dna_summary.txt \ + --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" + } + for file in "${R1_files[@]}"; do - sample_id=$(basename "$file" "-R1_trimmed.fq.gz") - hisat-3n /cromwell_root/reference/$genome_fa_basename \ - -q \ - -1 ${sample_id}-R1_trimmed.fq.gz \ - -2 ${sample_id}-R2_trimmed.fq.gz \ - --directional-mapping-reverse \ - --base-change C,T \ - --no-repeat-index \ - --no-spliced-alignment \ - --no-temp-splicesite \ - -t \ - --new-summary \ - --summary-file ${sample_id}.hisat3n_dna_summary.txt \ - --threads 11 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" + ( + echo "starting task $file.." + task "$file" + sleep $(( (RANDOM % 3) + 1)) + ) & + + if [[ $(jobs -r -p | wc -l) -ge 4 ]]; then + wait -n + fi done + # Wait for all background jobs to finish before continuing + wait + + echo "done hisat" + + echo "tarring up the outputs" # tar up the bam files and stats files tar -zcvf ~{plate_id}.hisat3n_paired_end_bam_files.tar.gz *.bam tar -zcvf ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz *.hisat3n_dna_summary.txt - mv ~{plate_id}.hisat3n_paired_end_bam_files.tar.gz ../ - mv ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz ../ - >>> runtime { docker: docker @@ -455,17 +429,17 @@ task Hisat_3n_pair_end_mapping_dna_mode{ } } -task Separate_unmapped_reads { +task Separate_and_split_unmapped_reads { input { File hisat3n_bam_tar Int min_read_length String plate_id - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 50 + String docker + Int disk_size = 1000 Int mem_size = 10 Int preemptible_tries = 3 - Int cpu = 1 + Int cpu = 8 } command <<< @@ -484,7 +458,7 @@ task Separate_unmapped_reads { pattern = "*.hisat3n_dna.unsort.bam" bam_files = glob.glob(os.path.join('/cromwell_root/', pattern)) - + for file in bam_files: full_filename = os.path.basename(file) @@ -509,46 +483,11 @@ task Separate_unmapped_reads { # tar up the uniqe bams tar -zcvf ~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz *.hisat3n_dna.unique_aligned.bam - # tar up the multi bams - tar -zcvf ~{plate_id}.hisat3n_paired_end_multi_bam_files.tar.gz *.hisat3n_dna.multi_aligned.bam - # tar up the unmapped fastq files tar -zcvf ~{plate_id}.hisat3n_paired_end_unmapped_fastq_files.tar.gz *.hisat3n_dna.unmapped.fastq - >>> - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } - output { - File unique_bam_tar = "~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz" - File multi_bam_tar = "~{plate_id}.hisat3n_paired_end_multi_bam_files.tar.gz" - File unmapped_fastq_tar = "~{plate_id}.hisat3n_paired_end_unmapped_fastq_files.tar.gz" - } -} - -task Split_unmapped_reads { - input { - File unmapped_fastq_tar - Int min_read_length - String plate_id - - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 50 - Int mem_size = 10 - Int preemptible_tries = 3 - Int cpu = 1 - } - command <<< - - set -euo pipefail - # untar the unmapped fastq files - tar -xf ~{unmapped_fastq_tar} - rm ~{unmapped_fastq_tar} + tar -xf ~{plate_id}.hisat3n_paired_end_unmapped_fastq_files.tar.gz python3 <>> runtime { docker: docker @@ -589,474 +527,411 @@ task Split_unmapped_reads { preemptible: preemptible_tries } output { + File unique_bam_tar = "~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz" File split_fq_tar = "~{plate_id}.hisat3n_paired_end_split_fastq_files.tar.gz" } } -task Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name { +task hisat_single_end { input { File split_fq_tar File genome_fa File tarred_index_files String plate_id - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 500 - Int mem_size = 64 - Int preemptible_tries = 3 - Int cpu = 16 + String single_end_hisat_cpu_platform + Int disk_size = 1000 + Int mem_size = 128 + Int cpu = 32 + Int preemptible_tries = 2 + String docker } + command <<< set -euo pipefail - - mkdir reference/ - - cp ~{tarred_index_files} reference/ - cp ~{genome_fa} reference/ - + set -x + lscpu + # untar the tarred index files - cd reference/ - tar -xvf ~{tarred_index_files} + echo "Untar tarred_index_files" + start=$(date +%s) + pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to untar tarred_index_files: $elapsed seconds" + + cp ~{genome_fa} . #get the basename of the genome_fa file + echo "samtools faidx" + start=$(date +%s) genome_fa_basename=$(basename ~{genome_fa} .fa) samtools faidx $genome_fa_basename.fa - + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to samtools faidx: $elapsed seconds" + # untar the unmapped fastq files - tar -xvf ~{split_fq_tar} + echo "Untar split_fq_tar" + start=$(date +%s) + pigz -dc ~{split_fq_tar} | tar -xf - rm ~{split_fq_tar} - + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to untar split_fq_tar: $elapsed seconds" + + # make directories + mkdir -p /cromwell_root/merged_sort_bams + mkdir -p /cromwell_root/read_overlap + # define lists of r1 and r2 fq files R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) R2_files=($(ls | grep "\.hisat3n_dna.split_reads.R2.fastq")) - for file in "${R1_files[@]}"; do - sample_id=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") - hisat-3n /cromwell_root/reference/$genome_fa_basename \ + task() { + BASE=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") + echo $BASE + echo "Running hisat on sample_id_R1" $BASE + + echo "Hisat 3n R1" + start=$(date +%s) + + # hisat on R1 single end + hisat-3n /cromwell_root/$genome_fa_basename \ -q \ - -U ${sample_id}.hisat3n_dna.split_reads.R1.fastq \ - --directional-mapping-reverse \ - --base-change C,T \ + -U ${BASE}.hisat3n_dna.split_reads.R1.fastq \ + -S ${BASE}.hisat3n_dna.split_reads.R1.sam --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ - --summary-file ${sample_id}.hisat3n_dna_split_reads_summary.R1.txt \ - --threads 11 | samtools view -b -q 10 -o "${sample_id}.hisat3n_dna.split_reads.R1.bam" - done - - for file in "${R2_files[@]}"; do - sample_id=$(basename "$file" ".hisat3n_dna.split_reads.R2.fastq") - hisat-3n /cromwell_root/reference/$genome_fa_basename \ + --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R1.txt \ + --threads 8 + + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run $elapsed seconds" + echo "Finish running hisat on sample_id_R1" $BASE + + echo "Hisat 3n R2" + start=$(date +%s) + echo "Running hisat on sample_id_R2" $BASE + + # hisat on R2 single end + hisat-3n /cromwell_root/$genome_fa_basename \ -q \ - -U ${sample_id}.hisat3n_dna.split_reads.R2.fastq \ - --directional-mapping \ - --base-change C,T \ + -U ${BASE}.hisat3n_dna.split_reads.R2.fastq \ + -S ${BASE}.hisat3n_dna.split_reads.R2.sam --directional-mapping --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t --new-summary \ - --summary-file ${sample_id}.hisat3n_dna_split_reads_summary.R2.txt \ - --threads 11 | samtools view -b -q 10 -o "${sample_id}.hisat3n_dna.split_reads.R2.bam" - done + --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R2.txt \ + --threads 8 - # tar up the r1 and r2 stats files - tar -zcvf ../~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz *.hisat3n_dna_split_reads_summary.R1.txt - tar -zcvf ../~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz *.hisat3n_dna_split_reads_summary.R2.txt - - - # define lists of r1 and r2 bam files - R1_bams=($(ls | grep "\.hisat3n_dna.split_reads.R1.bam")) - R2_bams=($(ls | grep "\.hisat3n_dna.split_reads.R2.bam")) - - # Loop through the R1 BAM files - for r1_bam in "${R1_bams[@]}"; do - # Extract the corresponding R2 BAM file - r2_bam="${r1_bam/.hisat3n_dna.split_reads.R1.bam/.hisat3n_dna.split_reads.R2.bam}" - - # Define the output BAM file name - output_bam="$(basename ${r1_bam/.hisat3n_dna.split_reads.R1.bam/.hisat3n_dna.split_reads.name_sort.bam})" - - # Perform the samtools merge and sort commands - samtools merge -o - "$r1_bam" "$r2_bam" | samtools sort -n -o "$output_bam" - - done + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run $elapsed seconds" + echo "Finish running hisat on sample_id_R2" $BASE + + # samtools merge + echo "samtools merge R1 and R2" + start=$(date +%s) + samtools merge -o ${BASE}.name_merged.sam ${BASE}.hisat3n_dna.split_reads.R1.sam ${BASE}.hisat3n_dna.split_reads.R2.sam -@8 + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run samtools merge $elapsed seconds" + + # samtools sort + echo "samtools sort R1 and R2" + start=$(date +%s) + samtools sort -n -@8 -m1g ${BASE}.name_merged.sam -o ${BASE}.name_sorted.bam + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run samtools sort $elapsed seconds" + + # samtools filter bam + echo "samtools -q 10" + start=$(date +%s) + samtools view -q 10 ${BASE}.name_sorted.bam -o ${BASE}.name_sorted.filtered.bam + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run samtools -q 10 $elapsed seconds" + + # remove_overlap_read_parts + echo "call remove_overlap_read_parts" + start=$(date +%s) + python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,"cromwell_root","'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,"cromwell_root","'"$BASE"'.read_overlap.bam"))' + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run remove overlap $elapsed seconds" + + } - #tar up the merged bam files - tar -zcvf ../~{plate_id}.hisat3n_dna.split_reads.name_sort.bam.tar.gz *.hisat3n_dna.split_reads.name_sort.bam + # run 4 instances in parallel each with 8 threads + for file in "${R1_files[@]}"; do + ( + echo "starting task $file.." + task "$file" + sleep $(( (RANDOM % 3) + 1)) + ) & + if [[ $(jobs -r -p | wc -l) -ge 4 ]]; then + wait -n + fi + done + wait + echo "All done running tasks." + ls + + echo "Tar up summary text files" + start=$(date +%s) + # tar up the r1 and r2 stats files -p to set number of threads + tar -cf - *.hisat3n_dna_split_reads_summary.R1.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz + tar -cf - *.hisat3n_dna_split_reads_summary.R2.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run tar summary text files $elapsed seconds" + + # tar up read overlap files + echo "Tar up read_overlap bams" + start=$(date +%s) + tar -zcvf ~{plate_id}.remove_overlap_read_parts.tar.gz *read_overlap.bam + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to tar read_overlap bams $elapsed seconds" >>> - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } - output { - File merge_sorted_bam_tar = "~{plate_id}.hisat3n_dna.split_reads.name_sort.bam.tar.gz" - File hisat3n_dna_split_reads_summary_R1_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz" - File hisat3n_dna_split_reads_summary_R2_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz" - } -} - -task remove_overlap_read_parts { - input { - File bam - String plate_id - - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 80 - Int mem_size = 20 - Int preemptible_tries = 3 - Int cpu = 1 - } - - command <<< - set -euo pipefail - # unzip bam file - tar -xf ~{bam} - rm ~{bam} - - # create output dir - mkdir /cromwell_root/output_bams - - # get bams - bams=($(ls | grep "sort.bam$")) - - # loop through bams and run python script on each bam - # scatter instead of for loop to optimize - python3 <>> runtime { docker: docker - disks: "local-disk ${disk_size} HDD" + disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" + cpuPlatform: single_end_hisat_cpu_platform preemptible: preemptible_tries } - output { - File output_bam_tar = "~{plate_id}.remove_overlap_read_parts.tar.gz" - } -} - -task merge_original_and_split_bam_and_sort_all_reads_by_name_and_position { - input { - File bam - File split_bam - String plate_id - - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 80 - Int mem_size = 20 - Int preemptible_tries = 3 - Int cpu = 1 - } - command <<< - set -euo pipefail - #unzip bam file - tar -xf ~{bam} - tar -xf ~{split_bam} - rm ~{bam} - rm ~{split_bam} - - echo "samtools merge and sort" - # define lists of r1 and r2 fq files - UNIQUE_BAMS=($(ls | grep "\.hisat3n_dna.unique_aligned.bam")) - SPLIT_BAMS=($(ls | grep "\.hisat3n_dna.split_reads.read_overlap.bam")) - for file in "${UNIQUE_BAMS[@]}"; do - sample_id=$(basename "$file" ".hisat3n_dna.unique_aligned.bam") - samtools merge -f "${sample_id}.hisat3n_dna.all_reads.bam" "${sample_id}.hisat3n_dna.unique_aligned.bam" "${sample_id}.hisat3n_dna.split_reads.read_overlap.bam" - samtools sort -n -o "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" "${sample_id}.hisat3n_dna.all_reads.bam" - samtools sort -O BAM -o "${sample_id}.hisat3n_dna.all_reads.pos_sort.bam" "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" - done - - echo "Zip files" - #tar up the merged bam files - tar -zcvf ~{plate_id}.hisat3n_dna.all_reads.pos_sort.tar.gz *.hisat3n_dna.all_reads.pos_sort.bam - tar -zcvf ~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz *.hisat3n_dna.all_reads.name_sort.bam - >>> - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } output { - File name_sorted_bam = "~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz" - File position_sorted_bam = "~{plate_id}.hisat3n_dna.all_reads.pos_sort.tar.gz" + File hisat3n_dna_split_reads_summary_R1_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz" + File hisat3n_dna_split_reads_summary_R2_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz" + File remove_overlaps_output_bam_tar = "~{plate_id}.remove_overlap_read_parts.tar.gz" + } } - -task call_chromatin_contacts { + +task merge_sort_analyze { input { - File name_sorted_bam String plate_id + File paired_end_unique_tar + File read_overlap_tar - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 80 - Int mem_size = 20 - Int preemptible_tries = 3 - Int cpu = 1 - } - command <<< - set -euo pipefail - - # untar the name sorted bam files - tar -xf ~{name_sorted_bam} - rm ~{name_sorted_bam} - - python3 <>> - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } - output { - File chromatin_contact_stats = "~{plate_id}.chromatin_contact_stats.tar.gz" - } -} - -task dedup_unique_bam_and_index_unique_bam { - input { - File bam - String plate_id - - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 80 - Int mem_size = 20 - Int preemptible_tries = 3 - Int cpu = 1 - } - - command <<< - set -euo pipefail - - # unzip files - tar -xf ~{bam} - rm ~{bam} - - # create output dir - mkdir /cromwell_root/output_bams - mkdir /cromwell_root/temp - - # name : AD3C_BA17_2027_P1-1-B11-G13.hisat3n_dna.all_reads.pos_sort.bam - for file in *.bam - do - name=`echo $file | cut -d. -f1` - name=$name.hisat3n_dna.all_reads.deduped - echo $name - echo "Call Picard" - picard MarkDuplicates I=$file O=/cromwell_root/output_bams/$name.bam \ - M=/cromwell_root/output_bams/$name.matrix.txt \ - REMOVE_DUPLICATES=true TMP_DIR=/cromwell_root/temp - echo "Call samtools index" - samtools index /cromwell_root/output_bams/$name.bam - done - - cd /cromwell_root - - #tar up the output files - tar -zcvf ~{plate_id}.dedup_unique_bam_and_index_unique_bam.tar.gz output_bams - - #tar up the stats files - tar -zcvf ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz output_bams/*.matrix.txt - - >>> - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } - output { - File output_tar = "~{plate_id}.dedup_unique_bam_and_index_unique_bam.tar.gz" - File dedup_stats_tar = "~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz" - } -} - -task unique_reads_allc { - input { - File bam_and_index_tar + #input for allcools bam-to-allc File genome_fa - String plate_id + String genome_base = basename(genome_fa) Int num_upstr_bases Int num_downstr_bases Int compress_level + File chromosome_sizes - Int disk_size = 80 - Int mem_size = 20 - String genome_base = basename(genome_fa) - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" + String merge_sort_analyze_cpu_platform + String docker + Int disk_size = 1000 + Int mem_size = 64 + Int cpu = 16 Int preemptible_tries = 3 - Int cpu = 1 } - command <<< - set -euo pipefail - # unzip files - tar -xf ~{bam_and_index_tar} - rm ~{bam_and_index_tar} - - mkdir reference - cp ~{genome_fa} reference - cd reference - - # index the fasta - echo "Indexing FASTA" - samtools faidx *.fa - cd ../output_bams - - echo "Starting allcools" - bam_files=($(ls | grep "\.hisat3n_dna.all_reads.deduped.bam$")) - echo ${bam_files[@]} - for file in "${bam_files[@]}"; do - sample_id=$(basename "$file" ".hisat3n_dna.all_reads.deduped.bam") - /opt/conda/bin/allcools bam-to-allc \ - --bam_path "$file" \ - --reference_fasta /cromwell_root/reference/~{genome_base} \ - --output_path "${sample_id}.allc.tsv.gz" \ - --num_upstr_bases ~{num_upstr_bases} \ - --num_downstr_bases ~{num_downstr_bases} \ - --compress_level ~{compress_level} \ - --save_count_df \ - --convert_bam_strandness - done - echo "Zipping files" + command <<< + set -euo pipefail + set -x + lscpu + + # unzip tars + echo "Untar paired_end_unique_tar" + start=$(date +%s) + pigz -dc ~{paired_end_unique_tar} | tar -xf - + rm ~{paired_end_unique_tar} + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to untar paired_end_unique_tar: $elapsed seconds" + + echo "Untar read_overlap_tar" + start=$(date +%s) + pigz -dc ~{read_overlap_tar} | tar -xf - + rm ~{read_overlap_tar} + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to untar read_overlap_tar: $elapsed seconds" + + # reference and index + start=$(date +%s) + echo "Reference and index fasta" + mkdir reference + cp ~{genome_fa} reference + ls reference + samtools faidx reference/*.fa + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to index fasta $elapsed seconds" - tar -zcvf ../~{plate_id}.allc.tsv.tar.gz *.allc.tsv.gz - tar -zcvf ../~{plate_id}.allc.tbi.tar.gz *.allc.tsv.gz.tbi - tar -zcvf ../~{plate_id}.allc.count.tar.gz *.allc.tsv.gz.count.csv + # define lists of r1 and r2 fq files + UNIQUE_BAMS=($(ls | grep "\.hisat3n_dna.unique_aligned.bam")) + SPLIT_BAMS=($(ls | grep "\.read_overlap.bam")) + + # for allcools bam-to-allc + if [ ~{num_upstr_bases} -eq 0 ]; then + mcg_context=CGN + else + mcg_context=HCGN + fi + + # make directories + mkdir /cromwell_root/output_bams + mkdir /cromwell_root/temp + mkdir /cromwell_root/allc-${mcg_context} + + task() { + local file=$1 + sample_id=$(basename "$file" ".hisat3n_dna.unique_aligned.bam") + echo $sample_id + + start=$(date +%s) + echo "Merge all unique_aligned and read_overlap" + samtools merge -f "${sample_id}.hisat3n_dna.all_reads.bam" "${sample_id}.hisat3n_dna.unique_aligned.bam" "${sample_id}.read_overlap.bam" -@4 + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run merge $elapsed seconds" + + start=$(date +%s) + echo "Sort all reads by name" + samtools sort -n -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" "${sample_id}.hisat3n_dna.all_reads.bam" + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run sort by name $elapsed seconds" + + start=$(date +%s) + echo "Sort all reads by name" + samtools sort -O BAM -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.pos_sort.bam" "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run sort by pos $elapsed seconds" + + start=$(date +%s) + echo "Call Picard remove duplicates" + name=${sample_id}.hisat3n_dna.all_reads.deduped + picard MarkDuplicates I=${sample_id}.hisat3n_dna.all_reads.pos_sort.bam O=/cromwell_root/output_bams/${name}.bam \ + M=/cromwell_root/output_bams/${name}.matrix.txt \ + REMOVE_DUPLICATES=true TMP_DIR=/cromwell_root/temp + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to run picard $elapsed seconds" + + start=$(date +%s) + echo "Call samtools index" + samtools index /cromwell_root/output_bams/${name}.bam + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to samtools index $elapsed seconds" + + start=$(date +%s) + echo "Call chromatin contacts from name sorted bams" + python3 -c 'from cemba_data.hisat3n import *;import os;import glob;call_chromatin_contacts(bam_path="'"$sample_id"'.hisat3n_dna.all_reads.name_sort.bam",contact_prefix="'"$sample_id"'.hisat3n_dna.all_reads",save_raw=False,save_hic_format=True)' + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to chromatin contacts $elapsed seconds" + + start=$(date +%s) + echo "Call allcools bam-to-allc from deduped.bams" + /opt/conda/bin/allcools bam-to-allc \ + --bam_path /cromwell_root/output_bams/${name}.bam \ + --reference_fasta /cromwell_root/reference/~{genome_base} \ + --output_path "${sample_id}.allc.tsv.gz" \ + --num_upstr_bases ~{num_upstr_bases} \ + --num_downstr_bases ~{num_downstr_bases} \ + --compress_level ~{compress_level} \ + --save_count_df \ + --convert_bam_strandness + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to allcools bam-to-allc $elapsed seconds" + + start=$(date +%s) + echo "Call allcools extract-all" + allcools extract-allc --strandness merge \ + --allc_path ${sample_id}.allc.tsv.gz \ + --output_prefix /cromwell_root/allc-${mcg_context}/${sample_id} \ + --mc_contexts ${mcg_context} \ + --chrom_size_path ~{chromosome_sizes} + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to allcools extract-all $elapsed seconds" + + echo "Remove some bams" + rm ${sample_id}.hisat3n_dna.all_reads.bam + rm ${sample_id}.hisat3n_dna.all_reads.pos_sort.bam + rm /cromwell_root/${sample_id}.read_overlap.bam + rm /cromwell_root/${sample_id}.hisat3n_dna.unique_aligned.bam + } + + # run 4 instances of task in parallel + for file in "${UNIQUE_BAMS[@]}"; do + ( + echo "starting task $file.." + task "$file" + sleep $(( (RANDOM % 3) + 1)) + ) & + # allow to execute up to 4 jobs in parallel + if [[ $(jobs -r -p | wc -l) -ge 4 ]]; then + wait -n + fi + done + wait + echo "Tasks all done." + du -h * + echo "Tar files." + tar -zcvf ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz output_bams/*.matrix.txt + tar -zcvf ~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz *.hisat3n_dna.all_reads.name_sort.bam + # tar outputs of call_chromatin_contacts + tar -zcvf ~{plate_id}.hisat3n_dna.all_reads.3C.contact.tar.gz *.hisat3n_dna.all_reads.3C.contact.tsv.gz + tar -zcvf ~{plate_id}.hisat3n_dna.all_reads.dedup_contacts.tar.gz *.hisat3n_dna.all_reads.dedup_contacts.tsv.gz + tar -zcvf ~{plate_id}.chromatin_contact_stats.tar.gz *.hisat3n_dna.all_reads.contact_stats.csv + # tar outputs of allcools + tar -zcvf ~{plate_id}.allc.tsv.tar.gz *.allc.tsv.gz + tar -zcvf ~{plate_id}.allc.tbi.tar.gz *.allc.tsv.gz.tbi + tar -zcvf ~{plate_id}.allc.count.tar.gz *.allc.tsv.gz.count.csv + tar -zcvf ~{plate_id}.extract-allc_tbi.tar.gz *.tbi + tar -zcvf ~{plate_id}.extract-allc.tar.gz /cromwell_root/allc-${mcg_context}/*.gz + tar -zcvf ~{plate_id}.extract-allc_tbi.tar.gz /cromwell_root/allc-${mcg_context}/*.tbi >>> + runtime { docker: docker - disks: "local-disk ${disk_size} HDD" + disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" + cpuPlatform: merge_sort_analyze_cpu_platform preemptible: preemptible_tries } - output { + + output { File allc = "~{plate_id}.allc.tsv.tar.gz" File tbi = "~{plate_id}.allc.tbi.tar.gz" + File all_reads_dedup_contacts = "~{plate_id}.hisat3n_dna.all_reads.dedup_contacts.tar.gz" + File all_reads_3C_contacts = "~{plate_id}.hisat3n_dna.all_reads.3C.contact.tar.gz" + File name_sorted_bam = "~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz" + File dedup_stats_tar = "~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz" + File chromatin_contact_stats = "~{plate_id}.chromatin_contact_stats.tar.gz" File allc_uniq_reads_stats = "~{plate_id}.allc.count.tar.gz" - } -} - - -task unique_reads_cgn_extraction { - input { - File allc_tar - File tbi_tar - File chrom_size_path - String plate_id - - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" - Int disk_size = 80 - Int mem_size = 20 - Int num_upstr_bases = 0 - Int preemptible_tries = 3 - Int cpu = 1 - } - - command <<< - set -euo pipefail - - tar -xf ~{allc_tar} - rm ~{allc_tar} - - tar -xf ~{tbi_tar} - rm ~{tbi_tar} - - # prefix="allc-{mcg_context}/{cell_id}" - if [ ~{num_upstr_bases} -eq 0 ]; then - mcg_context=CGN - else - mcg_context=HCGN - fi - - # create output dir - mkdir /cromwell_root/allc-${mcg_context} - outputdir=/cromwell_root/allc-${mcg_context} - - for gzfile in *.gz - do - name=`echo $gzfile | cut -d. -f1` - echo $name - allcools extract-allc --strandness merge --allc_path $gzfile \ - --output_prefix $outputdir/$name \ - --mc_contexts ${mcg_context} \ - --chrom_size_path ~{chrom_size_path} - done - - cd /cromwell_root - - tar -zcvf ~{plate_id}.output_allc_tar.tar.gz $outputdir/*.gz - tar -zcvf ~{plate_id}.output_tbi_tar.tar.gz $outputdir/*.tbi - - >>> - - runtime { - docker: docker - disks: "local-disk ${disk_size} HDD" - cpu: cpu - memory: "${mem_size} GiB" - preemptible: preemptible_tries - } - - output { - File output_allc_tar = "~{plate_id}.output_allc_tar.tar.gz" - File output_tbi_tar = "~{plate_id}.output_tbi_tar.tar.gz" - } + File extract_allc_output_tbi_tar = "~{plate_id}.extract-allc_tbi.tar.gz" + File extract_allc_output_allc_tar = "~{plate_id}.extract-allc.tar.gz" + File extract_allc_output_tbi_tar = "~{plate_id}.extract-allc_tbi.tar.gz" + } } - task summary { input { Array[File] trimmed_stats @@ -1069,11 +944,11 @@ task summary { Array[File] unique_reads_cgn_extraction_tbi String plate_id - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" + String docker Int disk_size = 80 - Int mem_size = 20 + Int mem_size = 5 Int preemptible_tries = 3 - Int cpu = 1 + Int cpu = 4 } command <<< set -euo pipefail @@ -1129,4 +1004,4 @@ task summary { output { File mapping_summary = "~{plate_id}_MappingSummary.csv.gz" } -} \ No newline at end of file +} diff --git a/pipelines/skylab/snM3C/test_inputs/Plumbing/miseq_M16_G13.json b/pipelines/skylab/snM3C/test_inputs/Plumbing/miseq_M16_G13.json index 8df63dba8b..e7d1cfe078 100644 --- a/pipelines/skylab/snM3C/test_inputs/Plumbing/miseq_M16_G13.json +++ b/pipelines/skylab/snM3C/test_inputs/Plumbing/miseq_M16_G13.json @@ -16,5 +16,7 @@ "snM3C.tarred_index_files":"gs://broad-gotc-test-storage/methylome/input/plumbing/index_files/hg38_index_files.tar.gz", "snM3C.chromosome_sizes": "gs://broad-gotc-test-storage/methylome/input/plumbing/index_files/hg38.chrom.sizes", "snM3C.genome_fa": "gs://broad-gotc-test-storage/methylome/input/plumbing/index_files/hg38.fa", - "snM3C.batch_number": 2 + "snM3C.batch_number": 2, + "snM3C.single_end_hisat_cpu_platform": "Intel Cascade Lake", + "snM3C.merge_sort_analyze_cpu_platform": "Intel Cascade Lake" } diff --git a/scripts/BuildAFComparisonTable.wdl b/scripts/BuildAFComparisonTable.wdl index 066d5691f8..43e331df4e 100644 --- a/scripts/BuildAFComparisonTable.wdl +++ b/scripts/BuildAFComparisonTable.wdl @@ -117,7 +117,7 @@ task AnnotateWithAF_t { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" # some of the gnomad vcfs are like 38 gigs so maybe need more ? } @@ -145,7 +145,7 @@ task GatherVCFsCloud { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: "16 GB" } @@ -169,7 +169,7 @@ task MakeSitesOnlyVcf { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: "16 GB" } @@ -224,7 +224,7 @@ task VariantsToTable { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: "16 GB" } @@ -280,7 +280,7 @@ task RemoveSymbolicAlleles { File output_vcf_index = "~{output_basename}.vcf.gz.tbi" } runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: "4 GB" } diff --git a/scripts/RemoveBadSitesByID.wdl b/scripts/RemoveBadSitesByID.wdl index 700ee05a4a..bb1bcefd8f 100644 --- a/scripts/RemoveBadSitesByID.wdl +++ b/scripts/RemoveBadSitesByID.wdl @@ -129,7 +129,7 @@ task SplitX { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk " + disk_size + " HDD" memory: "16 GB" } @@ -215,7 +215,7 @@ task RemoveBadSitesFromVcf { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" disks: "local-disk 100 HDD" memory: "16 GB" } diff --git a/tasks/broad/DragenTasks.wdl b/tasks/broad/DragenTasks.wdl index ebc4146a7e..149eb5fd12 100644 --- a/tasks/broad/DragenTasks.wdl +++ b/tasks/broad/DragenTasks.wdl @@ -24,7 +24,7 @@ task CalibrateDragstrModel { File str_table_file File alignment ## can handle cram or bam. File alignment_index - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int preemptible_tries = 3 Int threads = 4 Int? memory_mb diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index 0e446a1993..0e3c8f2e6e 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -96,7 +96,7 @@ task HaplotypeCaller_GATK4_VCF { Boolean use_dragen_hard_filtering = false Boolean use_spanning_event_genotyping = true File? dragstr_model - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int memory_multiplier = 1 } @@ -256,7 +256,7 @@ task HardFilterVcf { String vcf_basename File interval_list Int preemptible_tries - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 @@ -292,7 +292,7 @@ task DragenHardFilterVcf { Boolean make_gvcf String vcf_basename Int preemptible_tries - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 @@ -332,7 +332,7 @@ task CNNScoreVariants { File ref_fasta_index File ref_dict Int preemptible_tries - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2)) @@ -389,7 +389,7 @@ task FilterVariantTranches { File dbsnp_resource_vcf_index String info_key Int preemptible_tries - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int disk_size = ceil(size(hapmap_resource_vcf, "GiB") + diff --git a/tasks/broad/IlluminaGenotypingArrayTasks.wdl b/tasks/broad/IlluminaGenotypingArrayTasks.wdl index dff9fdee6e..2598bed60b 100644 --- a/tasks/broad/IlluminaGenotypingArrayTasks.wdl +++ b/tasks/broad/IlluminaGenotypingArrayTasks.wdl @@ -404,7 +404,7 @@ task SelectVariants { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" bootDiskSizeGb: 15 disks: "local-disk " + disk_size + " HDD" memory: "3500 MiB" @@ -441,7 +441,7 @@ task SelectIndels { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" bootDiskSizeGb: 15 disks: "local-disk " + disk_size + " HDD" memory: "3500 MiB" @@ -577,7 +577,7 @@ task SubsetArrayVCF { } runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" bootDiskSizeGb: 15 disks: "local-disk " + disk_size + " HDD" memory: "3500 MiB" @@ -676,7 +676,7 @@ task ValidateVariants { >>> runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" bootDiskSizeGb: 15 disks: "local-disk " + disk_size + " HDD" memory: "3500 MiB" diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 533cdb6dfc..5e575c002a 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -65,7 +65,7 @@ task GenerateChunk { Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here Int cpu = 1 Int memory_mb = 8000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int command_mem = memory_mb - 1000 Int max_heap = memory_mb - 500 @@ -112,7 +112,7 @@ task CountVariantsInChunks { File panel_vcf File panel_vcf_index - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 4000 Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + 20 @@ -266,7 +266,7 @@ task GatherVcfs { Array[File] input_vcfs String output_vcf_basename - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 16000 Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) @@ -336,7 +336,7 @@ task UpdateHeader { String basename Int disk_size_gb = ceil(4*(size(vcf, "GiB") + size(vcf_index, "GiB"))) + 20 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 8000 } @@ -372,7 +372,7 @@ task RemoveSymbolicAlleles { String output_basename Int disk_size_gb = ceil(3*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 4000 } @@ -659,7 +659,7 @@ task SubsetVcfToRegion { Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here Int cpu = 1 Int memory_mb = 8000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } Int command_mem = memory_mb - 1000 Int max_heap = memory_mb - 500 @@ -754,7 +754,7 @@ task SelectVariantsByIds { File ids String basename - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 16000 Int disk_size_gb = ceil(1.2*size(vcf, "GiB")) + 100 @@ -820,7 +820,7 @@ task InterleaveVariants { Array[File] vcfs String basename - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 16000 Int disk_size_gb = ceil(3.2*size(vcfs, "GiB")) + 100 diff --git a/tasks/broad/JointGenotypingTasks.wdl b/tasks/broad/JointGenotypingTasks.wdl index 41918ed50b..65386b0f06 100644 --- a/tasks/broad/JointGenotypingTasks.wdl +++ b/tasks/broad/JointGenotypingTasks.wdl @@ -51,7 +51,7 @@ task SplitIntervalList { Int machine_mem_mb = 3750 String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" String? extra_args - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -94,7 +94,7 @@ task ImportGVCFs { Int machine_mem_mb = 30000 Int batch_size - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -159,7 +159,7 @@ task GenotypeGVCFs { Int machine_mem_mb = 26000 # This is needed for gVCFs generated with GATK3 HaplotypeCaller Boolean allow_old_rms_mapping_quality_annotation_data = false - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -216,7 +216,7 @@ task GnarlyGenotyper { String dbsnp_vcf Boolean make_annotation_db = false - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int machine_mem_mb = 26000 Int disk_size_gb = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) } @@ -277,7 +277,7 @@ task HardFilterAndMakeSitesOnlyVcf { Int disk_size_gb Int machine_mem_mb = 3750 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -339,7 +339,7 @@ task IndelsVariantRecalibrator { Int disk_size_gb Int machine_mem_mb = 26000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -404,7 +404,7 @@ task SNPsVariantRecalibratorCreateModel { Int disk_size_gb Int machine_mem_mb = 104000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -468,7 +468,7 @@ task SNPsVariantRecalibrator { Int max_gaussians = 6 Int disk_size_gb - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int? machine_mem_mb } @@ -533,7 +533,7 @@ task GatherTranches { String mode Int disk_size_gb Int machine_mem_mb = 7500 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -607,7 +607,7 @@ task ApplyRecalibration { Boolean use_allele_specific_annotations Int disk_size_gb Int machine_mem_mb = 7000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -658,7 +658,7 @@ task GatherVcfs { String output_vcf_name Int disk_size_gb Int machine_mem_mb = 7000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -706,7 +706,7 @@ task SelectFingerprintSiteVariants { String base_output_name Int disk_size_gb Int machine_mem_mb = 7500 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -759,7 +759,7 @@ task CollectVariantCallingMetrics { File ref_dict Int disk_size_gb Int machine_mem_mb = 7500 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -798,7 +798,7 @@ task GatherVariantCallingMetrics { String output_prefix Int disk_size_gb Int machine_mem_mb = 3000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } parameter_meta { @@ -879,7 +879,7 @@ task CrossCheckFingerprint { String output_base_name Boolean scattered = false Array[String] expected_inconclusive_samples = [] - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int? machine_mem_mb Int disk = 100 } @@ -1000,7 +1000,7 @@ task GetFingerprintingIntervalIndices { input { Array[File] unpadded_intervals File haplotype_database - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = 10 Int machine_mem_mb = 3750 } @@ -1114,7 +1114,7 @@ task CalculateAverageAnnotations { File vcf Array[String] annotations_to_divide = ["ASSEMBLED_HAPS", "FILTERED_HAPS", "TREE_SCORE"] - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil(size(vcf, "GB") + 50) Int memory_mb = 12000 Int preemptible = 3 diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index 3dadae1b72..0ff525b571 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -621,7 +621,7 @@ task ValidateVCF { Int preemptible_tries = 3 Boolean is_gvcf = true String? extra_args - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int machine_mem_mb = 7000 } @@ -629,7 +629,7 @@ task ValidateVCF { String calling_interval_list_basename = basename(calling_interval_list) String calling_interval_list_index_basename = if calling_intervals_is_vcf then basename(select_first([calling_interval_list_index])) else "" - Int command_mem_mb = machine_mem_mb - 1000 + Int command_mem_mb = machine_mem_mb - 2000 Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20 diff --git a/tasks/broad/RNAWithUMIsTasks.wdl b/tasks/broad/RNAWithUMIsTasks.wdl index 9e5b459f28..0c5ee13362 100644 --- a/tasks/broad/RNAWithUMIsTasks.wdl +++ b/tasks/broad/RNAWithUMIsTasks.wdl @@ -278,7 +278,7 @@ task GetSampleName { input { File bam - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 1000 Int disk_size_gb = ceil(2.0 * size(bam, "GiB")) + 10 @@ -852,7 +852,7 @@ task CalculateContamination { File population_vcf File population_vcf_index # runtime - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 8192 Int disk_size_gb = 256 diff --git a/tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl b/tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl index e293defa2e..6d1e5999bb 100644 --- a/tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl +++ b/tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl @@ -326,7 +326,7 @@ task HardThresholdVCF { String output_basename String score_key Int disk_size - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } command <<< @@ -384,7 +384,7 @@ task AnnotateSampleVCF { File input_vcf_index String output_basename Int disk_size = ceil(size(input_vcf, "GB") * 2) + 50 - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" File ref_fasta File ref_fasta_index File ref_dict diff --git a/tasks/broad/UltimaGenomicsWholeGenomeGermlineTasks.wdl b/tasks/broad/UltimaGenomicsWholeGenomeGermlineTasks.wdl index d1761ba041..4942f79280 100644 --- a/tasks/broad/UltimaGenomicsWholeGenomeGermlineTasks.wdl +++ b/tasks/broad/UltimaGenomicsWholeGenomeGermlineTasks.wdl @@ -56,7 +56,7 @@ task SplitCram { String base_file_name Int reads_per_file - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil(3 * size(input_cram_bam, "GiB") + 20) Int cpu = 1 Int memory_gb = 10 @@ -269,7 +269,7 @@ task MarkDuplicatesSpark { Array[File] input_bams String output_bam_basename Boolean save_bam_file - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb Int cpu = 32 Int memory_mb = if 4 * ceil(size(input_bams, "MB")) / 4000 > 600000 then 300000 else 208000 @@ -319,7 +319,7 @@ task ExtractSampleNameFlowOrder{ File input_bam References references - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil(size(input_bam, "GB") + size(references.ref_fasta, "GB") + 20) Int cpu = 1 Int memory_mb = 2000 @@ -480,7 +480,7 @@ task HaplotypeCaller { Boolean native_sw = false String? contamination_extra_args - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil((size(input_bam_list, "GB")) + size(references.ref_fasta, "GB") + size(references.ref_fasta_index, "GB") + size(references.ref_dict, "GB") + 60) Int cpu = 2 Int memory_mb = 12000 @@ -591,7 +591,7 @@ task ConvertGVCFtoVCF { String output_vcf_name References references - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil(2 * size(input_gvcf, "GB") + size(references.ref_fasta, "GB") + size(input_gvcf_index, "GB") + 20) Int cpu = 1 Int memory_mb = 12000 @@ -947,7 +947,7 @@ task AnnotateVCF { String flow_order String final_vcf_base_name - String docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int disk_size_gb = ceil(2 * size(input_vcf, "GB") + size(references.ref_fasta, "GB") + size(reference_dbsnp, "GB") + 20) Int cpu = 1 Int memory_mb = 15000 diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 7adbfef0a0..6952a21588 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -11,7 +11,7 @@ task FastqProcessing { String read_struct #using the latest build of warp-tools in GCR - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" #runtime values Int machine_mem_mb = 40000 Int cpu = 16 @@ -246,7 +246,7 @@ task FastqProcessATAC { # [?] copied from corresponding optimus wdl for fastqprocessing # using the latest build of warp-tools in GCR - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" # Runtime attributes [?] Int mem_size = 5 diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 4279f6ff6c..18fed45fc1 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -6,8 +6,7 @@ task OptimusH5adGeneration { input { #runtime values - #String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.6-1692962087" - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" # name of the sample String input_id # user provided id @@ -106,8 +105,7 @@ task SingleNucleusOptimusH5adOutput { input { #runtime values - #String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.6-1692962087" - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" # name of the sample String input_id # user provided id @@ -195,14 +193,13 @@ task JoinMultiomeBarcodes { Int nthreads = 1 String cpuPlatform = "Intel Cascade Lake" + Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000 + Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10 } String gex_base_name = basename(gex_h5ad, ".h5ad") String atac_base_name = basename(atac_h5ad, ".h5ad") String atac_fragment_base = basename(atac_fragment, ".tsv") - Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000 - Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10 - parameter_meta { atac_h5ad: "The resulting h5ad from the ATAC workflow." atac_fragment: "The resulting fragment TSV from the ATAC workflow." diff --git a/tasks/skylab/Metrics.wdl b/tasks/skylab/Metrics.wdl index a1b3f0c74f..fb91283d71 100644 --- a/tasks/skylab/Metrics.wdl +++ b/tasks/skylab/Metrics.wdl @@ -8,12 +8,11 @@ task CalculateCellMetrics { String input_id # runtime values - #String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.9-1700252065" - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" Int machine_mem_mb = 8000 Int cpu = 4 Int disk = ceil(size(bam_input, "Gi") * 4) + ceil((size(original_gtf, "Gi") * 3)) - Int preemptible = 3 + Int preemptible = 1 } meta { @@ -81,16 +80,16 @@ task CalculateCellMetrics { task CalculateGeneMetrics { input { File bam_input + File original_gtf File? mt_genes String input_id # runtime values - #String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.9-1700252065" - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" Int machine_mem_mb = 32000 Int cpu = 4 - Int disk = ceil(size(bam_input, "Gi") * 4) - Int preemptible = 3 + Int disk = ceil(size(bam_input, "Gi") * 4) + ceil((size(original_gtf, "Gi") * 3)) + Int preemptible = 1 } @@ -109,9 +108,21 @@ task CalculateGeneMetrics { command { set -e + + # create the tmp folder mkdir temp + + # if GTF file in compressed then uncompress + if [[ ~{original_gtf} =~ \.gz$ ]] + then + gunzip -c ~{original_gtf} > annotation.gtf + else + mv ~{original_gtf} annotation.gtf + fi + # call TagSort with gene as metric type TagSort --bam-input ~{bam_input} \ + --gtf-file annotation.gtf \ --metric-output "~{input_id}.gene-metrics.csv" \ --compute-metric \ --metric-type gene \ @@ -149,11 +160,13 @@ task CalculateGeneMetrics { task CalculateUMIsMetrics { input { File bam_input + File original_gtf File? mt_genes String input_id + # runtime values # Did not update docker image as this task uses loom which does not play nice with the changes - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.9-1700252065" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" Int machine_mem_mb = 16000 Int cpu = 8 Int disk = ceil(size(bam_input, "Gi") * 4) @@ -179,7 +192,16 @@ task CalculateUMIsMetrics { set -e mkdir temp + # if GTF file in compressed then uncompress + if [[ ~{original_gtf} =~ \.gz$ ]] + then + gunzip -c ~{original_gtf} > annotation.gtf + else + mv ~{original_gtf} annotation.gtf + fi + TagSort --bam-input ~{bam_input} \ + --gtf-file annotation.gtf \ --metric-output "~{input_id}.umi-metrics.csv" \ --compute-metric \ --metric-type umi \ @@ -219,8 +241,7 @@ task FastqMetricsSlideSeq { # Runtime attributes - #String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.9-1700252065" - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" Int cpu = 16 Int machine_mb = 40000 Int disk = ceil(size(r1_fastq, "GiB")*3) + 50 diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index b61030f56c..477d5d4779 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -443,11 +443,12 @@ task MergeStarOutput { Array[File]? summary Array[File]? align_features Array[File]? umipercell + String? counting_mode String input_id #runtime values - String docker = "us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.2-1709308985" Int machine_mem_gb = 20 Int cpu = 1 Int disk = ceil(size(matrix, "Gi") * 2) + 10 @@ -475,45 +476,75 @@ task MergeStarOutput { declare -a align_features_files=(~{sep=' ' align_features}) declare -a umipercell_files=(~{sep=' ' umipercell}) - for cell_read in "${cell_reads_files[@]}"; do - if [ -f "$cell_read" ]; then - cat "$cell_read" >> "~{input_id}_cell_reads.txt" - fi - done + if [ -f "${cell_reads_files[0]}" ]; then + + # Destination file for cell reads + dest="~{input_id}_cell_reads.txt" + # first create the header from the first file in the list, and add a column header for the shard id + head -n 1 "${cell_reads_files[0]}" | awk '{print $0 "\tshard_number"}' > "$dest" + + # Loop through the array and add the second row with shard number to a temp file notinpasslist.txt + for index in "${!cell_reads_files[@]}"; do + secondLine=$(sed -n '2p' "${cell_reads_files[$index]}") + echo -e "$secondLine\t$index" >> "notinpasslist.txt" + done + + # add notinpasslist.txt to the destination file and delete the notinpasslist.txt + cat "notinpasslist.txt" >> "$dest" + rm notinpasslist.txt + + # now add the shard id to the matrix in a temporary matrix file, and skip the first two lines + counter=0 + for cell_read in "${cell_reads_files[@]}"; do + if [ -f "$cell_read" ]; then + awk -v var="$counter" 'NR>2 {print $0 "\t" var}' "$cell_read" >> "matrix.txt" + let counter=counter+1 + fi + done + + # add the matrix to the destination file, then delete the matrix file + cat "matrix.txt" >> "$dest" + rm "matrix.txt" + fi + + counter=0 for summary in "${summary_files[@]}"; do if [ -f "$summary" ]; then - cat "$summary" >> "~{input_id}_summary.txt" + awk -v var=",$counter" '{print $0 var}' "$summary" >> "~{input_id}_summary.txt" + let counter=counter+1 fi done + counter=0 for align_feature in "${align_features_files[@]}"; do if [ -f "$align_feature" ]; then - cat "$align_feature" >> "~{input_id}_align_features.txt" - fi - done - - for umipercell in "${umipercell_files[@]}"; do - if [ -f "$umipercell" ]; then - cat "$umipercell" >> "~{input_id}_umipercell.txt" + awk -v var="$counter" '{print $0 " " var}' "$align_feature" >> "~{input_id}_align_features.txt" + let counter=counter+1 fi done + # note that the counter might not correspond to the shard number, it is just the order of files in bash (e.g. 10 before 2) + counter=0 for umipercell in "${umipercell_files[@]}"; do if [ -f "$umipercell" ]; then - cat "$umipercell" >> "~{input_id}_umipercell.txt" + awk -v var="$counter" '{print $0, var}' "$umipercell" >> "~{input_id}_umipercell.txt" + let counter=counter+1 fi done - # If text files are present, create a tar archive with them + # If text files are present, create a tar archive with them and run python script to combine shard metrics if ls *.txt 1> /dev/null 2>&1; then + echo "listing files" + ls + python3 /warptools/scripts/combine_shard_metrics.py ~{input_id}_summary.txt ~{input_id}_align_features.txt ~{input_id}_cell_reads.txt ~{counting_mode} ~{input_id} tar -zcvf ~{input_id}.star_metrics.tar *.txt else echo "No text files found in the folder." fi # create the compressed raw count matrix with the counts, gene names and the barcodes - python3 /usr/gitc/create-merged-npz-output.py \ + python3 /warptools/scripts/create-merged-npz-output.py \ --barcodes ${barcodes_files[@]} \ --features ${features_files[@]} \ --matrix ${matrix_files[@]} \ @@ -534,6 +565,7 @@ task MergeStarOutput { File col_index = "~{input_id}_sparse_counts_col_index.npy" File sparse_counts = "~{input_id}_sparse_counts.npz" File? cell_reads_out = "~{input_id}.star_metrics.tar" + File? library_metrics="~{input_id}_library_metrics.csv" } } diff --git a/verification/VerifyNA12878.wdl b/verification/VerifyNA12878.wdl index 5bee339931..0a8e699ffa 100644 --- a/verification/VerifyNA12878.wdl +++ b/verification/VerifyNA12878.wdl @@ -80,7 +80,7 @@ task RunValidation { } runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" memory: machine_mem + " MiB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" diff --git a/verification/VerifySlideSeq.wdl b/verification/VerifySlideSeq.wdl index e3a8e7e8e4..d20f991d08 100644 --- a/verification/VerifySlideSeq.wdl +++ b/verification/VerifySlideSeq.wdl @@ -5,8 +5,8 @@ import "../verification/VerifyTasks.wdl" as VerifyTasks workflow VerifySlideSeq { input { - File test_loom - File truth_loom + File test_h5ad + File truth_h5ad File test_bam File truth_bam @@ -48,10 +48,10 @@ workflow VerifySlideSeq { truth_zip = truth_umi_metrics } - call VerifyTasks.CompareLooms as CompareLooms{ + call VerifyTasks.CompareH5adFilesGEX as CompareH5adFilesOptimus { input: - test_loom = test_loom, - truth_loom = truth_loom + test_h5ad = test_h5ad, + truth_h5ad = truth_h5ad } } \ No newline at end of file diff --git a/verification/VerifyTasks.wdl b/verification/VerifyTasks.wdl index 355ed42cc4..2772b41aec 100644 --- a/verification/VerifyTasks.wdl +++ b/verification/VerifyTasks.wdl @@ -20,7 +20,7 @@ task CompareVcfs { runtime { docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4:latest" - disks: "local-disk 50 HDD" + disks: "local-disk 70 HDD" memory: "32 GiB" preemptible: 3 } @@ -115,7 +115,7 @@ task CompareTabix { exit_code=0 a=$(md5sum "~{test_fragment_file}" | awk '{ print $1 }') b=$(md5sum ~{truth_fragment_file} | awk '{ print $1 }') - if [[ $a = $b ]]; then + if [[ $a = $b ]]; then echo equal else echo different diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index bb9aff4018..9a4a0ec83a 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -18,7 +18,6 @@ workflow TestMultiome { Array[File]? gex_i1_fastq File tar_star_reference File annotations_gtf - File ref_genome_fasta File? mt_genes Int tenx_chemistry_version = 3 Int emptydrops_lower = 100 @@ -69,7 +68,6 @@ workflow TestMultiome { input_id = input_id, tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, - ref_genome_fasta = ref_genome_fasta, mt_genes = mt_genes, tenx_chemistry_version = tenx_chemistry_version, emptydrops_lower = emptydrops_lower, diff --git a/verification/test-wdls/TestOptimus.wdl b/verification/test-wdls/TestOptimus.wdl index 535eb8d530..82bdf03adc 100644 --- a/verification/test-wdls/TestOptimus.wdl +++ b/verification/test-wdls/TestOptimus.wdl @@ -24,7 +24,6 @@ workflow TestOptimus { # organism reference parameters File tar_star_reference File annotations_gtf - File ref_genome_fasta File? mt_genes String? soloMultiMappers @@ -79,7 +78,6 @@ workflow TestOptimus { input_name_metadata_field = input_name_metadata_field, tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, - ref_genome_fasta = ref_genome_fasta, tenx_chemistry_version = tenx_chemistry_version, emptydrops_lower = emptydrops_lower, force_no_check = force_no_check, diff --git a/verification/test-wdls/TestSlideSeq.wdl b/verification/test-wdls/TestSlideSeq.wdl index bb92bb610f..b63cd87099 100644 --- a/verification/test-wdls/TestSlideSeq.wdl +++ b/verification/test-wdls/TestSlideSeq.wdl @@ -57,7 +57,7 @@ workflow TestSlideSeq { SlideSeq.bam, ], # File? outputs - select_all([SlideSeq.loom_output_file]), + select_all([SlideSeq.h5ad_output_file]), ]) @@ -94,9 +94,9 @@ workflow TestSlideSeq { # This is achieved by passing each desired file/array[files] to GetValidationInputs if (!update_truth){ - call Utilities.GetValidationInputs as GetLoom { + call Utilities.GetValidationInputs as GetH5adInputs { input: - input_file = SlideSeq.loom_output_file, + input_file = SlideSeq.h5ad_output_file, results_path = results_path, truth_path = truth_path } @@ -127,8 +127,8 @@ workflow TestSlideSeq { call VerifySlideSeq.VerifySlideSeq as Verify { input: - truth_loom = GetLoom.truth_file, - test_loom = GetLoom.results_file, + truth_h5ad = GetH5adInputs.truth_file, + test_h5ad = GetH5adInputs.results_file, truth_bam = GetBam.truth_file, test_bam = GetBam.results_file, truth_gene_metrics = GetGeneMetrics.truth_file, diff --git a/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl b/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl index c3138ddb19..de9899439b 100644 --- a/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl +++ b/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl @@ -33,9 +33,8 @@ workflow TestUltimaGenomicsJointGenotyping { File runs_file Array[File] annotation_intervals String flow_order - String snp_annotations - String indel_annotations - Boolean use_allele_specific_annotations + Array[String] snp_annotations + Array[String] indel_annotations String model_backend Int? top_level_scatter_count Boolean? gather_vcfs @@ -83,7 +82,6 @@ workflow TestUltimaGenomicsJointGenotyping { flow_order = flow_order, snp_annotations = snp_annotations, indel_annotations = indel_annotations, - use_allele_specific_annotations = use_allele_specific_annotations, model_backend = model_backend, top_level_scatter_count = top_level_scatter_count, gather_vcfs = gather_vcfs, diff --git a/verification/test-wdls/TestsnM3C.wdl b/verification/test-wdls/TestsnM3C.wdl index 3ca01baf74..de2d5dab4b 100644 --- a/verification/test-wdls/TestsnM3C.wdl +++ b/verification/test-wdls/TestsnM3C.wdl @@ -35,6 +35,10 @@ workflow TestsnM3C { Boolean update_truth String vault_token_path String google_account_vault_path + + String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.3" + String single_end_hisat_cpu_platform = "Intel Ice Lake" + String merge_sort_analyze_cpu_platform = "Intel Ice Lake" } meta { @@ -60,7 +64,10 @@ workflow TestsnM3C { num_upstr_bases = num_upstr_bases, num_downstr_bases = num_downstr_bases, compress_level = compress_level, - batch_number = batch_number + batch_number = batch_number, + docker = docker, + single_end_hisat_cpu_platform = single_end_hisat_cpu_platform, + merge_sort_analyze_cpu_platform = merge_sort_analyze_cpu_platform } @@ -72,24 +79,13 @@ workflow TestsnM3C { ], # Array[File] outputs snM3C.reference_version, - snM3C.chromatin_contact_stats, - snM3C.unique_reads_cgn_extraction_tbi, snM3C.unique_reads_cgn_extraction_allc, - snM3C.dedup_unique_bam_and_index_unique_bam_tar, - snM3C.remove_overlap_read_parts_bam_tar, - snM3C.pos_sorted_bams, + snM3C.unique_reads_cgn_extraction_tbi, + snM3C.unique_reads_cgn_extraction_allc_extract, + snM3C.unique_reads_cgn_extraction_tbi_extract, snM3C.name_sorted_bams, - snM3C.merge_sorted_bam_tar, - snM3C.split_fq_tar, - snM3C.unmapped_fastq_tar, - snM3C.multi_bam_tar, - snM3C.unique_bam_tar, - snM3C.hisat3n_bam_tar, - snM3C.hisat3n_stats_tar, - snM3C.r2_trimmed_fq, - snM3C.r1_trimmed_fq, - snM3C.trimmed_stats, - + snM3C.all_reads_dedup_contacts, + snM3C.all_reads_3C_contacts, ]) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index c5357613c2..4f0750f35d 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -8,7 +8,8 @@ slug: /Pipelines/ATAC/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [1.1.6](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [1.1.8](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | + ## Introduction to the ATAC workflow ATAC is an open-source, cloud-optimized pipeline developed in collaboration with members of the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and [BICAN](https://brainblog.nih.gov/brain-blog/brain-issues-suite-funding-opportunities-advance-brain-cell-atlases-through-centers) Sequencing Working Group) and [SCORCH](https://nida.nih.gov/about-nida/organization/divisions/division-neuroscience-behavior-dnb/basic-research-hiv-substance-use-disorder/scorch-program) (see [Acknowledgements](#acknowledgements) below). It supports the processing of 10x single-nucleus data generated with 10x Multiome [ATAC-seq (Assay for Transposase-Accessible Chromatin)](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression), a technique used in molecular biology to assess genome-wide chromatin accessibility. @@ -95,9 +96,16 @@ To see specific tool parameters, select the task WDL link in the table; then vie All ATAC pipeline releases are documented in the [ATAC changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/multiome/atac.changelog.md) and tested using [plumbing and scientific test data](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/multiome/test_inputs). To learn more about WARP pipeline testing, see [Testing Pipelines](https://broadinstitute.github.io/warp/docs/About_WARP/TestingPipelines). ## Citing the ATAC Pipeline -Please identify the pipeline in your methods section using the ATAC Pipeline's [SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_024656/resolver?q=SCR_024656%2A&l=SCR_024656%2A&i=rrid:scr_024656). + +If you use the ATAC Pipeline in your research, please identify the pipeline in your methods section using the [ATAC SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_024656/resolver?q=SCR_024656%2A&l=SCR_024656%2A&i=rrid:scr_024656). + * Ex: *ATAC Pipeline (RRID:SCR_024656)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + + ## Acknowledgements We are immensely grateful to the members of the BRAIN Initiative (BICAN Sequencing Working Group) and SCORCH for their invaluable and exceptional contributions to this pipeline. Our heartfelt appreciation goes to Alex Dobin, Aparna Bhaduri, Alec Wysoker, Anish Chakka, Brian Herb, Daofeng Li, Fenna Krienen, Guo-Long Zuo, Jeff Goldy, Kai Zhang, Khalid Shakir, Bo Li, Mariano Gabitto, Michael DeBerardine, Mengyi Song, Melissa Goldman, Nelson Johansen, James Nemesh, and Theresa Hodges for their unwavering dedication and remarkable efforts. diff --git a/website/docs/Pipelines/BuildIndices_Pipeline/README.md b/website/docs/Pipelines/BuildIndices_Pipeline/README.md index fc328379aa..0d0431edc4 100644 --- a/website/docs/Pipelines/BuildIndices_Pipeline/README.md +++ b/website/docs/Pipelines/BuildIndices_Pipeline/README.md @@ -112,6 +112,12 @@ The following table lists the output variables and files produced by the pipelin All BuildIndices pipeline releases are documented in the [BuildIndices changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/build_indices/BuildIndices.changelog.md) and tested manually using [reference JSON files](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/build_indices). +## Citing the BuildIndices Pipeline + +If you use the BuildIndices Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia support This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN). diff --git a/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/CEMBA.methods.md b/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/CEMBA.methods.md index 13ede6b7c9..97c7ef4c94 100644 --- a/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/CEMBA.methods.md +++ b/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/CEMBA.methods.md @@ -2,7 +2,7 @@ sidebar_position: 2 --- -# CEMBA_v1.1.5 Publication Methods +# CEMBA_v1.1.6 Publication Methods Below we provide a sample methods section for a publication. For the complete pipeline documentation, see the [CEMBA README](./README.md). @@ -20,7 +20,7 @@ The trimmed R1 and R2 reads were then aligned to mouse (mm10) or human (hg19) ge After alignment, the output R1 and R2 BAMs were sorted in coordinate order and duplicates removed using the Picard MarkDuplicates REMOVE_DUPLICATE option. Samtools 1.9 was used to further filter BAMs with a minimum map quality of 30 using the parameter `-bhq 30`. -Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.3.0.0. The VCF was then converted to an additional ALLC file using a custom python script. +Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.5.0.0. The VCF was then converted to an additional ALLC file using a custom python script. Samtools was then used to calculate coverage depth for sites with coverage greater than 1 and to create BAM index files. The final outputs included the barcoded aligned BAM, BAM index, a VCF with locus-specific methylation information, VCF index, ALLC file, and methylation reports. diff --git a/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/README.md b/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/README.md index d38b188f5c..5dac529f8d 100644 --- a/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/README.md +++ b/website/docs/Pipelines/CEMBA_MethylC_Seq_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/CEMBA_MethylC_Seq_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [CEMBA_v1.1.5](https://github.com/broadinstitute/warp/releases) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [CEMBA_v1.1.6](https://github.com/broadinstitute/warp/releases) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![CEMBA](./CEMBA.png) @@ -28,7 +28,7 @@ Interested in using the pipeline for your publication? See the [“CEMBA publica | Workflow Language | WDL 1.0 | [openWDL](https://github.com/openwdl/wdl) | | Genomic Reference Sequence| GRCH38 and GRCM38 | [GENCODE](https://www.gencodegenes.org/) | | Aligner | BISMARK v0.21.0 with --bowtie2 | [Bismark](https://www.bioinformatics.babraham.ac.uk/projects/bismark/) | -| Variant Caller | GATK 4.3.0.0 | [GATK 4.3.0.0](https://gatk.broadinstitute.org/hc/en-us) +| Variant Caller | GATK 4.5.0.0 | [GATK 4.5.0.0](https://gatk.broadinstitute.org/hc/en-us) | Data Input File Format | File format in which sequencing data is provided | [Zipped FASTQs (.fastq.gz)](https://support.illumina.com/bulletins/2016/04/fastq-files-explained.html) | | Data Output File Format | File formats in which CEMBA output is provided | [BAM](http://samtools.github.io/hts-specs/), [VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf), [ALLC](https://github.com/yupenghe/methylpy#output-format) | @@ -105,10 +105,10 @@ The table and summary sections below detail the tasks and tools of the CEMBA pip | GetMethylationReport | [Bismark v0.21.0](https://www.bioinformatics.babraham.ac.uk/projects/bismark/) | Produce methylation report for reads above map quality and below map quality | quay.io/broadinstitute/bismark:0.21.0 | | AttachBarcodes | [Picard v2.26.10](https://broadinstitute.github.io/picard/) | Add barcodes from the tagged uBAM to the aligned BAM | us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10 | | MergeBams | [Samtools v.19](http://www.htslib.org/) | Merge R1 and R2 BAM files into single BAM | quay.io/broadinstitute/samtools:1.9 | -| AddReadGroup | [GATK v4.3.0.0](https://gatk.broadinstitute.org/hc/en-us) | Add read groups to the merged BAM | us.gcr.io/broad-gatk/gatk:4.3.0.0 | +| AddReadGroup | [GATK v4.5.0.0](https://gatk.broadinstitute.org/hc/en-us) | Add read groups to the merged BAM | us.gcr.io/broad-gatk/gatk:4.5.0.0 | | Sort | [Picard v2.26.10](https://broadinstitute.github.io/picard/) | Sort in coordinate order after adding read group | us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10 | | IndexBam | [Samtools v1.9](http://www.htslib.org/) | Index the output BAM | quay.io/broadinstitute/samtools:1.9 | -| MethylationTypeCaller | [GATK v4.3.0.0](https://gatk.broadinstitute.org/hc/en-us) | Produce a VCF with locus-specific methylation information | us.gcr.io/broad-gatk/gatk:4.3.0.0 | +| MethylationTypeCaller | [GATK v4.5.0.0](https://gatk.broadinstitute.org/hc/en-us) | Produce a VCF with locus-specific methylation information | us.gcr.io/broad-gatk/gatk:4.5.0.0 | | VCFtoALLC | Python | Creates an [ALLC](https://github.com/yupenghe/methylpy#output-format) file from the VCF produced with MethylationTypeCaller | quay.io/cemba/vcftoallc:v0.0.1 | | ComputeCoverageDepth | [Samtools v1.9](http://www.htslib.org/) | Compute number of sites with coverage greater than 1 | quay.io/broadinstitute/samtools:1.9 | @@ -178,9 +178,15 @@ The table below details the pipeline outputs. **If using multiplexed samples, th All CEMBA pipeline releases are documented in the [CEMBA changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/cemba/cemba_methylcseq/CEMBA.changelog.md). ## Citing the CEMBA Pipeline -Please identify the pipeline in your methods section using the CEMBA Pipeline's [SciCrunch resource identifier](https://scicrunch.org/scicrunch/Resources/record/nlx_144509-1/SCR_021219/resolver?q=CEMBA&l=CEMBA). + +If you use the CEMBA Pipeline in your research, please identify the pipeline in your methods section using the [CEMBA SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_021219/resolver?q=SCR_021219&l=SCR_021219&i=rrid:scr_021219). + * Ex: *CEMBA MethylC Seq Pipeline (RRID:SCR_021219)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia Support This pipeline is supported and used by the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN). diff --git a/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/README.md b/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/README.md index 2165bd249d..10582bdb6d 100644 --- a/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Exome_Germline_Single_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [ExomeGermlineSingleSample_v3.1.16](https://github.com/broadinstitute/warp/releases?q=ExomeGermlineSingleSample_v3.0.0&expanded=true) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [ExomeGermlineSingleSample_v3.1.18](https://github.com/broadinstitute/warp/releases?q=ExomeGermlineSingleSample_v3.0.0&expanded=true) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | The Exome Germline Single Sample pipeline implements data pre-processing and initial variant calling according to the GATK Best Practices for germline SNP and Indel discovery in human exome sequencing data. @@ -27,7 +27,7 @@ The Exome Germline Single Sample workflow is written in the Workflow Description ### Software Version Requirements -* [GATK 4.3.0.0](https://github.com/broadinstitute/gatk/releases/tag/4.3.0.0) +* [GATK 4.5.0.0](https://github.com/broadinstitute/gatk/releases/tag/4.5.0.0) * Picard 2.26.10 * Samtools 1.11 * Python 3.0 @@ -130,6 +130,12 @@ view the following tutorial [(How to) Execute Workflows from the gatk-workflows - Please visit the [GATK Technical Documentation](https://gatk.broadinstitute.org/hc/en-us/categories/360002310591) site for further documentation on our workflows and tools. - You can access relevant reference and resource bundles in the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811). +## Citing the Exome Germline Single Sample Pipeline + +If you use the Exome Germline Single Sample Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Contact Us This material is provided by the Data Science Platform group at the Broad Institute. Please direct any questions or concerns to one of our forum sites : [GATK](https://gatk.broadinstitute.org/hc/en-us/community/topics) or [Terra](https://support.terra.bio/hc/en-us/community/topics/360000500432). diff --git a/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md b/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md index a66740f22a..28b0ee4fcd 100644 --- a/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md +++ b/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# Exome Germline Single Sample v3.0.0 Methods +# Exome Germline Single Sample v3.1.18 Methods The following contains a detailed methods description outlining the pipeline’s process, software, and tools that can be modified for a publication methods section. ## Detailed Methods -Preprocessing and variant calling was performed using the ExomeGermlineSingleSample 3.0.0 pipeline using Picard 2.23.8, GATK 4.2.2.0, and Samtools 1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline follows GATK Best Practices as previously described ([Van der Auwera & O'Connor, 2020](https://www.oreilly.com/library/view/genomics-in-the/9781491975183/)) as well as the Functional Equivalence specification ([Regier et al., 2018](https://www.nature.com/articles/s41467-018-06159-4)). +Preprocessing and variant calling was performed using the ExomeGermlineSingleSample 3.1.17 pipeline using Picard 2.26.10, GATK 4.5.0.0, and Samtools 1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline follows GATK Best Practices as previously described ([Van der Auwera & O'Connor, 2020](https://www.oreilly.com/library/view/genomics-in-the/9781491975183/)) as well as the Functional Equivalence specification ([Regier et al., 2018](https://www.nature.com/articles/s41467-018-06159-4)). ### Pre-processing and QC @@ -31,4 +31,5 @@ Prior to variant calling, the variant calling interval list was split to enable The pipeline’s final outputs included metrics, the ValidateSamFile validation reports, an aligned CRAM with index, and a reblocked GVCF containing variant calls with an accompanying index. ## Previous methods documents +- [ExomeGermlineSingleSample_v3.0.0](https://github.com/broadinstitute/warp/blob/ExomeGermlineSingleSample_v3.0.0/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md) - [ExomeGermlineSingleSample_v2.4.4](https://github.com/broadinstitute/warp/blob/ExomeGermlineSingleSample_v2.6.0/website/docs/Pipelines/Exome_Germline_Single_Sample_Pipeline/exome.methods.md) \ No newline at end of file diff --git a/website/docs/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/README.md b/website/docs/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/README.md index cef49ec424..d3151c4060 100644 --- a/website/docs/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/README.md +++ b/website/docs/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [GDCWholeGenomeSomaticSingleSample_v1.0.1](https://github.com/broadinstitute/warp/releases) | January, 2021 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [GDCWholeGenomeSomaticSingleSample_v1.3.1](https://github.com/broadinstitute/warp/releases) | January, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the GDC Whole Genome Somatic Single Sample pipeline @@ -29,8 +29,8 @@ For the latest workflow version and release notes, please see the [changelog](ht ### Software version requirements -* GATK 4.0.7 -* Picard 2.18.11 (Custom Docker is used to run software on Cromwell 52) +* GATK 4.5.0.0 +* Picard 2.26.10 * Samtools 1.11 * Python 3.0 * Cromwell version support @@ -114,6 +114,12 @@ Alternatively, Cromwell allows you to specify an output directory using an optio - Runtime parameters are optimized for Broad's Google Cloud Platform implementation. - Please visit the [GATK Technical Documentation](https://gatk.broadinstitute.org/hc/en-us/categories/360002310591) site for further documentation on GATK-related workflows and tools. +## Citing the GDC Pipeline + +If you use the GDC Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Contact us Please help us make our tools better by contacting [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) for pipeline-related suggestions or questions. diff --git a/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/Illumina_genotyping_array_spec.md b/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/Illumina_genotyping_array_spec.md index 6d105fe77c..566e3722b7 100644 --- a/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/Illumina_genotyping_array_spec.md +++ b/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/Illumina_genotyping_array_spec.md @@ -4,9 +4,9 @@ sidebar_position: 2 # VCF Overview: Illumina Genotyping Array -The [Illumina Genotyping Array Pipeline](https://github.com/broadinstitute/warp/blob/develop/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl) v1.11.0 pipeline produces a VCF (Variant Call Format) output with data processing and sample-specific genotype information. The VCF follows the format listed in the [VCF 4.2 specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf), but additionally contains fields and attributes that are unique to the Arrays pipeline. +The [Illumina Genotyping Array Pipeline](https://github.com/broadinstitute/warp/blob/develop/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl) v1.12.16 pipeline produces a VCF (Variant Call Format) output with data processing and sample-specific genotype information. The VCF follows the format listed in the [VCF 4.2 specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf), but additionally contains fields and attributes that are unique to the Arrays pipeline. -This document describes the Array pipeline’s unique VCF fields and attributes that are not listed in the standard VCF specification. To learn more about the pipeline, see the [Illumina Genotyping Array Pipeline Overview](./IlluminaGenotypingArray.documentation.md). +This document describes the Array pipeline’s unique VCF fields and attributes that are not listed in the standard VCF specification. To learn more about the pipeline, see the [Illumina Genotyping Array Pipeline Overview](./README.md). :::tip How do I view a VCF file? @@ -26,7 +26,7 @@ Each VCF has meta information fields with attributes that generally describe the - extendedIlluminaManifestVersion - Version of the ‘extended Illumina manifest’ used by the VCF - generation software. - extendedManifestFile - File name of the ‘extended Illumina manifest’ used by the VCF generation software - fingerprintGender - Gender (sex) determined using an orthogonal fingerprinting technology, populated by an optional parameter used by the VCF generation software -- gtcCallRate - GTC call rate of the sample processed that is generated by the autocall/gencall software and represents the fraction of callable loci that had valid calls +- gtcCallRate - GTC call rate of the sample processed that is generated by the autocall/gencall software and represents the fraction of callable loci that had valid calls; ignores zeroed-out SNPs - imagingDate - Creation date for the chip well barcode IDATs (raw image scans) - manifestFile - Name of the Illumina manifest (.bpm) file used by the VCF generation software - sampleAlias - Sample name @@ -112,4 +112,4 @@ The remaining attributes describe the cluster definitions provided in the cluste - meanX_BB - Mean of normalized X for BB cluster - meanY_AA - Mean of normalized Y for AA cluster - meanY_AB - Mean of normalized Y for AB cluster -- meanY_BB - Mean of normalized Y for BB cluster +- meanY_BB - Mean of normalized Y for BB cluster \ No newline at end of file diff --git a/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/README.md b/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/README.md index 0cebb97fea..c5127827d0 100644 --- a/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/README.md +++ b/website/docs/Pipelines/Illumina_Genotyping_Arrays_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Illumina_Genotyping_Arrays_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Version 1.11.6](https://github.com/broadinstitute/warp/releases) | October, 2021 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [Version 1.12.16](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![The Illumina Genotyping Array Pipeline](./IlluminaGenotyping.png) @@ -121,7 +121,7 @@ The following table provides a summary of the WDL tasks and software tools calle | SubsetArrayVCF | [SubsetArrayVCF](https://gatk.broadinstitute.org/hc/en-us/articles/360036362532) | GATK | | CollectArraysVariantCallingMetrics | [CollectArraysVariantCallingMetrics](https://gatk.broadinstitute.org/hc/en-us/articles/360037593871) | Picard | | SelectVariants | [SelectVariants](https://gatk.broadinstitute.org/hc/en-us/articles/360036362532) | GATK | -| CheckFingerprint | [CheckFingerprint](https://gatk.broadinstitute.org/hc/en-us/articles/360036358752) | Picard | +| CheckFingerprintTask | [CheckFingerprint](https://gatk.broadinstitute.org/hc/en-us/articles/360036358752) | Picard | | VcfToIntervalList | [VcfToIntervalList](https://gatk.broadinstitute.org/hc/en-us/articles/360036897672) | Picard | | GenotypeConcordance | [GenotypeConcordance](https://gatk.broadinstitute.org/hc/en-us/articles/360036348932) | Picard | @@ -176,7 +176,7 @@ DNA fingerprinting helps maintain sample identity and avoid sample swaps. The Il #### 6. Evaluating an existing fingerprint (optional) -If the genotyping sample already has a corresponding fingerprint VCF file, the workflow can also optionally check the existing fingerprint to confirm sample identity. It uses the CheckFingerPrints task to calculate genotype concordance between the workflow’s genotyping output VCF (final_output_vcf) and the known genotype specified in a fingerprint_genotypes_vcf_file. The workflow returns a boolean for if the sample genotype failed concordance, as well as a Logarithm of Odds (LOD) score for concordance. +If the genotyping sample already has a corresponding fingerprint VCF file, the workflow can also optionally check the existing fingerprint to confirm sample identity. It uses the CheckFingerprintTask task to calculate genotype concordance between the workflow’s genotyping output VCF (final_output_vcf) and the known genotype specified in a fingerprint_genotypes_vcf_file. The workflow returns a boolean for if the sample genotype failed concordance, as well as a Logarithm of Odds (LOD) score for concordance. #### 7. Genotype concordance (optional) @@ -237,6 +237,12 @@ All Illumina Genotyping Array workflow releases are documented in the [workflow The Illumina Genotyping Array Pipeline is available on the cloud-based platform [Terra](https://app.terra.bio). If you have a Terra account, you can access the Featured Workspace using this address: `https://app.terra.bio/#workspaces/warp-pipelines/Illumina-Genotyping-Array`. The workspace is preloaded with instructions and sample data. For more information on using the Terra platform, please view the [Support Center](https://support.terra.bio/hc/en-us). +## Citing the Illumina Genotyping Array Pipeline + +If you use the Illumina Genotyping Array Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Feedback and questions Please help us make our tools better by contacting [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) for pipeline-related suggestions or questions. diff --git a/website/docs/Pipelines/Imputation_Pipeline/README.md b/website/docs/Pipelines/Imputation_Pipeline/README.md index 8d82efbc58..4c8faa68cc 100644 --- a/website/docs/Pipelines/Imputation_Pipeline/README.md +++ b/website/docs/Pipelines/Imputation_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Imputation_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Imputation_v1.0.0](https://github.com/broadinstitute/warp/releases?q=Imputation_v1.0.0&expanded=true) | August, 2021 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [Imputation_v1.1.12](https://github.com/broadinstitute/warp/releases?q=Imputation_v1.0.0&expanded=true) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the Imputation pipeline The Imputation pipeline imputes missing genotypes from either a multi-sample VCF or an array of single sample VCFs using a large genomic reference panel. It is based on the [Michigan Imputation Server pipeline](https://imputationserver.readthedocs.io/en/latest/pipeline/). Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. It outputs the imputed VCF along with key imputation metrics. @@ -54,7 +54,7 @@ For examples of how to specify each input in a configuration file, as well as cl | genetics_maps_eagle | Genetic map file for phasing.| File | | output_callset_name | Output callset name. | String | | split_output_to_single_sample | Boolean to split out the final combined VCF to individual sample VCFs; set to false by default. | Boolean | -| merge_ssvcf_mem_gb | Memory allocation for MergeSingleSampleVcfs (in GB). | Int | +| merge_ssvcf_mem_mb | Optional integer specifying memory allocation for MergeSingleSampleVcfs (in MB); default is 3000. | Int | | frac_well_imputed_threshold | Threshold for the fraction of well-imputed sites; default set to 0.9. | Float | | chunks_fail_threshold | Maximum threshold for the number of chunks allowed to fail; default set to 1. | Float | | vcf_suffix | File extension used for the VCF in the reference panel. | String | @@ -138,6 +138,12 @@ The pipeline is cost-optimized for between 100 and 1,000 samples, where the cost | 100 | 0.11 | | 1000 | 0.024 | | 13.5 K | 0.025 | + +## Citing the Imputation Pipeline + +If you use the Imputation Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Contact us diff --git a/website/docs/Pipelines/JointGenotyping/README.md b/website/docs/Pipelines/JointGenotyping/README.md new file mode 100644 index 0000000000..6b8aa181ed --- /dev/null +++ b/website/docs/Pipelines/JointGenotyping/README.md @@ -0,0 +1,249 @@ +--- +sidebar_position: 1 +slug: /Pipelines/JointGenotyping_Pipeline/README +--- + +# JointGenotyping Overview + +| Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | +| :----: | :---: | :----: | :--------------: | +| [JointGenotyping_v1.6.10](https://github.com/broadinstitute/warp/releases) | February, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in WARP or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | + +## Introduction to the JointGenotyping workflow + +The [JointGenotyping workflow](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl) is an open-source, cloud-optimized pipeline that implements joint variant calling, filtering, and (optional) fingerprinting. + +The pipeline can be configured to run using one of the following GATK joint genotyping methods: + +* **[GenotypeGVCFs](https://gatk.broadinstitute.org/hc/en-us/articles/21905118377755)** (default method) performs joint genotyping on GVCF files stored in GenomicsDB and pre-called with HaplotypeCaller. +* **[GnarlyGenotyper](https://gatk.broadinstitute.org/hc/en-us/articles/21904951112091)** performs scalable, “quick and dirty” joint genotyping on a set of GVCF files stored in GenomicsDB and pre-called with HaplotypeCaller. + +The pipeline can be configured to run using one of the following GATK variant filtering techniques: + +* **[Variant Quality Score Recalibration (VQSR)](https://gatk.broadinstitute.org/hc/en-us/articles/360035531612)** (default method) uses the VariantRecalibrator and ApplyVQSR tools to filter variants according to [GATK Best Practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535932). +* **Variant Extract-Train-Score (VETS)** uses the ExtractVariantAnnotations, TrainVariantAnnotationsModel, and ScoreVariantAnnotations tools called in the [VETS subworkflow](https://github.com/broadinstitute/gatk/blob/master/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl) to score variant annotations. + +The pipeline takes in a sample map file listing GVCF files produced by HaplotypeCaller in GVCF mode and produces a filtered VCF file (with index) containing genotypes for all samples present in the input VCF files. All sites that are present in the input VCF file are retained. Filtered sites are annotated as such in the FILTER field. If you are new to VCF files, see the [file type specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf). + +The JointGenotyping pipeline can be adapted to run on Microsoft Azure instead of Google Cloud. For more information, see the [azure-warp-joint-calling GitHub repository](https://github.com/broadinstitute/azure-warp-joint-calling). + +## Set-up + +### JointGenotyping Installation and Requirements + +To download the latest JointGenotyping release, see the release tags prefixed with "JointGenotyping" on the WARP [releases page](https://github.com/broadinstitute/warp/releases). All JointGenotyping pipeline releases are documented in the [JointGenotyping changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md). + +To search releases of this and other pipelines, use the WARP command-line tool [Wreleaser](https://github.com/broadinstitute/warp/tree/master/wreleaser). + +If you’re running a JointGenotyping workflow version prior to the latest release, the accompanying documentation for that release may be downloaded with the source code on the WARP [releases page](https://github.com/broadinstitute/warp/releases) (see the folder `website/docs/Pipelines/JointGenotyping`). + +The JointGenotyping pipeline can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/), a GA4GH-compliant, flexible workflow management system that supports multiple computing platforms. The workflow can also be run in [Terra](https://app.terra.bio), a cloud-based analysis platform. The Terra [Whole-Genome-Analysis-Pipeline](https://app.terra.bio/#workspaces/warp-pipelines/Whole-Genome-Analysis-Pipeline) and [Exome-Analysis-Pipeline](https://app.terra.bio/#workspaces/warp-pipelines/Exome-Analysis-Pipeline) workspaces contain the JointGenotyping pipeline, as well as workflows for preprocessing, initial variant calling, and sample map generation, workflow configurations, required reference data and other inputs, and example testing data. + +### Inputs + +The JointGenotyping workflow inputs are specified in JSON configuration files. Example configuration files can be found in the [test_inputs](https://github.com/broadinstitute/warp/tree/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs) folder in the WARP repository. + +#### Default joint calling input descriptions + +The table below describes the pipeline inputs that apply when the pipeline is run with default parameters and uses GenotypeGVCFs for joint calling and VQSR for variant filtering: + +| Parameter name | Description | Type | +| --- | --- | --- | +| unpadded_intervals_file | Describes the intervals for which VCF output will be written; exome data will have different captures/targets. | File | +| callset_name | Identifier for the group of VCF files used for joint calling. | String | +| sample_name_map | Path to file containing the sample names and the cloud location of the individual GVCF files. | String | +| ref_fasta | Reference FASTA file used for joint calling; must agree with reference for `unpadded_intervals_file`. | File | +| ref_fasta_index | Index for reference FASTA file used for joint calling; must agree with reference for `unpadded_intervals_file`. | File | +| ref_dict | Reference dictionary file used for joint calling; must agree with reference for `unpadded_intervals_file`. | File | +| dbsnp_vcf | Resource VCF file containing common SNPs and indels used for annotating the VCF file after joint calling. | File | +| dbsnp_vcf_index | Index for `dbsnp_vcf`. | File | +| snp_recalibration_tranche_values | Set of sensitivity levels used when running the pipeline using VQSR; value should match estimated sensitivity of truth resource passed as `hapmap_resource_vcf` to the [SNPsVariantRecalibratorCreateModel](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) and [SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) tasks; filter cutoff based on sensitivity to common variants (more sensitivity = more false positives); required when `run_vets` is “false”. | Array[String] | +| snp_recalibration_annotation_values | Features used for filtering model (annotations in VCF file); all allele-specific versions. | Array[String] | +| indel_recalibration_tranche_values | Set of sensitivity levels used when running the pipeline using VQSR; value should match estimated sensitivity of truth resource passed as `mills_resource_vcf` to the [IndelsVariantRecalibrator](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task; filter cutoff based on sensitivity to common variants (more sensitivity = more false positives); required when `run_vets` is “false”. | Array[String] | +| indel_recalibration_annotation_values | Features used for filtering model when running the pipeline using VQSR; required when `run_vets` is “false”. | Array[String] | +| eval_interval_list | Subset of the unpadded intervals file used for metrics. | File | +| hapmap_resource_vcf | Used for SNP variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| hapmap_resource_vcf_index | Used for SNP variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| omni_resource_vcf | Used for SNP recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| omni_resource_vcf_index | Used for SNP recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| one_thousand_genomes_resource_vcf | Used for SNP recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| one_thousand_genomes_resource_vcf_index | Used for SNP recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| mills_resource_vcf | Used for indel variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| mills_resource_vcf_index | Used for indel variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| axiomPoly_resource_vcf | Used for indel variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| axiomPoly_resource_vcf_index | Used for indel variant recalibration; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| dbsnp_resource_vcf | Optional file used for SNP/indel variant recalibration; set to `dbsnp_vcf` by default; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| dbsnp_resource_vcf_index | Optional file used for SNP/indel variant recalibration; set to `dbsnp_vcf_index` by default; see the [GATK Resource Bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811) for more information. | File | +| excess_het_threshold | Optional float used for hard filtering joint calls; phred-scaled p-value; set to `54.69` by default to cut off quality scores greater than a z-score of -4.5 (p-value of 3.4e-06). | Float | +| vqsr_snp_filter_level | Used for applying the recalibration model when running the pipeline using VQSR; required when `run_vets` is “false”. | Float | +| vqsr_indel_filter_level | Used for applying the recalibration model when running the pipeline using VQSR; required when `run_vets` is “false”. | Float | +| snp_vqsr_downsampleFactor | The downsample factor used for SNP variant recalibration if the number of GVCF files is greater than the ` snps_variant_recalibration_threshold` when running the pipeline using VQSR; required when `run_vets` is “false”. | Int | +| top_level_scatter_count | Optional integer used to determine how many files the input interval list should be split into; default will split the interval list into 2 files. | Int | +| gather_vcfs | Optional boolean; “true” is used for small callsets containing less than 100,000 GVCF files. | Boolean | +| snps_variant_recalibration_threshold | Optional integer that sets the threshold for the number of callset VCF files used to perform recalibration on a single file; if the number of VCF files exceeds the threshold, variants will be downsampled to enable parallelization; default is “500000”. | Int | +| rename_gvcf_samples | Optional boolean describing whether GVCF samples should be renamed; default is “true”. | Boolean | +| unbounded_scatter_count_scale_factor | Optional float used to scale the scatter count when `top_level_scatter_count` is not provided as input; default is “0.15”. | Float | +| use_allele_specific_annotations | Optional boolean used for SNP and indel variant recalibration when running the pipeline using VQSR; set to “true” by default. | Boolean | + + +#### GnarlyGenotyper joint calling input descriptions + +The table below describes the additional pipeline inputs that apply when the pipeline is run with GnarlyGenotyper for joint calling: + +| Parameter name | Description | Type | +| --- | --- | --- | +| gnarly_scatter_count | Optional integer used to determine how many files to split the interval list into when using GnarlyGenotyper; default is “10”. | Int | +| use_gnarly_genotyper | Optional boolean describing whether GnarlyGenotyper should be used; default is “false”. | Boolean | + + +#### VETS variant filtering input descriptions + +The table below describes the additional pipeline inputs that apply when the pipeline is run with VETS for variant filtering: + +| Parameter name | Description | Type | +| --- | --- | --- | +| targets_interval_list | Describes the intervals for which the filtering model will be trained when running the pipeline using VETS; for more details, see the associated [README](https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list.README.md); required when `run_vets` is “true”. | File | +| run_vets | Optional boolean used to describe whether the pipeline will use VQSR (`run_vets = false`) or VETS (`run_vets = true`) to create the filtering model; default is “false”. | Boolean | + + +#### Fingerprinting input descriptions + +The table below describes the pipeline inputs that apply to fingerprinting: + +| Parameter name | Description | Type | +| --- | --- | --- | +| haplotype_database | Haplotype reference used for fingerprinting (see the CrosscheckFingerprints task). | File | +| cross_check_fingerprints | Optional boolean describing whether or not the pipeline should check fingerprints; default is “true”. | Boolean | +| scatter_cross_check_fingerprints | Optional boolean describing whether `CrossCheckFingerprintsScattered` or `CrossCheckFingerprintsSolo` should be run; default is “false” and `CrossCheckFingerprintsSolo` will be run. | Boolean | + +#### Runtime parameter input descriptions + +The table below describes the pipeline inputs used for setting runtime parameters of tasks: + +| Parameter name | Description | Type | +| --- | --- | --- | +| small_disk | Disk size; dependent on cohort size; requires user input; see example JSON configuration files found in the WARP [test_inputs](https://github.com/broadinstitute/warp/tree/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs) folder for recommendations. | Int | +| medium_disk | Disk size; dependent on cohort size; requires user input; see example JSON configuration files found in the WARP [test_inputs](https://github.com/broadinstitute/warp/tree/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs) folder for recommendations. | Int | +| large_disk | Disk size; dependent on cohort size; requires user input; see example JSON configuration files found in the WARP [test_inputs](https://github.com/broadinstitute/warp/tree/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs) folder for recommendations. | Int | +| huge_disk | Disk size; dependent on cohort size; requires user input; see example JSON configuration files found in the WARP [test_inputs](https://github.com/broadinstitute/warp/tree/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs) folder for recommendations. | Int | + + +## JointGenotyping tasks and tools + +The [JointGenotyping workflow](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl) imports individual "tasks," also written in WDL script, from the WARP [tasks folder](https://github.com/broadinstitute/warp/tree/master/tasks/broad). + +Overall, the JointGenotyping workflow: + +1. Splits the input interval list and imports GVCF files. +1. Performs joint genotyping using GATK GenotypeGVCFs (default) or GnarlyGenotyper. +1. Creates single site-specific VCF and index files. +1. Creates and applies a variant filtering model using GATK VQSR (default) or VETS. +1. Collects variant calling metrics. +1. Checks fingerprints (optional). + +The tasks and tools used in the JointGenotyping workflow are detailed in the table below. + +To see specific tool parameters, select the task WDL link in the table; then find the task and view the `command {}` section of the task in the WDL script. To view or use the exact tool software, see the task's Docker image which is specified in the task WDL `# runtime values` section as `String docker =`. + +| Task | Tool | Software | Description | +| --- | --- | --- | --- | +| [CheckSamplesUnique](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | bash | bash | Checks that there are more than 50 unique samples in `sample_name_map`. | +| [SplitIntervalList](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | SplitIntervals | [GATK](https://gatk.broadinstitute.org/hc/en-us) | Splits the unpadded interval list for scattering. | +| [ImportGVCFs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GenomicsDBImport | [GATK](https://gatk.broadinstitute.org/hc/en-us) | Imports single-sample GVCF files into GenomicsDB before joint genotyping. | +| [SplitIntervalList as GnarlyIntervalScatterDude](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | SplitIntervals | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `use_gnarly_genotyper` is “true” (default is “false”), splits the unpadded interval list for scattering; otherwise, this task is skipped. | +| [GnarlyGenotyper](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GnarlyGenotyper | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `use_gnarly_genotyper` is “true” (default is “false”), performs scalable, “quick and dirty” joint genotyping on a set of GVCF files stored in GenomicsDB; otherwise, this task is skipped. | +| [GatherVcfs as TotallyRadicalGatherVcfs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GatherVcfsCloud | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `use_gnarly_genotyper` is “true” (default is “false”), compiles the site-specific VCF files generated for each interval into one VCF output and index; otherwise, this task is skipped. | +| [GenotypeGVCFs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GenotypeGVCFs | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `use_gnarly_genotyper` is “false” (default is “false”), performs joint genotyping on GVCF files stored in GenomicsDB; otherwise this task is skipped. | +| [HardFilterAndMakeSitesOnlyVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | VariantFiltration, MakeSitesOnlyVcf | [GATK](https://gatk.broadinstitute.org/hc/en-us) | Uses the VCF files to hard filter the variant calls; outputs a VCF file with the site-specific (but not genotype) information. | +| [GatherVcfs as SitesOnlyGatherVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GatherVcfsCloud | [GATK](https://gatk.broadinstitute.org/hc/en-us) | Compiles the site-specific VCF files generated for each interval into one VCF output file and index. | +| [JointVcfFiltering as TrainAndApplyVETS](https://github.com/broadinstitute/gatk/blob/master/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl) | ExtractVariantAnnotations, TrainVariantAnnotationsModel, ScoreVariantAnnotations | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “true” (default is “false”), calls the `JointVcfFiltering.wdl` subworkflow to extract variant-level annotations, trains a model for variant scoring, and scores variants; otherwise, this task is skipped. | +| [IndelsVariantRecalibrator](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | VariantRecalibrator | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”), uses the compiled VCF file to build a recalibration model to score indel variant quality; produces a recalibration table. | +| [SNPsVariantRecalibratorCreateModel](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | VariantRecalibrator | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”) and the number of input GVCF files is greater than `snps_variant_recalibration_threshold`, builds a recalibration model to score variant quality; otherwise this task is skipped. | +| [SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | VariantRecalibrator | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”) and the number of input GVCF files is greater than `snps_variant_recalibration_threshold`, builds a scattered recalibration model to score variant quality; otherwise this task is skipped. | +| [Tasks.GatherTranches as SNPGatherTranches](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GatherTranches | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”) and the number of input GVCF files is greater than `snps_variant_recalibration_threshold`, gathers tranches into a single file; otherwise this task is skipped. | +| [SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | VariantRecalibrator | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”) and the number of input GVCF files is not greater than `snps_variant_recalibration_threshold`, builds a recalibration model to score variant quality; otherwise this task is skipped. | +| [ApplyRecalibration](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | ApplyVQSR | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `run_vets` is “false” (default is “false”), scatters the site-specific VCF file and applies a filtering threshold. | +| [CollectVariantCallingMetrics as CollectMetricsSharded](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | CollectVariantCallingMetrics | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If the callset has at least 1000 GVCF files, returns detail and summary metrics for each of the scattered VCF files. If the number is small, will return metrics for a merged VCF file produced in the `GatherVcfs as FinalGatherVcf` task (listed below). | +| [GatherVcfs as FinalGatherVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GatherVcfsCloud | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If the callset has fewer than 1000 GVCF files, compiles the VCF files prior to collecting metrics in the `CollectVariantCallingMetrics as CollectMetricsOnFullVcf` task (listed below). | +| [CollectVariantCallingMetrics as CollectMetricsOnFullVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | CollectVariantCallingMetrics | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If the callset has fewer than 1000 GVCF files, returns metrics for the merged VCF file produced in the `GatherVcfs as FinalGatherVcf` task. | +| [GatherVariantCallingMetrics](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | AccumulateVariantCallingMetrics | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If the callset has at least 1000 GVCF files, gathers metrics produced for each VCF file. | +| [GetFingerprintingIntervalIndices](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | IntervalListTools | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “true” (default is “false”), gets and sorts indices for fingerprint intervals; otherwise the task is skipped. | +| [GatherVcfs as GatherFingerprintingVcfs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | GatherVcfsCloud | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “true” (default is “false”), compiles the fingerprint VCF files; otherwise the task is skipped. | +| [SelectFingerprintSiteVariants](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | SelectVariants | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `cross_check_fingerprints` is “true” (default is “true”)and `scatter_cross_check_fingerprints` is “true” (default is “false”), selects variants from the fingerprint VCF file; otherwise the task is skipped. | +| [PartitionSampleNameMap](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | bash | bash | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “true” (default is “false”), partitions the sample name map and files are scattered by the partition; otherwise the task is skipped. | +| [CrossCheckFingerprint as CrossCheckFingerprintsScattered](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | CrosscheckFingerprints | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “true” (default is “false”), checks fingerprints for the VCFs in the scattered partitions and produces a metrics file; otherwise the task is skipped. | +| [GatherPicardMetrics as GatherFingerprintingMetrics](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | bash | bash | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “true” (default is “false”), combines the fingerprint metrics files into a single metrics file; otherwise the task is skipped. | +| [CrossCheckFingerprint as CrossCheckFingerprintSolo](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) | CrosscheckFingerprints | [GATK](https://gatk.broadinstitute.org/hc/en-us) | If `cross_check_fingerprints` is “true” (default is “true”) and `scatter_cross_check_fingerprints` is “false” (default is “false”), checks fingerprints for the single VCF file and produces a metrics file; otherwise the task is skipped. | + +#### 1. Splits the input interval list and imports GVCF files + +The [SplitIntervalList](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task uses GATK’s SplitIntervals tool to split the input interval list into two or more interval files. The number of output interval files can be specified using the `top_level_scatter_count` input parameter or by specifying `unbounded_scatter_count_scale_factor`, which will scale the number of output files based on the number of input GVCF files. + +The [ImportGVCFs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task uses GATK’s GenomicsDBImport tool and the input sample map file to import single-sample GVCF files into GenomicsDB before joint genotyping. + +#### 2. Performs joint genotyping using GATK GenotypeGVCFs (default) or GnarlyGenotyper + +**GenotypeGVCFs (default)** + +When `use_gnarly_genotyper` is “false”, the [GenotypeGVCFs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task uses GATK’s GenotypeGVCFs tool to perform joint genotyping on GVCF files stored in GenomicsDB that have been pre-called with HaplotypeCaller. + +**GnarlyGenotyper** + +When `use_gnarly_genotyper` is “true”, the [SplitIntervalList as GnarlyIntervalScatterDude](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task splits the unpadded interval list for scattering using GATK’s SplitIntervals tool. The output is used as input for the [GnarlyGenotyper](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task which performs joint genotyping on the set of GVCF files and outputs an array of VCF and index files using the GnarlyGenotyper tool. Those VCF and index files are gathered in the next task, [GatherVcfs as TotallyRadicalGatherVcfs](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl), which uses the GatherVcfsCloud tool. + +#### 3. Creates single site-specific VCF and index files + +The [HardFilterAndMakeSitesOnlyVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task takes in the output VCF and index files produced by either GnarlyGenotyper or GenotypeGVCFs. The task uses the `excess_het_threshold` input value to hard filter the variant calls using GATK’s VariantFiltration tool. After filtering, the site-specific VCF files are generated from the filtered VCF files by removing all sample-specific genotype information, leaving only the site-level summary information at each site. + +Next, the site-specific VCF and index files for each interval are gathered into a single site-specific VCF and index file by the [GatherVcfs as SitesOnlyGatherVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task, which uses the GatherVcfsCloud tool. + +#### 4. Creates and applies a variant filtering model using GATK VQSR (default) or VETS + +**VQSR (default)** + +If `run_vets` is “false”, the [IndelsVariantRecalibrator](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task takes in the site-specific VCF and index files generated in [Step 3](#3-creates-single-site-specific-vcf-and-index-files) and uses GATK’s VariantRecalibrator tool to perform the first step of the Variant Quality Score Recalibration (VQSR) technique of filtering variants. The tool builds a model to be used to score and filter indels and produces a recalibration table as output. + +After building the indel filtering model, the workflow uses the VariantRecalibrator tool to build a model to be used to score and filter SNPs. If the number of input GVCF files is greater than `snps_variant_recalibration_threshold`, the [SNPsVariantRecalibratorCreateModel](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl), [SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl), and [Tasks.GatherTranches as SNPGatherTranches](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) tasks are called to scatter the site-specific VCF and index files, build the SNP model, and gather scattered tranches into a single file. If the number of input GVCF files is less than `snps_variant_recalibration_threshold`, the [SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task is called to build the SNP model. + +The [ApplyRecalibration](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task uses GATK’s ApplyVQSR tool to scatter the site-specific VCF file, apply the indel and SNP filtering models, and output a recalibrated VCF and index file. + +**VETS** + +If `run_vets` is “true”, the [JointVcfFiltering as TrainAndApplyVETS](https://github.com/broadinstitute/gatk/blob/master/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl) task takes in the hard filtered and site-specific VCF and index files generated in [Step 3](#3-creates-single-site-specific-vcf-and-index-files) and calls the `JointVcfFiltering.wdl` subworkflow. This workflow uses the Variant Extract-Train-Score (VETS) algorithm to extract variant-level annotations, train a filtering model, and score variants based on the model. The subworkflow uses the GATK ExtractVariantAnnotations, TrainVariantAnnotationsModel, and ScoreVariantAnnotations tools to create extracted and scored VCF and index files. The output VCF and index files are not filtered by the score assigned by the model. The score is included in the output VCF files in the INFO field as an annotation called “SCORE”. + +The VETS algorithm trains the model only over target regions, rather than including exon tails which can lead to poor-quality data. However, the model is applied everywhere including the exon tails. + +#### 5. Collects variant calling metrics + +Summary and per-sample metrics are collected using Picard’s CollectVariantCallingMetrics tool. For large callsets (at least 1000 GVCF files), the workflow calls the [CollectVariantCallingMetrics as CollectMetricsSharded](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) followed by the [GatherVariantCallingMetrics](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task to compute and gather the variant calling metrics into single output files. For small callsets (less than 1000 GVCF files), the workflow calls the [GatherVcfs as FinalGatherVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task followed by the [CollectVariantCallingMetrics as CollectMetricsOnFullVcf](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task to first compile the VCF files and then compute the variant calling metrics. Detail and summary metrics files are produced as outputs of these tasks. + +#### 6. Checks fingerprints (optional) + +If `cross_check_fingerprints` is “true”, the workflow will use Picard to determine the likelihood that the input and output data were generated from the same individual to verify that the pipeline didn’t swap any of the samples during processing. The [SelectFingerprintSiteVariants](https://github.com/broadinstitute/warp/blob/develop/tasks/broad/JointGenotypingTasks.wdl) task uses GATK’s SelectVariants tool to select variants in the site-specific VCF file based on the variants present in the `haplotype_database` and outputs a fingerprint VCF and index file. Next, the workflow cross-checks the fingerprints and creates an output metrics file using the CrosscheckFingerprints tool. + +## Outputs + +The following table lists the output variables and files produced by the pipeline. + +| Output name | Filename, if applicable | Output format and description | +| ------ | ------ | ------ | +| detail_metrics_file | `.variant_calling_detail_metrics` | Detail metrics file produced using Picard. | +| summary_metrics_file | `.variant_calling_summary_metrics` | Summary metrics file produced using Picard. | +| output_vcfs | `.vcf.gz` or `.filtered..vcf.gz` | Array of all site-specific output VCF files. | +| output_vcf_indices | `.vcf.gz.tbi` or `.filtered..vcf.gz.tbi` | Array of all output VCF index files. | +| output_intervals | `scatterDir/` | Interval list file produced by the workflow. | +| crosscheck_fingerprint_check | `.fingerprintcheck` | Fingerprint metrics | Optional output file containing fingerprint metrics. | + +## Versioning and testing + +All JointGenotyping pipeline releases are documented in the [JointGenotyping changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md) and tested using [plumbing and scientific test data](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/dna_seq/germline/joint_genotyping/test_data_overview.md). To learn more about WARP pipeline testing, see [Testing Pipelines](https://broadinstitute.github.io/warp/docs/About_WARP/TestingPipelines). + +## Citing the JointGenotyping Pipeline + +If you use the JointGenotyping Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + +## Feedback + +Please help us make our tools better by contacting the [WARP Pipelines Team](mailto:warp-pipelines-help@broadinstitute.org) for pipeline-related suggestions or questions. \ No newline at end of file diff --git a/website/docs/Pipelines/JointGenotyping/_category_.json b/website/docs/Pipelines/JointGenotyping/_category_.json new file mode 100644 index 0000000000..8088ecaa3b --- /dev/null +++ b/website/docs/Pipelines/JointGenotyping/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "JointGenotyping", + "position": 8 +} diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 97815b03c8..fbd2802544 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -7,9 +7,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | - -| [Multiome v3.1.1](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | - +| [Multiome v3.3.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | ![Multiome_diagram](./multiome_diagram.png) @@ -62,7 +60,6 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | | gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | | tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | -| ref_genome_fasta | Genome FASTA file used for building the indices. | File | | mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | | counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | | tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | @@ -122,6 +119,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | gex_aligner_metrics | `.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | +| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | | cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| | checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | | h5_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | @@ -138,9 +136,15 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt All Multiome pipeline releases are documented in the [Multiome changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/Multiome.changelog.md) and tested using [plumbing and scientific test data](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/multiome/test_inputs). To learn more about WARP pipeline testing, see [Testing Pipelines](https://broadinstitute.github.io/warp/docs/About_WARP/TestingPipelines). ## Citing the Multiome Pipeline -Please identify the pipeline in your methods section using the Multiome Pipeline's [SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_024217/resolver?q=SCR_024217&l=SCR_024217&i=rrid:scr_024217). + +If you use the Multiome Pipeline in your research, please identify the pipeline in your methods section using the [Multiome SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_024217/resolver?q=SCR_024217&l=SCR_024217&i=rrid:scr_024217). + * Ex: *Multiome Pipeline (RRID:SCR_024217)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia support This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN). diff --git a/website/docs/Pipelines/Multiome_Pipeline/_category_.json b/website/docs/Pipelines/Multiome_Pipeline/_category_.json index 1ec6f2bad8..fddd703eab 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/_category_.json +++ b/website/docs/Pipelines/Multiome_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Multiome scATAC and GEX", - "position": 8 + "position": 9 } diff --git a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md index 7a76ffe328..8bf61109e8 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md +++ b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md @@ -14,7 +14,7 @@ It contains the raw, but UMI-corrected cell by gene counts, which vary depending You can determine which type of counts are in the h5ad file by looking at the unstructured metadata (the `anndata.uns` property of the matrix) `expression_data_type` key (see [Table 1](#table-1-global-attributes) below). -The matrix also contains multiple metrics for both individual cells (the `anndata.obs` property of the matrix; [Table 2](#table-2-cell-metrics) and individual genes (the `anndata.var` property of the matrix; [Table 3](#table-3-gene-metrics)). +The matrix also contains multiple metrics for both individual cells (the `anndata.obs` property of the matrix; [Table 2](#table-2-cell-metrics)) and individual genes (the `anndata.var` property of the matrix; [Table 3](#table-3-gene-metrics)). :::tip Additional Matrix Processing for Consortia Previous Loom files generated by Optimus for consortia, such as the Human Cell Atlas (HCA) or the BRAIN Initiative Cell Census Network (BICCN), may have additional processing steps. Read the [Consortia Processing Overview](consortia-processing.md#hca-data-coordination-platform-matrix-processing) for details on consortia-specific matrix changes. @@ -43,16 +43,16 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file |`n_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads associated with the cell. Like all metrics, `n_reads` is calculated from the Optimus output BAM file. Prior to alignment, reads are checked against the whitelist and any within one edit distance (Hamming distance) are corrected. These CB-corrected reads are aligned using STARsolo, where they get further CB correction. For this reason, most reads in the aligned BAM file have both `CB` and `UB` tags. Therefore, `n_reads` represents CB-corrected reads, rather than all reads in the input FASTQ files. | |`noise_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| Number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | |`perfect_molecule_barcodes`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads with molecule barcodes (sequences used to identify unique transcripts) that have no errors. Learn more about UMIs in the [Definitions](#definitions) section below. | -| `reads_mapped_exonic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`. | -| `reads_mapped_exonic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as exon in the antisense direction; counted when the BAM's `sF` is assigned to a `2` or `4` and the `NH:i` tag is `1`. | -| `reads_mapped_intronic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as intron; counted when the BAM files's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`. | -| `reads_mapped_intronic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tas is `1`. | +| `reads_mapped_exonic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_exonic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as exon in the antisense direction; counted when the BAM's `sF` is assigned to a `2` or `4` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as intron; counted when the BAM files's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tas is `1`; mitochondrial reads are excluded. | | `duplicate_reads` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | Not currently calculated for Optimus output; number of duplicate reads. | |`n_mitochondrial_genes`| [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of mitochondrial genes detected by this cell. | |`n_mitochondrial_molecules`| [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of molecules from mitochondrial genes detected for this cell. | |`pct_mitochondrial_molecules`| [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The percentage of molecules from mitochondrial genes detected for this cell. | -|`reads_mapped_uniquely`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads mapped to a single unambiguous location in the genome. | -|`reads_mapped_multiple`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)|The number of reads mapped to multiple genomic positions with equal confidence. | +|`reads_mapped_uniquely`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads mapped to a single unambiguous location in the genome; mitochondrial reads are excluded. | +|`reads_mapped_multiple`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)|The number of reads mapped to multiple genomic positions with equal confidence; mitochondrial reads are excluded. | |`spliced_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads that overlap splicing junctions. | |`antisense_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| Not calculated for Optimus outputs; see `reads_mapped_exonic_as` or `reads_mapped_intronic_as` for antisense counts. | |`molecule_barcode_fraction_bases_above_30_mean`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The average fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of the cell. | @@ -80,7 +80,7 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file | `emptydrops_PValue` | [dropletUtils](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) | The Monte Carlo p-value against the null model; single-cell data will read `NA` if task is unable to detect knee point inflection. Column is not included for data run in the `sn_rna` mode | | `emptydrops_Total` | [dropletUtils](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) | The total read counts for each barcode; single-cell data will read `NA` if task is unable to detect knee point inflection. Column is not included for data run in the `sn_rna` mode. | | `reads_mapped_intergenic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intergenic; counted when the BAM file's `sF` tag is assigned to a `7` and the `NH:i` tag is `1`. | -| `reads_unmapped` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The total number of reads that are unmapped; counted when the BAM file's`sF` tag is `0`. | +| `reads_unmapped` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The total number of reads that are unmapped; counted when the BAM file's `sF` tag is `0`. | |`reads_per_molecule`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The average number of reads associated with each molecule in the cell. | ## Table 3. Gene metrics @@ -93,12 +93,12 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file |`n_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads associated with this gene. | |`noise_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| Not currently calculated for Optimus output; number of reads that are categorized by 10x Genomics Cell Ranger as "noise"; refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | |`perfect_molecule_barcodes`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads with molecule barcodes (sequences used to identify unique transcripts) that have no errors. Learn more about UMIs in the [Definitions](#definitions) section below. | -| `reads_mapped_exonic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`. | -| `reads_mapped_exonic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as exon in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `2` or `4` and the `NH:i` tag is `1`. | -| `reads_mapped_intronic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intron; counted when the BAM file's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`. | -| `reads_mapped_intronic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tag is `1`. | -|`reads_mapped_uniquely`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads mapped to a single unambiguous location in the genome. | -|`reads_mapped_multiple`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)|The number of reads mapped to multiple genomic positions with equal confidence. | +| `reads_mapped_exonic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_exonic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as exon in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `2` or `4` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intron; counted when the BAM file's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic_as` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +|`reads_mapped_uniquely`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads mapped to a single unambiguous location in the genome; mitochondrial reads are excluded. | +|`reads_mapped_multiple`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)|The number of reads mapped to multiple genomic positions with equal confidence; mitochondrial reads are excluded. | |`spliced_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads that overlap splicing junctions. | |`antisense_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads that are mapped to the antisense strand instead of the transcribed strand. | | `duplicate_reads` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of duplicate reads. | diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index ffe147e4ea..fa91c7fea9 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,8 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v6.3.5](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | January, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [optimus_v6.5.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | February, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | + ![Optimus_diagram](Optimus_diagram.png) @@ -33,8 +34,8 @@ The following table provides a quick glance at the Optimus pipeline features: | Assay type | 10x single cell or single nucleus expression (v2 and v3) | [10x Genomics](https://www.10xgenomics.com) | Overall workflow | Quality control module and transcriptome quantification module | Code available from [GitHub](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/Optimus.wdl) | | Workflow language | WDL 1.0 | [openWDL](https://github.com/openwdl/wdl) | -| Genomic Reference Sequence | GRCh38 human genome primary sequence and M21 (GRCm38.p6) mouse genome primary sequence | GENCODE [human reference files](https://www.gencodegenes.org/human/release_27.html) and [mouse reference files](https://www.gencodegenes.org/mouse/release_M21.html) -| Transcriptomic reference annotation | V27 GENCODE human transcriptome and M21 mouse transcriptome | GENCODE [human GTF](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_27/gencode.v27.annotation.gtf.gz) and [mouse GTF](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gff3.gz) | +| Genomic Reference Sequence | GRCh38.p13 (v43) human genome primary sequence and GRCm39 (M32) mouse genome primary sequence | GENCODE [human reference files](https://www.gencodegenes.org/human/release_43.html) and [mouse reference files](https://www.gencodegenes.org/mouse/release_M32.html) +| Transcriptomic reference annotation | V43 GENCODE human transcriptome and M32 mouse transcriptome | GENCODE [human GTF](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gtf.gz) and [mouse GTF](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M32/gencode.vM32.annotation.gtf.gz) | | Aligner and transcript quantification | STARsolo | [Dobin, et al.,2021](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1) | | Data input file format | File format in which sequencing data is provided | [FASTQ](https://academic.oup.com/nar/article/38/6/1767/3112533) | | Data output file format | File formats in which Optimus output is provided | [BAM](http://samtools.github.io/hts-specs/), Python numpy arrays (internal), h5ad | @@ -49,12 +50,13 @@ To discover and search releases, use the WARP command-line tool [Wreleaser](http If you’re running an Optimus workflow version prior to the latest release, the accompanying documentation for that release may be downloaded with the source code on the WARP [releases page](https://github.com/broadinstitute/warp/releases) (see the source code folder `website/docs/Pipelines/Optimus_Pipeline`). -Optimus can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/), a GA4GH compliant, flexible workflow management system that supports multiple computing platforms. The workflow can also be run in [Terra](https://app.terra.bio), a cloud-based analysis platform. The Terra [Optimus Featured Workspace](https://app.terra.bio/#workspaces/featured-workspaces-hca/HCA_Optimus_Pipeline) contains the Optimus workflow, workflow configurations, required reference data and other inputs, and example testing data. +Optimus can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/), a GA4GH-compliant, flexible workflow management system that supports multiple computing platforms. The workflow can also be run in [Terra](https://app.terra.bio), a cloud-based analysis platform. The Terra [Optimus Featured Workspace](https://app.terra.bio/#workspaces/featured-workspaces-hca/HCA_Optimus_Pipeline) contains the Optimus workflow, workflow configurations, required reference data and other inputs, and example testing data. ### Inputs Optimus pipeline inputs are detailed in JSON format configuration files. There are five downsampled example configuration files available for running the pipeline: + * [human_v2_example](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/example_inputs/human_v2_example.json): An example human 10x v2 single-cell dataset * [human_v3_example](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/example_inputs/human_v3_example.json): An example human 10x v3 single-cell dataset * [mouse_v2_example](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/example_inputs/mouse_v2_example.json): An example mouse 10x v2 single-cell dataset @@ -115,7 +117,7 @@ The Optimus pipeline is currently available on the cloud-based platform Terra. A The [Optimus workflow](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/Optimus.wdl) imports individual "tasks," also written in WDL script, from the WARP [tasks folder](https://github.com/broadinstitute/warp/blob/master/tasks/skylab). Overall, the Optimus workflow: -1. Checks inputs +1. Checks inputs. 1. Partitions FASTQs by CB. 1. Corrects CBs, aligns reads, corrects UMIs, and counts genes with STAR. 1. Merges the Star outputs into NPY and NPZ arrays. @@ -252,7 +254,8 @@ The following table lists the output files produced from the pipeline. For sampl | matrix_col_index | `_sparse_counts_col_index.npy` | Index of genes in count matrix. | NPY | | cell_metrics | `.cell-metrics.csv.gz` | Matrix of metrics by cells. | Compressed CSV | | gene_metrics | `.gene-metrics.csv.gz` | Matrix of metrics by genes. | Compressed CSV | -| aligner_metrics | `.cell_reads.txt` | Per barcode metrics (CellReads.stats) produced by the STARsolo aligner. | TXT | +| aligner_metrics | `.star_metrics.tar` | Tarred metrics files produced by the STARsolo aligner; contains align features, cell reads, summary, and UMI per cell metrics files. | TXT | +| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | CSV | | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | @@ -282,11 +285,16 @@ Optimus has been validated for processing both human and mouse single-cell and s All Optimus pipeline releases are documented in the [Optimus changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/optimus/Optimus.changelog.md). +## Citing the Optimus Pipeline + +If you use the Optimus Pipeline in your research, please identify the pipeline in your methods section using the [Optimus SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_018908/resolver?q=SCR_018908&l=SCR_018908&i=rrid:scr_018908). -## Citing the Optimus pipeline -Please identify the pipeline in your methods section using the Optimus Pipeline's [SciCrunch resource identifier](https://scicrunch.org/scicrunch/Resources/record/nlx_144509-1/SCR_018908/resolver?q=SCR_018908&l=SCR_018908). * Ex: *Optimus Pipeline (RRID:SCR_018908)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia support This pipeline is supported and used by the [Human Cell Atlas](https://www.humancellatlas.org/) (HCA) project and the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN). diff --git a/website/docs/Pipelines/Optimus_Pipeline/_category_.json b/website/docs/Pipelines/Optimus_Pipeline/_category_.json index ebfd0a5ec3..5fa50a9742 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/_category_.json +++ b/website/docs/Pipelines/Optimus_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Optimus", - "position": 9 + "position": 10 } diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 81c7506f30..67a76f522a 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,8 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [PairedTag_v0.0.5](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [PairedTag_v0.3.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | + ## Introduction to the Paired-Tag workflow @@ -68,7 +69,6 @@ The Paired-Tag workflow inputs are specified in JSON configuration files. Exampl | gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | | tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | -| ref_genome_fasta | Genome FASTA file used for building the indices. | File | | mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | | tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | | emptydrops_lower | **Not used for single-nucleus data.** Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer | @@ -95,8 +95,9 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | Subworkflow/Task | Software | Description | | ----------- | -------- | ----------- | | Optimus ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/optimus/Optimus.wdl) and [documentation](../Optimus_Pipeline/README)) | fastqprocess, STARsolo, Emptydrops | Workflow used to analyze 10x single-cell GEX data. | -| ​​PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to check the length of the read2 FASTQ (should be either 27 or 24 bp). If `preindex` is set to true, the task will perform demultiplexing of the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. It will then perform barcode orientation checking. The ATAC workflow will then add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. If `preindex` is false and then length is 27 bp, the task will perform trimming and subsequent barcode orientation checking. | -ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/atac.wdl) and [documentation](../ATAC/README)) | fastqprocess, bwa-mem, SnapATAC2 | Workflow used to analyze single-nucleus paired-tag DNA (histone modifications) data. | +| PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to check the length of the read2 FASTQ (should be either 27 or 24 bp). If `preindex` is set to true, the task will perform demultiplexing of the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. It will then perform barcode orientation checking. The ATAC workflow will then add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. If `preindex` is false and then length is 27 bp, the task will perform trimming and subsequent barcode orientation checking. | +| ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/atac.wdl) and [documentation](../ATAC/README)) | fastqprocess, bwa-mem, SnapATAC2 | Workflow used to analyze single-nucleus paired-tag DNA (histone modifications) data. | +| ParseBarcodes as ParseBarcodes ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | python3 | Task used to parse and split the cell barcodes and sample barcodes from the combined index in the h5ad and fragment files when `preindex` is set to true. | ## Outputs @@ -105,8 +106,8 @@ ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab |--- | --- | --- | | pairedtag_pipeline_version_out | N.A. | String describing the version of the Paired-Tag pipeline used. | | bam_aligned_output_atac | `_atac.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | -| fragment_file_atac | `_atac.fragments.tsv` or if preindexing = true, `_atac.fragments.BB.tsv | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". | -| snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. | +| fragment_file_atac | `_atac.fragments.tsv` or if preindexing = true, `_atac.fragments.BB.tsv` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When preindexing is used, additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | +| snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. If the preindex option is used, the h5ad.obs will contain 3 extra columns: preindex (the sample barcode), CB (cell barcodes), and duplicates (indicates with a 1 if the cell barcode matches more than preindex, otherwise it is 0).| | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | | matrix_gex | `_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. | @@ -116,12 +117,23 @@ ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab | gene_metrics_gex | `_gex.gene_metrics.csv.gz` | CSV file containing the per-gene metrics. | | cell_calls_gex | `_gex.emptyDrops` | TSV file containing the EmptyDrops results when the Optimus workflow is run in sc_rna mode. | | h5ad_output_file_gex | `_gex.h5ad` | h5ad (Anndata) file containing the raw cell-by-gene count matrix, gene metrics, cell metrics, and global attributes. See the [Optimus Count Matrix Overview](../Optimus_Pipeline/Loom_schema.md) for more details. | +| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | ## Versioning and testing All Paired-Tag pipeline releases are documented in the [Paired-Tag changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/paired_tag/PairedTag.wdl) and tested using [plumbing and scientific test data](https://github.com/broadinstitute/warp/tree/develop/pipelines/skylab/paired_tag/test_inputs). To learn more about WARP pipeline testing, see [Testing Pipelines](https://broadinstitute.github.io/warp/docs/About_WARP/TestingPipelines). Note that paired-tag tests are still in development. +## Citing the Paired-Tag Pipeline + +If you use the Paired-Tag Pipeline in your research, please identify the pipeline in your methods section using the [Paired-Tag SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_025042/resolver?q=paired_tag&l=paired_tag&i=rrid:scr_025042). + +* Ex: *Paired-Tag Pipeline (RRID:SCR_025041)* + +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia support This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN). diff --git a/website/docs/Pipelines/PairedTag_Pipeline/_category_.json b/website/docs/Pipelines/PairedTag_Pipeline/_category_.json index d7305fba0f..94672fe8d3 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/_category_.json +++ b/website/docs/Pipelines/PairedTag_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Paired-Tag", - "position": 10 + "position": 11 } diff --git a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/README.md b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/README.md index c407efd7f4..2c7b1f08ca 100644 --- a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/README.md +++ b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/RNA_with_UMIs_Pipeline/README | Pipeline Version | Date Updated | Documentation Authors | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [RNAWithUMIsPipeline_v1.0.15](https://github.com/broadinstitute/warp/releases?q=RNAwithUMIs&expanded=true) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) & [Kaylee Mathews](mailto:kmathews@broadinstitute.org)| Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [RNAWithUMIsPipeline_v1.0.16](https://github.com/broadinstitute/warp/releases?q=RNAwithUMIs&expanded=true) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) & [Kaylee Mathews](mailto:kmathews@broadinstitute.org)| Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![RNAWithUMIs_diagram](rna-with-umis_diagram.png) @@ -266,7 +266,11 @@ Workflow outputs are described in the table below. All RNA with UMIs pipeline releases are documented in the [pipeline changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md). - +## Citing the RNA with UMIs Pipeline + +If you use the RNA with UMIs Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Feedback diff --git a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/_category_.json b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/_category_.json index d8cf127ef8..d17a4bd158 100644 --- a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/_category_.json +++ b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "RNA with UMIs", - "position": 11 + "position": 12 } diff --git a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/rna-with-umis.methods.md b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/rna-with-umis.methods.md index 7d29774ea1..856690ea74 100644 --- a/website/docs/Pipelines/RNA_with_UMIs_Pipeline/rna-with-umis.methods.md +++ b/website/docs/Pipelines/RNA_with_UMIs_Pipeline/rna-with-umis.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# RNA with UMIs v1.0.15 Methods +# RNA with UMIs v1.0.16 Methods Below we provide an example methods section for publications using the RNA with UMIs pipeline. For the complete pipeline documentation, see the [RNA with UMIs Overview](./README.md). ## Methods -Data preprocessing, gene counting, and metric calculation were performed using the RNA with UMIs v1.0.6 pipeline, which uses Picard, fgbio v1.4.0, fastp v0.20.1, FastQC v0.11.9, STAR v2.7.10a, Samtools v1.11, UMI-tools v1.1.1, GATK, and RNA-SeQC v2.4.2 with default tool parameters unless otherwise specified. Reference files are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references;tab=objects?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in [example configuration files](https://github.com/broadinstitute/warp/tree/develop/pipelines/broad/rna_seq/test_inputs) in the in the WARP repository. +Data preprocessing, gene counting, and metric calculation were performed using the RNA with UMIs v1.0.16 pipeline, which uses Picard, fgbio v1.4.0, fastp v0.20.1, FastQC v0.11.9, STAR v2.7.10a, Samtools v1.11, UMI-tools v1.1.1, GATK 4.5.0.0, and RNA-SeQC v2.4.2 with default tool parameters unless otherwise specified. Reference files are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references;tab=objects?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in [example configuration files](https://github.com/broadinstitute/warp/tree/develop/pipelines/broad/rna_seq/test_inputs) in the in the WARP repository. Paired-end FASTQ files were first converted to an unmapped BAM (uBAM) using Picard's (v3.0.0) FastqToSam tool with SORT_ORDER = unsorted. (If a read group unmapped BAM file is used as input for the pipeline, this step is skipped.) Unique molecular identifiers (UMIs) were extracted from the uBAM using fgbio's ExtractUmisFromBam and stored in the RX read tag. @@ -16,9 +16,9 @@ After the extraction of UMIs, reads that failed quality control checks performed Reads were aligned using STAR to the GRCh38 (hg38) reference with HLA, ALT, and decoy contigs removed with gene annotations from GENCODE v34 (or GRCh37 [hg19] with gene annotations from GENCODE v19). The --readFilesType and --readFilesCommand parameters were set to "SAM PE" and "samtools view -h", respectively, to indicate that the input was a BAM file. To specify that the output was an unsorted BAM that included unmapped reads, --outSAMtype was set to "BAM Unsorted" and --outSAMunmapped was set to "Within". A transcriptome-aligned BAM was also output with --quantMode = TranscriptomeSAM. To match [ENCODE bulk RNA-seq data standards](https://www.encodeproject.org/data-standards/rna-seq/long-rnas/), the alignment was performed with parameters --outFilterType = BySJout, --outFilterMultimapNmax = 20, --outFilterMismatchNmax = 999, --alignIntronMin = 20, --alignIntronMax = 1000000, --alignMatesGapMax = 1000000, --alignSJoverhangMin = 8, and --alignSJDBoverhangMin = 1. The fraction of reads required to match the reference was set with --outFilterMatchNminOverLread = 0.33 and the fraction of allowable mismatches to read length was set with --outFilterMismatchNoverLmax = 0.1. Chimeric alignments were included with --chimSegmentMin = 15, where 15 was the minimum length of each segment, and --chimMainSegmentMultNmax = 1 to prevent main chimeric segments from mapping to multiple sites. To output chimeric segments with soft-clipping in the aligned BAM, --chimOutType was set to "WithinBAM SoftClip". A maximum of 20 protruding bases at the ends of alignments was allowed with --alignEndsProtrude set to "20 ConcordantPair" to prevent reads from small cDNA fragments that were sequenced into adapters from being dropped. -Following alignment, both BAM files were sorted by coordinate with Picard's (v2.6.11) SortSam tool. UMI-tools was then used to further divide putative duplicates into subgroups based on UMI and sequencing errors in UMIs were corrected. To specify the tag where the UMIs were stored, --extract-umi-method was set to "tag" and --umi-tag was set to "RX". Unmapped reads were included in the output file with --unmapped-reads = use. Tagged BAM files were output using the option --output-bam. SortSam was used again to sort the BAM files by queryname for Picard's (v2.26.11) MarkDuplicates tool. MarkDuplicates was used to mark PCR duplicates and calculate duplicate metrics. After duplicate marking, BAM files were sorted by coordiante using SortSam to facilitate downstream analysis. The transcriptome-aligned, duplicate-marked BAM was sorted and postprocessed using GATK's (v4.2.6.0) PostProcessReadsForRSEM tool for compatability with RSEM. +Following alignment, both BAM files were sorted by coordinate with Picard's (v2.6.11) SortSam tool. UMI-tools was then used to further divide putative duplicates into subgroups based on UMI and sequencing errors in UMIs were corrected. To specify the tag where the UMIs were stored, --extract-umi-method was set to "tag" and --umi-tag was set to "RX". Unmapped reads were included in the output file with --unmapped-reads = use. Tagged BAM files were output using the option --output-bam. SortSam was used again to sort the BAM files by queryname for Picard's (v2.26.11) MarkDuplicates tool. MarkDuplicates was used to mark PCR duplicates and calculate duplicate metrics. After duplicate marking, BAM files were sorted by coordiante using SortSam to facilitate downstream analysis. The transcriptome-aligned, duplicate-marked BAM was sorted and postprocessed using GATK's PostProcessReadsForRSEM tool for compatability with RSEM. -The genome-aligned, duplicate-marked BAM file was then used to calculate summary metrics using RNASeQC, Picard's (v2.26.11) CollectRNASeqMetrics and (v3.0.0) CollectMultipleMetrics tools, and GATK's (v4.3.0.0) GetPileupSummaries and CalculateContamination tools. CollectMultipleMetrics was used with the programs “CollectInsertSizeMetrics” and “CollectAlignmentSummaryMetrics”. GetPileupSummaries was run with the read filters, "WellformedReadFilter" and "MappingQualityAvailableReadFilter" disabled. +The genome-aligned, duplicate-marked BAM file was then used to calculate summary metrics using RNASeQC, Picard's (v2.26.11) CollectRNASeqMetrics and (v3.0.0) CollectMultipleMetrics tools, and GATK's GetPileupSummaries and CalculateContamination tools. CollectMultipleMetrics was used with the programs “CollectInsertSizeMetrics” and “CollectAlignmentSummaryMetrics”. GetPileupSummaries was run with the read filters, "WellformedReadFilter" and "MappingQualityAvailableReadFilter" disabled. The final outputs of the RNA with UMIs pipeline included metrics generated before alignment with FastQC, a transcriptome-aligned, duplicate-marked BAM file with duplication metrics, and a genome-aligned, duplicate-marked BAM file with corresponding index, duplication metrics, and metrics generated with RNASeQC, Picard, and GATK tools. diff --git a/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/README.md b/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/README.md index d61e98fcb6..038463eb60 100644 --- a/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/README.md +++ b/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/README.md @@ -157,9 +157,15 @@ The following table details the metrics available in the output_snap_qc file. All scATAC workflow releases are documented in the [scATAC changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/scATAC/scATAC.changelog.md). ## Citing the scATAC Pipeline -Please identify the pipeline in your methods section using the scATAC Pipeline's [SciCrunch resource identifier](https://scicrunch.org/scicrunch/Resources/record/nlx_144509-1/SCR_018919/resolver?q=SCR_018919&l=SCR_018919). + +If you use the scATAC Pipeline in your research, please identify the pipeline in your methods section using the [scATAC SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_018919/resolver?q=SCR_018919&l=SCR_018919&i=rrid:scr_018919). + * Ex: *scATAC Pipeline (RRID:SCR_018919)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia Support This pipeline is supported and used by the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN). diff --git a/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/_category_.json b/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/_category_.json index 2145d730e7..e6581e6a3b 100644 --- a/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/_category_.json +++ b/website/docs/Pipelines/Single_Cell_ATAC_Seq_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Single Cell ATAC", - "position": 12 + "position": 13 } diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/README.md b/website/docs/Pipelines/SlideSeq_Pipeline/README.md index b4ce9af4e0..ffccb6c445 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/README.md +++ b/website/docs/Pipelines/SlideSeq_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/SlideSeq_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [SlideSeq v1.0.1](https://github.com/broadinstitute/warp/releases) | March, 2023 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [SlideSeq v3.1.2](https://github.com/broadinstitute/warp/releases) | February, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ![SlideSeq_diagram](./slide-seq_diagram.png) @@ -15,7 +15,7 @@ slug: /Pipelines/SlideSeq_Pipeline/README The [Slide-seq workflow](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/slideseq/SlideSeq.wdl) is an open-source, cloud-optimized pipeline developed in collaboration with the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN) and the BRAIN Initiative Cell Atlas Network (BICAN). It supports the processing of spatial transcriptomic data generated with the [Slide-seq](https://www.science.org/doi/10.1126/science.aaw1219) (commercialized as [Curio Seeker](https://curiobioscience.com/product/)) assay. -Overall, the workflow corrects bead barcodes, aligns reads to the genome, generates a count matrix, calculates summary metrics for genes, barcodes, and UMIs, and returns read outputs in BAM format. +Overall, the workflow corrects bead barcodes, aligns reads to the genome, generates a count matrix, calculates summary metrics for genes, barcodes, and UMIs, returns read outputs in BAM format, and returns counts in numpy matrix and h5ad file formats. Slide-seq has been validated for analyzing mouse datasets generated with the Slide-seq assay. Learn more in the [validation section](#validation-against-on-prem-pipeline). @@ -37,7 +37,7 @@ The following table provides a quick glance at the Slide-seq pipeline features: | Transcriptomic reference annotation | M23 mouse transcriptome built with the [BuildIndices workflow](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/build_indices/BuildIndices.wdl) | GENCODE [mouse GTF](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gff3.gz); [modified version](https://console.cloud.google.com/storage/browser/_details/gcp-public-data--broad-references/mm10/v0/single_nucleus/star/modified_star_2.7.9a_primary_gencode_mouse_vM23.tar;tab=live_object) available in Broad’s public reference bucket | | Aligner and transcript quantification | STARsolo | [Kaminow et al. 2021](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1) | | Data input file format | File format in which sequencing data is provided | [FASTQ](https://academic.oup.com/nar/article/38/6/1767/3112533) | -| Data output file format | File formats in which Slide-seq output is provided | [BAM](http://samtools.github.io/hts-specs/), Python NumPy arrays, and Loom (generated with [Loompy)](http://loompy.org/) | +| Data output file format | File formats in which Slide-seq output is provided | [BAM](http://samtools.github.io/hts-specs/), Python NumPy arrays, and h5ad | ## Set-up @@ -67,7 +67,7 @@ The Slide-seq workflow inputs are specified in JSON configuration files. Example | tar_star_reference | TAR file containing a species-specific reference genome and GTF; generated using the [BuildIndices workflow](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/build_indices/BuildIndices.wdl). | File | | annotations_gtf | GTF containing gene annotations used for gene tagging (must match GTF in STAR reference). | File | | output_bam_basename | Optional string used for the output BAM file basename. | String | -| count_exons | Optional boolean indicating if the workflow should calculate exon counts; default is set to “true” and produces a Loom file containing both whole-gene counts and exon counts in an additional layer; when set to “false”, a Loom file containing only whole-gene counts is produced. | Boolean | +| count_exons | Optional boolean indicating if the workflow should calculate exon counts; default is set to “true” and produces an h5ad file containing both whole-gene counts and exon counts in an additional layer; when set to “false”, an h5ad file containing only whole-gene counts is produced. | Boolean | | bead_locations | Whitelist TSV file containing bead barcodes and XY coordinates on a single line for each bead; determined by sequencing prior to mRNA transfer and library preparation. | File | #### Pseudogene handling @@ -84,11 +84,11 @@ The [Slide-seq workflow](https://github.com/broadinstitute/warp/blob/master/pipe Overall, the Slide-seq workflow: 1. Calculates prealignment metrics. -1. Uses sctools to filter, trim, and split reads into < 30 GB FASTQs. +1. Filters, trims, and splits reads into < 30 GB FASTQs. 1. Uses STARsolo to correct bead barcodes, align reads, and count genes. 1. Calculates metrics. 1. Merges the STAR outputs into NPY and NPZ arrays. -1. Merges gene counts and metrics into a Loom-formatted matrix. +1. Merges gene counts and metrics into a h5ad-formatted matrix. The tools each Slide-seq task employs are detailed in the table below. @@ -104,12 +104,12 @@ To see specific tool parameters, select the task WDL link in the table; then fin | [Metrics.CalculateUMIsMetrics (alias = UMIsMetrics)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl) | TagSort | [warp-tools](https://github.com/broadinstitute/warp-tools) | Sorts the BAM file by gene using the bead barcode (CB), molecule barcode (UB), and gene ID (GX) tags and computes gene metrics. | | [Metrics.CalculateCellMetrics (alias = CellMetrics)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl) | TagSort | [warp-tools](https://github.com/broadinstitute/warp-tools) | Sorts the BAM file by bead barcode (CB), molecule barcode (UB), and gene ID (GX) tags and computes bead barcode metrics. | | [StarAlign.MergeStarOutput (alias = MergeStarOutputsExons)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/StarAlign.wdl) | create-npz-output.py | [Python 3](https://www.python.org/) | Creates a compressed raw NPY or NPZ file containing the STARsolo output features (NPY), barcodes (NPZ) and counts (NPZ). By default, `count_exons` is true and exon counts are included in output files. When `count_exons` is false, exon counts are excluded. | -| [LoomUtils.SingleNucleusOptimusLoomOutput (alias = SlideseqLoomGenerationWithExons)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/LoomUtils.wdl) | create_loom_slide_seq.py | [Python 3](https://www.python.org/) | Merges the gene counts, bead barcode metrics, and gene metrics data into a Loom formatted bead-by-gene matrix. By default, the Loom file contains whole-gene counts with exon counts in an additional layer. When `count_exons` is false, the task is run as `SlideseqLoomGeneration` and exon counts are excluded. | +| [H5adUtils.SingleNucleusOptimusH5adOutput (alias = OptimusH5adGenerationWithExons)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/H5adUtils.wdl) | create_h5ad_optimus.py | [Python 3](https://www.python.org/) | Merges the gene counts, bead barcode metrics, and gene metrics data into an h5ad formatted bead-by-gene matrix. By default, the h5ad file contains whole-gene counts with exon counts in an additional layer. When `count_exons` is false, the task is run as `SlideseqH5adGeneration` and exon counts are excluded. | #### 1. Calculating prealignment metrics The [FastqMetricsSlideSeq](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/FastqProcessing.wdl) task calculates prealignment metrics used for assessing data quality from the input FASTQ files. These metrics include the bead barcode distribution, UMI distribution, number of reads per cell and number of UMIs per cell. These metrics are included in the final outputs of the workflow. -#### 2. Filtering reads, trimming barcodes, and splitting FASTQs with sctools +#### 2. Filtering reads, trimming barcodes, and splitting FASTQs **Read filtering** @@ -121,7 +121,7 @@ Barcodes that are more than one edit distance ([Hamming distance](https://www.nc **Barcode trimming** -The task uses sctools to trim spacer sequences from bead barcodes and UMIs for use by STARsolo, which requires continuous sample barcodes without spacer sequences between them. The input `read_structure` is used to parse the barcodes and remove any bases with tags other than C or M, which represent the bead barcode and UMI, respectively. For example, with a `read_structure` of 8C18X6C9M1X, bases represented by 18X and 1X are removed from the reads and the string of bases is rewritten with the structure 14C9M. Bases represented by tags other than X will also be removed during this step, so long as they are not C or M. +The task uses warp-tools to trim spacer sequences from bead barcodes and UMIs for use by STARsolo, which requires continuous sample barcodes without spacer sequences between them. The input `read_structure` is used to parse the barcodes and remove any bases with tags other than C or M, which represent the bead barcode and UMI, respectively. For example, with a `read_structure` of 8C18X6C9M1X, bases represented by 18X and 1X are removed from the reads and the string of bases is rewritten with the structure 14C9M. Bases represented by tags other than X will also be removed during this step, so long as they are not C or M. **FASTQ splitting** @@ -151,40 +151,40 @@ The resulting BAM files are merged together into a single BAM using the [MergeSo **STARsolo outputs** -The task’s output includes a coordinate-sorted BAM file containing the bead barcode-corrected reads and SAM attributes UB UR UY CR CB CY NH GX GN. Additionally, after counting, the task outputs three intermediate TSV files (features, barcodes, and matrix) used for downstream Loom matrix generation. +The task’s output includes a coordinate-sorted BAM file containing the bead barcode-corrected reads and SAM attributes UB UR UY CR CB CY NH GX GN. Additionally, after counting, the task outputs three intermediate TSV files (features, barcodes, and matrix) used for downstream h5ad matrix generation. #### 4. Calculating metrics The [CalculateGeneMetrics](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl), [CalculateUMIsMetrics](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl), and [CalculateCellMetrics](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl) tasks use [warp-tools](https://github.com/broadinstitute/warp-tools) to calculate summary metrics that help assess the per-bead and per-UMI quality of the data output each time this pipeline is run. -These metrics output from both tasks are included in the output Loom matrix. A detailed list of these metrics is found in the [Slide-seq Count Matrix Overview](./count-matrix-overview.md). +These metrics output from both tasks are included in the output h5ad matrix. A detailed list of these metrics is found in the [Slide-seq Count Matrix Overview](./count-matrix-overview.md). #### 5. Merging the STAR outputs into NPY and NPZ arrays The STARsolo output includes a features, barcodes, and matrix TSV for each of the partitioned FASTQ input files. The [MergeStarOutput task](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/StarAlign.wdl) merges each respective TSV. It uses a custom python script to convert the merged matrix, features, and barcodes output from STARsolo into an NPY (features and barcodes)- and NPZ (the matrix)-formatted file. -#### 6. Merging counts and metrics data into Loom-formatted matrix +#### 6. Merging counts and metrics data into h5ad-formatted matrix -The [SingleNucleusOptimusLoomOutput](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/LoomUtils.wdl) task uses a custom python script to merge the converted STARsolo count matrix and the cell (bead) and gene metrics into a Loom-formatted bead-by-gene matrix. **These counts are raw and unfiltered.** +The [SingleNucleusOptimusH5adOutput](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/H5adUtils.wdl) task uses a custom python script to merge the converted STARsolo count matrix and the cell (bead) and gene metrics into an h5ad-formatted bead-by-gene matrix. **These counts are raw and unfiltered.** Read full details for all the metrics in the [Slide-seq Count Matrix Overview](./count-matrix-overview.md). **Gene counts** -The type of gene counts in the Loom will vary depending on the value of the Slide-seq workflow input, `count_exons`. By default, `count_exons` is set to true and the output Loom will contain whole-gene counts with exon counts in an additional layer. +The type of gene counts in the h5ad file will vary depending on the value of the Slide-seq workflow input, `count_exons`. By default, `count_exons` is set to true and the output h5ad file will contain whole-gene counts with exon counts in an additional layer. -If the workflow is run with `count_exons` set to false, the output Loom file will contain whole-gene counts. Running the workflow in this configuration will cause the Loom matrix to have fewer columns (bead barcodes) due to the difference in STARsolo counting mode. +If the workflow is run with `count_exons` set to false, the output h5ad file will contain whole-gene counts. Running the workflow in this configuration will cause the h5ad matrix to have fewer columns (bead barcodes) due to the difference in STARsolo counting mode. -You can determine which type of counts are in the Loom by looking at the global attribute `expression_data_type`. +You can determine which type of counts are in the h5ad by looking at the global attribute `expression_data_type`. -After running the pipeline with `count_exons` set to true, you can access whole-gene and exonic counts using Loompy's `layers()` method. For example, `loompy.connect.layers[“”]` will return the whole-gene counts from the output Loom file. Similarly, `loompy.connect.layers[“exon_counts”]` will return the exonic counts from the output Loom. +After running the pipeline with `count_exons` set to true, you can access whole-gene and exonic counts using the AnnData `layers()` function. For example, adata.layers[“exon_counts”]` will return the exonic counts from the output h5ad. #### 6. Outputs Output files of the pipeline include: -1. Bead x Gene unnormalized count matrices in Loom format. +1. Bead x Gene unnormalized count matrices in h5ad format. 2. Unfiltered, sorted BAM file with barcode and downstream analysis tags. 3. Bead metadata, including bead metrics. 4. Gene metadata, including gene metrics. @@ -206,11 +206,9 @@ The following table lists the output files produced from the pipeline. For sampl | fastq_umi_distribution | `.barcode_distribution_XM.txt` | Metric file containing the distribution of reads per UMI that were calculated prior to alignment. | TXT | | fastq_reads_per_cell | `.numReads_perCell_XC.txt` | Metric file containing the number of reads per barcode that were calculated prior to alignment. | TXT | | fastq_reads_per_umi | `.numReads_perCell_XM.txt` | Metric file containing the number of reads per UMI that were calculated prior to alignment. | TXT | -| loom_output_file | `.loom` | Loom file containing count data and metadata. | Loom | +| h5ad_output_file | `.h5ad` | h5ad file containing count data and metadata. | H5AD | -The Loom matrix is the default output. See the [create_loom_slide_seq.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/create_loom_optimus.py) script for the detailed code. This matrix contains the unnormalized (unfiltered) count matrices, as well as the gene and bead barcode metrics detailed in the [Slide-seq Count Matrix Overview](./count-matrix-overview.md). - -The output Loom matrix can be converted to an H5AD file for downstream processing using a [custom script](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/loom_to_h5ad.py) available in the [warp-tools GitHub repository](https://github.com/broadinstitute/warp-tools). +The h5ad matrix is the default output. This matrix contains the unnormalized (unfiltered) count matrices, as well as the gene and bead barcode metrics detailed in the [Slide-seq Count Matrix Overview](./count-matrix-overview.md). ## Validation against on-prem pipeline @@ -223,9 +221,15 @@ All Slide-seq pipeline releases are documented in the [Slide-seq changelog](http ## Citing the Slide-seq Pipeline -Please identify the pipeline in your methods section using the Slide-seq Pipeline's [SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_023379/resolver?q=%22Slide-seq%22&l=%22Slide-seq%22&i=rrid:scr_023379). + +If you use the Slide-seq Pipeline in your research, please identify the pipeline in your methods section using the [Slide-seq SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_023379/resolver?q=%22Slide-seq%22&l=%22Slide-seq%22&i=rrid:scr_023379). + * Ex: *Slide-seq Pipeline (RRID:SCR_023379)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia support This pipeline is supported by the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN) and BRAIN Initiative Cell Atlas Network (BICAN). diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/_category_.json b/website/docs/Pipelines/SlideSeq_Pipeline/_category_.json index a658fab6e4..74b2f466a6 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/_category_.json +++ b/website/docs/Pipelines/SlideSeq_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Slide-seq", - "position": 13 + "position": 14 } \ No newline at end of file diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/count-matrix-overview.md b/website/docs/Pipelines/SlideSeq_Pipeline/count-matrix-overview.md index 751ac6f605..8ca9dd3aac 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/count-matrix-overview.md +++ b/website/docs/Pipelines/SlideSeq_Pipeline/count-matrix-overview.md @@ -4,28 +4,30 @@ sidebar_position: 2 # Slide-seq Count Matrix Overview -The Slide-seq pipeline's default count matrix output is a Loom file generated using [Loompy v.3.0.6](http://loompy.org/). +:::warning +The Loom matrix is deprecated and the default matrix is now h5ad. +::: -It contains the raw bead-by-gene counts, which vary depending on the workflow's `count_exons` parameter. By default, `count_exons` is set to `true` and the output Loom will contain whole-gene counts with exon counts in an additional layer. +The Slide-seq pipeline's default count matrix output is a h5ad file generated using [AnnData](https://anndata.readthedocs.io/en/latest/index.html). -If the workflow is run with `count_exons` set to `false`, the output Loom file will contain whole-gene counts. Running the workflow in this configuration will cause the Loom matrix to have fewer columns (bead barcodes) due to the difference in STARsolo counting mode. +It contains the raw bead-by-gene counts, which vary depending on the workflow's `count_exons` parameter. By default, `count_exons` is set to `true` and the output h5ad file will contain whole-gene counts with exon counts in an additional layer. -You can determine which type of counts are in the Loom by looking at the global attribute `expression_data_type` (see [Table 1](#table-1-global-attributes) below). +If the workflow is run with `count_exons` set to `false`, the output h5ad file will contain whole-gene counts. Running the workflow in this configuration will cause the h5ad matrix to have fewer columns (bead barcodes) due to the difference in STARsolo counting mode. -The matrix also contains multiple metrics for both individual bead barcodes (the columns of the matrix; [Table 2](#table-2-column-attributes-bead-barcode-metrics)) and individual genes (the rows of the matrix; [Table 3](#table-3-row-attributes-gene-metrics)). +You can determine which type of counts are in the h5ad file by looking at the unstructured metadata (the `anndata.uns` property of the matrix) `expression_data_type` key (see [Table 1](#table-1-global-attributes) below). + +The matrix also contains multiple metrics for both individual bead barcodes (the `anndata.obs` property of the matrix; [Table 2](#table-2-column-attributes-bead-barcode-metrics)) and individual genes (the `anndata.var` property of the matrix; [Table 3](#table-3-row-attributes-gene-metrics)) Table 3. Row attributes (gene metrics) ## Table 1. Global attributes -The global attributes in the Loom apply to the whole file, not any specific part. +The global attributes (unstuctured metadata) in the h5ad apply to the whole file, not any specific part. | Attribute | Details | | :-------- | :------ | -| `CreationDate` | Date the Loom file was created. | -| `LOOM_SPEC_VERSION` | Loom file spec version used during creation of the Loom file. | | `expression_data_type` | String describing if the pipeline counted whole transcript (exonic and intronic) or only exonic reads determined by the value of the `count_exons` parameter. By default, `count_exons` is `true` and `expression_data_type` is `whole_transcript`; if `count_exons` is `false` then `expression_data_type` is `exonic`. | | `input_id` | The `input_id` provided to the pipeline as input and listed in the pipeline configuration file. This can be any string, but it's recommended for this to be consistent with any sample metadata. | -| `optimus_output_schema_version` | Loom file spec version used during creation of the Loom file. | -| `pipeline_version` | Version of the Slide-seq pipeline used to generate the Loom file. | +| `optimus_output_schema_version` | h5ad file spec version used during creation of the h5ad file. | +| `pipeline_version` | Version of the Slide-seq pipeline used to generate the h5ad file. | ## Table 2. Column attributes (bead barcode metrics) @@ -33,37 +35,45 @@ The bead barcode metrics below are computed using [TagSort](https://github.com/b | Bead Barcode Metrics | Details | | :------------------- | :------ | +|`cell_names` | The unique identifier for each bead based on bead barcodes; identical to `CellID`. | | `CellID` | The unique identifier for each bead based on bead barcodes; identical to `cell_names`. | +|`n_reads`| The number of reads associated with this entity. n_reads, like all metrics, are calculated from the Slide-Seq output BAM. Prior to alignment with STARsolo, reads are checked against the whitelist (1 hamming distance). These CB-corrected reads are the input to the STAR aligner. Then, the reads also get CB correction during STAR. For this reason, almost all reads in the aligned BAM have a CB tag and UB tag. Therefore, n_reads represents CB corrected reads, not all reads in the input FASTQ files. | +|`noise_reads`| Number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | +|`perfect_molecule_barcodes`| The number of reads whose molecule barcodes contain no errors. | +| `reads_mapped_exonic` | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_exonic_as` | The number of reads counted as exon in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `2` or `4` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic` | The number of reads counted as intron; counted when the BAM file's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic_as` | The number of reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +|`reads_mapped_uniquely`| The number of reads mapped to a single unambiguous location in the genome; mitochondrial reads are excluded. | +|`reads_mapped_multiple`| The number of reads mapped to multiple genomic positions with equal confidence; mitochondrial reads are excluded. | +| `duplicate_reads` | The number of duplicate reads. | +|`spliced_reads`| The number of reads that overlap splicing junctions. | |`antisense_reads`| The number of reads that are mapped to the antisense strand instead of the transcribed strand. | -|`cell_barcode_fraction_bases_above_30_mean`| The average fraction of base calls for the bead barcode sequences that are greater than 30, across molecules. | -|`cell_barcode_fraction_bases_above_30_variance`| The variance of the fraction of base calls for the bead barcode sequences that are greater than 30, across molecules. | -|`cell_names` | The unique identifier for each bead based on bead barcodes; identical to `CellID`. | -|`fragments_per_molecule`| The average number of fragments associated with each molecule in this entity. | +|`n_molecules`| Number of molecules corresponding to this entity (only reflects reads with CB and UB tags). | +|`n_fragments`| Number of fragments corresponding to this entity. | |`fragments_with_single_read_evidence`| The number of fragments associated with this entity that are observed by only one read. | +|`molecules_with_single_read_evidence`| The number of molecules associated with this entity that are observed by only one read. | +|`perfect_cell_barcodes`| The number of reads whose bead barcodes contain no errors. | +| `reads_mapped_intergenic` | The number of reads counted as intergenic; counted when the BAM file's `sF` tag is assigned to a `7` and the `NH:i` tag is `1`. | +| `reads_unmapped` | The total number of reads that are unmapped; counted when the BAM file's `sF` tag is `0`. | +|`reads_mapped_too_many_loci`| The number of reads that were mapped to too many loci across the genome and as a consequence, are reported unmapped by the aligner. | +| `n_genes` | The number of genes detected by this bead. | | `genes_detected_multiple_observations` | The number of genes that are observed by more than one read in this entity. | -| `genomic_read_quality_mean` | Average quality of base calls in the genomic reads corresponding to this entity. | -| `genomic_read_quality_variance` | Variance in quality of base calls in the genomic reads corresponding to this entity. | -| `genomic_reads_fraction_bases_quality_above_30_mean` | The average fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | -| `genomic_reads_fraction_bases_quality_above_30_variance` | The variance in the fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | -| `input_id` | The `input_id` provided to the pipeline as input and listed in the pipeline configuration file. This can be any string, but it's recommended for this to be consistent with any sample metadata. | | `molecule_barcode_fraction_bases_above_30_mean` | The average fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity. | | `molecule_barcode_fraction_bases_above_30_variance` | The variance in the fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity.| -|`molecules_with_single_read_evidence`| The number of molecules associated with this entity that are observed by only one read. | -|`n_fragments`| Number of fragments corresponding to this entity. | -| `n_genes` | The number of genes detected by this bead. | +| `genomic_reads_fraction_bases_quality_above_30_mean` | The average fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | +| `genomic_reads_fraction_bases_quality_above_30_variance` | The variance in the fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | +| `genomic_read_quality_mean` | Average quality of base calls in the genomic reads corresponding to this entity. | +| `genomic_read_quality_variance` | Variance in quality of base calls in the genomic reads corresponding to this entity. | +|`reads_per_molecule`| The average number of reads associated with each molecule in this entity. | +|`reads_per_fragment`| The average number of reads associated with each fragment in this entity. | +|`fragments_per_molecule`| The average number of fragments associated with each molecule in this entity. | +|`cell_barcode_fraction_bases_above_30_mean`| The average fraction of base calls for the bead barcode sequences that are greater than 30, across molecules. | +|`cell_barcode_fraction_bases_above_30_variance`| The variance of the fraction of base calls for the bead barcode sequences that are greater than 30, across molecules. | |`n_mitochondrial_genes`| The number of mitochondrial genes detected by this bead. | |`n_mitochondrial_molecules`| The number of molecules from mitochondrial genes detected for this bead. | -|`n_molecules`| Number of molecules corresponding to this entity (only reflects reads with CB and UB tags). | -|`n_reads`| The number of reads associated with this entity. n_reads, like all metrics, are calculated from the Optimus output BAM. Prior to alignment with STARsolo, reads are checked against the whitelist (1 hamming distance). These CB-corrected reads are the input to the STAR aligner. Then, the reads also get CB correction during STAR. For this reason, almost all reads in the aligned BAM have a CB tag and UB tag. Therefore, n_reads represents CB corrected reads, not all reads in the input FASTQ files. | -|`noise_reads`| Number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | |`pct_mitochondrial_molecules`| The percentage of molecules from mitochondrial genes detected for this bead. | -|`perfect_cell_barcodes`| The number of reads whose bead barcodes contain no errors. | -|`perfect_molecule_barcodes`| The number of reads whose molecule barcodes contain no errors. | -|`reads_mapped_multiple`| The number of reads mapped to multiple genomic positions with equal confidence. | -|`reads_mapped_too_many_loci`| The number of reads that were mapped to too many loci across the genome and as a consequence, are reported unmapped by the aligner. | -|`reads_mapped_uniquely`| The number of reads mapped to a single unambiguous location in the genome. | -|`reads_per_fragment`| The average number of reads associated with each fragment in this entity. | -|`spliced_reads`| The number of reads that overlap splicing junctions. | +| `input_id` | The `input_id` provided to the pipeline as input and listed in the pipeline configuration file. This can be any string, but it's recommended for this to be consistent with any sample metadata. | ## Table 3. Row attributes (gene metrics) @@ -72,28 +82,36 @@ The gene metrics below are computed using [TagSort](https://github.com/broadinst | Gene Metrics | Details | | ------------ | ------- | +|`gene_names` | The unique `gene_name` provided in the [GENCODE GTF](https://www.gencodegenes.org/); identical to the `Gene` attribute. | +|`ensembl_ids` | The `gene_id` provided in the [GENCODE GTF](https://www.gencodegenes.org/). | | `Gene` | The unique `gene_name` provided in the [GENCODE GTF](https://www.gencodegenes.org/); identical to the `gene_names` attribute. | +|`n_reads`| The number of reads associated with this entity. n_reads, like all metrics, are calculated from the Slide-Seq output BAM. Prior to alignment with STARsolo, reads are checked against the whitelist (1 hamming distance). These CB-corrected reads are the input to the STAR aligner. Then, the reads also get CB correction during STAR. For this reason, almost all reads in the aligned BAM have a CB tag and UB tag. Therefore, n_reads represents CB corrected reads, not all reads in the input FASTQ files. | +|`noise_reads`| The number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | +|`perfect_molecule_barcodes`| The number of reads with molecule barcodes that have no errors. | +| `reads_mapped_exonic` | The number of unique reads counted as exon; counted when BAM file's `sF` tag is assigned to `1` or `3` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_exonic_as` | The number of reads counted as exon in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `2` or `4` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic` | The number of reads counted as intron; counted when the BAM file's `sF` tag is assigned to a `5` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +| `reads_mapped_intronic_as` | The number of reads counted as intron in the antisense direction; counted when the BAM file's `sF` tag is assigned to a `6` and the `NH:i` tag is `1`; mitochondrial reads are excluded. | +|`reads_mapped_uniquely`| The number of reads mapped to a single unambiguous location in the genome; mitochondrial reads are excluded. | +|`reads_mapped_multiple`| The number of reads mapped to multiple genomic positions with equal confidence; mitochondrial reads are excluded. | +| `duplicate_reads` | The number of duplicate reads. | +|`spliced_reads`| The number of reads that overlap splicing junctions. | |`antisense_reads`| The number of reads that are mapped to the antisense strand instead of the transcribed strand. | -|`ensembl_ids` | The `gene_id` provided in the [GENCODE GTF](https://www.gencodegenes.org/). | -|`fragments_per_molecule`| The average number of fragments associated with each molecule in this entity. | -|`fragments_with_single_read_evidence`| The number of fragments associated with this entity that are observed by only one read. | -|`gene_names` | The unique `gene_name` provided in the [GENCODE GTF](https://www.gencodegenes.org/); identical to the `Gene` attribute. | -|`genomic_read_quality_mean`| Average quality of base calls in the genomic reads corresponding to this entity. | -|`genomic_read_quality_variance`| Variance in quality of base calls in the genomic reads corresponding to this entity. | -|`genomic_reads_fraction_bases_quality_above_30_mean`| The average fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | -|`genomic_reads_fraction_bases_quality_above_30_variance`| The variance in the fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | |`molecule_barcode_fraction_bases_above_30_mean`| The average fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity. | |`molecule_barcode_fraction_bases_above_30_variance`| The variance in the fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity. | -|`molecules_with_single_read_evidence`| The number of molecules associated with this entity that are observed by only one read. | -|`n_fragments`| Number of fragments corresponding to this entity. | +|`genomic_reads_fraction_bases_quality_above_30_mean`| The average fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | +|`genomic_reads_fraction_bases_quality_above_30_variance`| The variance in the fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity. | +|`genomic_read_quality_mean`| Average quality of base calls in the genomic reads corresponding to this entity. | +|`genomic_read_quality_variance`| Variance in quality of base calls in the genomic reads corresponding to this entity. | |`n_molecules`| Number of molecules corresponding to this entity (only reflects reads with CB and UB tags). | -|`n_reads`| The number of reads associated with this entity. n_reads, like all metrics, are calculated from the Optimus output BAM. Prior to alignment with STARsolo, reads are checked against the whitelist (1 hamming distance). These CB-corrected reads are the input to the STAR aligner. Then, the reads also get CB correction during STAR. For this reason, almost all reads in the aligned BAM have a CB tag and UB tag. Therefore, n_reads represents CB corrected reads, not all reads in the input FASTQ files. | -|`noise_reads`| The number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | +|`n_fragments`| Number of fragments corresponding to this entity. | +|`reads_per_molecule`| The average number of reads associated with each molecule in this entity. | +|`reads_per_fragment`|The average number of reads associated with each fragment in this entity. | +|`fragments_per_molecule`| The average number of fragments associated with each molecule in this entity. | +|`fragments_with_single_read_evidence`| The number of fragments associated with this entity that are observed by only one read. | +|`molecules_with_single_read_evidence`| The number of molecules associated with this entity that are observed by only one read. | |`number_cells_detected_multiple`| The number of bead barcodes which observe more than one read of this gene. | |`number_cells_expressing`| The number of bead barcodes that detect this gene. | -|`perfect_molecule_barcodes`| The number of reads with molecule barcodes that have no errors. | -|`reads_mapped_multiple`| The number of reads mapped to multiple genomic positions with equal confidence. | -|`reads_mapped_uniquely`| The number of reads mapped to a single unambiguous location in the genome. | -|`reads_per_fragment`|The average number of reads associated with each fragment in this entity. | -|`reads_per_molecule`| The average number of reads associated with each molecule in this entity. | -|`spliced_reads`| The number of reads that overlap splicing junctions. | + +## Definitions +* Bead Barcode: Short nucleotide sequence used to label and distinguish which reads come from each unique bead, allowing for tracking of many beads simultaneously. diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/slide-seq_diagram.png b/website/docs/Pipelines/SlideSeq_Pipeline/slide-seq_diagram.png index ce70363d23..385c653c20 100644 Binary files a/website/docs/Pipelines/SlideSeq_Pipeline/slide-seq_diagram.png and b/website/docs/Pipelines/SlideSeq_Pipeline/slide-seq_diagram.png differ diff --git a/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/README.md index 1f069a419d..1a6368c014 100644 --- a/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/README.md @@ -98,9 +98,15 @@ The Multi-SS2 Pipeline has been validated for processing human and mouse, strand Release information for the Multi-SS2 Pipeline can be found in the [changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md). Please note that any major changes to the Smart-seq2 pipeline will be documented in the [Smart-seq2 Single Sample changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md). ## Citing the Smart-seq2 Multi-Sample Pipeline -Please identify the pipeline in your methods section using the Smart-seq2 Multi-Sample Pipeline's [SciCrunch resource identifier](https://scicrunch.org/scicrunch/Resources/record/nlx_144509-1/SCR_018920/resolver?q=Smart-seq2&l=Smart-seq2). + +If you use the Smart-seq2 Multi-Sample Pipeline in your research, please identify the pipeline in your methods section using the [Smart-seq2 Multi-Sample SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_018920/resolver?q=SCR_018920&l=SCR_018920&i=rrid:scr_018920). + * Ex: *Smart-seq2 Multi-Sample Pipeline (RRID:SCR_018920)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia Support This pipeline is supported and used by the [Human Cell Atlas](https://www.humancellatlas.org/) (HCA) project. diff --git a/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/_category_.json b/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/_category_.json index a14bbb52df..c6c709a762 100644 --- a/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/_category_.json +++ b/website/docs/Pipelines/Smart-seq2_Multi_Sample_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Smart-seq2 Multi-Sample", - "position": 15 + "position": 16 } diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md index e21fe808ee..0bc4dcf415 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [MultiSampleSmartSeq2SingleNuclei_v1.2.28](https://github.com/broadinstitute/warp/releases) | January, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [MultiSampleSmartSeq2SingleNuclei_v1.3.1](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![](./snSS2.png) @@ -177,10 +177,14 @@ The Multi-snSS2 pipeline was scientifically validated by the BRAIN Initiatives C All Multi-snSS2 release notes are documented in the [Multi-snSS2 changelog](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md). ## Citing the Multi-snSS2 Pipeline -To cite the Multi-snSS2 pipeline, use the [SciCrunch resource identifier](https://scicrunch.org/scicrunch/Resources/record/nlx_144509-1/SCR_021312/resolver). + +If you use the Multi-snSS2 Pipeline in your research, please identify the pipeline in your methods section using the [Multi-snSS2 SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_021312/resolver?q=SCR_021312&l=SCR_021312&i=rrid:scr_021312). + * Ex: *Smart-seq2 Single Nucleus Multi-Sample Pipeline (RRID:SCR_021312)* -To view an example of this citation as well as a publication-style methods section, see the Multi-snSS2 [Example Methods](./multi_snss2.methods.md). +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Consortia Support This pipeline is supported and used by the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN). diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/_category_.json b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/_category_.json index 19995a09ed..7b7a9bf0ed 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/_category_.json +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Smart-seq2 Single Nucleus Multi-Sample", - "position": 14 + "position": 15 } diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md index 77dedddb0e..a758e085cb 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# Smart-seq2 Single Nucleus Multi-Sample v1.2.26 Publication Methods +# Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Publication Methods Below we provide an example methods section for a publication. For the complete pipeline documentation, see the [Smart-seq2 Single Nucleus Multi-Sample Overview](./README.md). ## Methods -Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.2.26 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. +Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. For each nucleus in the batch, paired-end FASTQ files were first trimmed to remove adapters using the fastq-mcf tool with a subsampling parameter of 200,000 reads. The trimmed FASTQ files were then aligned to the GENCODE GRCm38 mouse genome using STAR v.2.7.10a. To count the number of reads per gene, but not isoforms, the quantMode parameter was set to GeneCounts. Multi-mapped reads, and optical and PCR duplicates, were removed from the resulting aligned BAM using the Picard MarkDuplicates tool with REMOVE_DUPLICATES = true. Metrics were collected on the deduplicated BAM using Picard CollectMultipleMetrics with VALIDATION_STRINGENCY =SILENT. diff --git a/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/README.md index 080b79071f..214484949e 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Smart-seq2_Single_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [smartseq2_v5.1.1](https://github.com/broadinstitute/warp/releases) | December, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [smartseq2_v5.1.20](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![](./smartseq_image.png) @@ -33,7 +33,7 @@ Check out the [Smart-seq2 Publication Methods](../Smart-seq2_Multi_Sample_Pipeli | Genomic Reference Sequence (for validation)| GRCh38 human genome primary sequence and M21 (GRCm38.p6) mouse genome primary sequence | GENCODE [human reference files](https://www.gencodegenes.org/human/release_27.html) and [mouse reference files](https://www.gencodegenes.org/mouse/release_M21.html) | Transcriptomic Reference Annotation (for validation) | V27 GENCODE human transcriptome and M21 mouse transcriptome | GENCODE [human GTF](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_27/gencode.v27.annotation.gtf.gz) and [mouse GTF](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gff3.gz) | | Aligner | HISAT2 (v.2.1.0) | [Kim, et al.,2019](https://www.nature.com/articles/s41587-019-0201-4) | -| QC Metrics | Picard (v.2.10.10) | [Broad Institute](https://broadinstitute.github.io/picard/) | +| QC Metrics | Picard (v.2.26.10) | [Broad Institute](https://broadinstitute.github.io/picard/) | | Transcript Quantification | Utilities for processing large-scale single cell datasets | [RSEM v.1.3.0](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-323) | Data Input File Format | File format in which sequencing data is provided | [FASTQ](https://academic.oup.com/nar/article/38/6/1767/3112533) | | Data Output File Formats | File formats in which Smart-seq2 output is provided | [BAM](http://samtools.github.io/hts-specs/), Loom (generated with [Loompy v.3.0.6)](http://loompy.org/), CSV (QC metrics and counts) | @@ -99,7 +99,7 @@ Overall, the workflow is divided into two parts that are completed after an init **Part 1: Quality Control Tasks** 1. Aligns reads to the genome with HISAT2 v.2.1.0 - 2. Calculates summary metrics from an aligned BAM using Picard v.2.10.10 + 2. Calculates summary metrics from an aligned BAM using Picard v.2.26.10 **Part 2: Transcriptome Quantification Tasks** 1. Aligns reads to the transcriptome with HISAT v.2.1.0 @@ -133,11 +133,11 @@ HISAT2 is a fast, cost-efficient alignment tool that can determine the presence The [Picard task](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Picard.wdl) generates QC metrics by using three sub-tasks: -* CollectMultipleMetrics: calls the [CollectMultipleMetrics](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.0.0.0/picard_analysis_CollectMultipleMetrics.php) tool which uses the aligned BAM file and reference genome fasta to collect metrics on [alignment](http://broadinstitute.github.io/picard/picard-metric-definitions.html#AlignmentSummaryMetrics), [insert size](http://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics), [GC bias](https://broadinstitute.github.io/picard/command-line-overview.html#CollectGcBiasMetrics), [base distribution by cycle](http://broadinstitute.github.io/picard/picard-metric-definitions.html#BaseDistributionByCycleMetrics), [quality score distribution](https://broadinstitute.github.io/picard/command-line-overview.html#QualityScoreDistribution), [quality distribution by cycle](https://broadinstitute.github.io/picard/command-line-overview.html#MeanQualityByCycle), [sequencing artifacts](http://broadinstitute.github.io/picard/picard-metric-definitions.html#ErrorSummaryMetrics), and [quality yield](http://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectQualityYieldMetrics.QualityYieldMetrics). +* CollectMultipleMetrics: calls the [CollectMultipleMetrics](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.2.6.1/picard_analysis_CollectMultipleMetrics.php) tool which uses the aligned BAM file and reference genome fasta to collect metrics on [alignment](http://broadinstitute.github.io/picard/picard-metric-definitions.html#AlignmentSummaryMetrics), [insert size](http://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics), [GC bias](https://broadinstitute.github.io/picard/command-line-overview.html#CollectGcBiasMetrics), [base distribution by cycle](http://broadinstitute.github.io/picard/picard-metric-definitions.html#BaseDistributionByCycleMetrics), [quality score distribution](https://broadinstitute.github.io/picard/command-line-overview.html#QualityScoreDistribution), [quality distribution by cycle](https://broadinstitute.github.io/picard/command-line-overview.html#MeanQualityByCycle), [sequencing artifacts](http://broadinstitute.github.io/picard/picard-metric-definitions.html#ErrorSummaryMetrics), and [quality yield](http://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectQualityYieldMetrics.QualityYieldMetrics). -* CollectRnaMetrics: calls the [CollectRnaSeqMetrics](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.0.0.0/picard_analysis_CollectRnaSeqMetrics.php) tool which uses the aligned BAM, a RefFlat genome annotation file, and a ribosomal intervals file to produce RNA alignment metrics (metric descriptions are found in the [Picard Metrics Dictionary](http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics)). +* CollectRnaMetrics: calls the [CollectRnaSeqMetrics](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.2.6.1/picard_analysis_CollectRnaSeqMetrics.php) tool which uses the aligned BAM, a RefFlat genome annotation file, and a ribosomal intervals file to produce RNA alignment metrics (metric descriptions are found in the [Picard Metrics Dictionary](http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics)). -* CollectDuplicationMetrics: calls the [MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.0.4.0/picard_sam_markduplicates_MarkDuplicates.php) tool which uses the aligned BAM to identify duplicate reads (output metrics are listed in the [Picard Metrics Dictionary](http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics)). +* CollectDuplicationMetrics: calls the [MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.2.6.1/picard_sam_markduplicates_MarkDuplicates.php) tool which uses the aligned BAM to identify duplicate reads (output metrics are listed in the [Picard Metrics Dictionary](http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics)). #### Part 2: Transcriptome Quantification Tasks @@ -211,9 +211,15 @@ The SS2 pipeline has been validated for processing human and mouse, stranded or All SS2 release notes are documented in the [Smartseq2 Single Sample changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md). ## Citing the Smart-seq2 Single Sample Pipeline -Please identify the SS2 pipeline in your methods section using the Smart-seq2 Single Sample Pipeline's [SciCrunch resource identifier](https://scicrunch.org/browse/resourcedashboard). + +If you use the Smart-seq2 Single Sample Pipeline in your research, please identify the pipeline in your methods section using the [Smart-seq2 Single Sample SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_021228/resolver?q=SCR_021228&l=SCR_021228&i=rrid:scr_021228). + * Ex: *Smart-seq2 Single Sample Pipeline (RRID:SCR_021228)* +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Consortia Support This pipeline is supported and used by the [Human Cell Atlas](https://www.humancellatlas.org/) (HCA) project. diff --git a/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/_category_.json b/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/_category_.json index 75e33d384f..5c2d6a9b2a 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/_category_.json +++ b/website/docs/Pipelines/Smart-seq2_Single_Sample_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Smart-seq2 Single Sample", - "position": 16 + "position": 17 } diff --git a/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/README.md b/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/README.md index bd97ab96fa..923b419675 100644 --- a/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/README.md +++ b/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/README | Pipeline Version | Date Updated | Documentation Authors | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [UltimaGenomicsWholeGenomeGermline_v1.0.13](https://github.com/broadinstitute/warp/releases) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) & [Kaylee Mathews](mailto:kmathews@broadinstitute.org)| Please file GitHub issues in warp or contact [the wARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [UltimaGenomicsWholeGenomeGermline_v1.0.15](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) & [Kaylee Mathews](mailto:kmathews@broadinstitute.org)| Please file GitHub issues in warp or contact [the wARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![UG_diagram](ug_diagram.png) @@ -272,7 +272,11 @@ The outputs of the UG_WGS workflow are not yet compatible with the WARP [Ultimat All UG_WGS pipeline releases are documented in the [pipeline changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md). - +## Citing the UG_WGS Pipeline + +If you use the UG_WGS Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Feedback diff --git a/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/_category_.json b/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/_category_.json index 010f0be5a3..edca41ff15 100644 --- a/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/_category_.json +++ b/website/docs/Pipelines/Ultima_Genomics_Whole_Genome_Germline_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Ultima Genomics Whole Genome Germline", - "position": 18 + "position": 19 } diff --git a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README.md b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README.md index 27ad3c9355..bec572c824 100644 --- a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| WholeGenomeGermlineSingleSample_v3.1.17 (see [releases page](https://github.com/broadinstitute/warp/releases)) | December, 2023 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| WholeGenomeGermlineSingleSample_v3.1.19 (see [releases page](https://github.com/broadinstitute/warp/releases)) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the Whole Genome Germline Single Sample Pipeline The Whole Genome Germline Single Sample (WGS) pipeline implements data pre-processing and initial variant calling according to the GATK Best Practices for germline SNP and Indel discovery in human whole-genome sequencing data. It includes the DRAGEN-GATK mode, which makes the pipeline functionally equivalent to DRAGEN’s analysis pipeline (read more in this [DRAGEN-GATK blog](https://gatk.broadinstitute.org/hc/en-us/articles/360039984151)). @@ -369,6 +369,12 @@ The final CRAM files have base quality scores binned according to the [Functiona - When the pipeline runs in the **dragen_functional_equivalence_mode**, it produces functionally equivalent outputs to the DRAGEN pipeline. - Additional information about the GATK tool parameters and the DRAGEN-GATK best practices pipeline can be found on the [GATK support site](https://gatk.broadinstitute.org/hc/en-us). +## Citing the WGS Pipeline + +If you use the WGS Pipeline in your research, please consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + ## Contact us Please help us make our tools better by contacting [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) for pipeline-related suggestions or questions. diff --git a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/_category_.json b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/_category_.json index 7fd28c7d80..d44ed244cb 100644 --- a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/_category_.json +++ b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/_category_.json @@ -1,4 +1,4 @@ { "label": "Whole Genome Germline Single Sample", - "position": 19 + "position": 20 } diff --git a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/wgs.methods.md b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/wgs.methods.md index 842dd9ec59..01bd7457d0 100644 --- a/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/wgs.methods.md +++ b/website/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/wgs.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# Whole Genome Germline Single Sample v3.1.17 Methods (Default workflow) +# Whole Genome Germline Single Sample v3.1.19 Methods (Default workflow) The following contains a detailed methods description outlining the pipeline’s process, software, and tools that can be modified for a publication methods section. ## Detailed methods for the default Whole Genome Germline Single Sample workflow -Preprocessing and variant calling was performed using the WholeGenomeGermlineSingleSample v3.1.17 pipeline using Picard v2.26.10, GATK v4.3.0.0, and Samtools v1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline follows GATK Best Practices as previously described ([Van der Auwera & O'Connor, 2020](https://www.oreilly.com/library/view/genomics-in-the/9781491975183/)) as well as the Functional Equivalence specification ([Regier et al., 2018](https://www.nature.com/articles/s41467-018-06159-4)). +Preprocessing and variant calling was performed using the WholeGenomeGermlineSingleSample v3.1.19 pipeline using Picard v2.26.10, GATK v4.5.0.0, and Samtools v1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline follows GATK Best Practices as previously described ([Van der Auwera & O'Connor, 2020](https://www.oreilly.com/library/view/genomics-in-the/9781491975183/)) as well as the Functional Equivalence specification ([Regier et al., 2018](https://www.nature.com/articles/s41467-018-06159-4)). ### Pre-processing and quality control metrics @@ -34,7 +34,7 @@ The pipeline’s final outputs included metrics, validation reports, an aligned ## Detailed methods for the Functional Equivalence mode of the Whole Genome Germline Single Sample workflow -Preprocessing and variant calling was performed using the WholeGenomeGermlineSingleSample v3.1.17 pipeline using v2.26.10, GATK v4.3.0.0, and Samtools v1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline is functionally equivalent (as described in GATK Support: https://gatk.broadinstitute.org/hc/en-us/articles/4410456501915) to DRAGEN v3.4.12. +Preprocessing and variant calling was performed using the WholeGenomeGermlineSingleSample v3.1.17 pipeline using v2.26.10, GATK v4.5.0.0, and Samtools v1.11 with default tool parameters unless otherwise specified. All reference files are available in the public [Broad References Google Bucket](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0). The pipeline is functionally equivalent (as described in GATK Support: https://gatk.broadinstitute.org/hc/en-us/articles/4410456501915) to DRAGEN v3.4.12. ### Pre-processing and quality control metrics diff --git a/website/docs/Pipelines/snM3C/README.md b/website/docs/Pipelines/snM3C/README.md index 397cada01b..d0606addb9 100644 --- a/website/docs/Pipelines/snM3C/README.md +++ b/website/docs/Pipelines/snM3C/README.md @@ -6,14 +6,16 @@ slug: /Pipelines/snM3C/README | Pipeline Version | Date Updated | Documentation Authors | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [snM3C_v1.0.1](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Kaylee Mathews](mailto:warp-pipelines-help@broadinsitute.org) | Please file GitHub issues in the [WARP repository](https://github.com/broadinstitute/warp/issues) | +| [snM3C_v3.0.0](https://github.com/broadinstitute/warp/releases) | March, 2024 | [Kaylee Mathews](mailto:warp-pipelines-help@broadinsitute.org) | Please file GitHub issues in the [WARP repository](https://github.com/broadinstitute/warp/issues) | ## Introduction to snM3C The Single Nucleus Methly-Seq and Chromatin Capture (snM3C) workflow is an open-source, cloud-optimized computational workflow for processing single-nucleus methylome and chromatin contact (snM3C) sequencing data. The workflow is designed to demultiplex and align raw sequencing reads, call chromatin contacts, and generate summary metrics. -The workflow is developed in collaboration with Hanqing Liu and the laboratory of Joseph Ecker. For more information about the snM3C tools and analysis, please see the [YAP documentation](https://hq-1.gitbook.io/mc/) or the [cemba_data](https://github.com/lhqing/cemba_data) GitHub repository created by Hanqing Liu. +The workflow is developed in collaboration with Hanqing Liu, Wei Tian, Wubin Ding, Huaming Chen, Chongyuan Luo, and the entire laboratory of Joseph Ecker. + +For more information about the snM3C tools and analysis, please see the [YAP documentation](https://hq-1.gitbook.io/mc/) or the [cemba_data](https://github.com/lhqing/cemba_data) GitHub repository created by Hanqing Liu. ## Quickstart table The following table provides a quick glance at the Multiome pipeline features: @@ -76,15 +78,10 @@ Overall, the snM3C workflow: 1. Demultiplexes, sorts, and trims reads. 2. Aligns paired-end reads. -3. Separates unmapped, uniquely aligned, multi-aligned reads. -4. Splits unmapped reads by enzyme cut sites. -5. Aligns unmapped, single-end reads. -6. Removes overlapping reads. -7. Merges mapped reads from single- and paired-end alignments. -8. Calls chromatin contacts. -9. Removes duplicate reads. -10. Creates ALLC file. -11. Creates summary output file. +3. Separates unmapped, uniquely aligned, multi-aligned reads and splits unmapped reads by enzyme cut site. +4. Aligns unmapped, single-end reads and removes overlapping reads. +5. Merges mapped reads, calls chromatin contacts, and creates ALLC files. +6. Creates summary output file. The tools each snM3C task employs are detailed in the table below. @@ -95,17 +92,12 @@ To see specific tool parameters, select the [workflow WDL link](https://github.c | Demultiplexing | Cutadapt | [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) | Performs demultiplexing to cell-level FASTQ files based on random primer indices. | | Sort_and_trim_r1_and_r2 | Cutadapt | [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) | Sorts, filters, and trims reads using the `r1_adapter`, `r2_adapter`, `r1_left_cut`, `r1_right_cut`, `r2_left_cut`, and `r2_right_cut` input parameters. | | Hisat_3n_pair_end_mapping_dna_mode | HISAT-3N | [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/) | Performs paired-end read alignment. | -| Separate_unmapped_reads | [hisat3n_general.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/hisat3n_general.py) | python3 | Imports a custom python3 script developed by Hanqing Liu and calls the `separate_unique_and_multi_align_reads()` function to separate unmapped, uniquely aligned, multi-aligned reads from HISAT-3N BAM file; unmapped reads are stored in an unmapped FASTQ file and uniquely and multi-aligned reads are stored in separate BAM files. | -| Split_unmapped_reads | [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py) | python3 | Imports a custom python3 script developed by Hanqing Liu and calls the `split_hisat3n_unmapped_reads()` function to split the unmapped reads FASTQ file by all possible enzyme cut sites and output new R1 and R2 FASTQ files. | -| Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name | HISAT-3N | [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/) | Performs single-end alignment of unmapped reads to maximize read mapping. | -| remove_overlap_read_parts | [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py) | python3 | Imports a custom python3 script developed by Hanqing Liu and calls the `remove_overlap_read_parts()` function to remove overlapping reads from the split alignment BAM file produced during single-end alignment. | -| merge_original_and_split_bam_and_sort_all_reads_by_name_and_position | merge, sort | [samtools](https://www.htslib.org/) | Merges and sorts all mapped reads from the paired-end and single-end alignments; creates a position-sorted BAM file and a name-sorted BAM file. | -| call_chromatin_contacts | [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py) | python3 | Imports a custom python3 script developed by Hanqing Liu and calls the `call_chromatin_contacts()` function to call chromatin contacts from the name-sorted, merged BAM file; reads are considered chromatin contacts if they are greater than 2,500 base pairs apart. | -| dedup_unique_bam_and_index_unique_bam | MarkDuplicates | [Picard](https://broadinstitute.github.io/picard/) | Removes duplicate reads from the position-sorted, merged BAM file. | -| unique_reads_allc | bam-to-allc | [ALLCools](https://lhqing.github.io/ALLCools/intro.html) | Creates an ALLC file with a list of methylation points. | -| unique_reads_cgn_extraction | extract-allc | [ALLCools](https://lhqing.github.io/ALLCools/intro.html) | Creates an ALLC file containing methylation contexts. | +| Separate_and_split_unmapped_reads | [hisat3n_general.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/hisat3n_general.py), [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py) | python3 | Imports 2 custom python3 scripts developed by Hanqing Liu and calls the `separate_unique_and_multi_align_reads()` and `split_hisat3n_unmapped_reads()` functions to separate unmapped, uniquely aligned, multi-aligned reads from HISAT-3N BAM file, then splits the unmapped reads FASTQ file by all possible enzyme cut sites and output new R1 and R2 FASTQ files; unmapped reads are stored in unmapped FASTQ files and uniquely and multi-aligned reads are stored in separate BAM files. | +| hisat_single_end | HISAT-3N, [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py) | [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/), python3 | Performs single-end alignment of unmapped reads to maximize read mapping, imports a custom python3 script developed by Hanqing Liu, and calls the `remove_overlap_read_parts()` function to remove overlapping reads from the split alignment BAM file produced during single-end alignment. | +| merge_sort_analyze | merge, sort, MarkDuplicates, [hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py), bam-to-allc, extract-allc | [samtools](https://www.htslib.org/), [Picard](https://broadinstitute.github.io/picard/), python3, [ALLCools](https://lhqing.github.io/ALLCools/intro.html) | Merges and sorts all mapped reads from the paired-end and single-end alignments; creates a position-sorted BAM file and a name-sorted BAM file; removes duplicate reads from the position-sorted, merged BAM file; imports a custom python3 script developed by Hanqing Liu and calls the `call_chromatin_contacts()` function to call chromatin contacts from the name-sorted, merged BAM file; reads are considered chromatin contacts if they are greater than 2,500 base pairs apart; creates a first ALLC file with a list of methylation points and a second ALLC file containing methylation contexts. | | summary | [summary.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/summary.py) | python3 | Imports a custom python3 script developed by Hanqing Liu and calls the `snm3c_summary()` function to generate a single, summary file for the pipeline in TSV format; contains trimming, mapping, deduplication, chromatin contact, and AllC site statistics. | + #### 1. Demultiplexes, sorts, and trims reads In the first step of the pipeline (`Demultiplexing`), raw sequencing reads are demultiplexed by random primer index into cell-level FASTQ files using [Cutadapt](https://cutadapt.readthedocs.io/en/stable/). For more information on barcoding, see the [YAP documentation](https://hq-1.gitbook.io/mc/tech-background/barcoding#two-round-of-barcoding). @@ -114,37 +106,37 @@ After demultiplexing, the pipeline uses [Cutadapt](https://cutadapt.readthedocs. #### 2. Aligns paired-end reads In the next step of the pipeline, the `Hisat_3n_pair_end_mapping_dna_mode` task uses [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/) to perform paired-end read alignment to a reference genome FASTA file (`genome_fa`) and outputs an aligned BAM file. Additionally, the task outputs a stats file and a text file containing the genomic reference version used. -#### 3. Separates unmapped, uniquely aligned, multi-aligned reads -After paired-end alignment, the pipeline calls the `Separate_unmapped_reads` task, which imports a custom python3 script ([hisat3n_general.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/hisat3n_general.py)) developed by Hanqing Liu. The task calls the script's `separate_unique_and_multi_align_reads()` function to separate unmapped, uniquely aligned, and multi-aligned reads from the HISAT-3N BAM file. Three new files are output from this step of the pipeline: +#### 3. Separates unmapped, uniquely aligned, multi-aligned reads and splits unmapped reads by enzyme cut site + +After paired-end alignment, the pipeline calls the `Separate_and_split_unmapped_reads` task, which imports a custom python3 script ([hisat3n_general.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/hisat3n_general.py)) developed by Hanqing Liu. The task calls the script's `separate_unique_and_multi_align_reads()` function to separate unmapped, uniquely aligned, and multi-aligned reads from the HISAT-3N BAM file. Three new files are output from this step of the pipeline: 1. A FASTQ file that contains the unmapped reads (`unmapped_fastq_tar`) 2. A BAM file that contains the uniquely aligned reads (`unique_bam_tar`) 3. A BAM file that contains the multi-aligned reads (`multi_bam_tar`) -#### 4. Splits unmapped reads by enzyme cut sites -The `Split_unmapped_reads` task imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu and calls the script's `split_hisat3n_unmapped_reads()` function. This splits the FASTQ file containing the unmapped reads by all possible enzyme cut sites and outputs new R1 and R2 files. +After separating reads, the task imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu and calls the script's `split_hisat3n_unmapped_reads()` function. This splits the FASTQ file containing the unmapped reads by all possible enzyme cut sites and outputs new R1 and R2 files. + +#### 4. Aligns unmapped, single-end reads and removes overlapping reads +In the next step of the pipeline, the `hisat_single_end ` task uses [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/) to perform single-end read alignment of the previously unmapped reads to maximize read mapping and outputs a single, aligned BAM file. -#### 5. Aligns unmapped, single-end reads -In the next step of the pipeline, the `Hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name ` task uses [HISAT-3N](https://daehwankimlab.github.io/hisat2/hisat-3n/) to perform single-end read alignment of the previously unmapped reads to maximize read mapping and outputs a single, aligned BAM file. +After the second alignment step, the task imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu. The task calls the script's `remove_overlap_read_parts()` function to remove overlapping reads from the BAM file produced during single-end alignment and output another BAM file. -#### 6. Removes overlapping reads -After the second alignment step, the pipeline calls the `remove_overlap_read_parts ` task, which imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu. The task calls the script's `remove_overlap_read_parts()` function to remove overlapping reads from the BAM file produced during single-end alignment and output another BAM file. +#### 5. Merges mapped reads, calls chromatin contacts, and creates ALLC files -#### 7. Merges mapped reads from single- and paired-end alignments -The `merge_original_and_split_bam_and_sort_all_reads_by_name_and_position` task uses [samtools](https://www.htslib.org/) to merge and sort all of the mapped reads from the paired-end and single-end alignments into a single BAM file. The BAM file is output as both a position-sorted and a name-sorted BAM file. +**Merged mapped reads** +The `merge_sort_analyze` task uses [samtools](https://www.htslib.org/) to merge and sort all of the mapped reads from the paired-end and single-end alignments into a single BAM file. The BAM file is output as both a position-sorted and a name-sorted BAM file. -#### 8. Calls chromatin contacts -In the `call_chromatin_contacts` task, the pipeline imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu. The task calls the script's `call_chromatin_contacts()` function to call chromatin contacts from the name-sorted, merged BAM file. If reads are greater than 2,500 base pairs apart, they are considered chromatin contacts. If reads are less than 2,500 base pairs apart, they are considered the same fragment. +After merging, the task uses Picard's MarkDuplicates tool to remove duplicate reads from the position-sorted, merged BAM file and output a deduplicated BAM file. -#### 9. Removes duplicate reads -After calling chromatin contacts, the `dedup_unique_bam_and_index_unique_bam` task uses Picard's MarkDuplicates tool to remove duplicate reads from the position-sorted, merged BAM file and output a deduplicated BAM file. +**Calls chromatin contacts** +Next, the pipeline imports a custom python3 script ([hisat3n_m3c.py](https://github.com/lhqing/cemba_data/blob/bf6248239074d0423d45a67d83da99250a43e50c/cemba_data/hisat3n/hisat3n_m3c.py)) developed by Hanqing Liu. The task calls the script's `call_chromatin_contacts()` function to call chromatin contacts from the name-sorted, merged BAM file. If reads are greater than 2,500 base pairs apart, they are considered chromatin contacts. If reads are less than 2,500 base pairs apart, they are considered the same fragment. -#### 10. Creates ALLC file -The `unique_reads_allc` task uses the [ALLCools](https://lhqing.github.io/ALLCools/intro.html) `bam-to-allc` function to create an ALLC file from the deduplicated BAM file that contains a list of methylation points. The `num_upstr_bases` and `num_downstr_bases` input parameters are used to define the number of bases upstream and downstream of the C base to include in the ALLC context column. +**Creates ALLC files** +After calling chromatin contacts, the task uses the [ALLCools](https://lhqing.github.io/ALLCools/intro.html) `bam-to-allc` function to create an ALLC file from the deduplicated BAM file that contains a list of methylation points. The `num_upstr_bases` and `num_downstr_bases` input parameters are used to define the number of bases upstream and downstream of the C base to include in the ALLC context column. -Next, the `unique_reads_cgn_extraction` task uses the [ALLCools](https://lhqing.github.io/ALLCools/intro.html) `extract-allc` function to extract methylation contexts from the input ALLC file and output a second ALLC file that can be used to generate an [MCDS file](https://github.com/lhqing/allcools_doc/blob/master/tech-background/file-formats.md#mcds-file). +Next, the task uses the [ALLCools](https://lhqing.github.io/ALLCools/intro.html) `extract-allc` function to extract methylation contexts from the input ALLC file and output a second ALLC file that can be used to generate an [MCDS file](https://github.com/lhqing/allcools_doc/blob/master/tech-background/file-formats.md#mcds-file). -#### 11. Creates summary output file +#### 6. Creates summary output file In the last step of the pipeline, the `summary` task imports a custom python3 script ([summary.py](https://github.com/lhqing/cemba_data/blob/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/summary.py)) developed by Hanqing Liu. The task calls the script's `snm3c_summary()` function to generate a single, summary file for the pipeline in TSV format; contains trimming, mapping, deduplication, chromatin contact, and AllC site statistics. This is the main output of the pipeline. ## Outputs @@ -154,34 +146,44 @@ The following table lists the output variables and files produced by the pipelin | Output name | Filename, if applicable | Output format and description | | ------ | ------ | ------ | | MappingSummary | `_MappingSummary.csv.gz` | Mapping summary file in CSV format. | -| trimmed_stats | `.trimmed_stats_files.tar.gz` | Array of tarred files containing trimming stats files; for more information, see the [Cutadapt documentation](https://cutadapt.readthedocs.io/en/stable/guide.html#reporting). | -| r1_trimmed_fq | `.R1_trimmed_files.tar.gz` | Array of tarred files containing trimmed R1 FASTQ files. | -| r2_trimmed_fq | `.R2_trimmed_files.tar.gz` | Array of tarred files containing trimmed R2 FASTQ files. | -| hisat3n_stats_tar | `.hisat3n_paired_end_stats_files.tar.gz` | Array of tarred files containing paired-end alignment summary files; see the [HISAT2 alignment summary documentation](https://daehwankimlab.github.io/hisat2/manual/) for more information. | -| hisat3n_bam_tar | `.hisat3n_paired_end_bam_files.tar.gz` | Array of tarred files containing BAM files from paired-end alignment. | -| unique_bam_tar | `.hisat3n_paired_end_unique_bam_files.tar.gz` | Array of tarred files containing BAM files with uniquely aligned reads from paired-end alignment. | -| multi_bam_tar | `.hisat3n_paired_end_multi_bam_files.tar.gz` | Array of tarred files containing BAM files with multi-aligned reads from paired-end alignment. | -| unmapped_fastq_tar | `.hisat3n_paired_end_unmapped_fastq_files.tar.gz` | Array of tarred files containing FASTQ files with unmapped reads from paired-end alignment. | -| split_fq_tar | `.hisat3n_paired_end_split_fastq_files.tar.gz` | Array of tarred files containing FASTQ files with unmapped reads split by possible enzyme cut sites. | -| merge_sorted_bam_tar | `.hisat3n_dna.split_reads.name_sort.bam.tar.gz` | Array of tarred files containing BAM files from single-end alignment. | | name_sorted_bams | `.hisat3n_dna.all_reads.name_sort.tar.gz` | Array of tarred files containing name-sorted, merged BAM files. | -| pos_sorted_bams | `.hisat3n_dna.all_reads.pos_sort.tar.gz` | Array of tarred files containing position-sorted, merged BAM files. | -| remove_overlap_read_parts_bam_tar | `.remove_overlap_read_parts.tar.gz` | Array of tarred files containing BAM files from single-end alignment with overlapping reads removed. | -| dedup_unique_bam_and_index_unique_bam_tar | `.dedup_unique_bam_and_index_unique_bam.tar.gz` | Array of tarred files containing deduplicated, position-sorted BAM files. | -| unique_reads_cgn_extraction_allc | `.output_allc_tar.tar.gz` | Array of tarred files containing CGN context-specific ALLC files that can be used to generate an [MCDS file](https://github.com/lhqing/allcools_doc/blob/master/tech-background/file-formats.md#mcds-file). | -| unique_reads_cgn_extraction_tbi | `.output_tbi_tar.tar.gz` | Array of tarred files containing ALLC index files. | -| chromatin_contact_stats | `.chromatin_contact_stats.tar.gz` | Array of tarred files containing chromatin contact files. | +| unique_reads_cgn_extraction_allc | `.allc.tsv.tar.gz` | Array of tarred files containing list of methylation points. | +| unique_reads_cgn_extraction_tbi | `.allc.tbi.tar.gz` | Array of tarred files containing ALLC index files. | | reference_version | `.reference_version.txt` | Array of tarred files containing the genomic reference version used. | +| all_reads_dedup_contacts | `.hisat3n_dna.all_reads.dedup_contacts.tar.gz` | Array of tarred TSV files containing deduplicated chromatin contacts. | +| all_reads_3C_contacts | `.hisat3n_dna.all_reads.3C.contact.tar.gz` | Array of tarred TSV files containing chromatin contacts in Hi-C format. | +| chromatin_contact_stats | `.chromatin_contact_stats.tar.gz` | Array of tarred files containing chromatin contact statistics. | +| unique_reads_cgn_extraction_allc_extract | `.extract-allc.tar.gz` | Array of tarred files containing CGN context-specific ALLC files that can be used to generate an [MCDS file](https://github.com/lhqing/allcools_doc/blob/master/tech-background/file-formats.md#mcds-file). | +| unique_reads_cgn_extraction_tbi_extract | `.extract-allc_tbi.tar.gz` | Array of tarred files containing ALLC index files. | + ## Versioning All snM3C pipeline releases are documented in the [pipeline changelog](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/snM3C/snM3C.changelog.md). +## Citing the snM3C Pipeline + +If you use the snM3C Pipeline in your research, please identify the pipeline in your methods section using the [snM3C SciCrunch resource identifier](https://scicrunch.org/resources/data/record/nlx_144509-1/SCR_025041/resolver?q=SCR_025041&l=SCR_025041&i=rrid:scr_025041). + +* Ex: *snM3C Pipeline (RRID:SCR_025041)* + +Please cite the following publication the snM3C pipeline: + +Lee, DS., Luo, C., Zhou, J. et al. Simultaneous profiling of 3D genome structure and DNA methylation in single human cells. Nat Methods 16, 999–1006 (2019). https://doi.org/10.1038/s41592-019-0547-z + +Please also consider citing our preprint: + +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 + + ## Consortia support This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN). If your organization also uses this pipeline, we would like to list you! Please reach out to us by contacting the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org). +## Acknowledgements +We are immensely grateful to the members of the BRAIN Initiative ([BICAN](https://brainblog.nih.gov/brain-blog/brain-issues-suite-funding-opportunities-advance-brain-cell-atlases-through-centers) Sequencing Working Group) and [SCORCH](https://nida.nih.gov/about-nida/organization/divisions/division-neuroscience-behavior-dnb/basic-research-hiv-substance-use-disorder/scorch-program) for their invaluable and exceptional contributions to this pipeline. Our heartfelt appreciation goes to our collaborators and the developers of these tools, Hanqing Liu, Wei Tian, Wubin Ding, Huaming Chen, Chongyuan Luo, and the entire laboratory of Joseph Ecker. + ## Feedback For questions, suggestions, or feedback related to the snM3C pipeline, please contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org). Your feedback is valuable for improving the pipeline and addressing any issues that may arise during its usage. \ No newline at end of file diff --git a/website/docs/Pipelines/snM3C/_category_.json b/website/docs/Pipelines/snM3C/_category_.json index 0aed70ddcf..e646087b04 100644 --- a/website/docs/Pipelines/snM3C/_category_.json +++ b/website/docs/Pipelines/snM3C/_category_.json @@ -1,4 +1,4 @@ { "label": "Single Nucleus Methyl-Seq and Chromatin Capture", - "position": 17 + "position": 18 } diff --git a/website/docs/get-started.md b/website/docs/get-started.md index c42a4a06ae..0c6a77bc70 100755 --- a/website/docs/get-started.md +++ b/website/docs/get-started.md @@ -101,7 +101,7 @@ Our planned upcoming improvements include: ## Citing WARP When citing WARP, please use the following: -Degatano K, Grant G, Khajouei F et al. Introducing WARP: A collection of cloud-optimized workflows for biological data processing and reproducible analysis [version 1; not peer reviewed]. F1000Research 2021, 10(ISCB Comm J):705 (slides) (doi: 10.7490/f1000research.1118678.1) +Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Acknowledgements