Merge pull request wtsi-npg#799 from wtsi-npg/devel

pull from devel to master to create release 67.0.0
mgcam · Sep 19, 2023 · 9588ebb · 9588ebb
2 parents 2f3d471 + 79202f8
commit 9588ebb
Show file tree

Hide file tree

Showing 8 changed files with 306 additions and 172 deletions.
diff --git a/.github/workflows/perlbrew.sha256 b/.github/workflows/perlbrew.sha256
@@ -0,0 +1 @@
+c3996e4fae37a0ae01839cdd73752fb7b17e81bac2a8b39712463a7d518c4945  perlbrew.sh
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -0,0 +1,123 @@
+name: "Unit tests"
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    defaults:
+      run:
+        shell: bash -l -e -o pipefail {0}
+
+    env:
+      PERL_CACHE: ~/perl5 # Perlbrew and CPAN modules installed here, cached
+      NPG_LIB: ~/perl5npg # NPG modules installed here, not cached
+      WSI_CONDA_CHANNEL: "https://dnap.cog.sanger.ac.uk/npg/conda/devel/generic"
+      CONDA_TEST_ENV: test-environment
+      WTSI_NPG_GITHUB_URL: https://github.com/wtsi-npg
+      WTSI_NPG_BUILD_BRANCH: ${{ github.base_ref || github.ref }}
+
+    strategy:
+      matrix:
+        perl: ["5.26.3", "5.34.1"]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: "Install OS dependencies"
+        run: |
+          sudo apt-get update
+          # https://github.com/actions/runner-images/issues/2139
+          sudo apt-get remove -y nginx libgd3
+          sudo apt-get install -y libgd-dev uuid-dev libgd-text-perl
+
+      - name: "Initialize Miniconda"
+        run: |
+          echo 'source $CONDA/etc/profile.d/conda.sh' >> "$HOME/.bash_profile"
+
+      - name: "Install Conda packages"
+        run: |
+          conda config --prepend pkgs_dirs ~/conda/pkgs
+          conda config --prepend envs_dirs ~/conda/envs
+
+          conda config --set auto_update_conda False
+          conda config --prepend channels "$WSI_CONDA_CHANNEL"
+          conda info
+
+          conda create  -y -n "$CONDA_TEST_ENV"
+          conda install -y -n "$CONDA_TEST_ENV" baton
+          conda install -y -n "$CONDA_TEST_ENV" samtools
+
+      - name: "Cache Perl"
+        id: cache-perl
+        uses: actions/cache@v3
+        with:
+          path: ${{ env.PERL_CACHE }}
+          key: ${{ runner.os }}-${{ matrix.perl }}-perl
+
+      - name: "Install Perlbrew"
+        if: steps.cache-perl.outputs.cache-hit != 'true'
+        run: |
+          curl -sSL https://install.perlbrew.pl -o perlbrew.sh
+          sha256sum -c .github/workflows/perlbrew.sha256
+          export PERLBREW_ROOT=${{ env.PERL_CACHE }}
+          sh perlbrew.sh
+          
+          source ${{ env.PERL_CACHE }}/etc/bashrc
+          perlbrew available
+          perlbrew install --notest perl-${{ matrix.perl }}
+          perlbrew use perl-${{ matrix.perl }}
+          perlbrew install-cpanm
+
+      - name: "Initialize Perlbrew"
+        run: |
+          echo "source ${{ env.PERL_CACHE }}/etc/bashrc" >> "$HOME/.bash_profile"
+
+      - name: "Install Perl dependencies"
+        run: |
+          cpanm --local-lib=${{ env.PERL_CACHE }} local::lib
+          eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib="$NPG_LIB")
+          eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib)
+          
+          cpanm --quiet --notest Module::Build
+          cpanm --quiet --notest Alien::Tidyp
+          cpanm --quiet --notest Net::SSLeay
+          cpanm --quiet --notest https://github.com/chapmanb/vcftools-cpan/archive/v0.953.tar.gz
+          
+          ./scripts/install_wsi_dependencies.sh "$NPG_LIB" \
+             perl-dnap-utilities \
+             perl-irods-wrap \
+             ml_warehouse \
+             npg_tracking \
+             npg_seq_common \
+             npg_qc \
+             npg_irods
+
+          cpanm --installdeps --notest .          
+
+      - name: "Log install failure"
+        if: ${{ failure() }}
+        run: |
+          find ~/.cpanm/work -cmin -1 -name '*.log' -exec tail -n20  {} \;
+
+      - name: "Archive CPAN logs on failure"
+        if: ${{ failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: cpan_log
+          path: ~/.cpanm/work/*/build.log
+          retention-days: 5
+
+      - name: "Run tests"
+        run: |
+          conda activate "$CONDA_TEST_ENV"
+          conda info --envs
+          
+          eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib)
+          eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib="$NPG_LIB")
+          
+          export TEST_AUTHOR=1
+          perl Build.PL
+          ./Build test --verbose
+          ./Build install
diff --git a/.github/workflows/testing_and_building_repo.yml b/.github/workflows/testing_and_building_repo.yml
diff --git a/Changes b/Changes
@@ -1,6 +1,15 @@
 LIST OF CHANGES
 ---------------
 
+release 67.0.0
+ - Turn off spatial filter QC check for NovaSeqX
+ - Switch to Perlbrew to obtain multiple Perl versions
+ - Remove npg_ml_warehouse dependency
+ - Enhance README with more context
+ - Improve Markdown format consistency
+ - Add images of DAGs, add links, fix a typo
+ - Add info on data intensive P4 based steps
+
 release 66.0.0
  - small tweak to seq_alignment so GbS samples with no study ref do not fail
  - switch off spatial filter for NovaSeqX

diff --git a/README.md b/README.md
@@ -1,55 +1,137 @@
-## Pipelines for Processing Sequencing Data
+# NPG Pipelines for Processing Illumina Sequencing Data
+
+This software provides the Sanger NPG team's automation for analysing and
+internally archiving Illumina sequencing on behalf of DNA Pipelines for their
+customers.
+
+There are two main pipelines:
+
+* data product and QC metric creation: `central`
+* internal archival of data products, metadata, QC metrics and logs:
+  `post_qc_review`
+
+and the daemons which automatically start these pipelines.
+
+Processing is performed as appropriate for the entire run, for each lane in the
+sequencing flowcell, or each tagged library (within a pool on the flowcell).
+
+## Batch Processing and Dependency Tracking with LSF or wr
+
+With this system, all of a pipeline's jobs for its steps are submitted for
+execution to the LSF, or wr, batch/job processing system as the pipeline is
+initialised. As such, a _submitted_ pipeline does not have an orchestration
+script or daemon running: managing the runtime dependencies of jobs within an
+instance of a pipeline is delegated to the batch/job processing system.
+
+How is this done? The job representing the start point of a graph is submitted
+to LSF, or wr, in a suspended state and is resumed once all other jobs have been
+submitted thus ensuring that the execution starts only if all steps are
+successfully submitted to LSF, or wr. If an error occurs at any point during job
+submissions, all submitted jobs, apart from the start job, are killed.
+
+## Pipeline Creation
+
+Steps of each of the pipelines and dependencies between the steps are defined in
+JSON input files located in data/config_files directory. The files follow
+[JSON Graph Format](https://github.com/jsongraph/json-graph-specification)
+syntax. Individual pipeline steps are defined as graph nodes, dependencies
+between them as directed graph edges. If step B should be executed after step A
+finishes, step B is considered to be dependant on step A.
+
+The graph represented by the input file should be a directed acyclic graph
+(DAG). Each graph node should have an id, which should be unique, and a label,
+which is the name of the pipeline step.
+
+Parallelisation of processing may be performed at different levels within the
+DAG: some steps are appropriate for
+
+* per run
+* per lane
+* per lane and tagged library, or per tagged library
+* per tagged library
+
+parallelisation.
+
+#### Visualizing Input Graphs
+
+JSON Graph Format (JGF) is relatively new, with little support for
+visualization. Convert JGF to GML
+[Graph Modeling Language](http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/brandenburg/projekte/gml/gml-technical-report.pdf)
+format using a simple script supplied with this package, `scripts/jgf2gml` .
+Many graph visualization tools, for example
+[Cytoscape](http://www.cytoscape.org/), support the GML format.
+
+## Per Sequencing-Run Pipelines
+
+The processing is performed per sequencing run. Many different studies and
+sequencing assays for different "customers" may be performed on a single run.
+Unlike contemporary (2020s) sharable bioinformatics pipelines, the logic for
+informatics is tied closely to the business logic e.g. what aligner is required
+with what reference, whether human read separation is required, is determined
+per indexed library within a lane of sequencing and scheduled for work in
+parallel.
+
+The information required for the logic is obtained from the upstream "LIMS" via
+a MLWH (Multi-LIMS warehouse) database and the run folder output by the
+sequencing instrument.
 
 ### Analysis Pipeline
 
-Processes data coming from Illumina sequencing instruments.
-Input data - bcl files, output - CRAM files. In most cases CRAM files are aligned.
+Processes data coming from Illumina sequencing instruments. It is labeled the
+"central" pipeline.
 
-### Archival Pipeline
+The input for an instance of the pipeline is the instrument output run folder
+(BCL and associated files) and LIMS information which drives appropriate
+processing.
 
-Archives sequencing data (CRAM files) and other related artefacts.
+The key data products are aligned CRAM files and indexes, or unaligned CRAM
+files. However per study (a LIMS datum) pipeline configuration allows for the
+creation of GATK gVCF files, or the running for external tool/pipeline e.g.
+ncov2012-artic-nf
 
-### Configuring Pipeline's Steps
+!["central" pipeline](data/config_files/function_list_central.json.png)
 
-Steps of each of the pipelines and dependencies between the steps are
-defined in JSON input files located in data/config_files directory.
-The files follow [JSON Graph Format](https://github.com/jsongraph/json-graph-specification)
-systax. Individual pipeline steps are defined as graph nodes, dependencies
-between them as directed graph edges. If step B should be executed
-after step A finishes, step B is is considered to be dependant on step A.
+Within this DAG there are two step which are key in producing the main data products:
 
-The graph represented by the input file should be a directed acyclic
-graph (DAG). Each graph node should have an id, which should be unique,
-and a label, which is the name of the pipeline step.
+* `p4_stage1_analysis` processes data at the lane level within a flowcell/run: includes conversion of instrument output (BCL files) to BAM format, demultiplexing of data within a lane to tagged libraries, alignment with any spiked phiX, (for some instrument types) detection of indel inducing fluidics bubbles and marking reads with fail bit, and (for some instrument types) detection and marking of sequencing adapter.
+* `seq_alignment` processes data at tagged library, or lane and tagged library, level: includes alignment to the target genome (or not), a naive human read filtering capability, splitting of human target data by autosome/allosome capability, (for some instrument types) removal of marked adapter pre-alignment and pasting post-alignment (so there is no loss of instrument basecalls or quality data), duplicate marking, and creation of standard sequencing metrics files.
 
-### Visualizing Input Graphs
+### Archival Pipeline
 
-JSON Graph Format (JGF) is relatively new, with little support for visualization.
-Convert JGF to GML [Graph Modeling Language](http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/brandenburg/projekte/gml/gml-technical-report.pdf)
-format using a simple script supplied with this package, scripts/jgf2gml.
-Many graph visualization tools, for example [Cytoscape](http://www.cytoscape.org/),
-support the GML format.
+Archives sequencing data (CRAM files) and other related artifacts e.g. index
+files. QC metrics. It is labeled the "post_qc_review" pipeline.
+
+!["post_qc_review" pipeline](data/config_files/function_list_post_qc_review.json.png)
 
 ### Pipeline Script Outputs
 
-Log file - in the run folder (as in the current pipeline).
-Example: /nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log
+Log file - in the run folder (as in the current pipeline). Example:
+`/nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log`
+
+File with JSON serialization of definition objects - in the analysis directory
+directory. Example:
+`/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json`
 
-File with JSON serialization of definition objects - in the analysis directory directory.
-Example: /path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json
+File with saved commands hashed by function name, LSF job id and array index -
+in the analysis directory. Example:
+`/path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json`
 
-File with saved commands hashed by function name, LSF job id and array index - in the analysis directory.
-Example: /path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json
+## Dependencies
 
-### Batch Processing and Dependencies Tracking with LSF
+This software relies heavily on the
+[npg_tracking](https://github.com/wtsi-npg/npg_tracking) software to abstract
+information from the MLWH and instrument runfolder, and coordination of the
+state of the run.
 
-In this package the pipeline steps are submitted for execution to the
-LSF batch processing system. The LSF job representing the start point of a graph
-is submitted to LSF in a suspended state and is resumed once all other LSF jobs
-have been submitted thus ensuring that the execution starts only if all steps
-are successfully submitted to LSF. If an error occurs at any point, all submitted
-jobs, apart from the start job, are killed.
+This software integrates heavily with the
+[npg_qc](https://github.com/wtsi-npg/npg_qc) system for calculating and
+recording for internal display QC metrics for operational teams to assess the
+sequencing and upstream processes.
 
-
-
+For the data processing intensive steps, `p4_stage1_analysis` and
+`seq_alignment`, the [p4](https://github.com/wtsi-npg/p4) software is used to
+provide disk IO minimised processing of many informatics tools in streaming data
+flow DAGs.
 
+Also, the [npg_irods](https://github.com/wtsi-npg/npg_irods) system is essential
+for the internal archival of data products.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		c3996e4fae37a0ae01839cdd73752fb7b17e81bac2a8b39712463a7d518c4945 perlbrew.sh