From 4da333a87281d0148acc8d9a04e7a691caf139a3 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Sun, 20 Aug 2023 19:58:15 +0100 Subject: [PATCH 1/9] No spatial filter QC check for NovaSeqX --- lib/npg_pipeline/function/autoqc.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/npg_pipeline/function/autoqc.pm b/lib/npg_pipeline/function/autoqc.pm index 6a8c451a..1f51c0a7 100644 --- a/lib/npg_pipeline/function/autoqc.pm +++ b/lib/npg_pipeline/function/autoqc.pm @@ -291,7 +291,7 @@ sub _should_run { my $is_pool = $product->lims->is_pool; my $is_tag_zero = $product->is_tag_zero_product; - if($self->qc_to_run() eq 'spatial_filter' and $self->platform_NovaSeq) { + if($self->qc_to_run() eq 'spatial_filter' and ($self->platform_NovaSeq or $self->platform_NovaSeqX)) { return 0; } From 3bb46a37c6e79b3a812e6df7ed66a9e8446d81a8 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Mon, 21 Aug 2023 21:55:17 +0100 Subject: [PATCH 2/9] update Changes --- Changes | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Changes b/Changes index 8fc90506..c637fbab 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,8 @@ LIST OF CHANGES --------------- + - turn off spatial filter QC check for NovaSeqX + release 66.0.0 - small tweak to seq_alignment so GbS samples with no study ref do not fail - switch off spatial filter for NovaSeqX From 31e034a065dfe34078ce5457416b92fd26c64f5f Mon Sep 17 00:00:00 2001 From: Keith James Date: Tue, 29 Aug 2023 11:28:18 +0100 Subject: [PATCH 3/9] Switch to Perlbrew to obtain multiple Perl versions Change some file names and labels to be consistent with other Perl CI --- .github/workflows/perlbrew.sha256 | 1 + .github/workflows/run-tests.yml | 124 ++++++++++++++++++ .../workflows/testing_and_building_repo.yml | 75 ----------- scripts/install_npg_perl_dependencies.sh | 60 --------- scripts/install_wsi_dependencies.sh | 54 ++++++++ 5 files changed, 179 insertions(+), 135 deletions(-) create mode 100644 .github/workflows/perlbrew.sha256 create mode 100644 .github/workflows/run-tests.yml delete mode 100644 .github/workflows/testing_and_building_repo.yml delete mode 100755 scripts/install_npg_perl_dependencies.sh create mode 100755 scripts/install_wsi_dependencies.sh diff --git a/.github/workflows/perlbrew.sha256 b/.github/workflows/perlbrew.sha256 new file mode 100644 index 00000000..d9992912 --- /dev/null +++ b/.github/workflows/perlbrew.sha256 @@ -0,0 +1 @@ +c3996e4fae37a0ae01839cdd73752fb7b17e81bac2a8b39712463a7d518c4945 perlbrew.sh diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 00000000..1a9d50c2 --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,124 @@ +name: "Unit tests" + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l -e -o pipefail {0} + + env: + PERL_CACHE: ~/perl5 # Perlbrew and CPAN modules installed here, cached + NPG_LIB: ~/perl5npg # NPG modules installed here, not cached + WSI_CONDA_CHANNEL: "https://dnap.cog.sanger.ac.uk/npg/conda/devel/generic" + CONDA_TEST_ENV: test-environment + WTSI_NPG_GITHUB_URL: https://github.com/wtsi-npg + WTSI_NPG_BUILD_BRANCH: ${{ github.base_ref || github.ref }} + + strategy: + matrix: + perl: ["5.26.3", "5.34.1"] + + steps: + - uses: actions/checkout@v3 + + - name: "Install OS dependencies" + run: | + sudo apt-get update + # https://github.com/actions/runner-images/issues/2139 + sudo apt-get remove -y nginx libgd3 + sudo apt-get install -y libgd-dev uuid-dev libgd-text-perl + + - name: "Initialize Miniconda" + run: | + echo 'source $CONDA/etc/profile.d/conda.sh' >> "$HOME/.bash_profile" + + - name: "Install Conda packages" + run: | + conda config --prepend pkgs_dirs ~/conda/pkgs + conda config --prepend envs_dirs ~/conda/envs + + conda config --set auto_update_conda False + conda config --prepend channels "$WSI_CONDA_CHANNEL" + conda info + + conda create -y -n "$CONDA_TEST_ENV" + conda install -y -n "$CONDA_TEST_ENV" baton + conda install -y -n "$CONDA_TEST_ENV" samtools + + - name: "Cache Perl" + id: cache-perl + uses: actions/cache@v3 + with: + path: ${{ env.PERL_CACHE }} + key: ${{ runner.os }}-${{ matrix.perl }}-perl + + - name: "Install Perlbrew" + if: steps.cache-perl.outputs.cache-hit != 'true' + run: | + curl -sSL https://install.perlbrew.pl -o perlbrew.sh + sha256sum -c .github/workflows/perlbrew.sha256 + export PERLBREW_ROOT=${{ env.PERL_CACHE }} + sh perlbrew.sh + + source ${{ env.PERL_CACHE }}/etc/bashrc + perlbrew available + perlbrew install --notest perl-${{ matrix.perl }} + perlbrew use perl-${{ matrix.perl }} + perlbrew install-cpanm + + - name: "Initialize Perlbrew" + run: | + echo "source ${{ env.PERL_CACHE }}/etc/bashrc" >> "$HOME/.bash_profile" + + - name: "Install Perl dependencies" + run: | + cpanm --local-lib=${{ env.PERL_CACHE }} local::lib + eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib="$NPG_LIB") + eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib) + + cpanm --quiet --notest Module::Build + cpanm --quiet --notest Alien::Tidyp + cpanm --quiet --notest Net::SSLeay + cpanm --quiet --notest https://github.com/chapmanb/vcftools-cpan/archive/v0.953.tar.gz + + ./scripts/install_wsi_dependencies.sh "$NPG_LIB" \ + perl-dnap-utilities \ + perl-irods-wrap \ + ml_warehouse \ + npg_ml_warehouse \ + npg_tracking \ + npg_seq_common \ + npg_qc \ + npg_irods + + cpanm --installdeps --notest . + + - name: "Log install failure" + if: ${{ failure() }} + run: | + find ~/.cpanm/work -cmin -1 -name '*.log' -exec tail -n20 {} \; + + - name: "Archive CPAN logs on failure" + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: cpan_log + path: ~/.cpanm/work/*/build.log + retention-days: 5 + + - name: "Run tests" + run: | + conda activate "$CONDA_TEST_ENV" + conda info --envs + + eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib) + eval $(perl -I ${{ env.PERL_CACHE }}/lib/perl5/ -Mlocal::lib="$NPG_LIB") + + export TEST_AUTHOR=1 + perl Build.PL + ./Build test --verbose + ./Build install diff --git a/.github/workflows/testing_and_building_repo.yml b/.github/workflows/testing_and_building_repo.yml deleted file mode 100644 index 7e26496b..00000000 --- a/.github/workflows/testing_and_building_repo.yml +++ /dev/null @@ -1,75 +0,0 @@ -name: testing_and_building_repo -on: [push, pull_request] -env: - WTSI_NPG_BUILD_BRANCH: ${{ github.base_ref || github.ref }} -jobs: - build: - runs-on: ubuntu-latest - name: Distribution Perl - steps: - - uses: actions/checkout@v3 - - - name: Cache conda - id: minicondaCache - uses: actions/cache@v3 - with: - path: $HOME/miniconda - key: ${{ runner.os}}-build-miniconda - - - name: Cache cpanm external modules - id: cpanmCache - uses: actions/cache@v3 - with: - path: ~/perl5ext - key: ${{ runner.os}}-build-cpanm-external - - - name: Install libgd-dev and uuid-dev - run: | - sudo apt-get update - # https://github.com/actions/runner-images/issues/2139 - sudo apt-get remove -y nginx libgd3 - sudo apt-get install -y libgd-dev uuid-dev libgd-text-perl - - - name: Install baton and samtools - run: | - # Install baton from our conda channel - conda install --yes --channel ${WTSI_NPG_CONDA_REPO} --channel default --mkdir --prefix $HOME/miniconda/miniconda/baton baton; - - # Install samtools from our conda channel - # This is needed for our basic IRODS Perl wrapper to work - conda install --yes --channel ${WTSI_NPG_CONDA_REPO} --channel default --mkdir --prefix $HOME/miniconda/miniconda/samtools samtools - env: - WTSI_NPG_CONDA_REPO: https://dnap.cog.sanger.ac.uk/npg/conda/prod/generic - - - name: Install cpanm - run: | - wget -qO - https://cpanmin.us | /usr/bin/perl - --sudo App::cpanminus - - - name: Install NPG Perl dependencies, and their CPAN dependencies - run: | - echo "$HOME/miniconda/samtools/bin" >> $GITHUB_PATH - cpanm --local-lib=~/perl5ext local::lib && eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib) - ${GITHUB_WORKSPACE}/scripts/install_npg_perl_dependencies.sh $WTSI_NPG_GITHUB_URL $WTSI_NPG_BUILD_BRANCH - env: - WTSI_NPG_GITHUB_URL: https://github.com/wtsi-npg - - - name: Install cpanm dependencies - run: | - eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5npg) - eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5ext) - cpanm --installdeps . - - - name: Run Build.PL and ./Build - run: | - eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5ext) - eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5npg) - export TEST_AUTHOR=1 - perl Build.PL && ./Build test --verbose && ./Build install - - - name: Archive CPAN logs on failure - if: ${{ failure() }} - uses: actions/upload-artifact@v3 - with: - name: cpan_log - path: /home/runner/.cpanm/work/*/build.log - retention-days: 5 diff --git a/scripts/install_npg_perl_dependencies.sh b/scripts/install_npg_perl_dependencies.sh deleted file mode 100755 index ad20f28a..00000000 --- a/scripts/install_npg_perl_dependencies.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -set -e -x - -# iRODS test server is not set up, so tests that require it will -# be skipped - -#setting environment variables -WTSI_NPG_GITHUB_URL=$1 -WTSI_NPG_BUILD_BRANCH=$2 - -eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5ext) -cpanm --quiet --notest Alien::Tidyp # For npg_tracking -cpanm --quiet --notest Module::Build -cpanm --quiet --notest Net::SSLeay -cpanm --quiet --notest https://github.com/chapmanb/vcftools-cpan/archive/v0.953.tar.gz # for npg_qc - -# WTSI NPG Perl repo dependencies -repos="" -for repo in perl-dnap-utilities perl-irods-wrap ml_warehouse npg_ml_warehouse npg_tracking npg_seq_common npg_qc npg_irods; do - cd /tmp - # Always clone master when using depth 1 to get current tag - git clone --branch master --depth 1 ${WTSI_NPG_GITHUB_URL}/${repo}.git ${repo}.git - cd /tmp/${repo}.git - # Shift off master to appropriate branch (if possible) - git ls-remote --heads --exit-code origin ${WTSI_NPG_BUILD_BRANCH} && git pull origin ${WTSI_NPG_BUILD_BRANCH} && echo "Switched to branch ${WTSI_NPG_BUILD_BRANCH}" - repos=$repos" /tmp/${repo}.git" -done - -# Install CPAN dependencies. The src libs are on PERL5LIB because of -# circular dependencies. The blibs are on PERL5LIB because the package -# version, which cpanm requires, is inserted at build time. They must -# be before the libs for cpanm to pick them up in preference. - -for repo in $repos -do - export PERL5LIB=$repo:$repo/blib/lib:$PERL5LIB:$repo/lib -done - -for repo in $repos -do - cd $repo - cpanm --quiet --notest --installdeps . - perl Build.PL - ./Build -done - -# Finally, bring any common dependencies up to the latest version and -# install - -# to set liblocal for perl5_npg -eval $(perl -I ~/perl5ext/lib/perl5/ -Mlocal::lib=~/perl5npg) - -for repo in $repos -do - cd $repo - cpanm --quiet --notest --installdeps . - ./Build install -done -cd diff --git a/scripts/install_wsi_dependencies.sh b/scripts/install_wsi_dependencies.sh new file mode 100755 index 00000000..9d1e506f --- /dev/null +++ b/scripts/install_wsi_dependencies.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -e -u -x + +WSI_NPG_GITHUB_URL=${WSI_NPG_GITHUB_URL:=https://github.com/wtsi-npg} +WSI_NPG_BUILD_BRANCH=${WSI_NPG_BUILD_BRANCH:=devel} + +# The first argument is the install base for NPG modules, enabling them to be +# installed independently of CPAN dependencies. E.g. for cases where we want +# different caching behaviour. +NPG_ROOT="$1" +shift + +repos="" +for repo in "$@" ; do + cd /tmp + + # Clone deeper than depth 1 to get the tag even if something has been already + # committed over the tag + git clone --branch master --depth 3 "$WSI_NPG_GITHUB_URL/${repo}.git" "${repo}.git" + cd "/tmp/${repo}.git" + + # Shift off master to appropriate branch (if possible) + git ls-remote --heads --exit-code origin "$WSI_NPG_BUILD_BRANCH" && \ + git pull origin "$WSI_NPG_BUILD_BRANCH" && \ + echo "Switched to branch $WSI_NPG_BUILD_BRANCH" + repos="$repos /tmp/${repo}.git" +done + +# Install CPAN dependencies. The src libs are on PERL5LIB because of +# circular dependencies. The blibs are on PERL5LIB because the package +# version, which cpanm requires, is inserted at build time. They must +# be before the libs for cpanm to pick them up in preference. +for repo in $repos +do + export PERL5LIB="$PERL5LIB:$repo/blib/lib:$repo/lib" +done + +for repo in $repos +do + cd "$repo" + cpanm --quiet --notest --installdeps . + perl Build.PL + ./Build +done + +# Finally, bring any common dependencies up to the latest version and +# install +for repo in $repos +do + cd "$repo" + cpanm --quiet --notest --installdeps . + ./Build install --install-base "$NPG_ROOT" +done From 77b216c31c61f71d5743f6fc8d731a900b172729 Mon Sep 17 00:00:00 2001 From: Keith James Date: Tue, 29 Aug 2023 15:12:58 +0100 Subject: [PATCH 4/9] Remove npg_ml_warehouse dependency --- .github/workflows/run-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 1a9d50c2..e3394045 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -89,7 +89,6 @@ jobs: perl-dnap-utilities \ perl-irods-wrap \ ml_warehouse \ - npg_ml_warehouse \ npg_tracking \ npg_seq_common \ npg_qc \ From 40830a3d505a02f6ea1ad868f215a533fa56cbf0 Mon Sep 17 00:00:00 2001 From: David K Jackson Date: Mon, 11 Sep 2023 15:00:41 +0100 Subject: [PATCH 5/9] Enhance README with more context --- README.md | 79 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index aded10af..ac0e8517 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,35 @@ -## Pipelines for Processing Sequencing Data +# NPG Pipelines for Processing Illumina Sequencing Data -### Analysis Pipeline +This software provides the Sanger NPG team's automation for analysing and internally archiving Illumina sequencing on behalf of DNA Pipelines for their customers. -Processes data coming from Illumina sequencing instruments. -Input data - bcl files, output - CRAM files. In most cases CRAM files are aligned. +There are two main pipelines: -### Archival Pipeline +- data product and QC metric creation: "central" +- internal archival of data products, metadata, QC metrics and logs: "post_qc" + +and the daemons which automatically start these pipelines. -Archives sequencing data (CRAM files) and other related artefacts. +Processing is performed as a appropriate for the entire run, for each lane in the sequencing flowcell, or each tagged library (within a pool on the flowcell). -### Configuring Pipeline's Steps +## Batch Processing and Dependency Tracking with LSF or wr + +With this system, all of a pipeline's steps are submitted for execution to the +LSF, or wr, batch/job processing system as the pipeline is initialised. As such, a submitted pipeline does not have a +orchestration script of daemon running: managing the runtime dependencies of jobs within +an instance of a pipeline is delegated to the batch/job processing system. + +How is this done? The job representing the start point of a graph +is submitted to LSF, or wr, in a suspended state and is resumed once all other jobs +have been submitted thus ensuring that the execution starts only if all steps +are successfully submitted to LSF, or wr. If an error occurs at any point, all submitted +jobs, apart from the start job, are killed. + +## Pipeline Creation Steps of each of the pipelines and dependencies between the steps are defined in JSON input files located in data/config_files directory. The files follow [JSON Graph Format](https://github.com/jsongraph/json-graph-specification) -systax. Individual pipeline steps are defined as graph nodes, dependencies +syntax. Individual pipeline steps are defined as graph nodes, dependencies between them as directed graph edges. If step B should be executed after step A finishes, step B is is considered to be dependant on step A. @@ -22,34 +37,56 @@ The graph represented by the input file should be a directed acyclic graph (DAG). Each graph node should have an id, which should be unique, and a label, which is the name of the pipeline step. +Parallelisation of processing may be performed at different levels within the DAG: some steps are appropriate for + +- per run +- per lane +- per lane and tagged library, or per tagged library +- per tagged library + +parallelisation. + ### Visualizing Input Graphs JSON Graph Format (JGF) is relatively new, with little support for visualization. Convert JGF to GML [Graph Modeling Language](http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/brandenburg/projekte/gml/gml-technical-report.pdf) -format using a simple script supplied with this package, scripts/jgf2gml. +format using a simple script supplied with this package, `scripts/jgf2gml`. Many graph visualization tools, for example [Cytoscape](http://www.cytoscape.org/), support the GML format. +## Per Sequencing Run Pipelines + +The processing is performed per sequencing run. Many different studies and sequencing assays for different "customers" may be performed on a single run. Unlike contemporary (2020s) sharable bioinformatics pipelines, the logic for informatics is tied closely to the business logic e.g. what aligner is required with what reference, whether human read separation is required, is determined per indexed library within a lane of sequencing and scheduled for work in parallel. + +The information required for the logic is obtained from the upstream "LIMS" via a MLWH (Multi-LIMS warehouse) database and the run folder output by the sequencing instrument. + +### Analysis Pipeline + +Processes data coming from Illumina sequencing instruments. + +The input for an instance of the pipeline is the instrument output run folder (BCL and associated files) and LIMS information which drives appropriate processing. + +The key data products are aligned CRAM files and indexes, or unaligned CRAM files. However (per study) configuration allows for the creation of GATK gVCF files, or the running for external tool/pipeline e.g. ncov2012-artic-nf + +### Archival Pipeline + +Archives sequencing data (CRAM files) and other related artifacts e.g. index files. QC metrics. + ### Pipeline Script Outputs Log file - in the run folder (as in the current pipeline). -Example: /nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log +Example: `/nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log` File with JSON serialization of definition objects - in the analysis directory directory. -Example: /path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json +Example: `/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json` File with saved commands hashed by function name, LSF job id and array index - in the analysis directory. -Example: /path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json +Example: `/path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json` -### Batch Processing and Dependencies Tracking with LSF +## Dependencies -In this package the pipeline steps are submitted for execution to the -LSF batch processing system. The LSF job representing the start point of a graph -is submitted to LSF in a suspended state and is resumed once all other LSF jobs -have been submitted thus ensuring that the execution starts only if all steps -are successfully submitted to LSF. If an error occurs at any point, all submitted -jobs, apart from the start job, are killed. +This software relies heavily on the npg_tracking software to abstract information from the MLWH and instrument runfolder, and coordination of the state of the run. - - +This software integrates heavily with the npg_qc system for calculating and recording for internal display QC metrics for operational teams to assess the sequencing and upstream processes. +Also, the npg_irods system is essential for the internal archival of data products. From b97e27a1770c9fbc5e67ca0164d5be2850fec30c Mon Sep 17 00:00:00 2001 From: David K Jackson Date: Mon, 11 Sep 2023 15:01:17 +0100 Subject: [PATCH 6/9] Improve Markdown format consistency --- README.md | 114 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index ac0e8517..9dcb7188 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,118 @@ # NPG Pipelines for Processing Illumina Sequencing Data -This software provides the Sanger NPG team's automation for analysing and internally archiving Illumina sequencing on behalf of DNA Pipelines for their customers. +This software provides the Sanger NPG team's automation for analysing and +internally archiving Illumina sequencing on behalf of DNA Pipelines for their +customers. There are two main pipelines: -- data product and QC metric creation: "central" -- internal archival of data products, metadata, QC metrics and logs: "post_qc" +* data product and QC metric creation: "central" +* internal archival of data products, metadata, QC metrics and logs: "post_qc" and the daemons which automatically start these pipelines. -Processing is performed as a appropriate for the entire run, for each lane in the sequencing flowcell, or each tagged library (within a pool on the flowcell). +Processing is performed as a appropriate for the entire run, for each lane in +the sequencing flowcell, or each tagged library (within a pool on the flowcell). ## Batch Processing and Dependency Tracking with LSF or wr With this system, all of a pipeline's steps are submitted for execution to the -LSF, or wr, batch/job processing system as the pipeline is initialised. As such, a submitted pipeline does not have a -orchestration script of daemon running: managing the runtime dependencies of jobs within -an instance of a pipeline is delegated to the batch/job processing system. +LSF, or wr, batch/job processing system as the pipeline is initialised. As such, +a submitted pipeline does not have a orchestration script of daemon running: +managing the runtime dependencies of jobs within an instance of a pipeline is +delegated to the batch/job processing system. -How is this done? The job representing the start point of a graph -is submitted to LSF, or wr, in a suspended state and is resumed once all other jobs -have been submitted thus ensuring that the execution starts only if all steps -are successfully submitted to LSF, or wr. If an error occurs at any point, all submitted -jobs, apart from the start job, are killed. +How is this done? The job representing the start point of a graph is submitted +to LSF, or wr, in a suspended state and is resumed once all other jobs have been +submitted thus ensuring that the execution starts only if all steps are +successfully submitted to LSF, or wr. If an error occurs at any point, all +submitted jobs, apart from the start job, are killed. ## Pipeline Creation -Steps of each of the pipelines and dependencies between the steps are -defined in JSON input files located in data/config_files directory. -The files follow [JSON Graph Format](https://github.com/jsongraph/json-graph-specification) +Steps of each of the pipelines and dependencies between the steps are defined in +JSON input files located in data/config_files directory. The files follow +[JSON Graph Format](https://github.com/jsongraph/json-graph-specification) syntax. Individual pipeline steps are defined as graph nodes, dependencies -between them as directed graph edges. If step B should be executed -after step A finishes, step B is is considered to be dependant on step A. +between them as directed graph edges. If step B should be executed after step A +finishes, step B is is considered to be dependant on step A. -The graph represented by the input file should be a directed acyclic -graph (DAG). Each graph node should have an id, which should be unique, -and a label, which is the name of the pipeline step. +The graph represented by the input file should be a directed acyclic graph +(DAG). Each graph node should have an id, which should be unique, and a label, +which is the name of the pipeline step. -Parallelisation of processing may be performed at different levels within the DAG: some steps are appropriate for +Parallelisation of processing may be performed at different levels within the +DAG: some steps are appropriate for -- per run -- per lane -- per lane and tagged library, or per tagged library -- per tagged library +* per run +* per lane +* per lane and tagged library, or per tagged library +* per tagged library parallelisation. ### Visualizing Input Graphs -JSON Graph Format (JGF) is relatively new, with little support for visualization. -Convert JGF to GML [Graph Modeling Language](http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/brandenburg/projekte/gml/gml-technical-report.pdf) -format using a simple script supplied with this package, `scripts/jgf2gml`. -Many graph visualization tools, for example [Cytoscape](http://www.cytoscape.org/), -support the GML format. +JSON Graph Format (JGF) is relatively new, with little support for +visualization. Convert JGF to GML +[Graph Modeling Language](http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/brandenburg/projekte/gml/gml-technical-report.pdf) +format using a simple script supplied with this package, `scripts/jgf2gml` . +Many graph visualization tools, for example +[Cytoscape](http://www.cytoscape.org/), support the GML format. ## Per Sequencing Run Pipelines -The processing is performed per sequencing run. Many different studies and sequencing assays for different "customers" may be performed on a single run. Unlike contemporary (2020s) sharable bioinformatics pipelines, the logic for informatics is tied closely to the business logic e.g. what aligner is required with what reference, whether human read separation is required, is determined per indexed library within a lane of sequencing and scheduled for work in parallel. +The processing is performed per sequencing run. Many different studies and +sequencing assays for different "customers" may be performed on a single run. +Unlike contemporary (2020s) sharable bioinformatics pipelines, the logic for +informatics is tied closely to the business logic e.g. what aligner is required +with what reference, whether human read separation is required, is determined +per indexed library within a lane of sequencing and scheduled for work in +parallel. -The information required for the logic is obtained from the upstream "LIMS" via a MLWH (Multi-LIMS warehouse) database and the run folder output by the sequencing instrument. +The information required for the logic is obtained from the upstream "LIMS" via +a MLWH (Multi-LIMS warehouse) database and the run folder output by the +sequencing instrument. ### Analysis Pipeline Processes data coming from Illumina sequencing instruments. -The input for an instance of the pipeline is the instrument output run folder (BCL and associated files) and LIMS information which drives appropriate processing. +The input for an instance of the pipeline is the instrument output run folder +(BCL and associated files) and LIMS information which drives appropriate +processing. -The key data products are aligned CRAM files and indexes, or unaligned CRAM files. However (per study) configuration allows for the creation of GATK gVCF files, or the running for external tool/pipeline e.g. ncov2012-artic-nf +The key data products are aligned CRAM files and indexes, or unaligned CRAM +files. However (per study) configuration allows for the creation of GATK gVCF +files, or the running for external tool/pipeline e.g. ncov2012-artic-nf ### Archival Pipeline -Archives sequencing data (CRAM files) and other related artifacts e.g. index files. QC metrics. +Archives sequencing data (CRAM files) and other related artifacts e.g. index +files. QC metrics. ### Pipeline Script Outputs -Log file - in the run folder (as in the current pipeline). -Example: `/nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log` +Log file - in the run folder (as in the current pipeline). Example: +`/nfs/sf55/IL_seq_data/outgoing/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log` -File with JSON serialization of definition objects - in the analysis directory directory. -Example: `/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json` +File with JSON serialization of definition objects - in the analysis directory +directory. Example: +`/path_to_runfolder/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.json` -File with saved commands hashed by function name, LSF job id and array index - in the analysis directory. -Example: `/path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json` +File with saved commands hashed by function name, LSF job id and array index - +in the analysis directory. Example: +`/path_to_runfolder/Data/Intensities/BAM_basecalls_20180321-075511/bin_npg_pipeline_central_25438_20180321-080455-2214166102.log.commands4jobs.json` ## Dependencies -This software relies heavily on the npg_tracking software to abstract information from the MLWH and instrument runfolder, and coordination of the state of the run. +This software relies heavily on the npg_tracking software to abstract +information from the MLWH and instrument runfolder, and coordination of the +state of the run. -This software integrates heavily with the npg_qc system for calculating and recording for internal display QC metrics for operational teams to assess the sequencing and upstream processes. +This software integrates heavily with the npg_qc system for calculating and +recording for internal display QC metrics for operational teams to assess the +sequencing and upstream processes. -Also, the npg_irods system is essential for the internal archival of data products. +Also, the npg_irods system is essential for the internal archival of data +products. From 7f7e22cbee17c2840ed9c2007642c33b0b0155bd Mon Sep 17 00:00:00 2001 From: David K Jackson Date: Mon, 11 Sep 2023 15:15:53 +0100 Subject: [PATCH 7/9] Add images of DAGs, add links, fix a typo --- README.md | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 9dcb7188..c18e3583 100644 --- a/README.md +++ b/README.md @@ -6,27 +6,28 @@ customers. There are two main pipelines: -* data product and QC metric creation: "central" -* internal archival of data products, metadata, QC metrics and logs: "post_qc" +* data product and QC metric creation: `central` +* internal archival of data products, metadata, QC metrics and logs: + `post_qc_review` and the daemons which automatically start these pipelines. -Processing is performed as a appropriate for the entire run, for each lane in -the sequencing flowcell, or each tagged library (within a pool on the flowcell). +Processing is performed as appropriate for the entire run, for each lane in the +sequencing flowcell, or each tagged library (within a pool on the flowcell). ## Batch Processing and Dependency Tracking with LSF or wr -With this system, all of a pipeline's steps are submitted for execution to the -LSF, or wr, batch/job processing system as the pipeline is initialised. As such, -a submitted pipeline does not have a orchestration script of daemon running: -managing the runtime dependencies of jobs within an instance of a pipeline is -delegated to the batch/job processing system. +With this system, all of a pipeline's jobs for its steps are submitted for +execution to the LSF, or wr, batch/job processing system as the pipeline is +initialised. As such, a _submitted_ pipeline does not have an orchestration +script or daemon running: managing the runtime dependencies of jobs within an +instance of a pipeline is delegated to the batch/job processing system. How is this done? The job representing the start point of a graph is submitted to LSF, or wr, in a suspended state and is resumed once all other jobs have been submitted thus ensuring that the execution starts only if all steps are -successfully submitted to LSF, or wr. If an error occurs at any point, all -submitted jobs, apart from the start job, are killed. +successfully submitted to LSF, or wr. If an error occurs at any point during job +submissions, all submitted jobs, apart from the start job, are killed. ## Pipeline Creation @@ -51,7 +52,7 @@ DAG: some steps are appropriate for parallelisation. -### Visualizing Input Graphs +#### Visualizing Input Graphs JSON Graph Format (JGF) is relatively new, with little support for visualization. Convert JGF to GML @@ -60,7 +61,7 @@ format using a simple script supplied with this package, `scripts/jgf2gml` . Many graph visualization tools, for example [Cytoscape](http://www.cytoscape.org/), support the GML format. -## Per Sequencing Run Pipelines +## Per Sequencing-Run Pipelines The processing is performed per sequencing run. Many different studies and sequencing assays for different "customers" may be performed on a single run. @@ -76,20 +77,26 @@ sequencing instrument. ### Analysis Pipeline -Processes data coming from Illumina sequencing instruments. +Processes data coming from Illumina sequencing instruments. It is labeled the +"central" pipeline. The input for an instance of the pipeline is the instrument output run folder (BCL and associated files) and LIMS information which drives appropriate processing. The key data products are aligned CRAM files and indexes, or unaligned CRAM -files. However (per study) configuration allows for the creation of GATK gVCF -files, or the running for external tool/pipeline e.g. ncov2012-artic-nf +files. However per study (a LIMS datum) pipeline configuration allows for the +creation of GATK gVCF files, or the running for external tool/pipeline e.g. +ncov2012-artic-nf + +!["central" pipeline](data/config_files/function_list_central.json.png) ### Archival Pipeline Archives sequencing data (CRAM files) and other related artifacts e.g. index -files. QC metrics. +files. QC metrics. It is labeled the "post_qc_review" pipeline. + +!["post_qc_review" pipeline](data/config_files/function_list_post_qc_review.json.png) ### Pipeline Script Outputs @@ -106,13 +113,15 @@ in the analysis directory. Example: ## Dependencies -This software relies heavily on the npg_tracking software to abstract +This software relies heavily on the +[npg_tracking](https://github.com/wtsi-npg/npg_tracking) software to abstract information from the MLWH and instrument runfolder, and coordination of the state of the run. -This software integrates heavily with the npg_qc system for calculating and +This software integrates heavily with the +[npg_qc](https://github.com/wtsi-npg/npg_qc) system for calculating and recording for internal display QC metrics for operational teams to assess the sequencing and upstream processes. -Also, the npg_irods system is essential for the internal archival of data -products. +Also, the [npg_irods](https://github.com/wtsi-npg/npg_irods) system is essential +for the internal archival of data products. From 0383a991075509f44a25d777963bf405e79a27fc Mon Sep 17 00:00:00 2001 From: David K Jackson Date: Tue, 12 Sep 2023 09:12:25 +0100 Subject: [PATCH 8/9] Add info on data intensive P4 based steps --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c18e3583..bb29f5db 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ JSON input files located in data/config_files directory. The files follow [JSON Graph Format](https://github.com/jsongraph/json-graph-specification) syntax. Individual pipeline steps are defined as graph nodes, dependencies between them as directed graph edges. If step B should be executed after step A -finishes, step B is is considered to be dependant on step A. +finishes, step B is considered to be dependant on step A. The graph represented by the input file should be a directed acyclic graph (DAG). Each graph node should have an id, which should be unique, and a label, @@ -91,6 +91,11 @@ ncov2012-artic-nf !["central" pipeline](data/config_files/function_list_central.json.png) +Within this DAG there are two step which are key in producing the main data products: + +* `p4_stage1_analysis` processes data at the lane level within a flowcell/run: includes conversion of instrument output (BCL files) to BAM format, demultiplexing of data within a lane to tagged libraries, alignment with any spiked phiX, (for some instrument types) detection of indel inducing fluidics bubbles and marking reads with fail bit, and (for some instrument types) detection and marking of sequencing adapter. +* `seq_alignment` processes data at tagged library, or lane and tagged library, level: includes alignment to the target genome (or not), a naive human read filtering capability, splitting of human target data by autosome/allosome capability, (for some instrument types) removal of marked adapter pre-alignment and pasting post-alignment (so there is no loss of instrument basecalls or quality data), duplicate marking, and creation of standard sequencing metrics files. + ### Archival Pipeline Archives sequencing data (CRAM files) and other related artifacts e.g. index @@ -123,5 +128,10 @@ This software integrates heavily with the recording for internal display QC metrics for operational teams to assess the sequencing and upstream processes. +For the data processing intensive steps, `p4_stage1_analysis` and +`seq_alignment`, the [p4](https://github.com/wtsi-npg/p4) software is used to +provide disk IO minimised processing of many informatics tools in streaming data +flow DAGs. + Also, the [npg_irods](https://github.com/wtsi-npg/npg_irods) system is essential for the internal archival of data products. From f0d0d46c54b300b35dad2549bf1d589225e00f6f Mon Sep 17 00:00:00 2001 From: jmtcsngr Date: Tue, 19 Sep 2023 09:46:44 +0100 Subject: [PATCH 9/9] prep release 67.0.0 --- Changes | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Changes b/Changes index c637fbab..12485fd4 100644 --- a/Changes +++ b/Changes @@ -1,7 +1,14 @@ LIST OF CHANGES --------------- - - turn off spatial filter QC check for NovaSeqX +release 67.0.0 + - Turn off spatial filter QC check for NovaSeqX + - Switch to Perlbrew to obtain multiple Perl versions + - Remove npg_ml_warehouse dependency + - Enhance README with more context + - Improve Markdown format consistency + - Add images of DAGs, add links, fix a typo + - Add info on data intensive P4 based steps release 66.0.0 - small tweak to seq_alignment so GbS samples with no study ref do not fail