diff --git a/README b/README index 50be57fd3..ddae98791 100644 --- a/README +++ b/README @@ -7,6 +7,12 @@ genotyping microarrays, supplemented with Sequenom genotyping. Change log is located in: doc/CHANGELOG.md +To install, set the $INSTALL_ROOT environment variable to the path of the +desired installation directory, and run src/bash/install.sh. This installs +the Perl and Ruby components of the pipeline. It recursively installs +non-core Perl dependencies from CPAN, and requires approximately 120 MB of +disk space. + The gender check used as a QC metric is also available as a standalone application; see below. @@ -22,9 +28,13 @@ Pipeline: Bcftools https://github.com/samtools/bcftools +Install script: + cpanm >= 1.7042 http://search.cpan.org/~miyagawa/Menlo-1.9003/script/cpanm-menlo + + QC scripts: - Simtools >= 1.5 https://github.com/wtsi-npg/simtools + Simtools >= 2.2 https://github.com/wtsi-npg/simtools R >= 2.11.1 R mixtools library http://cran.r-project.org/web/packages/mixtools/index.html diff --git a/doc/CHANGELOG.md b/doc/CHANGELOG.md index c32663f5b..9258db98f 100644 --- a/doc/CHANGELOG.md +++ b/doc/CHANGELOG.md @@ -12,6 +12,32 @@ Added: and plots +Release 1.13.0: 2016-06-20 +-------------------------- + +Added: +- install.sh script to install pipeline and its Perl dependencies +- Documentation for Bayesian identity check + +Changed: +- Modified ready_workflow.pl to better align with user SOP +- Use try/catch to handle unexpected errors in retrieving QC plex results +from iRODS +- Updated reference genome for Sequenom iRODS query +- Update perl-irods-wrap dependency to 2.4.0; removes unhelpful warning +messages to STDERR. This in turn requires baton version >= 0.16.4. + +Removed: +- Script publish_infinium_file_list.pl; superseded by other publish scripts + + +Release 1.12.1: 2016-05-13 +-------------------------- + +Fixed: +- Support repeat scans from Infinium database + + Release 1.12.0: 2016-04-07 -------------------------- diff --git a/doc/bayesian_identity/.gitignore b/doc/bayesian_identity/.gitignore new file mode 100644 index 000000000..51e0bdd41 --- /dev/null +++ b/doc/bayesian_identity/.gitignore @@ -0,0 +1,7 @@ +# patterns to ignore for git updates +\.* +*~ +*.aux +*.dvi +*.ps +*.log diff --git a/doc/bayesian_identity/bayesian_identity.pdf b/doc/bayesian_identity/bayesian_identity.pdf new file mode 100644 index 000000000..cd81d877f Binary files /dev/null and b/doc/bayesian_identity/bayesian_identity.pdf differ diff --git a/doc/bayesian_identity/bayesian_identity.tex b/doc/bayesian_identity/bayesian_identity.tex new file mode 100644 index 000000000..5b0746e06 --- /dev/null +++ b/doc/bayesian_identity/bayesian_identity.tex @@ -0,0 +1,188 @@ +\documentclass{article} +\usepackage{graphicx} +\usepackage{mathtools} +\DeclareGraphicsExtensions{.pdf,.png,.jpg} + +\begin{document} + +\title{Bayesian Identity Metric for Genotyping} +\author{Iain Bancarz, ib5@sanger.ac.uk } +\date{May 2016} +\maketitle + + +\section{Introduction} + +This document describes the theoretical background to the Bayesian identity check, which is a QC metric introduced in pipeline release 1.11.5. + +An identity check compares calls on a set of QC plex SNPs, between Infinium and another genotyping platform, such as Sequenom and/or Fluidigm. We observe \emph{concordance} between Infinium and the alternate platform on the QC plex, and use it to evaluate \emph{identity}. If concordance is too low, we are not confident that the production and QC calls derive from the same sample; this may indicate a sample swap or poor data quality. + +Key questions include: + +\begin{itemize} +\item How do we handle no-calls? How many non-null calls are required to evaluate identity? +\item What is an appropriate concordance threshold for sample identity? +\item How do we handle multiple QC plex runs, potentially on different platforms? +\end{itemize} + +Previous versions of the identity check set an \textit{ad hoc} threshold on concordance, which was used to mark samples as passing or failing QC. The Bayesian approach seeks to place the identity check on a sound probabilistic footing, in which the underlying assumptions are made explicit. + +\section{Bayesian Inference} + +Bayesian inference is a widely used statistical method in which we specify \emph{prior} assumptions, and update the probability of an event based on observed data, finding what is known as the \emph{posterior} probability. + +This is done using \emph{Bayes' rule}: + +\begin{displaymath} +\Pr(H|E) = \frac{\Pr(E|H)\Pr(H)}{\Pr(E)} +\end{displaymath} + +where: + +\begin{itemize} +\item $H$ is a \emph{hypothesis}; ``samples are identical'' or ``samples are not identical'' +\item $E$ is \emph{evidence} which has been observed; in this case, the production and QC calls +\item $\Pr(H|E)$ is probability of hypothesis given the evidence; this is the \emph{posterior probability} which interests us. +\item $\Pr(H)$ is probability of the hypothesis when evidence is not known; this is the \emph{prior probability} of the hypothesis. +\item $\Pr(E|H)$ is known as the \emph{likelihood}. This is the probability of evidence given the hypothesis; if the samples are identical, what is the probability of observing these calls? +\item $\Pr(E)$ is prior probability of the evidence, independent of the hypothesis. Serves as a normalising constant. +\end{itemize} + + +\section{Framework for sample identity} + +We define the terms as follows: + + +\subsection*{Hypotheses} + +\begin{itemize} +\item $H_1$: Samples are identical +\item $H_2$: Samples are not identical +\end{itemize} + + +\subsection*{Evidence} + +\begin{itemize} +\item Production calls: $p_1, p_2, \dots , p_m$ where $p_i$ is the production call on the $i$th of $m$ SNPs in the QC plex, on the Infinium platform. +\item QC calls: $q_{11}, q_{12}, \dots , q_{mn},$ where $q_{ij}$ is the call on the $i$th SNP in the $j$th of $n$ QC runs, using Fluidigm, Sequenom, or any other platform. +\end{itemize} + +\subsection*{Likelihood} + +\subsubsection*{Defining events} + +Suppose the evidence $E$ is made up of multiple, independent events $E_i$. In that case: + +\begin{displaymath} +\Pr(E|H) = \prod_{i} \Pr(E_{i}|H) +\end{displaymath} + +Let $E_i$ consist of a production call and $n$ QC calls on the $i$th SNP: + +\begin{displaymath} +E_i = \{ p_i, q_{i1}, q_{i2}, \dots , q_{in} \} +\end{displaymath} + +where as before, $p_i$ is the production call on Infinium, and $q_{ij}$ are the QC calls on Fluidigm, Sequenom, or both. Each QC call $q_{ij}$ is either equivalent to the production call $p_i$, or not equivalent. Our model will use the probability of equivalent calls. + +\subsubsection*{Binomial distribution} + +Let us consider whether each of the QC calls $q_{ij}$ is equivalent to the production call $p_i$. We have $n_i$ calls on the $i$th SNP, excluding no-calls so that $n_i \le n$. Suppose that out of these $n_i$ calls, we have $k_i$ matches. Let $u_{i}$ be the probability that production and QC calls are identical on the $i$th SNP, given that the samples are equivalent; in other words, given $H_{1}$. Let $v_{i}$ be the probability of identical calls, given that the samples are not equivalent, ie.\ $H_{2}$ holds. + +So, we are interested in the likelihood of observing $k_i$ equivalent calls out of $n_i$ trials, with probability of equivalence $u_{i}$ or $v_{i}$. This is simply the binomial distribution. Omitting the subscripts for $i$ to simplify notation, the binomial distribution is: + +\begin{displaymath} +\Pr(k) = \binom{n}{k} u^{k} (1 - u)^{(n - k)} +\end{displaymath} + +and similarly for $v$, where: + +\begin{displaymath} +\binom{n}{k} = \frac{m!}{k!(n-k)!} +\end{displaymath} + +\subsubsection*{Probability of equivalence} + +Now, for our two hypotheses $H_1$ and $H_2$, we have probabilities $u$ and $v$ as follows: + +\begin{itemize} +\item $u_{i}$ is the probability of equivalent calls, given that the samples are identical. If the samples are identical, any mismatches must be the result of a genotyping error. We then have $u_{i} = 1 - r$, where $r$ is the expected error rate. +\item $v_{i}$ is the probability of equivalent calls, given that the samples are \emph{not} identical. So we have $v_{i} = \hat{v}_{i} + \theta$, where $\hat{v}_{i}$ is the ``true'' probability of equivalent calls on non-identical samples for the $i$th SNP, and $\theta$ is a correction factor to account for experimental error. +\end{itemize} + +In general, $v_i$ will depend on the expected degree of relatedness between samples, as well as the heterozygosity and minor allele frequency (MAF) of the $i$th SNP. + +\subsection*{Normalising constant} + +We can now find the normalising constant $\Pr(E)$ as follows: + +\begin{displaymath} +\Pr(E) = \Pr(E|H_{1})\Pr(H_{1}) + \Pr(E|H_{2})\Pr(H_{2}) +\end{displaymath} + +\subsection*{Computing the identity metric} + +Our desired metric is the probability that the production and QC samples are identical. This happens exactly when hypothesis $H_1$ holds. We can compute the probability of $H_1$ given the evidence $E$ as follows: + +\begin{displaymath} +\Pr(H_1|E) = \frac{\Pr(E|H_1)\Pr(H_1)}{\Pr(E)} += \frac{\Pr(E|H_1)\Pr(H_1)}{\Pr(E|H_{1})\Pr(H_{1}) + \Pr(E|H_{2})\Pr(H_{2})} +\end{displaymath} + +where: + +\begin{displaymath} +\Pr(E|H_1) = \prod_{i} \Pr(E_{i}|H_1) += \prod_{i} b(k_i; n_i,u_i) +\end{displaymath} + +where $b(k_i; n_i, u_i)$ is the binomial distribution for $k_i$ successes in $n_i$ trials with probability $u_i$, and similarly for $H_2$ and $v_i$. + +\section{Required parameters} + +\subsection*{Model inputs} + +In order to compute the identity metric as above, the following parameters must be supplied or estimated from data: + +\begin{itemize} +\item Prior probability of sample mismatch, $\Pr(H_2)$. We also have $\Pr(H_1) = 1 - \Pr(H_2)$. +\item Error rate of genotype calls, $r$. Probability of equivalent calls on identical samples, $u = 1 - r$. +\item Probability of identical calls on non-identical samples for the $i$th SNP, $v_i$. +\end{itemize} + +The attached graphs simulate the effects of varying the above parameters while keeping the others at constant default values. The defaults used were: + +\begin{itemize} +\item $\Pr(H_2) = 0.01$ +\item $r = 0.01$ +\item $v_i = 0.40625$ for all SNPs +\end{itemize} + +The default $v_i$ assumes heterozygosity 0.5 and minor allele frequency 0.25. Unless otherwise stated, the number of SNPs in the QC plex was 24. + +\subsection*{Pass/fail threshold} + +To apply the metric in practice, we need a threshold on the posterior probability $\Pr(H_1|E)$. Samples pass if they are above the threshold, and fail otherwise. + +Here are two plausible ways of choosing a threshold: + +\begin{itemize} +\item \textbf{Optimistic:} Require convincing evidence that a sample swap \emph{has} occurred. Set threshold to a ``significant'' probability of non-identical samples, for example 0.95. +\item \textbf{Pessimistic:} Require convincing evidence that a swap \emph{has not} occurred. Specifically, insist that posterior probability of identity is higher than prior probability; that is, the QC plex calls have made us \emph{more certain} that the samples are not identical. Set threshold to $1 - \Pr(H_2)$, where as above $H_2$ is the sample mismatch hypothesis. With default parameters, this gives us a threshold of 0.99. +\end{itemize} + +In the pipeline implementation of the Bayesian identity check, the second, ``pessimistic'' scenario has been adopted. + +\section{Graphs for simulated data} + +\includegraphics[scale=0.5]{identity_by_plex_size} + +\includegraphics[scale=0.5]{identity_by_smp} + +\includegraphics[scale=0.5]{identity_by_xer} + +\includegraphics[scale=0.5]{identity_by_eq_prob.png} + +\end{document} diff --git a/doc/bayesian_identity/identity_by_eq_prob.png b/doc/bayesian_identity/identity_by_eq_prob.png new file mode 100644 index 000000000..abeaf01a7 Binary files /dev/null and b/doc/bayesian_identity/identity_by_eq_prob.png differ diff --git a/doc/bayesian_identity/identity_by_plex_size.png b/doc/bayesian_identity/identity_by_plex_size.png new file mode 100644 index 000000000..e2820b9ce Binary files /dev/null and b/doc/bayesian_identity/identity_by_plex_size.png differ diff --git a/doc/bayesian_identity/identity_by_smp.png b/doc/bayesian_identity/identity_by_smp.png new file mode 100644 index 000000000..2867f0335 Binary files /dev/null and b/doc/bayesian_identity/identity_by_smp.png differ diff --git a/doc/bayesian_identity/identity_by_xer.png b/doc/bayesian_identity/identity_by_xer.png new file mode 100644 index 000000000..b314dab5b Binary files /dev/null and b/doc/bayesian_identity/identity_by_xer.png differ diff --git a/src/bash/install.sh b/src/bash/install.sh new file mode 100755 index 000000000..9d7a05d35 --- /dev/null +++ b/src/bash/install.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# Author: Iain Bancarz , June 2016 + +# script to install Perl dependencies of the genotyping pipeline using cpanm +# also installs Perl and Ruby components of the pipeline +# does *not* install baton and other non-Perl dependencies; these are specified in the genotyping modulefile + +# similar in purpose to npg_irods/scripts/travis_install.sh + +if [ -z $INSTALL_ROOT ]; then + echo "INSTALL_ROOT environment variable must specify the path to a directory for installation; exiting." >&2 + exit 1 +elif [ ! -e $INSTALL_ROOT ]; then + echo "INSTALL_ROOT environment variable path $INSTALL_ROOT does not exist; exiting." >&2 + exit 1 +elif [ ! -d $INSTALL_ROOT ]; then + echo "INSTALL_ROOT environment variable path $INSTALL_ROOT is not a directory; exiting." >&2 + exit 1 +else + echo "Installing pipeline to root directory $INSTALL_ROOT" +fi + +WTSI_DNAP_UTILITIES_VERSION="0.5.1" +ML_WAREHOUSE_VERSION="2.1" +ALIEN_TIDYP_VERSION="1.4.7" +NPG_TRACKING_VERSION="85.3" +NPG_IRODS_VERSION="2.4.0" +RUBY_VERSION="1.8.7-p330" +LIB_RUBY_VERSION="0.3.0" + +TEMP_NAME=`mktemp -d genotyping_temp.XXXXXXXX` +TEMP=`readlink -f $TEMP_NAME` # full path to temp directory +export PATH=$TEMP:$PATH # ensure cpanm download is on PATH +START_DIR=$PWD +cd $TEMP + +# use wget to download tarballs to install +URLS=(https://github.com/wtsi-npg/perl-dnap-utilities/releases/download/$WTSI_DNAP_UTILITIES_VERSION/WTSI-DNAP-Utilities-$WTSI_DNAP_UTILITIES_VERSION.tar.gz \ +https://github.com/wtsi-npg/ml_warehouse/releases/download/$ML_WAREHOUSE_VERSION/ml_warehouse-$ML_WAREHOUSE_VERSION.tar.gz \ +http://search.cpan.org/CPAN/authors/id/K/KM/KMX/Alien-Tidyp-v$ALIEN_TIDYP_VERSION.tar.gz \ +https://github.com/wtsi-npg/npg_tracking/releases/download/$NPG_TRACKING_VERSION/npg-tracking-$NPG_TRACKING_VERSION.tar.gz \ +https://github.com/wtsi-npg/perl-irods-wrap/releases/download/$NPG_IRODS_VERSION/WTSI-NPG-iRODS-$NPG_IRODS_VERSION.tar.gz ) + +for URL in ${URLS[@]}; do + wget $URL + if [ $? -ne 0 ]; then + echo "Failed to download $URL; non-zero exit status from wget; install halted" >&2 + exit 1 + fi +done + +# clone genotyping to access src/perl +git clone https://github.com/wtsi-npg/genotyping.git +if [ $? -ne 0 ]; then + echo "Failed to clone genotyping repository; install halted" >&2 + exit 1 +fi + +eval $(perl -Mlocal::lib=$INSTALL_ROOT) # set environment variables + +# use local installation of cpanm +module load cpanm/1.7042 + +# install prerequisites from tarfiles +TARFILES=(WTSI-DNAP-Utilities-$WTSI_DNAP_UTILITIES_VERSION.tar.gz \ +ml_warehouse-$ML_WAREHOUSE_VERSION.tar.gz \ +Alien-Tidyp-v$ALIEN_TIDYP_VERSION.tar.gz \ +npg-tracking-$NPG_TRACKING_VERSION.tar.gz \ +WTSI-NPG-iRODS-$NPG_IRODS_VERSION.tar.gz) + +for FILE in ${TARFILES[@]}; do + cpanm --installdeps $FILE --self-contained --notest + if [ $? -ne 0 ]; then + echo "cpanm --installdeps failed for $FILE; install halted" >&2 + exit 1 + fi + cpanm --install $FILE --notest + if [ $? -ne 0 ]; then + echo "cpanm --install failed for $FILE; install halted" >&2 + exit 1 + fi +done + +# install pipeline Perl +# first ensure we are on 'master' branch and find pipeline version +cd genotyping/src/perl +git checkout master +cpanm --installdeps . --self-contained --notest +if [ $? -ne 0 ]; then + echo "cpanm --installdeps failed for genotyping; install halted" >&2 + exit 1 +fi +cpanm --install . --notest +if [ $? -ne 0 ]; then + echo "cpanm --install failed for genotyping; install halted" >&2 + exit 1 +fi + +# now set Ruby environment variables and install +export RUBY_HOME=/software/gapi/pkg/ruby/$RUBY_VERSION +export PATH=$RUBY_HOME/bin:$PATH +export MANPATH=$RUBY_HOME/share/man:$MANPATH +export GEM_HOME=$INSTALL_ROOT +export GEM_PATH=/software/gapi/pkg/lib-ruby/$LIB_RUBY_VERSION +export GEM_PATH=$INSTALL_ROOT:$GEM_PATH +export PATH=/software/gapi/pkg/lib-ruby/$LIB_RUBY_VERSION/bin:$PATH + +cd ../ruby/genotyping-workflows/ +rake gem +if [ $? -ne 0 ]; then + echo "'rake gem' failed for genotyping; install halted" >&2 + exit 1 +fi +GENOTYPING_VERSION=`git describe --dirty --always` +GENOTYPING_GEM="pkg/genotyping-workflows-$GENOTYPING_VERSION.gem" +if [ ! -f $GENOTYPING_GEM ]; then + echo "Expected Ruby gem $GENOTYPING_GEM not found; install halted" >&2 + exit 1 +fi +gem install $GENOTYPING_GEM +if [ $? -ne 0 ]; then + echo "'gem install' failed for genotyping; install halted" >&2 + exit 1 +fi + +# clean up temporary directory +cd $START_DIR +rm -Rf $TEMP + +exit 0 diff --git a/src/perl/Build.PL b/src/perl/Build.PL index 3bdc00b4a..3c2f5b3d9 100644 --- a/src/perl/Build.PL +++ b/src/perl/Build.PL @@ -45,7 +45,7 @@ my $build = Build->new 'Try::Tiny' => '>= 0.22', 'URI' => '>= 1.67', 'WTSI::DNAP::Warehouse::Schema' => '>= 1.1', - 'WTSI::NPG::iRODS' => '>= 0.15.0' + 'WTSI::NPG::iRODS' => '>= 2.1.0' }, recommends => { 'UUID' => '>= 0.24', diff --git a/src/perl/MANIFEST b/src/perl/MANIFEST index 27cd29a16..d04269e99 100644 --- a/src/perl/MANIFEST +++ b/src/perl/MANIFEST @@ -19,7 +19,6 @@ bin/plot_metric_scatter.pl bin/publish_expression_analysis.pl bin/publish_fluidigm_genotypes.pl bin/publish_infinium_analysis.pl -bin/publish_infinium_file_list.pl bin/publish_infinium_genotypes.pl bin/publish_sequenom_genotypes.pl bin/publish_snpset.pl diff --git a/src/perl/bin/publish_infinium_file_list.pl b/src/perl/bin/publish_infinium_file_list.pl deleted file mode 100755 index e5cc129ad..000000000 --- a/src/perl/bin/publish_infinium_file_list.pl +++ /dev/null @@ -1,185 +0,0 @@ -#!/software/bin/perl - -use utf8; - -package main; - -use strict; -use warnings; -use Cwd qw(abs_path); -use DateTime; -use File::Basename; -use Getopt::Long; -use List::AllUtils qw(uniq); -use Log::Log4perl; -use Log::Log4perl::Level; -use Pod::Usage; - -use WTSI::NPG::Database::Warehouse; -use WTSI::NPG::Genotyping::Database::Infinium; -use WTSI::NPG::Genotyping::Infinium::Publisher; - -my $embedded_conf = q( - log4perl.logger.npg.irods.publish = ERROR, A1 - - log4perl.appender.A1 = Log::Log4perl::Appender::Screen - log4perl.appender.A1.utf8 = 1 - log4perl.appender.A1.layout = Log::Log4perl::Layout::PatternLayout - log4perl.appender.A1.layout.ConversionPattern = %d %p %m %n -); - -our $VERSION = ''; -our $DEFAULT_INI = $ENV{HOME} . "/.npg/genotyping.ini"; -our $DEFAULT_DAYS = 7; - -run() unless caller(); - -sub run { - my $config; - my $debug; - my $dry_run; - my $log4perl_config; - my $output; - my $publish_dest; - my $validate, - my $verbose; - - GetOptions('config=s' => \$config, - 'debug' => \$debug, - 'dest=s' => \$publish_dest, - 'dry-run' => \$dry_run, - 'help' => sub { pod2usage(-verbose => 2, -exitval => 0) }, - 'logconf=s' => \$log4perl_config, - 'output=s' => \$output, - 'validate' => \$validate, - 'verbose' => \$verbose); - - unless ($publish_dest) { - pod2usage(-msg => "A --dest argument is required\n", - -exitval => 2); - } - - $config ||= $DEFAULT_INI; - if ($output && $output ne '-') { $output = abs_path($output); } - - my $log; - - if ($log4perl_config) { - Log::Log4perl::init($log4perl_config); - $log = Log::Log4perl->get_logger('npg.irods.publish'); - } - else { - Log::Log4perl::init(\$embedded_conf); - $log = Log::Log4perl->get_logger('npg.irods.publish'); - - if ($verbose) { - $log->level($INFO); - } - elsif ($debug) { - $log->level($DEBUG); - } - } - - my $now = DateTime->now; - - my $ifdb = WTSI::NPG::Genotyping::Database::Infinium->new - (name => 'infinium', - inifile => $config, - logger => $log)->connect(RaiseError => 1); - - my $ssdb = WTSI::NPG::Database::Warehouse->new - (name => 'sequencescape_warehouse', - inifile => $config, - logger => $log)->connect(RaiseError => 1, - mysql_enable_utf8 => 1, - mysql_auto_reconnect => 1); - my @files = <>; - foreach my $file (@files) { - chomp($file); - my ($filename, $directories, $suffix) = fileparse($file); - } - - @files = uniq(@files); - $log->debug("Found ", scalar @files, " unique files"); - - my $publisher = WTSI::NPG::Genotyping::Infinium::Publisher->new - (publication_time => $now, - data_files => \@files, - infinium_db => $ifdb, - logger => $log); - - if ($dry_run && $validate) { - $log->logcroak("Can specify at most one of --dry-run and --validate"); - } elsif ($dry_run) { - $log->debug("Starting dry run"); - $publisher->dry_run($publish_dest, $output); - } elsif ($validate) { - $publisher->validate($publish_dest, $output); - } else { - $log->debug("Starting publication"); - $publisher->publish($publish_dest); - } - - return 0; -} - -__END__ - -=head1 NAME - -publish_infinium_file_list - -=head1 SYNOPSIS - -publish_infinium_file_list [--config ] \ - --dest [--dry-run] [--help] [--logconf ] - [--output ] [--validate] [--verbose] < - -Options: - - --config Load database configuration from a user-defined .ini file. - Optional, defaults to $HOME/.npg/genotyping.ini - --dest The data destination root collection in iRODS. - --dry-run Attempt to determine if inputs are valid. Does *not* actually - publish any files. Not compatible with --validate. Output is - a list of publishable files, one per line. - --help Display help. - --logconf A log4perl configuration file. Optional. - --output In --dry-run or --validate mode, write output to the - given file, or '-' for STDOUT. If this option is omitted, - output is not written. - --validate Validate upload of files which have already been published - to iRODS. Not compatible with --dry-run. Output is tab - delimited text, giving source, destination, status code, and - status description for each input. - --verbose Print messages while processing. Optional. - -=head1 DESCRIPTION - -Default behaviour is to publishe files named on STDIN to iRODS with metadata -obtained from LIMS. Can also perform a dry run to check if metadata exists -in the LIMS, or validate that files have been uploaded successfully. - -=head1 METHODS - -None - -=head1 AUTHOR - -Keith James , Iain Bancarz - -=head1 COPYRIGHT AND DISCLAIMER - -Copyright (c) 2012-2014 Genome Research Limited. All Rights Reserved. - -This program is free software: you can redistribute it and/or modify -it under the terms of the Perl Artistic License or the GNU General -Public License as published by the Free Software Foundation, either -version 3 of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -=cut diff --git a/src/perl/bin/ready_qc_calls.pl b/src/perl/bin/ready_qc_calls.pl index b0720060f..d05b46555 100755 --- a/src/perl/bin/ready_qc_calls.pl +++ b/src/perl/bin/ready_qc_calls.pl @@ -14,6 +14,7 @@ package main; use Log::Log4perl; use Log::Log4perl::Level; use Pod::Usage; +use Try::Tiny; use WTSI::NPG::Genotyping::Database::Pipeline; use WTSI::NPG::Genotyping::VCF::PlexResultFinder; @@ -71,6 +72,7 @@ sub run { my $debug; my $inifile; my $log4perl_config; + my $manifest_dir; my $output_dir; my $verbose; @@ -82,6 +84,7 @@ sub run { -exitval => 0) }, 'inifile=s' => \$inifile, 'logconf=s' => \$log4perl_config, + 'manifest_dir=s' => \$manifest_dir, 'out=s' => \$output_dir, 'verbose' => \$verbose); @@ -125,6 +128,12 @@ sub run { $log->logcroak("--out argument '", $output_dir, "' is not a directory"); } + if (defined($manifest_dir) && !(-d $manifest_dir)) { + $log->logcroak("--manifest_dir argument '", $manifest_dir, + "' is not a directory"); + } + $manifest_dir ||= $output_dir; + if (!$dbfile) { $log->logcroak("--dbfile argument is required"); } elsif (! -e $dbfile) { @@ -144,16 +153,23 @@ sub run { my @sample_ids = uniq map { $_->sanger_sample_id } @samples; ### create PlexResultFinder and write VCF ### - my $finder = WTSI::NPG::Genotyping::VCF::PlexResultFinder->new( - sample_ids => \@sample_ids, - subscriber_config => \@config, - logger => $log, - ); - my $vcf_paths = $finder->write_vcf($output_dir); + try { + my $finder = WTSI::NPG::Genotyping::VCF::PlexResultFinder->new( + sample_ids => \@sample_ids, + subscriber_config => \@config, + logger => $log, + ); + my $plex_manifests = $finder->write_manifests($manifest_dir); + $log->info("Wrote plex manifests: ", join(', ', @{$plex_manifests})); + my $vcf_paths = $finder->write_vcf($output_dir); + $log->info("Wrote VCF: ", join(', ', @{$vcf_paths})); + } catch { + $log->logwarn("Unexpected error finding QC plex data in ", + "iRODS; run with --verbose for details"); + $log->info("Caught PlexResultFinder error: $_"); + } } -## TODO Retrieve results for multiple plex types / experiments and record in the same VCF file - __END__ @@ -163,7 +179,7 @@ =head1 NAME =head1 SYNOPSIS -ready_qc_calls --dbfile --vcf +ready_qc_calls --dbfile --out Options: @@ -180,7 +196,9 @@ =head1 SYNOPSIS --help Display help. --inifile Path to .ini file to configure pipeline SQLite database connection. Optional. Only relevant if --dbfile is given. - --out Path to directory for VCF output. Required. + --manifest_dir Directory for output of QC plex manifests retrieved from + iRODS. Optional, defaults to the --out argument. + --out Directory for VCF output. Required. =head1 DESCRIPTION diff --git a/src/perl/bin/ready_workflow.pl b/src/perl/bin/ready_workflow.pl index 521daec0a..d77989b3e 100755 --- a/src/perl/bin/ready_workflow.pl +++ b/src/perl/bin/ready_workflow.pl @@ -19,6 +19,7 @@ package main; use Log::Log4perl; use Log::Log4perl::Level; use Pod::Usage; +use Try::Tiny; use YAML qw /DumpFile/; use WTSI::NPG::Genotyping::Database::Pipeline; @@ -70,42 +71,52 @@ package main; run() unless caller(); sub run { - my $workdir; - my $manifest; - my $smaller; - my $debug; + my $chunk_size; + my $config_out; my $dbfile; - my $run; + my $debug; my $egt; + my $host; my $inifile; + my $local; my $log4perl_config; + my $manifest; + my $memory; + my $no_filter; + my $no_plex; + my $queue; + my $run; + my $smaller; my $verbose; - my $host; + my $workdir; my $workflow; - my $chunk_size; - my $memory; my $zstart; my $ztotal; my @plex_config; - GetOptions('workdir=s' => \$workdir, - 'manifest=s' => \$manifest, - 'plex_config=s' => \@plex_config, - 'host=s' => \$host, + GetOptions('help' => sub { pod2usage(-verbose => 2, -exitval => 0) }, + 'chunk_size=i' => \$chunk_size, + 'config_out=s' => \$config_out, 'dbfile=s' => \$dbfile, - 'run=s' => \$run, + 'debug' => \$debug, 'egt=s' => \$egt, + 'host=s' => \$host, 'inifile=s' => \$inifile, + 'local' => \$local, + 'logconf=s' => \$log4perl_config, + 'manifest=s' => \$manifest, + 'memory=i' => \$memory, + 'no_filter' => \$no_filter, + 'no_plex' => \$no_plex, + 'plex_config=s' => \@plex_config, + 'queue=s' => \$queue, + 'run=s' => \$run, + 'smaller' => \$smaller, 'verbose' => \$verbose, + 'workdir=s' => \$workdir, 'workflow=s' => \$workflow, - 'chunk_size=i' => \$chunk_size, - 'smaller' => \$smaller, - 'memory=i' => \$memory, 'zstart=i' => \$zstart, 'ztotal=i' => \$ztotal, - 'logconf=s' => \$log4perl_config, - 'debug' => \$debug, - 'help' => sub { pod2usage(-verbose => 2, -exitval => 0) }, ); ### set up logging ### @@ -124,7 +135,7 @@ sub run { } } - ### validate command-line arguments ### + ### process command-line arguments ### $inifile ||= $DEFAULT_INI; if (! -e $inifile) { $log->logcroak("--inifile argument '", $inifile, "' does not exist"); @@ -149,6 +160,9 @@ sub run { $log->logcroak("--manifest argument '", $manifest, "' does not exist"); } + if (defined($no_filter)) { $no_filter = 'true'; } # Boolean value for Ruby + else { $no_filter = 'false'; } + if (!defined($queue)) { $queue = 'normal'; } if (!$workflow) { $log->logcroak("--workflow argument is required"); } elsif (!($workflow eq $ILLUMINUS || $workflow eq $ZCALL)) { @@ -170,7 +184,6 @@ sub run { "' does not exist"); } } - $host ||= $DEFAULT_HOST; # illuminus paralellizes by SNP, other callers by sample if ($workflow eq 'illuminus') { $chunk_size ||= $DEFAULT_CHUNK_SIZE_SNP; } @@ -184,31 +197,12 @@ sub run { if ($ztotal <=0) { $log->logcroak("ztotal must be > 0"); } ### create and populate the working directory ### - make_working_directory($workdir); - write_config_yml($workdir, $host); - - ### read sample identifiers from pipeline DB & create PlexResultFinder ### - my @initargs = (name => 'pipeline', - inifile => $inifile, - dbfile => $dbfile); - my $pipedb = WTSI::NPG::Genotyping::Database::Pipeline->new - (@initargs)->connect - (RaiseError => 1, - sqlite_unicode => 1, - on_connect_do => 'PRAGMA foreign_keys = ON'); - my @samples = $pipedb->sample->all; - my @sample_ids = uniq map { $_->sanger_sample_id } @samples; - my $finder = WTSI::NPG::Genotyping::VCF::PlexResultFinder->new( - sample_ids => \@sample_ids, - logger => $log, - subscriber_config => \@plex_config, - ); + make_working_directory($workdir, $local); + if ($local) { write_config_yml($workdir, $host); } - ### write plex manifests and VCF to working directory ### - my $manifest_dir = catfile($workdir, $PLEX_MANIFEST_SUBDIRECTORY); - my $plex_manifests = $finder->write_manifests($manifest_dir); - my $vcf_dir = catfile($workdir, $VCF_SUBDIRECTORY); - my $vcf = $finder->write_vcf($vcf_dir); + ### find QC plex VCF and manifests (if required) ### + my ($plex_manifests, $vcf) = write_plex_results( + $inifile, $dbfile, \@plex_config, $workdir, $log, $no_plex); ### if required, copy manifest, database and EGT to working directory ### unless ($smaller) { @@ -218,11 +212,46 @@ sub run { $egt = copy_file_to_directory($egt, $workdir); } } - write_workflow_yml($workdir, $workflow, $dbfile, $run, $manifest, - $chunk_size, $memory, $vcf, $plex_manifests, - $egt, $zstart, $ztotal); + ### generate the workflow config and write as YML ### + unless (defined($config_out)) { + $config_out = workflow_config_path($workdir, $workflow, $local); + } + my %workflow_args = ( + 'manifest' => $manifest, + 'chunk_size' => $chunk_size, + 'memory' => $memory, + 'nofilter' => $no_filter, + 'queue' => $queue, + 'vcf' => $vcf, + 'plex_manifest' => $plex_manifests, + ); + my $workflow_module; + if ($workflow eq $ILLUMINUS) { + $workflow_args{'gender_method'} = 'Supplied'; + $workflow_module = $MODULE_ILLUMINUS; + } elsif ($workflow eq $ZCALL) { + $workflow_module = $MODULE_ZCALL; + if (!($egt && $zstart && $ztotal)) { + $log->logcroak("Must specify EGT, zstart, and ztotal for ", + "zcall workflow"); + } + $workflow_args{'egt'} = $egt; + $workflow_args{'zstart'} = $zstart; + $workflow_args{'ztotal'} = $ztotal; + } else { + $log->logcroak("Invalid workflow argument '", $workflow, + "'; must be one of $ILLUMINUS, $ZCALL"); + } + my @args = ($dbfile, $run, $workdir, \%workflow_args); + my %params = ( + 'library' => 'genotyping', + 'workflow' => $workflow_module, + 'arguments' => \@args, + ); + $log->info("Wrote workflow YML to '", $config_out, "'"); + DumpFile($config_out, (\%params)); $log->info("Finished; genotyping pipeline directory '", $workdir, - "' is ready to run Percolate."); + "' was created successfully."); } sub copy_file_to_directory { @@ -238,7 +267,7 @@ sub copy_file_to_directory { sub make_working_directory { # make in, pass, fail if needed; copy dbfile to working directory # if $include_qc_plex, also create vcf and plex_manifest subdirs - my ($workdir) = @_; + my ($workdir, $local) = @_; if (-e $workdir) { if (-d $workdir) { $log->info("Working directory '", $workdir, "' already exists"); @@ -252,8 +281,10 @@ sub make_working_directory { $log->info("Created working directory '", $workdir, "'"); } # create subdirectories - my @names = ('in', 'pass', 'fail', $VCF_SUBDIRECTORY, - $PLEX_MANIFEST_SUBDIRECTORY); + my @names = ($VCF_SUBDIRECTORY, $PLEX_MANIFEST_SUBDIRECTORY); + if ($local) { + push @names, qw/in pass fail/; + } foreach my $name (@names) { my $subdir = catfile($workdir, $name); if (-e $subdir) { @@ -271,6 +302,15 @@ sub make_working_directory { } } +sub workflow_config_path { + my ($workdir, $workflow, $local) = @_; + my $config_dir; + if ($local) { $config_dir = catfile($workdir, 'in'); } + else { $config_dir = $workdir; } + my $config_path = catfile($config_dir, 'genotype_'.$workflow.'.yml'); + return $config_path; +} + sub write_config_yml { my ($workdir, $host) = @_; my %config = ( @@ -288,45 +328,42 @@ sub write_config_yml { return $config_path; } -sub write_workflow_yml { - my ($workdir, $workflow, $dbpath, $run, $manifest, $chunk_size, - $memory, $vcf, $plex_manifests, $egt, $zstart, $ztotal) = @_; - my %workflow_args = ( - 'chunk_size' => $chunk_size, - 'memory' => $memory, - 'manifest' => $manifest, - 'plex_manifest' => $plex_manifests, - 'vcf' => $vcf, - ); - my $workflow_module; - if ($workflow eq $ILLUMINUS) { - $workflow_args{'gender_method'} = 'Supplied'; - $workflow_module = $MODULE_ILLUMINUS; - } elsif ($workflow eq $ZCALL) { - if (!($egt && $zstart && $ztotal)) { - $log->logcroak("Must specify EGT, zstart, and ztotal for ", - "zcall workflow"); - } elsif (! -e $egt) { - $log->logcroak("EGT file '", $egt, "' does not exist."); - } else { - $workflow_args{'egt'} = $egt; - $workflow_args{'zstart'} = $zstart; - $workflow_args{'ztotal'} = $ztotal; - $workflow_module = $MODULE_ZCALL; +sub write_plex_results { + my ($inifile, $dbfile, $plex_config, $workdir, $log, $no_plex) = @_; + my $plex_manifests= []; + my $vcf = []; + if (!($no_plex)) { + my @initargs = (name => 'pipeline', + inifile => $inifile, + dbfile => $dbfile); + my $pipedb = WTSI::NPG::Genotyping::Database::Pipeline->new + (@initargs)->connect + (RaiseError => 1, + sqlite_unicode => 1, + on_connect_do => 'PRAGMA foreign_keys = ON'); + my @samples = $pipedb->sample->all; + my @sample_ids = uniq map { $_->sanger_sample_id } @samples; + try { + my $finder = WTSI::NPG::Genotyping::VCF::PlexResultFinder->new( + sample_ids => \@sample_ids, + logger => $log, + subscriber_config => $plex_config, + ); + my $manifest_dir = catfile($workdir, $PLEX_MANIFEST_SUBDIRECTORY); + $plex_manifests = $finder->write_manifests($manifest_dir); + $log->info("Wrote plex manifests: ", + join(', ', @{$plex_manifests})); + my $vcf_dir = catfile($workdir, $VCF_SUBDIRECTORY); + $vcf = $finder->write_vcf($vcf_dir); + $log->info("Wrote VCF: ", join(', ', @{$vcf})); + } catch { + $log->logwarn("Unexpected error finding QC plex data in ", + "iRODS; VCF and plex manifests not written; ", + "run with --verbose for details"); + $log->info("Caught PlexResultFinder error: $_"); } - } else { - $log->logcroak("Invalid workflow argument '", $workflow, - "'; must be one of $ILLUMINUS, $ZCALL"); } - my @args = ($dbpath, $run, $workdir, \%workflow_args); - my %params = ( - 'library' => 'genotyping', - 'workflow' => $workflow_module, - 'arguments' => \@args, - ); - my $out = catfile($workdir, "in", "genotype_".$workflow.".yml"); - $log->info("Wrote workflow YML to '", $out, "'"); - DumpFile($out, (\%params)); + return ($plex_manifests, $vcf); } @@ -348,17 +385,30 @@ =head1 SYNOPSIS --chunk_size Chunk size for parallelization. Optional, defaults to 4000 (SNPs) for Illuminus or 40 (samples) for zCall. + --config_out Output path for .yml workflow config file. Optional, + defaults to 'genotype_${WORKFLOW_NAME}.yml' in the + workflow directory (or the 'in' subdirectory, if --local + is in effect). --dbfile Path to an SQLite pipeline database file. Required. --egt Path to an Illumina .egt cluster file. Required for zcall. --help Display help. - --host Name of host machine for the beanstalk message queue. + --host Name of host machine for the beanstalk message queue in + Percolate config. Relevant only if --local is in effect. Optional, defaults to farm3-head2. + --local Create in/pass/fail subdirectories, to run Percolate + locally in the pipeline working directory. Write workflow + config to the 'in' subdirectory. Write a config.yml file + for Percolate to the working directory. --manifest Path to the .bpm.csv manifest file. Required. --memory Memory limit hint for LSF, in MB. Default = 2048. + --no_filter Enable the 'nofilter' option in workflow config. + --no_plex Do not query iRODS for QC plex results. --plex_config Path to a JSON file with parameters to query iRODS and write QC plex data as VCF. May be supplied more than once to specify multiple files. Optional, defaults to a standard set of config files. + --queue LSF queue hint for workflow config YML. Optional, defaults + to 'normal'. --run The pipeline run name in the database. Required. --smaller Do not copy the .egt, manifest, and plex manifest files to the workflow directory. Uses less space, but makes the diff --git a/src/perl/bin/write_snp_metadata.pl b/src/perl/bin/write_snp_metadata.pl index a5f3408c3..b62d6a8d4 100755 --- a/src/perl/bin/write_snp_metadata.pl +++ b/src/perl/bin/write_snp_metadata.pl @@ -120,7 +120,7 @@ sub splitManifest { $i++; if ($verbose && $i % 100_000 == 0) { print "$i lines read.\n"; } if ($i == 1) { next; } # first line is header - $_ =~ s/\s+$//msxg; # remove whitespace (including \r) from end of line + s/\s+$//msxg; # remove whitespace (including \r) from end of line my @fields = split /,/msx; my $fields_found = @fields; if ($fields_found != $EXPECTED_FIELDS) { diff --git a/src/perl/etc/ready_qc_sequenom.json b/src/perl/etc/ready_qc_sequenom.json index ee0f0ebaf..11e2bdd34 100644 --- a/src/perl/etc/ready_qc_sequenom.json +++ b/src/perl/etc/ready_qc_sequenom.json @@ -1,7 +1,7 @@ { "data_path": "/seq/sequenom", "platform": "sequenom", - "reference_name": "Homo_sapiens (GRCh37_53)", + "reference_name": "Homo_sapiens (1000Genomes)", "reference_path": "/seq/sequenom/multiplexes", "snpset_name": "W30467", "read_snpset_version": "1.0", diff --git a/src/perl/etc/snpsets.ini b/src/perl/etc/snpsets.ini index 0539ba05f..535c5eb36 100644 --- a/src/perl/etc/snpsets.ini +++ b/src/perl/etc/snpsets.ini @@ -19,8 +19,6 @@ name=HumanOmni25-8v1-2 name=HumanOmniExpress-12v1 name=HumanOmniExpress-12v1-1 name=HumanOmniExpress-24v1-1 -name=Immuno_BeadChip -name=Immuno_BeadChip_11419691 name=SangerDDD_OmniExPlusv1 name=Sanger_3K_ExomePlus name=HumanOmni25Exome-8v1 @@ -37,6 +35,7 @@ name=HumanOmni25Exome-8v1-1 name=MEGA_Consortium name=MEGA_Consortium_v2 name=Infinium-MethylationEPIC +name=Multi-EthnicGlobal [sequenom] name=W30467 diff --git a/src/perl/lib/WTSI/NPG/Database/DBIx.pm b/src/perl/lib/WTSI/NPG/Database/DBIx.pm index 5d801cc4e..30b8ff931 100644 --- a/src/perl/lib/WTSI/NPG/Database/DBIx.pm +++ b/src/perl/lib/WTSI/NPG/Database/DBIx.pm @@ -91,7 +91,7 @@ sub in_transaction { $self->logcroak("Attempted to use a closed connection") } } catch { - if ($_ =~ m{Rollback failed}msx) { + if (m{Rollback failed}msx) { $self->logconfess("$_. Rollback failed! ", "WARNING: data may be inconsistent."); } else { diff --git a/src/perl/lib/WTSI/NPG/Genotyping/Database/Pipeline/Schema/Result/Sample.pm b/src/perl/lib/WTSI/NPG/Genotyping/Database/Pipeline/Schema/Result/Sample.pm index 9b1d89678..cdd6b746b 100644 --- a/src/perl/lib/WTSI/NPG/Genotyping/Database/Pipeline/Schema/Result/Sample.pm +++ b/src/perl/lib/WTSI/NPG/Genotyping/Database/Pipeline/Schema/Result/Sample.pm @@ -226,9 +226,9 @@ sub idat { my @files; if ($channel eq 'red') { - @files = grep { defined $_ and m{Red}msx } @values; + @files = grep { defined and m{Red}msx } @values; } else { - @files = grep { defined $_ and m{Grn}msx } @values; + @files = grep { defined and m{Grn}msx } @values; } return shift @files; diff --git a/src/perl/lib/WTSI/NPG/Genotyping/Types.pm b/src/perl/lib/WTSI/NPG/Genotyping/Types.pm index 76739a77f..328d865a9 100644 --- a/src/perl/lib/WTSI/NPG/Genotyping/Types.pm +++ b/src/perl/lib/WTSI/NPG/Genotyping/Types.pm @@ -48,32 +48,32 @@ our $VERSION = ''; subtype HsapiensChromosome, as Str, - where { $_ =~ m{(^[Cc]hr)?[\d+|MT|X|Y]$}msx }, + where { m{(^[Cc]hr)?[\d+|MT|X|Y]$}msx }, message { "'$_' is not a valid H. sapiens chromosome name" }; subtype HsapiensX, as Str, - where { $_ =~ m{(^[Cc]hr)?X$}msx }, + where { m{(^[Cc]hr)?X$}msx }, message { "'$_' is not a valid H. sapiens X chromosome" }; subtype HsapiensY, as Str, - where { $_ =~ m{(^[Cc]hr)?Y$}msx }, + where { m{(^[Cc]hr)?Y$}msx }, message { "'$_' is not a valid H. sapiens Y chromosome" }; subtype InfiniumBeadchipBarcode, as Str, - where { $_ =~ m{^\d{10,12}$}msx }, + where { m{^\d{10,12}$}msx }, message { "'$_' is not a valid Infinium beadchip barcode" }; subtype InfiniumBeadchipSection, as Str, - where { $_ =~ m{^R\d+C\d+$}msx }, + where { m{^R\d+C\d+$}msx }, message { "'$_' is not a valid Infinium beadchip section" }; subtype DNABase, as Str, - where { $_ =~ m{^[ACGTNacgtn]$}msx }, + where { m{^[ACGTNacgtn]$}msx }, message { "'$_' is not a valid DNA base" }; subtype DNAStrand, @@ -84,7 +84,7 @@ subtype DNAStrand, # A genotype call represented as a 2 allele string. subtype SNPGenotype, as Str, - where { length($_) == 2 && + where { length == 2 && is_DNABase(substr($_, 0, 1)) && is_DNABase(substr($_, 1, 1)) }, message { "'$_' is not a valid SNP call"}; diff --git a/src/perl/t/WTSI/NPG/Genotyping/Database/InfiniumTest.pm b/src/perl/t/WTSI/NPG/Genotyping/Database/InfiniumTest.pm index 51f5db5a2..64553ce39 100644 --- a/src/perl/t/WTSI/NPG/Genotyping/Database/InfiniumTest.pm +++ b/src/perl/t/WTSI/NPG/Genotyping/Database/InfiniumTest.pm @@ -141,4 +141,5 @@ sub repeat_scans : Test(1) { image_iso_date => '2016-10-03'}], 'Repeat scans resolved to latest'); } + 1; diff --git a/src/perl/t/WTSI/NPG/Genotyping/Sequenom/AssayResultSetTest.pm b/src/perl/t/WTSI/NPG/Genotyping/Sequenom/AssayResultSetTest.pm index 00fb926f7..a25bab0a2 100644 --- a/src/perl/t/WTSI/NPG/Genotyping/Sequenom/AssayResultSetTest.pm +++ b/src/perl/t/WTSI/NPG/Genotyping/Sequenom/AssayResultSetTest.pm @@ -22,8 +22,6 @@ my $data_path = './t/sequenom_assay_data_object'; my $data_file = 'plate1_A01.csv'; my $irods_tmp_coll; -my $resultset; - my $pid = $$; sub make_fixture : Test(setup) { @@ -41,16 +39,9 @@ sub make_fixture : Test(setup) { $irods->add_object_avu($irods_path, 'study_id', '10'); $irods->add_object_avu($irods_path, 'sample_consent', '1'); $irods->add_object_avu($irods_path, 'sample_supplier_name', 'zzzzzzzzzz'); - - my $data_object = WTSI::NPG::Genotyping::Sequenom::AssayDataObject->new - ($irods, "$irods_tmp_coll/$data_file"); - $resultset = WTSI::NPG::Genotyping::Sequenom::AssayResultSet->new - ($data_object); } sub teardown : Test(teardown) { - undef $resultset; - my $irods = WTSI::NPG::iRODS->new; $irods->remove_collection($irods_tmp_coll); } @@ -84,23 +75,38 @@ sub constructor : Test(5) { } sub size : Test(1) { + my $resultset = _get_resultset(WTSI::NPG::iRODS->new); cmp_ok($resultset->size, '==', 1, 'Expected size'); } sub assay_results : Test(1) { + my $resultset = _get_resultset(WTSI::NPG::iRODS->new); cmp_ok(scalar @{$resultset->assay_results}, '==', 1, 'Contains expected number of assay results'); } sub snpset_name : Test(2) { my $irods = WTSI::NPG::iRODS->new; + my $resultset = _get_resultset($irods); is($resultset->snpset_name, 'qc', 'Correct SNP set name'); - $irods->add_object_avu($resultset->data_object->str, 'sequenom_plex', 'test'); $resultset->data_object->clear_metadata; # Clear metadata cache - dies_ok { $resultset->snpset_name }, 'Cannot have multiple names'; } +sub _get_resultset { + # Each resultset contains a data object, which in turn has an iRODS object. + # Need to be careful about working with multiple iRODS objects, because + # out-of-date caches in different objects can lead to unexpected + # results/errors. This is why $irods is supplied as an argument instead + # of created on the fly. + my ($irods, ) = @_; + my $data_object = WTSI::NPG::Genotyping::Sequenom::AssayDataObject->new + ($irods, "$irods_tmp_coll/$data_file"); + my $resultset = WTSI::NPG::Genotyping::Sequenom::AssayResultSet->new + ($data_object); + return $resultset; +} + 1; diff --git a/src/perl/t/WTSI/NPG/Genotyping/VCF/ReadyWorkflowTest.pm b/src/perl/t/WTSI/NPG/Genotyping/VCF/ReadyWorkflowTest.pm index 4eb56bc4d..24a633492 100644 --- a/src/perl/t/WTSI/NPG/Genotyping/VCF/ReadyWorkflowTest.pm +++ b/src/perl/t/WTSI/NPG/Genotyping/VCF/ReadyWorkflowTest.pm @@ -7,7 +7,7 @@ use warnings; use base qw(WTSI::NPG::Test); use Cwd qw/abs_path/; -use Test::More tests => 66; +use Test::More tests => 82; use Test::Exception; use File::Basename qw(fileparse); use File::Path qw/make_path/; @@ -240,77 +240,80 @@ sub teardown : Test(teardown) { $irods->remove_collection($irods_tmp_coll); } -sub test_ready_calls_fluidigm : Test(2) { +sub test_ready_calls_fluidigm : Test(4) { setup_chromosome_json(); my $fluidigm_params = setup_fluidigm(); my $vcf_out = "$tmp/fluidigm_qc.vcf"; + my $manifest_out = "$tmp/fluidigm_qc.tsv"; my $cmd = join q{ }, "$READY_QC_CALLS", "--config $fluidigm_params", "--dbfile $dbfile", "--logconf $LOG_TEST_CONF", "--verbose", "--out $tmp"; - ok(system($cmd) == 0, 'Wrote Fluidigm calls to VCF'); + ok(system($cmd) == 0, 'Ready calls script exit status OK'); + ok(-e $vcf_out, 'VCF output found'); + ok(-e $manifest_out, 'Plex manifest output found'); my @got_lines = read_file($vcf_out); @got_lines = grep !/^[#]{2}(fileDate|reference)=/, @got_lines; my @expected_lines = read_file($f_expected_vcf); @expected_lines = grep !/^[#]{2}(fileDate|reference)=/, @expected_lines; is_deeply(\@got_lines, \@expected_lines, "Fluidigm VCF output matches expected values"); - } -sub test_ready_calls_fluidigm_bad_reference : Test(2) { - # test +sub test_ready_calls_fluidigm_bad_reference : Test(3) { setup_chromosome_json(); - my $fluidigm_params = setup_fluidigm(); + my $fluidigm_params = setup_fluidigm('jabberwocky'); my $vcf_out = "$tmp/fluidigm_qc.vcf"; + my $manifest_out = "$tmp/fluidigm_qc.tsv"; my $cmd = join q{ }, "$READY_QC_CALLS", "--config $fluidigm_params", "--dbfile $dbfile", "--logconf $LOG_TEST_CONF", "--verbose", - "--out $tmp"; - ok(system($cmd) == 0, 'Wrote Fluidigm calls to VCF'); - my @got_lines = read_file($vcf_out); - @got_lines = grep !/^[#]{2}(fileDate|reference)=/, @got_lines; - my @expected_lines = read_file($f_expected_vcf); - @expected_lines = grep !/^[#]{2}(fileDate|reference)=/, @expected_lines; - is_deeply(\@got_lines, \@expected_lines, - "Fluidigm VCF output matches expected values"); - + "--out $tmp", + "2> /dev/null"; # suppress chatter to stderr + ok(system($cmd) == 0, 'Ready calls script exit status OK'); + ok(!(-e $vcf_out), 'VCF output not written'); + ok(!(-e $manifest_out), 'Manifest not written'); } -sub test_ready_calls_sequenom : Test(2) { +sub test_ready_calls_sequenom : Test(4) { setup_chromosome_json(); my $sequenom_params = setup_sequenom_default(); my $vcf_out = "$tmp/sequenom_W30467.vcf"; + my $manifest_out = "$tmp/sequenom_W30467.tsv"; my $cmd = join q{ }, "$READY_QC_CALLS", "--config $sequenom_params", "--dbfile $dbfile", "--logconf $LOG_TEST_CONF", "--out $tmp"; - ok(system($cmd) == 0, 'Wrote Sequenom calls to VCF'); + ok(system($cmd) == 0, 'Ready calls script exit status OK'); + ok(-e $vcf_out, 'VCF output found'); + ok(-e $manifest_out, 'Plex manifest output found'); my @got_lines = read_file($vcf_out); @got_lines = grep !/^[#]{2}(fileDate|reference)=/, @got_lines; my @expected_lines = read_file($s_expected_vcf); @expected_lines = grep !/^[#]{2}(fileDate|reference)=/, @expected_lines; is_deeply(\@got_lines, \@expected_lines, "Sequenom VCF output matches expected values"); - } -sub test_ready_calls_sequenom_alternate_snp : Test(2) { +sub test_ready_calls_sequenom_alternate_snp : Test(4) { # tests handling of renamed SNP in different manifest versions setup_chromosome_json(); my $sequenom_params = setup_sequenom_alternate(); my $vcf_out = "$tmp/sequenom_W30467.vcf"; + my $manifest_out = "$tmp/sequenom_W30467.tsv"; my $cmd = join q{ }, "$READY_QC_CALLS", "--config $sequenom_params", "--dbfile $dbfile", "--logconf $LOG_TEST_CONF", "--out $tmp"; - ok(system($cmd) == 0, 'Wrote Sequenom calls to VCF'); + ok(system($cmd) == 0, 'Ready calls script exit status OK'); + ok(-e $vcf_out, 'VCF output found'); + ok(-e $manifest_out, 'Plex manifest output found'); my @got_lines = read_file($vcf_out); @got_lines = grep !/^[#]{2}(fileDate|reference)=/, @got_lines; my @expected_lines = read_file($s_expected_vcf); @@ -402,10 +405,13 @@ sub test_workflow_script_illuminus: Test(16) { my $workdir = abs_path(catfile($tmp, "genotype_workdir_illuminus")); my $config_path = catfile($workdir, "config.yml"); my $working_db = catfile($workdir, $db_file_name); + my $queue = 'yesterday'; # non-default queue for testing my $cmd = join q{ }, "$READY_WORKFLOW", "--logconf $LOG_TEST_CONF", "--dbfile $dbfile", + "--local", "--manifest $manifest", + "--queue $queue", "--run run1", "--verbose", "--plex_config $f_config", @@ -442,13 +448,13 @@ sub test_workflow_script_illuminus: Test(16) { my $config = LoadFile($config_path); ok($config, "Config data structure loaded from YML"); my $expected_config = { - 'msg_port' => '11300', + 'msg_port' => '11300', 'max_processes' => '250', - 'root_dir' => $workdir, - 'log_level' => 'DEBUG', - 'async' => 'lsf', - 'msg_host' => 'farm3-head2', - 'log' => catfile($workdir, 'percolate.log') + 'root_dir' => $workdir, + 'log_level' => 'DEBUG', + 'async' => 'lsf', + 'msg_host' => 'farm3-head2', + 'log' => catfile($workdir, 'percolate.log') }; is_deeply($config, $expected_config, "YML Illuminus config matches expected values"); @@ -469,6 +475,8 @@ sub test_workflow_script_illuminus: Test(16) { 'memory' => '2048', 'manifest' => catfile($workdir, $manifest_name), 'chunk_size' => '4000', + 'nofilter' => 'false', + 'queue' => $queue, 'plex_manifest' => [ catfile($workdir, 'plex_manifests', $fluidigm_manifest_name), @@ -500,6 +508,7 @@ sub test_workflow_script_illuminus_bad_plex_reference: Test(12) { my $cmd = join q{ }, "$READY_WORKFLOW", "--logconf $LOG_TEST_CONF", "--dbfile $dbfile", + "--local", "--manifest $manifest", "--run run1", "--verbose", @@ -547,6 +556,8 @@ sub test_workflow_script_illuminus_bad_plex_reference: Test(12) { 'memory' => '2048', 'manifest' => catfile($workdir, $manifest_name), 'chunk_size' => '4000', + 'nofilter' => 'false', + 'queue' => 'normal', 'plex_manifest' => [], 'vcf' => [], 'gender_method' => 'Supplied' @@ -558,6 +569,74 @@ sub test_workflow_script_illuminus_bad_plex_reference: Test(12) { } +sub test_workflow_script_illuminus_nonlocal: Test(9) { + # test for 'nonlocal' percolate mode & nonstandard config path + setup_chromosome_json(); + my $f_config = setup_fluidigm(); + my $s_config = setup_sequenom_default(); + my $workdir = abs_path(catfile($tmp, "genotype_workdir_illuminus")); + my $workflow_config = catfile($workdir, "genotype_TEST.yml"); + my $working_db = catfile($workdir, $db_file_name); + my $cmd = join q{ }, "$READY_WORKFLOW", + "--config_out $workflow_config", + "--logconf $LOG_TEST_CONF", + "--dbfile $dbfile", + "--manifest $manifest", + "--run run1", + "--verbose", + "--plex_config $f_config", + "--plex_config $s_config", + "--workdir $workdir", + "--workflow illuminus"; + is(0, system($cmd), "illuminus setup exit status is zero"); + # check presence of required files and subfolders for workflow + ok(-e $workdir, "Workflow directory found"); + ok(!(-e catfile($workdir, "config.yml")), "Percolate config not written"); + foreach my $name (qw/in pass fail/) { + my $subdir = catfile($workdir, $name); + ok(!(-e $subdir), "Subdirectory '$name' not found"); + } + ok(-e $workflow_config, "Workflow config YML found"); + my $params = LoadFile($workflow_config); + ok($params, "Workflow parameter data structure loaded from YML"); + my $manifest_name = fileparse($manifest); + my $fluidigm_manifest_name = 'fluidigm_qc.tsv'; + my $sequenom_manifest_name = 'sequenom_W30467.tsv'; + my $vcf_path_fluidigm = catfile($workdir, 'vcf', 'fluidigm_qc.vcf'); + my $vcf_path_sequenom = catfile($workdir, 'vcf', 'sequenom_W30467.vcf'); + my $expected_params = { + 'workflow' => 'Genotyping::Workflows::GenotypeIlluminus', + 'library' => 'genotyping', + 'arguments' => [ + $working_db, + 'run1', + $workdir, + { + 'memory' => '2048', + 'manifest' => catfile($workdir, $manifest_name), + 'chunk_size' => '4000', + 'nofilter' => 'false', + 'queue' => 'normal', # default value + 'plex_manifest' => [ + catfile($workdir, 'plex_manifests', + $fluidigm_manifest_name), + catfile($workdir, 'plex_manifests', + $sequenom_manifest_name), + ], + 'vcf' => [ + $vcf_path_fluidigm, + $vcf_path_sequenom, + ], + 'gender_method' => 'Supplied' + } + ] + }; + is_deeply($params, $expected_params, + "YML Illuminus workflow params match expected values"); + + +} + sub test_workflow_script_zcall: Test(16) { setup_chromosome_json(); my $f_config = setup_fluidigm(); @@ -568,6 +647,7 @@ sub test_workflow_script_zcall: Test(16) { my $cmd = join q{ }, "$READY_WORKFLOW", "--logconf $LOG_TEST_CONF", "--dbfile $dbfile", + "--local", "--manifest $manifest", "--run run1", "--verbose", @@ -632,6 +712,8 @@ sub test_workflow_script_zcall: Test(16) { { 'zstart' => '6', 'chunk_size' => '40', + 'nofilter' => 'false', + 'queue' => 'normal', 'egt' => catfile($workdir, $egt_name), 'vcf' => [ $vcf_path_fluidigm,