diff --git a/Cargo.toml b/Cargo.toml index cf03780..6c14ac1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mgikit" -version = "0.1.5" +version = "0.1.6" edition = "2021" authors = ["Ziad Al Bkhetan "] repository = "https://github.com/sagc-bioinformatics/mgikit" diff --git a/bins/mgikit-V0.1.5.zip b/bins/mgikit-V0.1.5.zip deleted file mode 100644 index f6943ce..0000000 Binary files a/bins/mgikit-V0.1.5.zip and /dev/null differ diff --git a/docs/pages/demultiplex.md b/docs/pages/demultiplex.md index be20bb1..ce2cebf 100755 --- a/docs/pages/demultiplex.md +++ b/docs/pages/demultiplex.md @@ -7,9 +7,11 @@ type: guides --- ## Demultiplexing functionality + This command is used to demultiplex fastq files and assign the sequencing reads to their associated samples. The tool requires the following mandatory input files to perform the demultiplexing: + 1. Fastq files (single/paired-end). 2. Sample sheet which contains sample indexes and their templates (will be explained in detail). @@ -20,129 +22,118 @@ as well as some summary reports that can be visualised through the MultiQC tool ### Mandatory input files -+ **`-h` or `--help`**: Print help - -+ **`-V` or `--version`**: Print version +- **`-h` or `--help`**: Print help +- **`-V` or `--version`**: Print version **Fastq input file** -+ **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. +- **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. -+ **`-r or --read2`**: the path to the reverse reads fastq file. +- **`-r or --read2`**: the path to the reverse reads fastq file. -+ **`-i or --input`**: the path to the directory that contains the input fastq files. +- **`-i or --input`**: the path to the directory that contains the input fastq files. -Either `-i` or `-f/-r`, `-f` should be provided for a run. +{% include callout.html type="note" content="Either `-i` or `-f/-r`, `-f` should be provided for a run." %} **Input sample sheet** -+ **`-s or --sample-sheet`**: the path to the sample sheet file. - -More details are available below on the sample sheet format and preparation. +- **`-s or --sample-sheet`**: the path to the sample sheet file. It can be tab or comma separated. The tool detects the presense of any of them in the header. + More details are available below on the sample sheet format and preparation. ### Other Parameters -+ **`-o or --output`**: The path the output directory. - -The tool will create the directory if it does not exist -or overwrite the content if the directory exists and the parameter `--force` is used. The tool will exit -with an error if the directory exists, and `--force` is not used. If this parameter is not provided, the tools -will create a directory (in the working directory) with a name based on the date and time -of the run as follows `mgiKit_Y-m-dTHMS`. where `Y`, `m`, `d`, `H`, `M`, and `S` are the date and time format. - -+ **`--reports`**: The path of the output reports directory. - -By default, the tool writes the files of the run reports in the same output directory as the -demultiplexed fastq files (`-o` or `--output` parameter). This parameter is used to write the reports in -a different folder as specified with this parameter. +- **`-o or --output`**: The path the output directory. + The tool will create the directory if it does not exist + or overwrite the content if the directory exists and the parameter `--force` is used. The tool will exit + with an error if the directory exists, and `--force` is not used. If this parameter is not provided, the tools + will create a directory (in the working directory) with a name based on the date and time + of the run as follows `mgiKit_Y-m-dTHMS`. where `Y`, `m`, `d`, `H`, `M`, and `S` are the date and time format. -+ **`-m or --mismatches`**: The default value is 1. The number of mismatches allowed when -matching reads’ barcode with sample indexes. +- **`--reports`**: The path of the output reports directory. -This number should be less than the minimal Hamming -distance between any barcodes of two samples. In the case of dual index demultiplexing (i7 and i5), a -read will be assigned to a sample if the sum of the mismatches between the read barcode and both -indexes is less than or equal to the value of this parameter. In the case of a single index, the -mismatches with the single index should be less or equal to this parameter. + By default, the tool writes the files of the run reports in the same output directory as the + demultiplexed fastq files (`-o` or `--output` parameter). This parameter is used to write the reports in + a different folder as specified with this parameter. -+ **`--disable-illumina`**: Output reads' header in MGI format. +- **`-m or --mismatches`**: The default value is 1. The number of mismatches allowed when + matching reads’ barcode with sample indexes. -This option is to disable the default behaviour of the tool that outputs read files using Illumine format (for read headers and file naming). More details are below. + This number should be less than the minimal Hamming + distance between any barcodes of two samples. In the case of dual index demultiplexing (i7 and i5), a + read will be assigned to a sample if the sum of the mismatches between the read barcode and both + indexes is less than or equal to the value of this parameter. In the case of a single index, the + mismatches with the single index should be less or equal to this parameter. -+ **`--keep-barcode`**: keep the barcode at the end of the demultiplexed read. +- **`--disable-illumina`**: Output reads' header in MGI format. + This option is to disable the default behaviour of the tool that outputs read files using Illumine format (for read headers and file naming). More details are below. -By default, the tool trims the barcode sequence at the end of the read sequence. This can be disabled using this flag and the demultiplexed reads will contain the barcode at the tail of the read2 for paired-read sequencing or the tail of read1 for single-read sequencing. +- **`--keep-barcode`**: keep the barcode at the end of the demultiplexed read. + By default, the tool trims the barcode sequence at the end of the read sequence. This can be disabled using this flag and the demultiplexed reads will contain the barcode at the tail of the read2 for paired-read sequencing or the tail of read1 for single-read sequencing. -+ **`--template`**: The general template of the index locations in the read barcode. details are in the sample sheet preparation. if all samples in the sample sheet use the same template, the general template can be passed as a parameter instead of having it in the sample sheet for each sample. The general template will be used for all samples with the combination of `--i7-rc` and `--i5-rc`. +- **`--template`**: The general template of the index locations in the read barcode. details are in the sample sheet preparation. if all samples in the sample sheet use the same template, the general template can be passed as a parameter instead of having it in the sample sheet for each sample. The general template will be used for all samples with the combination of `--i7-rc` and `--i5-rc`. -+ **`--i7-rc`**: Use the reverse complementary form of i7 when matching with the barcode. +- **`--i7-rc`**: Use the reverse complementary form of i7 when matching with the barcode. + This option should be used together with the general template and will be applied to all samples. -This option should be used together with the general template and will be applied to all samples. +- **`--i5-rc`**: Use the reverse complementary form of i5 when matching with the barcode. + This option should be used together with the general template and will be applied to all samples. -+ **`--i5-rc`**: Use the reverse complementary form of i5 when matching with the barcode. +- **`--lane`**: Lane number such as `L01`. + This parameter is used to provide the lane number when the parameter `-i` or `--input` is not + provided. The lane number is used for QC reports and it is mandatory when Illumina format is + requested for file naming. -This option should be used together with the general template and will be applied to all samples. +- **`--instrument`**: The id of the sequncing machine. + This parameter is used to provide the instrument id when the parameter `-i` or `--input` + is not provided. The parameter is mandatory when Illumina format is requested for read header and + file naming. -+ **`--lane`**: Lane number such as `L01`. +- **`--run`**: The run id. It is taken from Bioinf.csv as the date and time of starting the run. + This parameter is used to provide the run id when the parameter `-i` or `--input` is not provided. The parameter is mandatory when Illumina format is requested for read header and file naming. -This parameter is used to provide the lane number when the parameter `-i` or `--input` is not -provided. The lane number is used for QC reports and it is mandatory when Illumina format is -requested for file naming. +- **`--writing-buffer-size`**: The default value is `67108864`. The size of the buffer for each sample to be filled with data and then written once to the disk. Smaller buffers will need less memory but make the tool slower. Larger buffers need more memory. -+ **`--instrument`**: The id of the sequncing machine. +- **`--comprehensive-scan`**: Enable comperhansive scan. -This parameter is used to provide the instrument id when the parameter `-i` or `--input` -is not provided. The parameter is mandatory when Illumina format is requested for read header and -file naming. + This parameter is only needed when having a mixed library dataset (different locations for the indexes in the read barcode for some samples). + The default behaviour of the tool is to stop comparing the barcodes with + samples’ indexes when it finds a match. This flag will force the tool to keep comparing with all other + samples to make sure that the read matches with only one sample. In a normal scenario, the read + should match with only one sample, however, there is a chance that the read matches with multiple + samples if the allowed number of mismatches is greater than the minimum hamming distance + between the indexes, or the samples have different templates. -+ **`--run`**: The run id. It is taken from Bioinf.csv as the date and time of starting the run. +- **`--undetermined-label`**: The default value is `Undetermined`. The label of the file that contains the + undermined reads which could not be assigned to any samples. -This parameter is used to provide the run id when the parameter `-i` or `--input` is not provided. The parameter is mandatory when Illumina format is requested for read header and file naming. +- **`--ambiguous-label`**: The default value is `ambiguous`. The label of the file that contains the ambiguous reads. + The ambiguous reads are the reads that can be assigned to multiple samples. This can happen when + the number of allowed mismatches is high. -+ **`--writing-buffer-size`**: The default value is `67108864`. The size of the buffer for each sample to be filled with data and then written once to the disk. Smaller buffers will need less memory but make the tool slower. Larger buffers need more memory. +- **`--report-limit`**: The number of barcodes to be reported in the list of undetermined and ambiguous barcodes for short/multiqc report. [default: 20] -+ **`--comprehensive-scan`**: Enable comperhansive scan. +- **`--r1-file-suf`**: The suffix to read1 file name. When using the --input parameter, the tool looks for the file that ends with this suffix and use it as read1 file. There should be one file with this suffix in the input directory. [default: _read_1.fq.gz] -This parameter is only needed when having a mixed library dataset (different locations for the indexes in the read barcode for some samples). -The default behaviour of the tool is to stop comparing the barcodes with -samples’ indexes when it finds a match. This flag will force the tool to keep comparing with all other -samples to make sure that the read matches with only one sample. In a normal scenario, the read -should match with only one sample, however, there is a chance that the read matches with multiple -samples if the allowed number of mismatches is greater than the minimum hamming distance -between the indexes, or the samples have different templates. +- **`--r2-file-suf`**: The suffix to read2 file name. When using the --input parameter, the tool looks for the file that ends with this suffix and use it as read2 file. There should be one file with this suffix in the input directory. [default: _read_2.fq.gz] -+ **`--undetermined-label`**: The default value is `Undetermined`. The label of the file that contains the -undermined reads which could not be assigned to any samples. +- **`--info-file`**: The name of the info file that contains the run information. Only needed when using the `--input` parameter. [default: BioInfo.csv] -+ **`--ambiguous-label`**: The default value is `ambiguous`. The label of the file that contains the ambiguous reads. -The ambiguous reads are the reads that can be assigned to multiple samples. This can happen when -the number of allowed mismatches is high. +- **`--report-level`**: The level of reporting. 0 no reports will be generated!, 1 data quality and demultiplexing reports. 2: all reports (reports on data quality, demultiplexing, undetermined and ambiguous barcodes).[default: 2] -+ **`--report-limit`**: The number of barcodes to be reported in the list of undetermined and ambiguous barcodes for short/multiqc report. [default: 20] +- **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression. [default: 1] -+ **`--r1-file-suf`**: The suffix to read1 file name. When using the --input parameter, the tool looks for the file that ends with this suffix and use it as read1 file. There should be one file with this suffix in the input directory. [default: _read_1.fq.gz] +- **`--force`**: this flag is to force the run and overwrite the existing output directory if exists. -+ **`--r2-file-suf`**: The suffix to read2 file name. When using the --input parameter, the tool looks for the file that ends with this suffix and use it as read2 file. There should be one file with this suffix in the input directory. [default: _read_2.fq.gz] +- **`--flexible`**: By default, the tool will calulate the length of the first read and its all parts and use this information in the analysis for a quicker determination of the read boundaries. `--flexible` option, will make the tool determine the read boundaries based on the `new line` character (`\n`). -+ **`--info-file`**: The name of the info file that contains the run information. Only needed when using the `--input` parameter. [default: BioInfo.csv] +- **`--ignore-undetermined`**: By default, the tool will stop if many reads were undetermined. using this parameter, will make the tool give a warning one this issue but keep demultiplexing. -+ **`--report-level`**: The level of reporting. 0 no reports will be generated!, 1 data quality and demultiplexing reports. 2: all reports (reports on data quality, demultiplexing, undetermined and ambiguous barcodes).[default: 2] +- **`--all-index-error`**: By default, the allowed mismatches `-m or --mismatches` are considered to be per index. This flag will make it for the total mismatches across all indices. -+ **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression. [default: 1] - -+ **`--force`**: this flag is to force the run and overwrite the existing output directory if exists. - -+ **`--flexible`**: By default, the tool will calulate the length of the first read and its all parts and use this information in the analysis for a quicker determination of the read boundaries. `--flexible` option, will make the tool determine the read boundaries based on the `new line` character (`\n`). - -+ **`--ignore-undetermined`**: By default, the tool will stop if many reads were undetermined. using this parameter, will make the tool give a warning one this issue but keep demultiplexing. - -+ **`--all-index-error`**: By default, the allowed mismatches `-m or --mismatches` are considered to be per index. This flag will make it for the total mismatches across all indices. - -+ **`--memory`**: The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine. - -+ **`--not-mgi`**: This flag needs to be enabled if the input fastq files don't have MGI format. +- **`--memory`**: The requested maximum memory to be used (in giga byte). Check the documentation for memory optimisation options. Default is 0 then the tool will use the available memory on the machine. +- **`--not-mgi`**: This flag needs to be enabled if the input fastq files don't have MGI format. ### Understanding input files @@ -151,112 +142,53 @@ The input fastq files can be provided to the tool in two ways: 1. Using `-f` and `-r` parameters which will be referring to the path to `R1` and `R2` respectively for paired-end or `-f` for single end fastq. -2. Using `-i` or `--input` parameter which refers to the path to the lane subdirectory in the sequencing output directory (or the directory that contains the fastq files if the data is obtained from somewhere else). In this case, the tool will search for the file that ends with `_read_1.fq.gz` and `_read_2.fq.gz` as forward and reverse reads respectively and if no reverse read file is found, the tool considers the run as a single end run. These suffixes can be also customised using the parameters (`--r1-file-suf` and `--r2-file-suf`). - -### Output fastq format - -**File naming** - -1. Illumina format (default format) - - The fils will have the following pattern `SAMPLEID_S{1-n}_L0{1,2,3,4}_R{1,2}_001.fastq.gz`. - - For example, `21-10233_S1_L01_R1_001.fastq.gz` and `21-10233_S1_L01_R2_001.fastq.gz`. - - - ![read-headers-figure](assets/file-naming.png) - - - Where: - - 1. Sample ULN. - - 2. Sample order in the sample sheet. - - 3. Lane number. - - 4. R1 and R2 for paired end sequencing, or R1 for single end sequencing. - - 5. The remaining part is fixed for all files. - -2. MGI format - -The files will have the following pattern `SAMPLEID_L0{1,2,3,4}_R{1,2}.fastq.gz`. - -For example, `21-10233_L01_R1.fastq.gz` and `21-10233_L01_R2.fastq.gz`. - - - -The lane number is an input parameter and `R1` and `R2` are for forward and reverse read. - -**Read header** - -1. Illumina format (default format) - -2. MGI format - -Please see the details of both headers and the conversion in the figure below: - -![read-headers-figure](assets/read-header.png) - -Illumina formatting requires Lane number, instrument id and run id. The three requirements can be provided using their parameters `--lane`, `--instrument`, and `--run` as described above. - -However, if the `--input` was used, the tools will look for `BioInfo.csv` file that is generated by MGI sequencers in the input directory and extract the instrument id and run id from it. -The run id will be the date and time of the run start ("YMDHmS" format). It will also check the second element in the name of the input fastq file (after splitting it by `_`) if it contains the substring `L0*`, if yes, the lane number will be taken from the fastq file name. - -If the input reads are passed using `-f` and `-r` parameters, mgikit will look for the file `BioInfo.csv` under the same directory as the read with barcodes (R1 for SE or R2 for PE). If found it will be used. - -The user can also pass the path of a file formatted in the same way as `BioInfo.csv` file using the parameter `--info-file`. if this path is passed, `instrument` and `run` will be extracted from this file. - -`--lane`, `--instrument`, and `--run` will be prioritised over the information in the `BioInfo.csv` file if these parameters were provided. +2. Using `-i` or `--input` parameter which refers to the path to the lane subdirectory in the sequencing output directory (or the directory that contains the fastq files if the data is obtained from somewhere else). In this case, the tool will search for the file that ends with `_read_1.fq.gz` and `_read_2.fq.gz` as forward and reverse reads respectively and if no reverse read file is found, the tool considers the run as a single end run. These suffixes can be also customised using the parameters (`--r1-file-suf` and `--r2-file-suf`). ### Sample sheet format and preparation. For the tool to perform demultiplexing, it needs to know the indexes of each sample to match them with the barcodes at the end of the read sequence as well as where to look for each index in the barcode. We refer to the location of the indexes within the barcode by the barcode template. For example This information can be provided to the tool in two different ways: -1. Sample sheet with all information about the indexes and their template. -This is the general way of passing this information to the tool and it works for all scenarios. -2. Sample sheet with information about sample indexes and a general template passed via a separate parameter. -This approach can be used if all samples have the same index template, which is the common scenario. +1. Sample sheet with all information about the indexes and their template. + This is the general way of passing this information to the tool and it works for all scenarios. +2. Sample sheet with information about sample indexes and a general template passed via a separate parameter. + This approach can be used if all samples have the same index template, which is the common scenario. -The sample sheet is a tab-delimited file that contains sample information. +The sample sheet is a tab-delimited or comma-delimited file that contains sample information. The tool will check if tab exists in the header, if not found, will check for comma and uses the first delimiter found. This file may contain all or some of these columns: -+ **`sample_id`** (**Mandatory**): a unique identifier that will be used to refer to the specific sample. - -+ **`i7`** (**Mandatory**): The nucleotide sequence for the i7 index for the associated sample. This will be used to demultiplex the reads by comparing it to the index found in the read barcode. +- **`sample_id`** (**Mandatory**): a unique identifier that will be used to refer to the specific sample. -+ **`i5`** (**Optional**): The nucleotide sequence for the i5 index for the associated sample. this will be used to demultiplex the reads by comparing it to the index found in the read barcode when using dual indexes. +- **`i7`** (**Mandatory**): The nucleotide sequence for the i7 index for the associated sample. This will be used to demultiplex the reads by comparing it to the index found in the read barcode. -+ **`template`** (**Optional**): This column should contain the template of the barcode for the specific sample. This allows doing demultiplexing for samples from different libraries where the templates are different. If all samples have the same template, this column can be ignored, and a general template should be passed in a separate parameter. See `--template` parameter. More details are below. +- **`i5`** (**Optional**): The nucleotide sequence for the i5 index for the associated sample. this will be used to demultiplex the reads by comparing it to the index found in the read barcode when using dual indexes. -+ **`i7_rc`** (**Optional**): Takes values from 0 or 1. If the value is 0, the i7 in the sample sheet will be used as is to compare with the read barcode. If the value is 1, the tool will compare the index found in the read barcode to the reverse complementary of i7 in the sample sheet. If the template was not provided in the sample sheet (general template is used), this parameter will be ignored, and the user has to provide this parameter (`--i7-rc`) separately. +- **`template`** (**Optional**): This column should contain the template of the barcode for the specific sample. This allows doing demultiplexing for samples from different libraries where the templates are different. If all samples have the same template, this column can be ignored, and a general template should be passed in a separate parameter. See `--template` parameter. More details are below. -+ **`i5_rc`** (**Optional**): The same as i7_rc but applies to the i5 index. +- **`i7_rc`** (**Optional**): Takes values from 0 or 1. If the value is 0, the i7 in the sample sheet will be used as is to compare with the read barcode. If the value is 1, the tool will compare the index found in the read barcode to the reverse complementary of i7 in the sample sheet. If the template was not provided in the sample sheet (general template is used), this parameter will be ignored, and the user has to provide this parameter (`--i7-rc`) separately. -+ **`job_number`** (**Optional**): It is an id to group the samples that are from the same project for the cases when a run contains samples from multiple projects. The demultiplexer will generate demultiplexing and quality reports for each project and the whole run. It can be ignored if the run has samples for the same project or if the project-based reports are not needed. +- **`i5_rc`** (**Optional**): The same as i7_rc but applies to the i5 index. +- **`job_number`** (**Optional**): It is an id to group the samples that are from the same project for the cases when a run contains samples from multiple projects. The demultiplexer will generate demultiplexing and quality reports for each project and the whole run. It can be ignored if the run has samples for the same project or if the project-based reports are not needed. **Barcode template** To understand how to use the demultiplexing tool, it is important to understand the structure of the input data and how to provide the correct parameters for the analysis. -The sequenced reads obtained by the MGI sequencing machine contain a string of nucleotides at the tail of read2 for paired-end sequencing or the tail of read1 for single-end sequencing. This substring is referred to as the read barcode which contains the indexes of the samples, single (i7) or dual (i7 and i5) indexes. It also includes the Unique Molecular Identifier (UMI) in some cases. +The sequenced reads obtained by the MGI sequencing machine contain a string of nucleotides at the tail of read2 for paired-end sequencing or the tail of read1 for single-end sequencing. This substring is referred to as the read barcode which contains the indexes of the samples, single (i7) or dual (i7 and i5) indexes. It also includes the Unique Molecular Identifier (UMI) in some cases. The demultiplexer tool looks at this read barcode and tries to match the indexes to a subsequence within the barcode to assign the read to a specific individual. In order to accomplish that, the tool needs to know where to look at the barcode to match with the index and from where to extract the UMI. This information is provided to the tool through the template parameter. The template parameter is a combination of four possible components of (i7*, i5*, um*, --*) separated by a colon `:`. Where: -+ `i7`: the region where to expect the index i7 within the barcode. -+ `i5`: the region where to expect the index i5 within the barcode -+ `um`: the region where to expect the UMI within the barcode -+ `--`: a discarded region that is not used. -+ `*` : should be replaced by a number representing the length of the relevant index or UMI. +- `i7`: the region where to expect the index i7 within the barcode. +- `i5`: the region where to expect the index i5 within the barcode +- `um`: the region where to expect the UMI within the barcode +- `--`: a discarded region that is not used. +- `*` : should be replaced by a number representing the length of the relevant index or UMI. - -*Examples of templates* +_Examples of templates_ **Example 1:** The template i58:um8:i78 means: @@ -270,8 +202,6 @@ The template parameter is a combination of four possible components of (i7*, i5* ![template-example-1](assets/template-example-1.png) - - **Example 2:** The template --2:i58:--2:i78 means: 1. The length of the barcode at the end of read2 (paired-end) or at the end of read1 (single-end) is 20 bp (2 + 8 + 2 + 8). @@ -286,7 +216,6 @@ The template parameter is a combination of four possible components of (i7*, i5* ![template-example-2](assets/template-example-2.png) - Note that in this example the direction of the arrows is to show that these indexes are reverse complementary in the reads, therefore, this should be accounted for when demultiplexing using other parameters as explained in this documentation later. **Example 3:** The template --2:i78 means: @@ -299,25 +228,82 @@ Note that in this example the direction of the arrows is to show that these inde 4. There is no i5 in this template as it is a single index run. - After demultiplexing, the barcode will be trimmed by default including all parts mentioned in the template. This can be disabled using the parameter `--keep-barcode`. -Templates and indexes forms can be provided by the user, however, the command `template` can detect the barcode template and the form of the indexes for the run. +Templates and indexes forms can be provided by the user, however, the command `template` can detect the barcode template and the form of the indexes for the run. + +### Understanding output files + +#### Samples fastq + +The output fastq files can be either paired-end or single-end files for each sample in the run if they have matching reads. Reads in the input fastq file that don't match with any sample in the sample sheet will be written to the `Undetermined` files. Reads that match multiple samples with the same mismatch rate will be written in the `Ambiguous` files. The undetermined and ambiguous files are kept in MGI format, while the sample files can be in either MGI or Illumina format (explained below) depending on the demultiplexing command. + +**Output fastq format** + +**File naming** + +1. Illumina format (default format) + + The fils will have the following pattern `SAMPLEID_S{1-n}_L0{1,2,3,4}_R{1,2}_001.fastq.gz`. + + For example, `21-10233_S1_L01_R1_001.fastq.gz` and `21-10233_S1_L01_R2_001.fastq.gz`. + + ![read-headers-figure](assets/file-naming.png) + + Where: + 1. Sample ULN. -### Reports {#demultipexing-reports-section} + 2. Sample order in the sample sheet. + + 3. Lane number. + + 4. R1 and R2 for paired end sequencing, or R1 for single end sequencing. + + 5. The remaining part is fixed for all files. + +2. MGI format + +The files will have the following pattern `SAMPLEID_L0{1,2,3,4}_R{1,2}.fastq.gz`. + +For example, `21-10233_L01_R1.fastq.gz` and `21-10233_L01_R2.fastq.gz`. + +The lane number is an input parameter and `R1` and `R2` are for forward and reverse read. + +**Read header** + +1. Illumina format (default format) + +2. MGI format + +Please see the details of both headers and the conversion in the figure below: + +![read-headers-figure](assets/read-header.png) + +Illumina formatting requires Lane number, instrument id and run id. The three requirements can be provided using their parameters `--lane`, `--instrument`, and `--run` as described above. + +However, if the `--input` was used, the tools will look for `BioInfo.csv` file that is generated by MGI sequencers in the input directory and extract the instrument id and run id from it. +The run id will be the date and time of the run start ("YMDHmS" format). It will also check the second element in the name of the input fastq file (after splitting it by `_`) if it contains the substring `L0*`, if yes, the lane number will be taken from the fastq file name. + +If the input reads are passed using `-f` and `-r` parameters, mgikit will look for the file `BioInfo.csv` under the same directory as the read with barcodes (R1 for SE or R2 for PE). If found it will be used. + +The user can also pass the path of a file formatted in the same way as `BioInfo.csv` file using the parameter `--info-file`. if this path is passed, `instrument` and `run` will be extracted from this file. + +`--lane`, `--instrument`, and `--run` will be prioritised over the information in the `BioInfo.csv` file if these parameters were provided. + +#### Reports {#demultipexing-reports-section} The demultiplex command generates multiple reports with file names that start with the flowcell and lane being demultiplexed. a MultiQC hitm report can be generated from these reports using [mgikit-multiqc](https://github.com/sagc-bioinformatics/mgikit-multiqc) plugin as described at the plugin [repository](https://github.com/sagc-bioinformatics/mgikit-multiqc). 1. `flowcell.L0*.mgikit.info` -This report contains the number of reads per sample respectively to each possible mismatch. It has (2 + allowed mismatches during demultiplexing) columns. +This report contains the number of reads per sample respectively to each possible mismatch. For example: | **sample** | **0-mismatches** | **1-mismatches** | -|:------------:|:------------------:|:------------------:| -| S01 | 3404 | 5655 | +| :--------: | :--------------: | :--------------: | +| S01 | 3404 | 5655 | This means that there was only one mismatch allowed during this execution and the sample S01 has 3404 reads with indexes matching perfectly and 5655 reads with indexes that differ by 1 base compared to the indexes provided in the sample sheet. @@ -327,8 +313,31 @@ This file is used for the mgikit plugin to visualise quality control reports thr This file contains summary information related to the cluster count and quality scores, summarised for each sample as well as at the whole lane scale. This file is used for the mgikit plugin to visualise quality control reports through MultiQC. +**Report content:** + +- **Lane statistics columns** + + 1. `Run ID-Lane`: Run ID and lane number. + 2. `Mb Total Yield`: total number of bases in a million. + 3. `M Total Clusters`: total number of reads in million. + 4. `% bases ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases. + 5. `Mean Quality`: The average quality score for the bases. + 6. `% Perfect Index`: The percentage of reads with perfectly matching indices of all reads. + +- **Sample general info** + + 1. `Sample ID`: sample ID taken from the sample sheet. + 2. `M Clusters`: total number of reads in million. + 3. `Mb Yield ≥ Q30`: total number of bases with a quality score greater than 30 in million. + 4. `% R1 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for forward reads. + 5. `% R2 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for reverse reads. + 6. `% R3 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for indices. + 7. `% Perfect Index`: The percentage of reads with perfectly matching indices of all reads. + 3. `flowcell.L0*.mgikit.sample_stats` +This file contains the informaiton in the above mentioned reports but in simple format. This is used to merge the reports from multiple lanes into one report for the whole run. + 4. `flowcell.L0*.mgikit.undetermined_barcode.complete` This report contains the undetermined barcodes including their frequency. @@ -347,7 +356,7 @@ This report contains the top 50 frequent barcodes from the above report (6). Thi The first three reports must be generated for each run. It is unlikely that the fourth and fifth reports will not be generated as usually there should be some undetermined reads in the run. It is highly likely that the sixth and seventh reports will not be generated. If they are generated, it is recommended to make sure that the input sample sheet does not have issues and that the allowed mismatches are less than the minimal Hamming distance between samples. -#### Generat MultiQC report from mgikit reports +### Generat MultiQC report from mgikit reports In order to generate a [multiqc](https://multiqc.info/) report from mgikit reports, multiqc needs to be installed. @@ -367,7 +376,7 @@ multiqc mgikit-examples/test/ ### Performance evaluation -Performance time (in minutes) evaluation and comparison on different datasets. +Performance time (in minutes) evaluation and comparison on different datasets. DS01 and DS04 are 10 bp dual index, DS02 and DS3 are 8 bp dual index and DS05 is 8 bp single index. In the case of single-end, the R2 file of the dataset is used alone for demultiplexing. @@ -464,28 +473,27 @@ In the case of single-end, the R2 file of the dataset is used alone for demultip The default parameters of the tool are optimised to achieve high performance. The majority of the memory needed is allocated for output buffering to reduce writing-to-disk operations. -The expected memory usage is influenced by three main factors, +The expected memory usage is influenced by three main factors, 1. Number of samples in the sample sheet. 2. Writing buffer size (`--writing-buffer-size` parameter, default is `67108864`). 3. Compression buffer size (`--compression-buffer-size` parameter, default is `131072`). 4. Single-end or paired-end input data. -The expected allocated memory is +The expected allocated memory is -+ **Single-end input**: `number of samples * (writing buffer size + 2 * compression buffer size)`. +- **Single-end input**: `number of samples * (writing buffer size + 2 * compression buffer size)`. -+ **Paired-end input**: `2 * number of samples * (writing buffer size + 2 * compression buffer size)`. +- **Paired-end input**: `2 * number of samples * (writing buffer size + 2 * compression buffer size)`. When using the default parameters: -+ **Single-end input**: `number of smaples * 64.25 MB`. +- **Single-end input**: `number of smaples * 64.25 MB`. -+ **Paired-end input**: `2 * number of smaples 64.25 MB`. +- **Paired-end input**: `2 * number of smaples 64.25 MB`. Reducing the writing buffer size will reduce the required memory but also affect the performance time. - ### Testing datasets We have attached a simple python script to generate paired-end fastq files. The script is available under [`mgikit/testing_data/generate_fastq/`](https://github.com/sagc-bioinformatics/mgikit/tree/main/testing_data/generate_fastq). You can use this script to generate large fastq files with random content for testing as described in the readme file under the directory. @@ -496,7 +504,6 @@ You can use the datasets at `testing_data` to perform these tests. **1. Demultiplexing a run with dual indexes (i7 and i5)** - ```bash target/release/mgikit demultiplex \ -i testing_data/input/ds01/L01/ \ @@ -575,4 +582,3 @@ target/release/mgikit demultiplex \ **6. demultiplexing a run with samples from multiple projects** The main difference here is that the sample sheet contains a column that links each sample to a specific project. The main difference in the output is that the tool will generate reports for the samples of each project. - diff --git a/docs/pages/mgikit-multiqc.md b/docs/pages/mgikit-multiqc.md index aa616bf..6a6f088 100755 --- a/docs/pages/mgikit-multiqc.md +++ b/docs/pages/mgikit-multiqc.md @@ -7,17 +7,18 @@ type: guides --- ## mgikit Reports + The demultiplex command generates multiple reports with file names that start with the flowcell and lane being demultiplexed. a MultiQC hitm report can be generated from these reports using [mgikit-multiqc](https://github.com/sagc-bioinformatics/mgikit-multiqc) plugin as described at the plugin [repository](https://github.com/sagc-bioinformatics/mgikit-multiqc). 1. `flowcell.L0*.mgikit.info` -This report contains the number of reads per sample respectively to each possible mismatch. It has (2 + allowed mismatches during demultiplexing) columns. +This report contains the number of reads per sample respectively to each possible mismatch. For example: | **sample** | **0-mismatches** | **1-mismatches** | -|:------------:|:------------------:|:------------------:| -| S01 | 3404 | 5655 | +| :--------: | :--------------: | :--------------: | +| S01 | 3404 | 5655 | This means that there was only one mismatch allowed during this execution and the sample S01 has 3404 reads with indexes matching perfectly and 5655 reads with indexes that differ by 1 base compared to the indexes provided in the sample sheet. @@ -27,8 +28,31 @@ This file is used for the mgikit plugin to visualise quality control reports thr This file contains summary information related to the cluster count and quality scores, summarised for each sample as well as at the whole lane scale. This file is used for the mgikit plugin to visualise quality control reports through MultiQC. +**Report content:** + +- **Lane statistics columns** + + 1. `Run ID-Lane`: Run ID and lane number. + 2. `Mb Total Yield`: total number of bases in a million. + 3. `M Total Clusters`: total number of reads in million. + 4. `% bases ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases. + 5. `Mean Quality`: The average quality score for the bases. + 6. `% Perfect Index`: The percentage of reads with perfectly matching indices of all reads. + +- **Sample general info** + + 1. `Sample ID`: sample ID taken from the sample sheet. + 2. `M Clusters`: total number of reads in million. + 3. `Mb Yield ≥ Q30`: total number of bases with a quality score greater than 30 in million. + 4. `% R1 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for forward reads. + 5. `% R2 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for reverse reads. + 6. `% R3 Yield ≥ Q30`: percentage of bases with a quality score greater than 30 of all bases calculated only for indices. + 7. `% Perfect Index`: The percentage of reads with perfectly matching indices of all reads. + 3. `flowcell.L0*.mgikit.sample_stats` +This file contains the informaiton in the above mentioned reports but in simple format. This is used to merge the reports from multiple lanes into one report for the whole run. + 4. `flowcell.L0*.mgikit.undetermined_barcode.complete` This report contains the undetermined barcodes including their frequency. diff --git a/docs/pages/reformat.md b/docs/pages/reformat.md index e1221e6..c47bf58 100755 --- a/docs/pages/reformat.md +++ b/docs/pages/reformat.md @@ -14,69 +14,67 @@ This command should be used for each sample separately (either paired-end or sin ## Command arguments -+ **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. +- **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. -+ **`-r or --read2`**: the path to the reverse reads fastq file. +- **`-r or --read2`**: the path to the reverse reads fastq file. -+ **`-i or --input`**: the path to the directory that contains the input fastq files. +- **`-i or --input`**: the path to the directory that contains the input fastq files. -Either `-i` or `-f/-r`, `-f` should be provided for a run. +{% include callout.html type="note" content="Either `-i` or `-f/-r`, `-f` should be provided for a run." %} -+ **`-o or --output`**: The path the output directory. +- **`-o or --output`**: The path the output directory. -The tool will create the directory if it does not exist -or overwrite the content if the directory exists and the parameter `--force` is used. The tool will exit -with an error if the directory exists, and `--force` is not used. If this parameter is not provided, the tools -will create a directory (in the working directory) with a name based on the date and time -of the run as follows `mgiKit_Y-m-dTHMS`. where `Y`, `m`, `d`, `H`, `M`, and `S` are the date and time format. + The tool will create the directory if it does not exist + or overwrite the content if the directory exists and the parameter `--force` is used. The tool will exit + with an error if the directory exists, and `--force` is not used. If this parameter is not provided, the tools + will create a directory (in the working directory) with a name based on the date and time + of the run as follows `mgiKit_Y-m-dTHMS`. where `Y`, `m`, `d`, `H`, `M`, and `S` are the date and time format. -+ **`--reports`**: The path of the output reports directory. +- **`--reports`**: The path of the output reports directory. -By default, the tool writes the files of the run reports in the same output directory as the -demultiplexed fastq files (`-o` or `--output` parameter). This parameter is used to write the reports in -a different folder as specified with this parameter. + By default, the tool writes the files of the run reports in the same output directory as the -+ **`--lane`**: Lane number such as `L01`. + demultiplexed fastq files (`-o` or `--output` parameter). This parameter is used to write the reports in + a different folder as specified with this parameter. -This parameter is used to provide the lane number when the parameter `-i` or `--input` is not -provided. The lane number is used for QC reports and it is mandatory when Illumina format is -requested for file naming. +- **`--lane`**: Lane number such as `L01`. -+ **`--instrument`**: The id of the sequncing machine. + This parameter is used to provide the lane number when the parameter `-i` or `--input` is not -This parameter is used to provide the instrument id when the parameter `-i` or `--input` -is not provided. The parameter is mandatory when Illumina format is requested for read header and -file naming. + provided. The lane number is used for QC reports and it is mandatory when Illumina format is requested for file naming. -+ **`--run`**: The run id. It is taken from Bioinf.csv as the date and time of starting the run. +- **`--instrument`**: The id of the sequncing machine. -This parameter is used to provide the run id when the parameter `-i` or `--input` is not provided. The parameter is mandatory when Illumina format is requested for read header and file naming. + This parameter is used to provide the instrument id when the parameter `-i` or `--input` is not provided. The parameter is mandatory when Illumina format is requested for read header and file naming. -+ **`--writing-buffer-size`**: The default value is `67108864`. The size of the buffer for each sample to be filled with data then written once to the disk. Smaller buffers will need less memory but makes the tool slower. Largeer buffers need more memory. +- **`--run`**: The run id. It is taken from Bioinf.csv as the date and time of starting the run. -+ **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression. [default: 1] + This parameter is used to provide the run id when the parameter `-i` or `--input` is not provided. The parameter is mandatory when Illumina format is requested for read header and file naming. -+ **`--force`**: this flag is to force the run and overwrite the existing output directory if exists. +- **`--writing-buffer-size`**: The default value is `67108864`. The size of the buffer for each sample to be filled with data then written once to the disk. Smaller buffers will need less memory but makes the tool slower. Largeer buffers need more memory. -+ **`--flexible`**: By default, the tool will calculate the length of the first read and its all parts and use this information in the analysis for a quicker determination of the read boundaries. `--flexible` option, will make the tool determine the read boundaries based on the `new line` character (`\n`). +- **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression. [default: 1] -+ **`--info-file`**: The name of the info file that contains the run information. Only needed when using the `--input` parameter. [default: BioInfo.csv] +- **`--force`**: this flag is to force the run and overwrite the existing output directory if exists. -+ **`--disable-illumina`**: reads will be left as is and only quality reports will be generated. +- **`--flexible`**: By default, the tool will calculate the length of the first read and its all parts and use this information in the analysis for a quicker determination of the read boundaries. `--flexible` option, will make the tool determine the read boundaries based on the `new line` character (`\n`). -+ **`--umi-length`**: The length of UMI expected at the end of the read (r1 for single-end, or r2 for paired-end) [Default: 0]. +- **`--info-file`**: The name of the info file that contains the run information. Only needed when using the `--input` parameter. [default: BioInfo.csv] -+ **`--report-level`**: The level of reporting. 0 no reports will be generated, 1 data quality and demultiplexing reports. 2: all reports (reports on data quality, demultiplexing, undetermined and ambiguous barcodes).[default: 2] +- **`--disable-illumina`**: reads will be left as is and only quality reports will be generated. -+ **`--sample-index`**: The index of the sample in the sample sheet. It is required for file naming. [default: 1] +- **`--umi-length`**: The length of UMI expected at the end of the read (r1 for single-end, or r2 for paired-end) [Default: 0]. + +- **`--report-level`**: The level of reporting. 0 no reports will be generated, 1 data quality and demultiplexing reports. 2: all reports (reports on data quality, demultiplexing, undetermined and ambiguous barcodes).[default: 2] + +- **`--sample-index`**: The index of the sample in the sample sheet. It is required for file naming. [default: 1] + +- **`--barcode`**: The barcode of the specific sample to calculate the mismatches for the reports. If not provided, no mismatches will be calculated. -+ **`--barcode`**: The barcode of the specific sample to calculate the mismatches for the reports. If not provided, no mismatches will be calculated. - ## Usage Examples **1. Demultiplexing a run with dual indexes (i7 and i5)** - ```bash target/release/mgikit reformat \ -f testing_data/input/extras_test/FC01_L01_sample1_1.fq.gz \ @@ -85,4 +83,3 @@ target/release/mgikit reformat \ --sample-index 1 \ --info-file testing_data/input/extras_test/BioInfo.csv ``` - diff --git a/docs/pages/report.md b/docs/pages/report.md index 292cfc4..af9966d 100755 --- a/docs/pages/report.md +++ b/docs/pages/report.md @@ -15,11 +15,8 @@ if the run has multiple lanes, there will be lane-specific reports. The reports ## Command arguments -+ **`--qc-report`**: The path to the QC report, you can add multiple paths by reusing the same parameter. For example, `--qc-report file1 --qc-report file2`. This argument takes multiple values and is mandatory. The tool expects here the reports generated for each lane in the run and you also can combine the reports generated from multiple runs for the same samples. - +- **`--qc-report`**: The path to the QC report, you can add multiple paths by reusing the same parameter. For example, `--qc-report file1 --qc-report file2`. This argument takes multiple values and is mandatory. The tool expects here the reports generated for each lane in the run and you also can combine the reports generated from multiple runs for the same samples. -+ **`-o or --output`**: The path and prefix of output files. The tools will create two files at the same path with the same prefix and end with `.info` and `.general`. +- **`-o or --output`**: The path and prefix of output files. The tools will create two files at the same path with the same prefix and end with `.info` and `.general`. ## Usage Examples - - diff --git a/docs/pages/template.md b/docs/pages/template.md index e57b5f8..07de215 100755 --- a/docs/pages/template.md +++ b/docs/pages/template.md @@ -6,33 +6,32 @@ toc: true type: guides --- -This command is used to detect the location and form of the indexes within the read barcode. It simply goes through a small number of the reads and investigates the number of matches with the indexes in the sample sheet within each possible location in the read barcode and considers the indexes as is and their reverse complementary. +This command is used to detect the location and form of the indexes within the read barcode. It simply goes through a small number of the reads and investigates the number of matches with the indexes in the sample sheet within each possible location in the read barcode and considers the indexes as is and their reverse complementary. -It reports matches for all possible combinations and uses the read template that had the maximum number of matches. This process happens for each sample individually and therefore, the best matching template for each sample will be reported. +It reports matches for all possible combinations and uses the read template that had the maximum number of matches. This process happens for each sample individually and therefore, the best matching template for each sample will be reported. -Using this comprehensive scan, the tool can detect the templates for mixed libraries. +Using this comprehensive scan, the tool can detect the templates for mixed libraries. ## Parameters **Fastq input file** -+ **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. +- **`-f or --read1`**: the path to the forward reads fastq file for both paired-end and single-end input data. -+ **`-r or --read2`**: the path to the reverse reads fastq file. +- **`-r or --read2`**: the path to the reverse reads fastq file. -+ **`-s or --sample-sheet`**: the path to the sample sheet file. +- **`-s or --sample-sheet`**: the path to the sample sheet file. -This is the same format as above, but only sample_id and i7 are required. i5 is required for dual indexes data. + This is the same format as above, but only sample_id and i7 are required. i5 is required for dual indexes data. -+ **`-o or --output`**: The path and prefix of output files. The tools will create two files at the same path with the same prefix and end with `_template.tsv` and `_details.tsv`. +- **`-o or --output`**: The path and prefix of output files. The tools will create two files at the same path with the same prefix and end with `_template.tsv` and `_details.tsv`. -+ **`--testing-reads`**: The number of reads to be investigated to check and detect the templates. The default is 5,000 reads. A Larger number increases the performance time. +- **`--testing-reads`**: The number of reads to be investigated to check and detect the templates. The default is 5,000 reads. A Larger number increases the performance time. -+ **`--barcode-length`**: The length of the read barcode at the end of the read2 in paired-end or read1 in single end to be investigated. By default, the barcode length is set to be the length difference between read2 and read1. +- **`--barcode-length`**: The length of the read barcode at the end of the read2 in paired-end or read1 in single end to be investigated. By default, the barcode length is set to be the length difference between read2 and read1. -+ **`--no-umi`**: If the barcode contains extra base pairs other than the indexes, the tool considers the longest as an umi. If this parameter is enabled, the tool will ignore all extra base pairs in the barcode and trim them from the read. +- **`--no-umi`**: If the barcode contains extra base pairs other than the indexes, the tool considers the longest as an umi. If this parameter is enabled, the tool will ignore all extra base pairs in the barcode and trim them from the read. -+ **`--popular-template`**: by default, the tool reports the template that matches the maximum number of reads to each corresponding sample. If this option is enabled, the tool will use the most frequent template across all samples as the final template for all samples. - -+ **`--max-umi-length`**: if barcode length is not provided, the tool will set the barcode length to the length difference between read2 and read1. If the barcode length is greater than the sum of indexes lengths and this parameter, the tool will stop. The default is 10 bp. You can disable this parameter by either providing a large number or providing the barcode length (`--barcode-length`) parameter manually. +- **`--popular-template`**: by default, the tool reports the template that matches the maximum number of reads to each corresponding sample. If this option is enabled, the tool will use the most frequent template across all samples as the final template for all samples. +- **`--max-umi-length`**: if barcode length is not provided, the tool will set the barcode length to the length difference between read2 and read1. If the barcode length is greater than the sum of indexes lengths and this parameter, the tool will stop. The default is 10 bp. You can disable this parameter by either providing a large number or providing the barcode length (`--barcode-length`) parameter manually. diff --git a/src/main.rs b/src/main.rs index 72591aa..af38c4d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -528,6 +528,7 @@ fn main() { ) ) + .arg_required_else_help(true) .get_matches(); diff --git a/src/samplesheet_utils.rs b/src/samplesheet_utils.rs index 90ac869..8eae99c 100755 --- a/src/samplesheet_utils.rs +++ b/src/samplesheet_utils.rs @@ -1,6 +1,6 @@ - use crate::variables::*; use crate::sequence_utils::*; +use core::panic; use std::io; use std::fs; use std::collections::{HashMap, HashSet}; @@ -107,14 +107,21 @@ pub fn parse_sample_index( let mut unique_sample_id: Vec = Vec::new(); let mut dup_ids = 0; let mut curr_unique_id; + let mut delimiter = '\t'; for line in lines { //println!("{}", line); if line.len() < 5 { continue; } - if header.len() == 0{ header = line.to_lowercase().split('\t').map(|x| x.trim().to_string()).collect(); + if header.len() < 2{ + header = line.to_lowercase().split(',').map(|x| x.trim().to_string()).collect(); + delimiter = ','; + if header.len() < 2{ + panic!("Sample sheet columns should be separated by ',' or '\t'!"); + } + } if template == "" && ! header.contains(&String::from("template")) { panic!("Template should be provided either as a general template or within the index/sample file!"); } @@ -136,7 +143,7 @@ pub fn parse_sample_index( } } }else{ - let vals: Vec = line.split('\t').map(|x| x.trim().to_string()).collect(); + let vals: Vec = line.split(delimiter).map(|x| x.trim().to_string()).collect(); curr_sample_info = Vec::with_capacity(7) ; if curr_sample_id == usize::MAX { @@ -420,7 +427,7 @@ pub fn read_sample_sheet_into_dic( let mut curr_i7: usize = usize::MAX; let mut curr_i5: usize = usize::MAX; let mut i5_val: String; - + let mut delimiter = '\t'; for line in lines { if line.len() < 5 { continue; @@ -428,6 +435,13 @@ pub fn read_sample_sheet_into_dic( if header.len() == 0{ header = line.to_lowercase().split('\t').map(|x| x.trim().to_string()).collect(); + if header.len() < 2{ + header = line.to_lowercase().split(',').map(|x| x.trim().to_string()).collect(); + delimiter = ','; + if header.len() < 2{ + panic!("Sample sheet columns should be separated by ',' or '\t'!"); + } + } for header_itr in 0..header.len(){ if header[header_itr] == "sample_id"{ curr_sample_id = header_itr; @@ -447,7 +461,7 @@ pub fn read_sample_sheet_into_dic( } }else{ - let vals: Vec = line.split('\t').map(|x| x.trim().to_string()).collect(); + let vals: Vec = line.split(delimiter).map(|x| x.trim().to_string()).collect(); if vals[curr_i7] == "." || vals[curr_i7].len() < 3 { panic!("i7 ({}) should be longer than 3 chars!", vals[curr_i7]); } diff --git a/testing_data/expected/ds01/sample_sheet_expected.csv b/testing_data/expected/ds01/sample_sheet_expected.csv new file mode 100644 index 0000000..781f286 --- /dev/null +++ b/testing_data/expected/ds01/sample_sheet_expected.csv @@ -0,0 +1,5 @@ +sample_id,i7,i5,job_number,template,i7_rc,i5_rc +Sample01,ACGGGCAT,GGCTAGAG,.,i78:i58,0,0 +Sample02,ACGCTAAT,GGCTAGAG,.,i78:i58,0,0 +Sample03,ACGCCCAT,GGCCAGAG,.,i78:i58,0,0 +Sample04,GGGTCGAT,GGGGACTA,.,i78:i58,0,0 diff --git a/tests/integration_test.rs b/tests/integration_test.rs index c08f911..778756b 100755 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -196,7 +196,8 @@ fn testing_demultiplex() { let read1_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_1.fq.gz", ds_itr_in, ds_itr_fc)); let read2_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_2.fq.gz", ds_itr_in, ds_itr_fc)); - let sample_sheet_file_path : String = String::from(format!("testing_data/expected/ds0{}/sample_sheet_expected.tsv", ds_itr_ex)); + let ext = if ds_itr_tmp == 1 {"csv"} else {"tsv"}; + let sample_sheet_file_path : String = String::from(format!("testing_data/expected/ds0{}/sample_sheet_expected.{ext}", ds_itr_ex)); let lane = String::from("L01"); let mut instrument = String::from("instrument_1"); let mut run = String::from("20231212");