Merge pull request #170 from timothymillar/sample-pools

Version 0.9.1
PlantandFoodResearch · Dec 12, 2023 · 92cb214 · 92cb214
2 parents 749dd03 + 99af0bc
commit 92cb214
Show file tree

Hide file tree

Showing 51 changed files with 444 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 ## Unreleased 
 
 
+## Beta v0.9.1
+
+New Features:
+- Allow complex sample pooling via a tabular file
+
+
 ## Beta v0.9.0
 
 New Features:

diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update \
 
 RUN git clone  https://github.com/PlantandFoodResearch/MCHap.git   \
     && cd MCHap \
-    && git checkout v0.9.0 \
+    && git checkout v0.9.1 \
     && pip install -r requirements.txt \
     && python3 setup.py sdist \
     && python3 -m pip install dist/mchap-*tar.gz

diff --git a/cli-assemble-help.txt b/cli-assemble-help.txt
@@ -72,8 +72,20 @@ options:
                         identifier and the inbreeding coefficient of that
                         sample separated by a tab.
   --sample-pool SAMPLE_POOL
-                        A name used to pool all sample reads into a single
-                        sample. WARNING: this is an experimental feature.
+                        WARNING: this is an experimental feature!!! Pool
+                        samples together into a single genotype. This may be
+                        (1) the name of a single pool for all samples or (2) a
+                        file containing a list of all samples and their
+                        assigned pool. If option (2) is used then each line of
+                        the plaintext file must contain a single sample
+                        identifier and the name of a pool separated by a
+                        tab.Samples may be assigned to multiple pools by using
+                        the same sample name on multiple lines.Each pool will
+                        treated as a single genotype by combining all reads
+                        from its constituent samples. Note that the pool names
+                        should be used in place of the samples names when
+                        assigning other per-sample parameters such as ploidy
+                        or inbreeding coefficients.
   --reference REFERENCE
                         Indexed fasta file containing the reference genome.
   --base-error-rate BASE_ERROR_RATE

diff --git a/cli-call-exact-help.txt b/cli-call-exact-help.txt
@@ -59,8 +59,20 @@ options:
                         identifier and the inbreeding coefficient of that
                         sample separated by a tab.
   --sample-pool SAMPLE_POOL
-                        A name used to pool all sample reads into a single
-                        sample. WARNING: this is an experimental feature.
+                        WARNING: this is an experimental feature!!! Pool
+                        samples together into a single genotype. This may be
+                        (1) the name of a single pool for all samples or (2) a
+                        file containing a list of all samples and their
+                        assigned pool. If option (2) is used then each line of
+                        the plaintext file must contain a single sample
+                        identifier and the name of a pool separated by a
+                        tab.Samples may be assigned to multiple pools by using
+                        the same sample name on multiple lines.Each pool will
+                        treated as a single genotype by combining all reads
+                        from its constituent samples. Note that the pool names
+                        should be used in place of the samples names when
+                        assigning other per-sample parameters such as ploidy
+                        or inbreeding coefficients.
   --reference REFERENCE
                         Indexed fasta file containing the reference genome.
   --base-error-rate BASE_ERROR_RATE

diff --git a/cli-call-help.txt b/cli-call-help.txt
@@ -63,8 +63,20 @@ options:
                         identifier and the inbreeding coefficient of that
                         sample separated by a tab.
   --sample-pool SAMPLE_POOL
-                        A name used to pool all sample reads into a single
-                        sample. WARNING: this is an experimental feature.
+                        WARNING: this is an experimental feature!!! Pool
+                        samples together into a single genotype. This may be
+                        (1) the name of a single pool for all samples or (2) a
+                        file containing a list of all samples and their
+                        assigned pool. If option (2) is used then each line of
+                        the plaintext file must contain a single sample
+                        identifier and the name of a pool separated by a
+                        tab.Samples may be assigned to multiple pools by using
+                        the same sample name on multiple lines.Each pool will
+                        treated as a single genotype by combining all reads
+                        from its constituent samples. Note that the pool names
+                        should be used in place of the samples names when
+                        assigning other per-sample parameters such as ploidy
+                        or inbreeding coefficients.
   --reference REFERENCE
                         Indexed fasta file containing the reference genome.
   --base-error-rate BASE_ERROR_RATE

diff --git a/docs/assemble.rst b/docs/assemble.rst
@@ -3,7 +3,7 @@ MCHap assemble
 
 De novo assembly of micro-haplotypes.
 
-*(Last updated for MCHap version 0.9.0)*
+*(Last updated for MCHap version 0.9.1)*
 
 Background
 ----------
@@ -229,6 +229,26 @@ effects on the results.
   Each line of this file must contain the identifier of a sample and its inbreeding 
   coefficient separated by a tab.
 
+Sample pooling
+~~~~~~~~~~~~~~
+
+MCHap allows you to define 'pools' of samples using the ``--sample-pools`` parameter.
+A sample pool will combine the reads of its constituent samples but otherwise is treated
+identically to a regular sample. Sample parameters relating to the sample pool including
+``--ploidy`` and ``--inbreeding`` must be set using the name of the sample pool rather
+than its constituent samples. Uses for sample pools include:
+
+- Combining the replicates into a single sample
+- Renaming samples (using a pool per sample)
+- Combining a set samples that are expected to contain a known number of haplotypes (e.g.,
+  the progeny of a bi-parental cross)
+
+The ``--sample-pools`` parameter can specify either the name of a single 'pool' containing
+all of the samples, or a tabular file assigning samples to pools. If a tabular fle is used,
+each line must contain the name of a sample followed by the name of the pool that sample is
+assigned to. All samples must be specified in this file but they can be assigned to a pool
+of the same name (i.e., a pool per sample). Samples may be assigned to more than one pool. 
+
 Output parameters
 ~~~~~~~~~~~~~~~~~
 

diff --git a/docs/call.rst b/docs/call.rst
@@ -3,7 +3,7 @@ MCHap call
 
 Calling genotypes from known haplotypes.
 
-*(Last updated for MCHap version 0.9.0)*
+*(Last updated for MCHap version 0.9.1)*
 
 Background
 ----------
@@ -195,6 +195,26 @@ effects on the results.
   Each line of this file must contain the identifier of a sample and its inbreeding 
   coefficient separated by a tab.
 
+Sample pooling
+~~~~~~~~~~~~~~
+
+MCHap allows you to define 'pools' of samples using the ``--sample-pools`` parameter.
+A sample pool will combine the reads of its constituent samples but otherwise is treated
+identically to a regular sample. Sample parameters relating to the sample pool including
+``--ploidy`` and ``--inbreeding`` must be set using the name of the sample pool rather
+than its constituent samples. Uses for sample pools include:
+
+- Combining the replicates into a single sample
+- Renaming samples (using a pool per sample)
+- Combining a set samples that are expected to contain a known number of haplotypes (e.g.,
+  the progeny of a bi-parental cross)
+
+The ``--sample-pools`` parameter can specify either the name of a single 'pool' containing
+all of the samples, or a tabular file assigning samples to pools. If a tabular fle is used,
+each line must contain the name of a sample followed by the name of the pool that sample is
+assigned to. All samples must be specified in this file but they can be assigned to a pool
+of the same name (i.e., a pool per sample). Samples may be assigned to more than one pool. 
+
 Output parameters
 ~~~~~~~~~~~~~~~~~
 

diff --git a/docs/example/bi-parental.ipynb b/docs/example/bi-parental.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Micro-haplotype assembly in an autopolyploid bi-parental cross\n",
     "\n",
-    "*(Last updated for MCHap version 0.9.0)*\n",
+    "*(Last updated for MCHap version 0.9.1)*\n",
     "\n",
     "This notebook demonstrates micro-haplotype assembly and genotype re-calling in a small bi-parental cross.\n",
     "It introduces the following topics:\n",
@@ -286,8 +286,8 @@
      "output_type": "stream",
      "text": [
       "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#fileformat=VCFv4.3\n",
-      "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#fileDate=20231017\n",
-      "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#source=mchap v0.9.0\n",
+      "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#fileDate=20231212\n",
+      "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#source=mchap v0.9.1\n",
       "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#phasing=None\n",
       "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#commandline=\"/home/cfltxm/Software/miniconda3/envs/mchap/bin/mchap assemble --bam input/bam/parent1.loci.bam input/bam/parent2.loci.bam input/bam/progeny001.loci.bam input/bam/progeny002.loci.bam input/bam/progeny003.loci.bam input/bam/progeny004.loci.bam input/bam/progeny005.loci.bam input/bam/progeny006.loci.bam input/bam/progeny007.loci.bam input/bam/progeny008.loci.bam input/bam/progeny009.loci.bam input/bam/progeny010.loci.bam input/bam/progeny011.loci.bam input/bam/progeny012.loci.bam input/bam/progeny013.loci.bam input/bam/progeny014.loci.bam input/bam/progeny015.loci.bam input/bam/progeny016.loci.bam input/bam/progeny017.loci.bam input/bam/progeny018.loci.bam input/bam/progeny019.loci.bam input/bam/progeny020.loci.bam --targets input/bed/targets4.bed --variants input/vcf/snvs.vcf.gz --reference input/fasta/chr1.fa.gz --ploidy 4 --inbreeding 0.01\"\n",
       "\u001b[01;31m\u001b[K#\u001b[m\u001b[K#randomseed=42\n",
@@ -741,7 +741,7 @@
     "        --haplotypes assemble.AFP.vcf.gz \\\n",
     "        --ploidy 4 \\\n",
     "        --inbreeding 0.01 \\\n",
-    "        --prior-frequencies \\\n",
+    "        --prior-frequencies AFP \\\n",
     "        | bgzip > assemble.AFP.prior.recall.vcf.gz\n",
     "tabix -p vcf assemble.AFP.prior.recall.vcf.gz"
    ]
@@ -837,7 +837,7 @@
    "source": [
     "Notes:\n",
     "\n",
-    "- The `--sample-pool` requires a name for our pooled \"sample\" in the output VCF file\n",
+    "- The `--sample-pool` parameter takes a name for our pooled \"sample\" in the output VCF file\n",
     "- We set `--ploidy` to `8` to treat this pooled sample as an octoploid\n",
     "- We have omitted  the `--inbreeding` argument, we could still include this but it would be interpreted as the *population* inbreeding coefficient"
    ]
@@ -862,7 +862,7 @@
     "        --haplotypes pooled.AFP.vcf.gz \\\n",
     "        --ploidy 4 \\\n",
     "        --inbreeding 0.01 \\\n",
-    "        --prior-frequencies \\\n",
+    "        --prior-frequencies AFP \\\n",
     "        | bgzip > pooled.AFP.prior.recall.vcf.gz\n",
     "tabix -p vcf pooled.AFP.prior.recall.vcf.gz"
    ]

diff --git a/mchap/application/arguments.py b/mchap/application/arguments.py
@@ -1,5 +1,6 @@
 import copy
 import pysam
+import os
 from dataclasses import dataclass
 
 from mchap.constant import PFEIFFER_ERROR
@@ -190,8 +191,18 @@ def add_to(self, parser):
         nargs=1,
         default=[None],
         help=(
-            "A name used to pool all sample reads into a single sample. "
-            "WARNING: this is an experimental feature."
+            "WARNING: this is an experimental feature!!! "
+            "Pool samples together into a single genotype. "
+            "This may be (1) the name of a single pool for all samples or "
+            "(2) a file containing a list of all samples and their assigned pool. "
+            "If option (2) is used then each line of the plaintext file must "
+            "contain a single sample identifier and the name of a pool separated by a tab."
+            "Samples may be assigned to multiple pools by using the same sample name on "
+            "multiple lines."
+            "Each pool will treated as a single genotype by combining "
+            "all reads from its constituent samples. "
+            "Note that the pool names should be used in place of the samples names when "
+            "assigning other per-sample parameters such as ploidy or inbreeding coefficients."
         ),
     ),
 )
@@ -664,6 +675,48 @@ def add_to(self, parser):
 )
 
 
+def parse_sample_pools(samples, sample_bams, sample_pool_argument):
+    if sample_pool_argument is None:
+        # pools of single samples
+        sample_bams = {k: [(k, v)] for k, v in sample_bams.items()}
+        return samples, sample_bams
+    if not os.path.isfile(sample_pool_argument):
+        # pool of all samples
+        samples = [sample_pool_argument]
+        sample_bams = {sample_pool_argument: [(k, v) for k, v in sample_bams.items()]}
+        return samples, sample_bams
+    else:
+        # custom pools
+        with open(sample_pool_argument) as f:
+            lines = [line.strip().split("\t") for line in f.readlines()]
+        pools = list()
+        pool_bams = dict()
+        samples_in_pools = set()
+        for sample, pool in lines:
+            samples_in_pools.add(sample)
+            bam = sample_bams[sample]
+            if pool not in pools:
+                # start new pool
+                pools.append(pool)
+                pool_bams[pool] = [(sample, bam)]
+            else:
+                # add sample to pool
+                pool_bams[pool].append((sample, bam))
+        # validation
+        sample_with_bams = set(samples)
+        diff = sample_with_bams - samples_in_pools
+        if diff:
+            raise ValueError(
+                f"The following samples have not been assigned to a pool: {diff}"
+            )
+        diff = samples_in_pools - sample_with_bams
+        if diff:
+            raise ValueError(
+                f"The following names in the sample-pool file do not match a known sample : {diff}"
+            )
+        return pools, pool_bams
+
+
 def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field):
     """Combine arguments relating to sample bam file specification.
 
@@ -718,14 +771,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
             raise ValueError("Too many fields")
 
     # handle sample pooling
-    if sample_pool_argument is None:
-        # pools of 1
-        sample_bams = {k: [(k, v)] for k, v in sample_bams.items()}
-    else:
-        # pool all samples
-        samples = [sample_pool_argument]
-        sample_bams = {sample_pool_argument: [(k, v) for k, v in sample_bams.items()]}
-    # TODO: multiple pools
+    samples, sample_bams = parse_sample_pools(
+        samples, sample_bams, sample_pool_argument
+    )
 
     return samples, sample_bams
 

diff --git a/mchap/tests/test_application_arguments.py b/mchap/tests/test_application_arguments.py
@@ -0,0 +1,76 @@
+import pathlib
+import pytest
+
+from mchap.application.arguments import parse_sample_pools
+
+
+def local_file_path(name):
+    path = pathlib.Path(__file__).parent.absolute()
+    path = path / ("test_io/data/" + name)
+    return str(path)
+
+
+def test_parse_sample_pools__none():
+    samples = ["SAMPLE1", "SAMPLE2", "SAMPLE3"]
+    sample_bams = {"SAMPLE1": "BAM1", "SAMPLE2": "BAM2", "SAMPLE3": "BAM3"}
+    pools, pool_bams = parse_sample_pools(
+        samples, sample_bams, sample_pool_argument=None
+    )
+    assert pools == samples
+    assert pool_bams == {
+        "SAMPLE1": [("SAMPLE1", "BAM1")],
+        "SAMPLE2": [("SAMPLE2", "BAM2")],
+        "SAMPLE3": [("SAMPLE3", "BAM3")],
+    }
+
+
+def test_parse_sample_pools__single():
+    samples = ["SAMPLE1", "SAMPLE2", "SAMPLE3"]
+    sample_bams = {"SAMPLE1": "BAM1", "SAMPLE2": "BAM2", "SAMPLE3": "BAM3"}
+    pools, pool_bams = parse_sample_pools(
+        samples, sample_bams, sample_pool_argument="POOL"
+    )
+    assert pools == ["POOL"]
+    assert pool_bams == {
+        "POOL": [("SAMPLE1", "BAM1"), ("SAMPLE2", "BAM2"), ("SAMPLE3", "BAM3")]
+    }
+
+
+def test_parse_sample_pools__file():
+    samples = ["SAMPLE1", "SAMPLE2", "SAMPLE3"]
+    sample_bams = {"SAMPLE1": "BAM1", "SAMPLE2": "BAM2", "SAMPLE3": "BAM3"}
+    pools, pool_bams = parse_sample_pools(
+        samples, sample_bams, sample_pool_argument=local_file_path("simple.pools")
+    )
+    assert pools == ["POOL1", "POOL2", "POOL3", "POOL13", "POOL123"]
+    assert pool_bams == {
+        "POOL1": [("SAMPLE1", "BAM1")],
+        "POOL2": [("SAMPLE2", "BAM2")],
+        "POOL3": [("SAMPLE3", "BAM3")],
+        "POOL13": [("SAMPLE1", "BAM1"), ("SAMPLE3", "BAM3")],
+        "POOL123": [("SAMPLE1", "BAM1"), ("SAMPLE2", "BAM2"), ("SAMPLE3", "BAM3")],
+    }
+
+
+def test_parse_sample_pools__raise_on_missing_sample():
+    samples = ["SAMPLE1", "SAMPLE2", "SAMPLE3", "SAMPLE4"]
+    sample_bams = {"SAMPLE1": "BAM1", "SAMPLE2": "BAM2", "SAMPLE3": "BAM3"}
+    with pytest.raises(
+        ValueError,
+        match="The following samples have not been assigned to a pool: {'SAMPLE4'}",
+    ):
+        parse_sample_pools(
+            samples, sample_bams, sample_pool_argument=local_file_path("simple.pools")
+        )
+
+
+def test_parse_sample_pools__raise_on_unknown_sample():
+    samples = ["SAMPLE1", "SAMPLE2"]
+    sample_bams = {"SAMPLE1": "BAM1", "SAMPLE2": "BAM2", "SAMPLE3": "BAM3"}
+    with pytest.raises(
+        ValueError,
+        match="The following names in the sample-pool file do not match a known sample : {'SAMPLE3'}",
+    ):
+        parse_sample_pools(
+            samples, sample_bams, sample_pool_argument=local_file_path("simple.pools")
+        )