Skip to content

Commit

Permalink
Merge pull request #2 from invitae/fix-encoding-error
Browse files Browse the repository at this point in the history
fix: Fixes issue with unsupported sequences from reference during encoding
  • Loading branch information
kazmiekr authored May 17, 2023
2 parents a97f163 + cfcef07 commit b0aa388
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 3 deletions.
2 changes: 2 additions & 0 deletions examples/error_w_encoding_example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CHROM,POS,REF,ALT
chr2,233137476,TG,T
23 changes: 21 additions & 2 deletions pangolin/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import logging
import re
import time
from typing import Tuple

import numpy as np
from pyfaidx import Fasta
import torch

from pangolin.batch import Variant, PreppedVariant
from pangolin.data_models import VariantEncodings, AppConfig, TimingDetails
from pangolin.data_models import (
Variant,
PreppedVariant,
VariantEncodings,
AppConfig,
TimingDetails,
)
from pangolin.genes import GeneAnnotator

logger = logging.getLogger(__name__)
Expand All @@ -17,6 +23,8 @@
[[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
)

SEQ_PATTERN = re.compile("^[ACTGN]+$")


def compute_score(ref_seq, alt_seq, strand, d, models):
ref_seq = one_hot_encode(ref_seq, strand).T
Expand Down Expand Up @@ -219,6 +227,17 @@ def prepare_variant(
empty_timing,
)

# This check ensures that only ACTGN characters are in the padded sequence from the FASTA file. If there
# are any other characters, the downstream encoding will fail in one_hot_encode
if re.search(SEQ_PATTERN, seq.upper()) is None:
skip_message = f"Unsupported sequences in ref seq from fasta, found bases: {set(seq).difference(set('ACTGN'))}"
return (
PreppedVariant.with_skip_message(
variant=variant, skip_message=skip_message
),
empty_timing,
)

if seq[5000 + distance : 5000 + distance + len(ref)].upper() != ref:
ref_base = seq[5000 + distance : 5000 + distance + len(ref)]
skip_message = f"Mismatch between FASTA (ref base: {ref_base}) and variant file (ref base: {ref})."
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pangolin"
version = "1.3.11"
version = "1.3.12"
description = ""
authors = ["Tony Zeng <[email protected]>", "Kevin Kazmierczak <[email protected]>"]
readme = "README.md"
Expand Down
32 changes: 32 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from collections import namedtuple
from unittest.mock import MagicMock

import pytest

from pangolin.data_models import Variant
from pangolin.utils import prepare_variant


@pytest.mark.parametrize(
"seq, expected",
[
("ACTGV", True),
("ACTG", False),
("AAAA", False),
("AAUAA", True),
("U", True),
],
)
def test_unsupported_ref_seq(seq, expected):
variant = Variant(lnum=1, chr="chr1", pos=100, ref="C", alt="T")
# Mock out the fasta seq so it returns the passed in sequence
fasta = MagicMock()
fasta.keys = MagicMock(return_value=[variant.chr])
seq_obj = namedtuple("RefSeq", "seq")
fasta[variant.chr].__getitem__ = MagicMock(return_value=seq_obj(seq))

prepped_variant, _ = prepare_variant(variant, None, fasta, 500)
skip_message_contains_text = (
"Unsupported sequences in ref seq from fasta" in prepped_variant.skip_message
)
assert skip_message_contains_text == expected

0 comments on commit b0aa388

Please sign in to comment.