Skip to content

Commit

Permalink
feat(repeats): adding examples from HGVS repeats recommendations page…
Browse files Browse the repository at this point in the history
…. Improved repeat analyzer, that now supports 5' and 3' shuffling.
  • Loading branch information
andreasprlic committed Sep 16, 2024
1 parent d216bba commit 9d15a22
Show file tree
Hide file tree
Showing 4 changed files with 417 additions and 94 deletions.
5 changes: 4 additions & 1 deletion src/hgvs/pretty/console/tx_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ def display(self, data: VariantData) -> str:
"""show the position of the transcript seq"""
var_str = ""

rna_coding = data.var_c_or_n.ac.startswith("NR_")
if data.var_c_or_n:
rna_coding = data.var_c_or_n.ac.startswith("NR_")
else:
rna_coding = False

count = -1
for pdata in data.position_details:
Expand Down
4 changes: 3 additions & 1 deletion src/hgvs/pretty/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ def data(
# for p in position_details:
# print(f"{p}\t{p.protein_data}\t")

strand = mapper.strand if mapper else 1

vd = VariantData(
seq_start,
seq_end,
Expand All @@ -355,7 +357,7 @@ def data(
tx_seq,
mapper,
var_g,
mapper.strand,
strand,
var_c_or_n,
var_p,
position_details,
Expand Down
255 changes: 209 additions & 46 deletions src/hgvs/repeats.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,236 @@
# -*- coding: utf-8 -*-
""" A class to manage conversion of SequenceVariants to repeat representation"""

from dataclasses import dataclass
import re
from typing import Optional

from hgvs.pretty.models import VariantCoords


def count_pattern_occurence(pattern, string):
"""Counts how often a pattern can be found in the string."""
matches = re.finditer(f'(?={pattern})', string)
return sum(1 for _ in matches)


def count_repetitive_units(s):
length = len(s)

for i in range(1, length + 1):
unit = s[:i]
if length % i == 0:
if unit * (length // i) == s:
return length // i, unit

return 1, s
@dataclass(eq=True, repr=True, frozen=True, order=True)
class RepeatUnit:
repeat_count: int
repeat_unit:str
block_size:int
block: str


class RepeatAnalyser:

def __init__(self, fs: VariantCoords) -> None:
def __init__(self, fs: VariantCoords, reverse:bool=False) -> None:

self.is_repeat = False
self.ref_count = 0
self.alt_count = 0
self.repeat_units_ref = None
self.repeat_units_alt = None
self.ref_str = fs.ref
self.alt_str = fs.alt
self.reverse = reverse

self.repeat_unit = self._get_repeat_unit(fs)
if self.repeat_unit is None:
self.repeat_units_ref = detect_repetitive_block_lengths(self.ref_str, self.reverse)
self.repeat_units_alt = detect_repetitive_block_lengths(self.alt_str, self.reverse)

if len(self.repeat_units_ref) == 0 and len(self.repeat_units_alt) ==0 :
return

# check longest repeat blocks:
# we only look at ref to determine if there are repeats
# If ref has no repeat, we don't call this a repeat variant, even if alt would have a repetitive unit
longest_r_unit = self._get_longest_repeat_unit(self.repeat_units_ref)
if longest_r_unit is None:
return

# filter our too fragmented results
expected_size = len(self.ref_str) / 3
if longest_r_unit.block_size < expected_size:
return

if longest_r_unit.repeat_unit not in self.alt_str:
return


self.is_repeat = True

self.ref_count = count_pattern_occurence(self.repeat_unit, fs.ref)
self.alt_count = count_pattern_occurence(self.repeat_unit, fs.alt)
self.ref_str = f"{self.repeat_unit}[{self.ref_count}]"
self.alt_str = f"{self.repeat_unit}[{self.alt_count}]"
ref_repeat = get_repeat_str(self.ref_str, self.alt_str, self.repeat_units_ref, self.repeat_units_alt, self.reverse)
alt_repeat = get_repeat_str(self.alt_str, self.ref_str, self.repeat_units_alt, self.repeat_units_ref, self.reverse)

self.ref_str = ref_repeat
self.alt_str = alt_repeat

def __repr__(self):
return f"{self.ref_str}>{self.alt_str}"

def _get_repeat_unit(self, fs: VariantCoords) -> Optional[str]:
"""Takes fully justified coordiantes and tries to detect a repeat in them."""
# analyze for repeat:
if len(fs.ref) == len(fs.alt) > 0:
# seems we cant shuffle. is an SVN or delins
return None

if len(fs.alt) > 0:
if fs.alt in fs.ref:
if fs.ref.startswith(fs.alt):
d = fs.ref[len(fs.alt) :]
if count_pattern_occurence(d, fs.ref) > 1:
c, u = count_repetitive_units(d)
return u
elif fs.ref in fs.alt:
if fs.alt.startswith(fs.ref):
d = fs.alt[len(fs.ref) :]
if count_pattern_occurence(d, fs.ref) > 1:
c, u = count_repetitive_units(d)
return u
def _get_longest_repeat_unit(self, repeat_units:list[RepeatUnit])->RepeatUnit:
lru = None
for ru in repeat_units:
if not lru:
lru = ru
continue

if ru.block_size > len(lru.block):
lru = ru

return lru


def detect_repetitive_block_lengths(sequence: str, reverse: bool = False) -> list[RepeatUnit]:
"""Detects the length of repetitive blocks in a string, with an option to search from left to right or reverse.
In reverse mode, it creates the largest possible blocks of the smallest possible units.
"""
result: list[RepeatUnit] = []
seq_len = len(sequence)

if reverse:
i = seq_len # Start from the end of the sequence
while i > 0:
matched = False
# Iterate over block sizes from smallest to largest
for block_size in range(1, seq_len // 2 + 1):
if i - block_size < 0:
continue # Not enough characters to form a repeat unit

# Extract the potential repeat unit ending at position i
repeat_unit = sequence[i - block_size:i]

# Calculate the maximum possible number of repeats for this block size
max_possible_repeats = i // block_size

# Initialize repeat_count to the maximum possible and decrease
for repeat_count in range(max_possible_repeats, 1, -1):
start_index = i - repeat_count * block_size
if start_index < 0:
continue # Not enough characters to form the repetitive block

# Extract the substring that could be the repetitive block
substr = sequence[start_index:i]

# Build the regex pattern for the current repeat unit and count
pattern = rf'({re.escape(repeat_unit)})' + r'{' + f'{repeat_count}' + r'}'

# Check if the substring matches the pattern
if re.fullmatch(pattern, substr):
repetitive_block = substr
ru = RepeatUnit(
block=repetitive_block,
block_size=len(repetitive_block),
repeat_unit=repeat_unit,
repeat_count=repeat_count
)
result.append(ru)
# Move the index `i` backward by the length of the repetitive block
i -= len(repetitive_block)
matched = True
break # Found the largest block for this unit size

if matched:
break # Proceed to the next position `i`

if not matched:
# No repeat found, remove one character
ru = RepeatUnit(
block=sequence[i - 1],
block_size=1,
repeat_unit=sequence[i - 1],
repeat_count=1
)
result.append(ru)
i -= 1 # Move back by one character if no match is found

else:
i = 0 # Start from the beginning of the sequence
while i < seq_len:
matched = False
for block_size in range(1, seq_len // 2 + 1):
# Build the regex pattern for the current block size
pattern = rf'(.{{{block_size}}})\1+'
match = re.match(pattern, sequence[i:])

if match:
repetitive_block = match.group() # The full repeating pattern
repeated_unit = match.group(1) # The repeating unit
repetition_count = len(repetitive_block) // len(repeated_unit)

# Add the repetitive block and its details to the result
ru = RepeatUnit(
block=repetitive_block,
block_size=len(repetitive_block),
repeat_unit=repeated_unit,
repeat_count=repetition_count
)
result.append(ru)

# Move the index `i` forward by the length of the repetitive block
i += len(repetitive_block)
matched = True
break # Found a match, break the loop for current block size

if not matched:
i += 1 # Move forward by one character if no match is found

return result


def get_repeat_str(
sequence: str,
other_seq: str,
primary_repeat_unit: list[RepeatUnit],
other_repeat_unit: list[RepeatUnit],
reverse: bool = False
) -> str:
if len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 0:
return None

if len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 1 and sequence == other_repeat_unit[0].repeat_unit:
return f"{sequence}[1]"
elif len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 1 and sequence != other_repeat_unit[0].repeat_unit:
return None

if len(primary_repeat_unit) > 0 and len(other_repeat_unit) > 0:
return_str = assemble_repeat_string(sequence, primary_repeat_unit, reverse=reverse)
return return_str

if len(other_repeat_unit) == 0 and len(other_seq) > 0:
return_str = assemble_repeat_string(sequence, primary_repeat_unit, reverse=reverse)
if len(return_str) > 0:
return return_str

return None

def assemble_repeat_string(sequence: str, repeat_units: list[RepeatUnit], reverse: bool = False) -> str:
return_str = ""
primary_repeat_unit = repeat_units.copy()
seq = sequence

if reverse:
while len(seq) > 0:
found_unit = None
for ru in primary_repeat_unit:
if seq.endswith(ru.block):
return_str = f"{ru.repeat_unit}[{ru.repeat_count}]" + return_str
seq = seq[:-len(ru.block)]
found_unit = ru
break
if not found_unit:
# remove one character if no matching repeat unit is found
seq_char = seq[0]
seq = seq[:-1]
return_str = f"{seq_char}[1]" + return_str

else: # forward direction
while len(seq) > 0:
found_unit = None
for ru in primary_repeat_unit:
if seq.startswith(ru.block):
return_str += f"{ru.repeat_unit}[{ru.repeat_count}]"
seq = seq[len(ru.block):]
found_unit = ru
break
if not found_unit:
# remove one character if no matching repeat unit is found
seq_char = seq[0]
seq = seq[1:]
return_str += f"{seq_char}[1]"


return return_str

Loading

0 comments on commit 9d15a22

Please sign in to comment.