diff --git a/src/pydna/common_sub_strings.py b/src/pydna/common_sub_strings.py index a5d1c7aa..94617cf7 100644 --- a/src/pydna/common_sub_strings.py +++ b/src/pydna/common_sub_strings.py @@ -16,7 +16,9 @@ # from array import array as _array # import itertools as _itertools from operator import itemgetter as _itemgetter +from typing import List as _List, Tuple as _Tuple +Match = _Tuple[int, int, int] # def _kark_sort(s, SA, n, K): # def radixpass(a, b, r, s, n, k): @@ -312,16 +314,37 @@ # return match -def common_sub_strings(stringx: str, stringy: str, limit=25): +def common_sub_strings(stringx: str, stringy: str, limit=25) -> _List[Match]: + """ + Finds all common substrings between stringx and stringy, and returns + them sorted by length. + + This function is case sensitive. + + Parameters + ---------- + stringx : str + stringy : str + limit : int, optional + + Returns + ------- + list of tuple + [(startx1, starty1, length1),(startx2, starty2, length2), ...] + + startx1 = startposition in x, where substring 1 starts + starty1 = position in y where substring 1 starts + length1 = lenght of substring + """ from pydivsufsort import common_substrings - match = common_substrings(stringx, stringy, limit=limit) - match.sort() - match.sort(key=_itemgetter(2), reverse=True) - return match + matches = common_substrings(stringx, stringy, limit=limit) + matches.sort() + matches.sort(key=_itemgetter(2), reverse=True) + return matches -def terminal_overlap(stringx: str, stringy: str, limit=15): +def terminal_overlap(stringx: str, stringy: str, limit=15) -> _List[Match]: """Finds the the flanking common substrings between stringx and stringy longer than limit. This means that the results only contains substrings that starts or ends at the the ends of stringx and stringy. diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 195db968..0052120b 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -22,7 +22,7 @@ import math as _math from pydna.seq import Seq as _Seq -from Bio.Seq import _translate_str +from Bio.Seq import _translate_str, _SeqAbstractBaseClass from pydna._pretty import pretty_str as _pretty_str from seguid import ldseguid as _ldseguid @@ -36,7 +36,23 @@ from Bio.Restriction import RestrictionBatch as _RestrictionBatch from Bio.Restriction import CommOnly -from typing import Tuple +from typing import ( + TYPE_CHECKING, + List as _List, + Tuple as _Tuple, + Union as _Union, + TypeVar as _TypeVar, + Iterable as _Iterable, +) + +if TYPE_CHECKING: + from Bio.Restriction import AbstractCut as _AbstractCut + + +# To represent any subclass of Dseq +DseqType = _TypeVar("DseqType", bound="Dseq") +EnzymesType = _TypeVar("EnzymesType", _RestrictionBatch, _Iterable["_AbstractCut"], "_AbstractCut") +CutSiteType = _Tuple[_Tuple[int, int], _Union["_AbstractCut", None]] class Dseq(_Seq): @@ -296,24 +312,24 @@ class Dseq(_Seq): def __init__( self, - watson, - crick=None, + watson: _Union[str, bytes], + crick: _Union[str, bytes, None] = None, ovhg=None, circular=False, pos=0, ): + if isinstance(watson, bytes): + watson = watson.decode("ASCII") + if isinstance(crick, bytes): + crick = crick.decode("ASCII") + if crick is None: - if ovhg is None: - crick = _rc(watson) - ovhg = 0 - try: - self._data = bytes(watson, encoding="ASCII") - except TypeError: - self._data = watson - watson = watson.decode("ASCII") - crick = crick.decode("ASCII") - else: # ovhg given, but no crick strand + if ovhg is not None: raise ValueError("ovhg defined without crick strand!") + crick = _rc(watson) + ovhg = 0 + self._data = bytes(watson, encoding="ASCII") + else: # crick strand given if ovhg is None: # ovhg not given olaps = _common_sub_strings( @@ -321,14 +337,18 @@ def __init__( str(_rc(crick).lower()), int(_math.log(len(watson)) / _math.log(4)), ) - try: - F, T, L = olaps[0] - except IndexError: + if len(olaps) == 0: raise ValueError("Could not anneal the two strands." " Please provide ovhg value") - ovhgs = [ol[1] - ol[0] for ol in olaps if ol[2] == L] - if len(ovhgs) > 1: + + # We extract the positions and length of the first (longest) overlap, since + # common_sub_strings sorts the overlaps by length. + pos_watson, pos_crick, longest_olap_length = olaps[0] + + # We see if there is another overlap of the same length + if any(olap[2] >= longest_olap_length for olap in olaps[1:]): raise ValueError("More than one way of annealing the" " strands. Please provide ovhg value") - ovhg = T - F + + ovhg = pos_crick - pos_watson sns = (ovhg * " ") + _pretty_str(watson) asn = (-ovhg * " ") + _pretty_str(_rc(crick)) @@ -510,7 +530,7 @@ def from_full_sequence_and_overhangs(cls, full_sequence: str, crick_ovhg: int, w # Use :meth:`looped` to create a circular Dseq object""" # return self._circular - def mw(self): + def mw(self) -> float: """This method returns the molecular weight of the DNA molecule in g/mol. The following formula is used:: @@ -529,7 +549,7 @@ def mw(self): + 79.0 ) - def upper(self): + def upper(self: DseqType) -> DseqType: """Return an upper case copy of the sequence. >>> from pydna.dseq import Dseq @@ -562,7 +582,7 @@ def upper(self): pos=self.pos, ) - def lower(self): + def lower(self: DseqType) -> DseqType: """Return a lower case copy of the sequence. >>> from pydna.dseq import Dseq @@ -594,7 +614,7 @@ def lower(self): pos=self.pos, ) - def find(self, sub, start=0, end=_sys.maxsize): + def find(self, sub: _Union[_SeqAbstractBaseClass, str, bytes], start=0, end=_sys.maxsize) -> int: """This method behaves like the python string method of the same name. Returns an integer, the index of the first occurrence of substring @@ -638,7 +658,7 @@ def find(self, sub, start=0, end=_sys.maxsize): return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end) - def __getitem__(self, sl): + def __getitem__(self, sl: slice) -> "Dseq": """Returns a subsequence. This method is used by the slice notation""" if not self.circular: @@ -679,7 +699,7 @@ def __getitem__(self, sl): return Dseq(w, c, ovhg=0) # , linear=True) - def __eq__(self, other): + def __eq__(self, other: DseqType) -> bool: """Compare to another Dseq object OR an object that implements watson, crick and ovhg properties. This comparison is case insensitive. @@ -775,7 +795,7 @@ def __repr__(self): ) ) - def reverse_complement(self, inplace=False): + def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. This represents the same double stranded sequence. @@ -805,7 +825,7 @@ def reverse_complement(self, inplace=False): rc = reverse_complement # alias for reverse_complement - def shifted(self, shift): + def shifted(self: DseqType, shift: int) -> DseqType: """Shifted version of a circular Dseq object.""" if not self.circular: raise TypeError("DNA is not circular.") @@ -815,7 +835,7 @@ def shifted(self, shift): else: return (self[shift:] + self[:shift]).looped() - def looped(self): + def looped(self: DseqType) -> DseqType: """Circularized Dseq object. This can only be done if the two ends are compatible, @@ -856,11 +876,11 @@ def looped(self): """ if self.circular: - return self + return _copy.deepcopy(self) type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() if type5 == type3 and str(sticky5) == str(_rc(sticky3)): - nseq = Dseq.quick( + nseq = self.__class__.quick( self.watson, self.crick[-self.ovhg :] + self.crick[: -self.ovhg], ovhg=0, @@ -872,7 +892,7 @@ def looped(self): else: raise TypeError("DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!") - def tolinear(self): # pragma: no cover + def tolinear(self: DseqType) -> DseqType: # pragma: no cover """Returns a blunt, linear copy of a circular Dseq object. This can only be done if the Dseq object is circular, otherwise a TypeError is raised. @@ -904,11 +924,11 @@ def tolinear(self): # pragma: no cover ) if not self.circular: raise TypeError("DNA is not circular.\n") - selfcopy = _copy.copy(self) + selfcopy = _copy.deepcopy(self) selfcopy.circular = False return selfcopy # self.__class__(self.watson, linear=True) - def five_prime_end(self): + def five_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of the DNA fragment @@ -958,7 +978,7 @@ def five_prime_end(self): type_ = "blunt" return type_, sticky - def three_prime_end(self): + def three_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of the DNA fragment @@ -1005,11 +1025,11 @@ def three_prime_end(self): type_ = "blunt" return type_, sticky - def watson_ovhg(self): + def watson_ovhg(self) -> int: """Returns the overhang of the watson strand at the three prime.""" return len(self.watson) - len(self.crick) + self.ovhg - def __add__(self, other): + def __add__(self: DseqType, other: DseqType) -> DseqType: """Simulates ligation between two DNA fragments. Add other Dseq object at the end of the sequence. @@ -1042,24 +1062,24 @@ def __add__(self, other): if self_type == other_type and str(self_tail) == str(_rc(other_tail)): answer = Dseq.quick(self.watson + other.watson, other.crick + self.crick, self.ovhg) elif not self: - answer = _copy.copy(other) + answer = _copy.deepcopy(other) elif not other: - answer = _copy.copy(self) + answer = _copy.deepcopy(self) else: raise TypeError("sticky ends not compatible!") return answer - def __mul__(self, number): + def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): raise TypeError("TypeError: can't multiply Dseq by non-int of type {}".format(type(number))) if number <= 0: return self.__class__("") - new = _copy.copy(self) + new = _copy.deepcopy(self) for i in range(number - 1): new += self return new - def _fill_in_five_prime(self, nucleotides): + def _fill_in_five_prime(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.five_prime_end() if type == "5'": @@ -1070,7 +1090,7 @@ def _fill_in_five_prime(self, nucleotides): break return self.crick + stuffer, self.ovhg + len(stuffer) - def _fill_in_three_prime(self, nucleotides): + def _fill_in_three_prime(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.three_prime_end() if type == "5'": @@ -1081,7 +1101,7 @@ def _fill_in_three_prime(self, nucleotides): break return self.watson + stuffer - def fill_in(self, nucleotides=None): + def fill_in(self, nucleotides: _Union[None, str] = None) -> "Dseq": """Fill in of five prime protruding end with a DNA polymerase that has only DNA polymerase activity (such as exo-klenow [#]_) and any combination of A, G, C or T. Default are all four @@ -1138,20 +1158,21 @@ def fill_in(self, nucleotides=None): .. [#] http://en.wikipedia.org/wiki/Klenow_fragment#The_exo-_Klenow_fragment """ - if not nucleotides: + if nucleotides is None: nucleotides = "GATCRYWSMKHBVDN" + nucleotides = set(nucleotides.lower() + nucleotides.upper()) crick, ovhg = self._fill_in_five_prime(nucleotides) watson = self._fill_in_three_prime(nucleotides) return Dseq(watson, crick, ovhg) - def transcribe(self): + def transcribe(self) -> _Seq: return _Seq(self.watson).transcribe() - def translate(self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"): + def translate(self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-") -> _Seq: return _Seq(_translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)) - def mung(self): + def mung(self) -> "Dseq": """ Simulates treatment a nuclease with 5'-3' and 3'-5' single strand specific exonuclease activity (such as mung bean nuclease [#]_) @@ -1194,7 +1215,7 @@ def mung(self): """ return Dseq(self.watson[max(0, -self.ovhg) : min(len(self.watson), len(self.crick) - self.ovhg)]) - def T4(self, nucleotides=None): + def T4(self, nucleotides=None) -> "Dseq": """Fill in five prime protruding ends and chewing back three prime protruding ends by a DNA polymerase providing both 5'-3' DNA polymerase activity and 3'-5' nuclease acitivty @@ -1275,58 +1296,58 @@ def T4(self, nucleotides=None): t4 = T4 # alias for the T4 method. - def exo1_front(self, n=1): + def exo1_front(self: DseqType, n=1) -> DseqType: """5'-3' resection at the start (left side) of the molecule.""" d = _copy.deepcopy(self) d.ovhg += n d.watson = d.watson[n:] return d - def exo1_end(self, n=1): + def exo1_end(self: DseqType, n=1) -> DseqType: """5'-3' resection at the end (right side) of the molecule.""" d = _copy.deepcopy(self) d.crick = d.crick[n:] return d - def no_cutters(self, batch: _RestrictionBatch = None): + def no_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch not cutting sequence.""" - if not batch: + if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist} return _RestrictionBatch(ncut) - def unique_cutters(self, batch: _RestrictionBatch = None): + def unique_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence once.""" - if not batch: + if batch is None: batch = CommOnly return self.n_cutters(n=1, batch=batch) once_cutters = unique_cutters # alias for unique_cutters - def twice_cutters(self, batch: _RestrictionBatch = None): + def twice_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence twice.""" - if not batch: + if batch is None: batch = CommOnly return self.n_cutters(n=2, batch=batch) - def n_cutters(self, n=3, batch: _RestrictionBatch = None): + def n_cutters(self, n=3, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting n times.""" - if not batch: + if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n} return _RestrictionBatch(ncut) - def cutters(self, batch: _RestrictionBatch = None): + def cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence at least once.""" - if not batch: + if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist} return _RestrictionBatch(ncut) - def seguid(self): + def seguid(self) -> str: """SEGUID checksum for the sequence.""" if self.circular: cs = _cdseguid(self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}") @@ -1337,7 +1358,7 @@ def seguid(self): cs = _ldseguid(w, c, alphabet="{DNA-extended}") return cs - def isblunt(self): + def isblunt(self) -> bool: """isblunt. Return True if Dseq is linear and blunt and @@ -1377,7 +1398,7 @@ def isblunt(self): """ return self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular - def cas9(self, RNA: str): + def cas9(self, RNA: str) -> _Tuple[slice, ...]: """docstring.""" bRNA = bytes(RNA, "ASCII") slices = [] @@ -1388,14 +1409,14 @@ def cas9(self, RNA: str): slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:])) return slices - def terminal_transferase(self, nucleotides="a"): + def terminal_transferase(self, nucleotides="a") -> "Dseq": """docstring.""" ovhg = self.ovhg if self.ovhg >= 0: ovhg += len(nucleotides) return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg) - def cut(self, *enzymes): + def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. @@ -1445,7 +1466,7 @@ def cut(self, *enzymes): cutsite_pairs = self.get_cutsite_pairs(cutsites) return tuple(self.apply_cut(*cs) for cs in cutsite_pairs) - def cutsite_is_valid(self, cutsite): + def cutsite_is_valid(self, cutsite: CutSiteType) -> bool: """Returns False if: - Cut positions fall outside the sequence (could be moved to Biopython) - Overhang is not double stranded @@ -1477,7 +1498,7 @@ def cutsite_is_valid(self, cutsite): end_of_recognition_site %= len(self) recognition_site = self[start_of_recognition_site:end_of_recognition_site] if len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0: - if enz.scd5 is None: + if enz is None or enz.scd5 is None: return False else: # For enzymes that cut twice, this might be referring to the second one @@ -1494,7 +1515,7 @@ def cutsite_is_valid(self, cutsite): return True - def get_cutsites(self, *enzymes): + def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> _List[CutSiteType]: """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`: - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence @@ -1511,11 +1532,11 @@ def get_cutsites(self, *enzymes): Parameters ---------- - enzymes : Union[_RestrictionBatch,list[_RestrictionType]] + enzymes : Union[_RestrictionBatch,list[_AbstractCut]] Returns ------- - list[tuple[tuple[int,int], _RestrictionType]] + list[tuple[tuple[int,int], _AbstractCut]] Examples -------- @@ -1562,7 +1583,7 @@ def get_cutsites(self, *enzymes): return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)]) - def left_end_position(self) -> Tuple[int, int]: + def left_end_position(self) -> _Tuple[int, int]: """ The index in the full sequence of the watson and crick start positions. @@ -1579,7 +1600,7 @@ def left_end_position(self) -> Tuple[int, int]: return self.ovhg, 0 return 0, -self.ovhg - def right_end_position(self) -> Tuple[int, int]: + def right_end_position(self) -> _Tuple[int, int]: """The index in the full sequence of the watson and crick end positions. full sequence (str(self)) for all three cases is AAA @@ -1595,7 +1616,7 @@ def right_end_position(self) -> Tuple[int, int]: return len(self) + self.watson_ovhg(), len(self) return len(self), len(self) - self.watson_ovhg() - def get_cut_parameters(self, cut: tuple, is_left: bool): + def get_cut_parameters(self, cut: _Union[CutSiteType, None], is_left: bool) -> _Tuple[int, int, int]: """For a given cut expressed as ((cut_watson, ovhg), enz), returns a tuple (cut_watson, cut_crick, ovhg). @@ -1622,15 +1643,15 @@ def get_cut_parameters(self, cut: tuple, is_left: bool): # In the right end, the overhang does not matter return *self.right_end_position(), self.watson_ovhg() - def apply_cut(self, left_cut, right_cut): + def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": """Extracts a subfragment of the sequence between two cuts. For more detail see the documentation of get_cutsite_pairs. Parameters ---------- - left_cut : Union[tuple[tuple[int,int], _RestrictionType], None] - right_cut: Union[tuple[tuple[int,int], _RestrictionType], None] + left_cut : Union[tuple[tuple[int,int], _AbstractCut], None] + right_cut: Union[tuple[tuple[int,int], _AbstractCut], None] Returns ------- @@ -1689,7 +1710,9 @@ def apply_cut(self, left_cut, right_cut): ovhg=ovhg_left, ) - def get_cutsite_pairs(self, cutsites): + def get_cutsite_pairs( + self, cutsites: _List[CutSiteType] + ) -> _List[_Tuple[_Union[None, CutSiteType], _Union[None, CutSiteType]]]: """Returns pairs of cutsites that render the edges of the resulting fragments. A fragment produced by restriction is represented by a tuple of length 2 that @@ -1705,11 +1728,11 @@ def get_cutsite_pairs(self, cutsites): Parameters ---------- - cutsites : list[tuple[tuple[int,int], _RestrictionType]] + cutsites : list[tuple[tuple[int,int], _AbstractCut]] Returns ------- - list[tuple[tuple[tuple[int,int], _RestrictionType]|None],tuple[tuple[int,int], _RestrictionType]|None] + list[tuple[tuple[tuple[int,int], _AbstractCut]|None],tuple[tuple[int,int], _AbstractCut]|None] Examples -------- diff --git a/src/pydna/utils.py b/src/pydna/utils.py index a007e5a8..f7e0ce36 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -19,11 +19,8 @@ import collections as _collections import itertools as _itertools from copy import deepcopy as _deepcopy -from typing import Union as _Union import sys as _sys -import re -import itertools import random import subprocess as _subprocess from bisect import bisect as _bisect @@ -35,6 +32,11 @@ from Bio.SeqFeature import SimpleLocation as _sl from Bio.SeqFeature import CompoundLocation as _cl +from typing import Union as _Union, TypeVar as _TypeVar, List as _List + +# For functions that take str or bytes as input and return str or bytes as output, matching the input type +StrOrBytes = _TypeVar("StrOrBytes", str, bytes) + _module_logger = _logging.getLogger("pydna." + __name__) _ambiguous_dna_complement.update({"U": "A"}) _complement_table = _maketrans(_ambiguous_dna_complement) @@ -256,7 +258,7 @@ def open_folder(pth): return "no cache to open." -def rc(sequence: str): +def rc(sequence: StrOrBytes) -> StrOrBytes: """Reverse complement. accepts mixed DNA/RNA @@ -332,7 +334,7 @@ def identifier_from_string(s: str) -> str: return s -def flatten(*args): +def flatten(*args) -> _List: """Flattens an iterable of iterables. Down to str, bytes, bytearray or any of the pydna or Biopython seq objects diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index 60f55ef5..aaf8e2a2 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -719,6 +719,19 @@ def test_shifted(): assert a.shifted(0) is not a +def test_looped(): + + # Looping a circular sequence should return a copy of the sequence + # not the same sequence + + from pydna.dseq import Dseq + + a = Dseq("gatc", circular=True) + + assert a.looped() == a + assert a.looped() is not a + + def test_misc(): from pydna.dseq import Dseq @@ -829,7 +842,7 @@ def test_apply_cut(): from pydna.dseq import Dseq from Bio.Restriction import EcoRI, BamHI - seq = Dseq('aaGAATTCaa', circular=False) + seq = Dseq("aaGAATTCaa", circular=False) # A cut where both sides are None returns the same sequence assert seq.apply_cut(None, None) == seq @@ -838,44 +851,44 @@ def test_apply_cut(): EcoRI_cut = ((3, -4), None) assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs( - 'aaGAATT', watson_ovhg=-4, crick_ovhg=0 + "aaGAATT", watson_ovhg=-4, crick_ovhg=0 ) assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs( - 'AATTCaa', watson_ovhg=0, crick_ovhg=-4 + "AATTCaa", watson_ovhg=0, crick_ovhg=-4 ) # It respects the original overhang - seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=1, crick_ovhg=1) + seq = Dseq.from_full_sequence_and_overhangs("aaGAATTCaa", watson_ovhg=1, crick_ovhg=1) assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs( - 'aaGAATT', watson_ovhg=-4, crick_ovhg=1 + "aaGAATT", watson_ovhg=-4, crick_ovhg=1 ) assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs( - 'AATTCaa', watson_ovhg=1, crick_ovhg=-4 + "AATTCaa", watson_ovhg=1, crick_ovhg=-4 ) - seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=-1, crick_ovhg=-1) + seq = Dseq.from_full_sequence_and_overhangs("aaGAATTCaa", watson_ovhg=-1, crick_ovhg=-1) assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs( - 'aaGAATT', watson_ovhg=-4, crick_ovhg=-1 + "aaGAATT", watson_ovhg=-4, crick_ovhg=-1 ) assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs( - 'AATTCaa', watson_ovhg=-1, crick_ovhg=-4 + "AATTCaa", watson_ovhg=-1, crick_ovhg=-4 ) # A repeated cut in a circular molecule opens it up - seq = Dseq('aaGAATTCaa', circular=True) + seq = Dseq("aaGAATTCaa", circular=True) assert seq.apply_cut(EcoRI_cut, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs( - 'AATTCaaaaGAATT', watson_ovhg=-4, crick_ovhg=-4 + "AATTCaaaaGAATT", watson_ovhg=-4, crick_ovhg=-4 ) # Two cuts extract a subsequence - seq = Dseq('aaGAATTCaaGAATTCaa', circular=True) + seq = Dseq("aaGAATTCaaGAATTCaa", circular=True) EcoRI_cut_2 = ((11, -4), None) assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs( - 'AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4 + "AATTCaaGAATT", watson_ovhg=-4, crick_ovhg=-4 ) # Overlapping cuts should return an error - seq = Dseq('aaGAATTCaa', circular=True) + seq = Dseq("aaGAATTCaa", circular=True) first_cuts = [ ((3, -4), BamHI), ((7, 4), BamHI), @@ -901,13 +914,13 @@ def test_apply_cut(): try: seq.apply_cut(first_cut, second_cut) except ValueError as e: - assert e.args[0] == 'Cuts by BamHI EcoRI overlap.' + assert e.args[0] == "Cuts by BamHI EcoRI overlap." else: print(first_cut, second_cut) - assert False, 'Expected ValueError' + assert False, "Expected ValueError" # Rotating the sequence, apply the same cut - seq = Dseq('acgtATGaatt', circular=True) + seq = Dseq("acgtATGaatt", circular=True) for shift in range(len(seq)): seq_shifted = seq.shifted(shift) start = 4 - shift @@ -916,19 +929,19 @@ def test_apply_cut(): # Cut with negative ovhg new_cut = ((start, -3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == 'ATGaattacgtATG' + assert str(out) == "ATGaattacgtATG" # Cut with positive ovhg start = (start + 3) % len(seq) new_cut = ((start, 3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == 'ATGaattacgtATG' + assert str(out) == "ATGaattacgtATG" # A blunt cut start = 4 - shift new_cut = ((start, 0), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == 'ATGaattacgt' + assert str(out) == "ATGaattacgt" def test_cutsite_is_valid(): @@ -964,7 +977,7 @@ def test_cutsite_is_valid(): assert len(dseq.get_cutsites([enz])) == 1 # Special cases: - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 0) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 0, 0) assert len(dseq.get_cutsites([NmeDI])) == 2 # Remove left cutting place assert len(dseq[2:].get_cutsites([NmeDI])) == 1 @@ -974,27 +987,27 @@ def test_cutsite_is_valid(): assert len(dseq[2:-2].get_cutsites([NmeDI])) == 0 # overhang left side - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', -2, 0) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", -2, 0) assert len(dseq.get_cutsites([NmeDI])) == 1 - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 2, 0) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 2, 0) assert len(dseq.get_cutsites([NmeDI])) == 1 # overhang right side - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 2) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 0, 2) assert len(dseq.get_cutsites([NmeDI])) == 1 - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, -2) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 0, -2) assert len(dseq.get_cutsites([NmeDI])) == 1 # overhang both sides - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 2, 2) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 2, 2) assert len(dseq.get_cutsites([NmeDI])) == 0 - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', -2, -2) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", -2, -2) assert len(dseq.get_cutsites([NmeDI])) == 0 # overhang on recognition site removes both cutting places - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 16, 0) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 16, 0) assert len(dseq.get_cutsites([NmeDI])) == 0 - dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 16) + dseq = Dseq.from_full_sequence_and_overhangs("AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA", 0, 16) assert len(dseq.get_cutsites([NmeDI])) == 0 @@ -1003,7 +1016,7 @@ def test_get_cutsite_pairs(): # in the test, we replace cuts by integers for clarity. - dseq = Dseq('A') + dseq = Dseq("A") # Empty returns empty list assert dseq.get_cutsite_pairs([]) == [] @@ -1014,7 +1027,7 @@ def test_get_cutsite_pairs(): # Two cuts on linear seq return three fragments assert dseq.get_cutsite_pairs([1, 2]) == [(None, 1), (1, 2), (2, None)] - dseq = Dseq('A', circular=True) + dseq = Dseq("A", circular=True) # Empty returns empty list assert dseq.get_cutsite_pairs([]) == [] @@ -1030,7 +1043,7 @@ def test_get_cut_parameters(): from pydna.dseq import Dseq - dseq = Dseq.from_full_sequence_and_overhangs('aaaACGTaaa', 3, 3) + dseq = Dseq.from_full_sequence_and_overhangs("aaaACGTaaa", 3, 3) assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) assert dseq.get_cut_parameters(None, False) == (*dseq.right_end_position(), dseq.watson_ovhg()) @@ -1039,22 +1052,22 @@ def test_get_cut_parameters(): assert dseq.get_cut_parameters(((6, 2), None), True) == (6, 4, 2) assert dseq.get_cut_parameters(((6, 2), None), False) == (6, 4, 2) - dseq = Dseq('aaaACGTaaa', circular=True) + dseq = Dseq("aaaACGTaaa", circular=True) # None cannot be used on circular molecules try: assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) except AssertionError as e: - assert e.args[0] == 'Circular sequences should not have None cuts' + assert e.args[0] == "Circular sequences should not have None cuts" else: - assert False, 'Expected AssertionError' + assert False, "Expected AssertionError" try: assert dseq.get_cut_parameters(None, False) == (*dseq.right_end_position(), dseq.watson_ovhg()) except AssertionError as e: - assert e.args[0] == 'Circular sequences should not have None cuts' + assert e.args[0] == "Circular sequences should not have None cuts" else: - assert False, 'Expected AssertionError' + assert False, "Expected AssertionError" # "Normal" cuts assert dseq.get_cut_parameters(((4, -2), None), True) == (4, 6, -2)