Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify blastn parameters to match those in Shmakov et al NAR 2023 #1

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions srufinder/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@ def run(self):

# BLASTn
subprocess.run(['blastn',
'-task', 'blastn-short',
'-reward', '3',
'-penalty', '-2',
'-gapopen', '5',
'-gapextend', '5',
'-dust', 'no',
'-evalue', '100',
'-word_size', str(self.master.word_size),
'-query', self.master.repeatdb,
'-db', self.master.out+'masked',
Expand All @@ -56,7 +61,11 @@ def run_spacer(self):

# BLASTn
subprocess.run(['blastn',
'-task', 'blastn-short',
'-reward', '3',
'-penalty', '-2',
'-gapopen', '5',
'-gapextend', '5',
'-dust', 'no',
'-word_size', str(self.master.word_size),
'-query', self.master.out+'spacers.fa',
'-db', self.master.out+'genome',
Expand All @@ -65,3 +74,5 @@ def run_spacer(self):
'-qcov_hsp_perc', str(self.master.spacer_coverage),
'-out', self.master.out+'blast_spacers.tab',
'-num_threads', str(self.master.threads)])


89 changes: 54 additions & 35 deletions srufinder/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import pandas as pd

from Bio import pairwise2
from Bio import Align
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import pairwise2

class Cluster(object):

Expand Down Expand Up @@ -52,7 +53,7 @@ def run(self):
# Cluster matches in arrays
self.cluster_adj()

# Append partial matches
# Append par#
self.append_partial()

# Round
Expand Down Expand Up @@ -116,8 +117,29 @@ def identity(self,x,y):
'''
Calculate identity between two sequences
'''
align = pairwise2.align.globalxs(x, y, -1, -1, penalize_end_gaps=False)
return(align[0][2]/min(len(x), len(y))*100)
aligner = Align.PairwiseAligner()

aligner.match_score = 1.0
aligner.mismatch_score = -2.0
aligner.gap_score = -10
aligner.extend_gap_score = -10

alignments = aligner.align(x, y)

best_alignment = alignments[0]

# Extract aligned sequences from the best alignment
aligned_seq1, aligned_seq2 = best_alignment

matches = sum(1 for a, b in zip(aligned_seq1, aligned_seq2) if a == b)
#compute lengths of aligned sequences without gaps
len1 = len(aligned_seq1.replace('-', ''))
len2 = len(aligned_seq2.replace('-', ''))
alignment_length = min(len1, len2)

# Calculate pidentity
pidentity = (matches / alignment_length) * 100 if alignment_length > 0 else 0
return pidentity

def identity_all(self,x,ll):
'''
Expand Down Expand Up @@ -168,8 +190,10 @@ def cluster_adj(self):
'''
Cluster adjacent matches into arrays
'''
pd.set_option('display.max_rows', None) # None means unlimited
pd.set_option('display.max_columns', None) # Adjust as per your DataFrame's number of columns

logging.info('Clustering matches')
logging.info('Clustering repeats')

# Sort by position
self.df_overlap = self.df_overlap.sort_values('Min')
Expand All @@ -183,40 +207,35 @@ def cluster_adj(self):
self.master.clean()
sys.exit()

cluster_df_lst = []
cluster = 0
# For each contig
for i in set(self.df_overlap_compl['Acc']):
tmp = self.df_overlap_compl[self.df_overlap_compl['Acc'] == i]
self.df_overlap_compl["Repeat_type"] = self.df_overlap_compl["Repeat"].str.split(':').str[0]
#cluster all repeats with the same type, and then group those from the same type less than 100bp apart
self.df_overlap_compl = self.df_overlap_compl.sort_values(['Acc', 'Repeat_type', 'Min'])
#cluster those that are less than 100bp apart from the previous one (that is, min(n+1) - max(n) < 100)
#create a column with the difference between the Min value of the current row and the Max value of the previous row
self.df_overlap_compl['Min_diff'] = self.df_overlap_compl['Min'] - self.df_overlap_compl['Max'].shift(1)
#cluster those contiguous repeats that are less than 100bp apart (absolute value)
self.df_overlap_compl['Cluster'] = (self.df_overlap_compl['Min_diff'].abs() > self.master.max_dist).cumsum()

# iterate over the clusters, and compute similarity between the sequences of the cluster. remove rows with similarity <40% to any other sequence in the cluster
to_remove = []
clusters = self.df_overlap_compl.groupby('Cluster')
for name, group in clusters:
if len(group) > 1:
sequences = group["Sequence"].tolist()

for i in range(len(group)):

pos = tmp[['Min','Max']].values
cluster_list = []
# Loop over complete matches
for ind, k in enumerate(pos):
# Keep first match
if ind == 0:
cluster_list.append(cluster)
arrays_cluster = [k]
else:
# If match within Xbp of any previous, add match to current cluster
if min(self.dist_all(k, arrays_cluster)) <= self.master.max_dist:
cluster_list.append(cluster)
arrays_cluster.append(k)
# If match > Xbp from previous, initiate new cluster
else:
cluster += 1
arrays_cluster = [k]
cluster_list.append(cluster)

tmp.insert(len(tmp.columns), 'Cluster', cluster_list)
cluster_df_lst.append(tmp)

# Increment cluster ID for next acc
cluster += 1
similarities = self.identity_all(sequences[i], [x for j, x in enumerate(sequences) if j != i])
if not any([similarity >= 40 for similarity in similarities]):
to_remove.append(group.index[i])



self.df_overlap_compl = self.df_overlap_compl.drop(to_remove)
self.df_overlap_compl = self.df_overlap_compl.sort_values(['Acc','Min'])
self.df_overlap_compl = self.df_overlap_compl.reset_index(drop=True)
# If several contigs, concatenate
self.df_cluster = pd.concat(cluster_df_lst)
self.df_cluster = self.df_overlap_compl

def append_partial(self):
'''
Expand Down
2 changes: 2 additions & 0 deletions srufinder/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import pandas as pd

from Bio import SeqIO
from Bio import Align


class Controller(object):

Expand Down