Skip to content

Commit

Permalink
Fix important bug in expand function. I also move the function, so we…
Browse files Browse the repository at this point in the history
… cannot see the change, See below for bug.

Add variable for files, and when re-running check if file exist and do not re-run hmmer or infernal if file already exists.

Bug was in search_left:

            window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON
	    window_end = (window_beg + 200) % SIZE_REPLICON

where in the second line window_beg does not have the expected value anymore as we change it the line before.

I change this to:

            window_end = (window_beg + 200) % SIZE_REPLICON
            window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON

And any occurrences of similar problem.

I change some file name
  • Loading branch information
jeanrjc committed Jul 30, 2015
1 parent fe8a627 commit 8b99b21
Showing 1 changed file with 92 additions and 72 deletions.
164 changes: 92 additions & 72 deletions integron_finder
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def search_attc(attc_df, keep_palindromes):
return attc_array


def find_integron(replicon_name, attc_file, intI_file, phage_int_file):
def find_integron(replicon_name, attc_file, intI_file, phageI_file):
""" Function that looks for integrons given rules :
- presence of intI
- presence of attC
Expand All @@ -527,7 +527,7 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file):
intI = read_hmm(replicon_name, intI_file)
intI.sort(["Accession_number", "pos_beg", "evalue"], inplace=True)

phageI = read_hmm(replicon_name, phage_int_file)
phageI = read_hmm(replicon_name, phageI_file)
phageI.sort(["Accession_number", "pos_beg", "evalue"], inplace=True)

tmp = intI[intI.ID_prot.isin(phageI.ID_prot)].copy()
Expand Down Expand Up @@ -627,7 +627,7 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file):
intI_ac.pos_end.values[i],
id_int,
int(intI_ac.strand.values[i]),
intI_ac.evalue.values[i], intI_ac.query_name.values[i])
intI_ac.evalue.values[i], intI_ac.query_replicon_name.values[i])

if n_attc_array > 0: # after the integrase loop (<=> no more integrases)
for attc_array in attc_ac:
Expand Down Expand Up @@ -671,65 +671,6 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file):

return integrons

def expand(window_beg, window_end, max_elt, df_max, search_left=False, search_right=False):
# for a given element, we can search on the left hand side (if integrase is on the right for instance)
# or right hand side (opposite situation) or both side (only integrase or only attC sites)
wb = window_beg
we = window_end
if search_right:

if circular:
window_beg = (window_end - 200) % SIZE_REPLICON # 200 to allow the detection of sites that would overlap 2 consecutive windows
window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON
else:
window_beg = max(0, window_end - 200) # 200 to allow the detection of sites that would overlap 2 consecutive windows
window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD)

searched_strand = "both" if search_left else "top" # search on both strands if search in both directions

while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON:

df_max = local_max(replicon_name, window_beg, window_end, searched_strand)
max_elt = pd.concat([max_elt, df_max])

if circular:
window_beg = (window_end - 200) % SIZE_REPLICON
window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON
else:
window_beg = max(0, window_end - 200)
window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD)

# re-initialize in case we enter search left too.
df_max = max_elt.copy()
window_beg = wb
window_end = we

if search_left:
if circular:
window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON
window_end = (window_beg + 200) % SIZE_REPLICON
else:
window_beg = max(0, window_beg - DISTANCE_THRESHOLD)
window_end = min(SIZE_REPLICON, window_beg + 200)

searched_strand = "both" if search_right else "bottom"

while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON:

df_max = local_max(replicon_name, window_beg, window_end, searched_strand)
max_elt = pd.concat([max_elt, df_max]) # update of attC list of hits.

if circular:
window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON
window_end = (window_beg + 200) % SIZE_REPLICON
else:
window_beg = max(0, window_beg - DISTANCE_THRESHOLD)
window_end = min(SIZE_REPLICON, window_beg + 200)

max_elt.drop_duplicates(inplace=True)
max_elt.index = range(len(max_elt))
return max_elt

def find_attc_max(integrons, outfile="attC_max_1.res"):
"""
Look for attC site with cmsearch --max option wich remove all heuristic filters.
Expand Down Expand Up @@ -864,6 +805,69 @@ def find_attc_max(integrons, outfile="attC_max_1.res"):

return max_final

def expand(window_beg, window_end, max_elt, df_max, search_left=False, search_right=False):
# for a given element, we can search on the left hand side (if integrase is on the right for instance)
# or right hand side (opposite situation) or both side (only integrase or only attC sites)
wb = window_beg
we = window_end

if search_right:

if circular:
window_beg = (window_end - 200) % SIZE_REPLICON # 200 to allow the detection of sites that would overlap 2 consecutive windows
window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON
else:
window_beg = max(0, window_end - 200) # 200 to allow the detection of sites that would overlap 2 consecutive windows
window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD)

searched_strand = "both" if search_left else "top" # search on both strands if search in both directions

while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON:

df_max = local_max(replicon_name, window_beg, window_end, searched_strand)
max_elt = pd.concat([max_elt, df_max])

if circular:
window_beg = (window_end - 200) % SIZE_REPLICON
window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON
else:
window_beg = max(0, window_end - 200)
window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD)

# re-initialize in case we enter search left too.
df_max = max_elt.copy()
window_beg = wb
window_end = we

if search_left:

if circular:
window_end = (window_beg + 200) % SIZE_REPLICON
window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON

else:
window_beg = max(0, window_beg - DISTANCE_THRESHOLD)
window_end = min(SIZE_REPLICON, window_beg + 200)

searched_strand = "both" if search_right else "bottom"

while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON:

df_max = local_max(replicon_name, window_beg, window_end, searched_strand)
max_elt = pd.concat([max_elt, df_max]) # update of attC list of hits.

if circular:
window_end = (window_beg + 200) % SIZE_REPLICON
window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON

else:
window_end = min(SIZE_REPLICON, window_beg + 200)
window_beg = max(0, window_beg - DISTANCE_THRESHOLD)

max_elt.drop_duplicates(inplace=True)
max_elt.index = range(len(max_elt))
return max_elt


def local_max(replicon_name, window_beg, window_end, strand_search="both"):

Expand Down Expand Up @@ -1504,31 +1508,47 @@ Please install prodigal package or setup 'prodigal' binary path with --prodigal

############### Default search ###############

intI_file = os.path.join(out_dir, replicon_name + "_intI.res")
phageI_file = os.path.join(out_dir, replicon_name + "_phage_int.res")
attC_default_file = os.path.join(out_dir, replicon_name + "_attc_table.res")

if args.no_proteins == False:
find_integrase(replicon_path, replicon_name, out_dir)
if (os.path.isfile(intI_file) == 0 or
os.path.isfile(phageI_file) == 0):

find_integrase(replicon_path, replicon_name, out_dir)

print "\n>>> Starting Default search ... :"

find_attc(replicon_path, replicon_name, out_dir)
print "\n>>> Starting Default search ... :"
if os.path.isfile(attC_default_file) == 0:
find_attc(replicon_path, replicon_name, out_dir)

print ">>> Default search done... : \n"
integrons = find_integron(replicon_name,
os.path.join(out_dir, replicon_name + "_attc_table.res"),
os.path.join(out_dir, replicon_name + "_intI.res"),
os.path.join(out_dir, replicon_name + "_phage_int.res"))
attC_default_file,
intI_file,
phageI_file)

############### Search with Eagle Eyes ###############
if args.eagle_eyes:

print "\n>>>>>> Starting search with Eagle's eyes...:"
integron_max = find_attc_max(integrons)
if os.path.isfile(os.path.join(out_dir, "integron_max.pickle")) == 0:


print ">>>>>> Search with Eagle's eyes done... : \n"
integron_max = find_attc_max(integrons)
integron_max.to_pickle(os.path.join(out_dir, "integron_max.pickle"))
print ">>>>>> Search with Eagle's eyes done... : \n"

else:
integron_max = pd.read_pickle(os.path.join(out_dir,
"integron_max.pickle"))
print ">>>>>> Search with Eagle's eyes was already done, continue... : \n"

integrons = find_integron(replicon_name,
integron_max,
os.path.join(out_dir, replicon_name + "_intI.res"),
os.path.join(out_dir, replicon_name + "_phage_int.res"))
intI_file,
phageI_file)


############### Add promoters and attI ###############
Expand Down

0 comments on commit 8b99b21

Please sign in to comment.