Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow other queries #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions graphStats.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@


function parse_numbers(path::AbstractString)
numbers = Vector{Int32}()
for node in split(path, " ")
number = parse(Int32, view(node, 1:length(node)-1))
# number = node[end] == "-" ? flipnode(number) : noflip(number)
push!(numbers, number)
end
return numbers
end

function get_stats_gfa2(f::String)
h = open(f, "r")
unique_nodes = Set{Int}()
edges = 0
for line in eachline(h)
identifier, path = split(line, "\t")
node_ids = parse_numbers(path)
for node in node_ids
push!(unique_nodes, node)
end
edges += length(edges)-1
end
println("Nodes: ", nodes)
println("Edges: ", edges)
end

get_stats_gfa2("/net/phage/linuxhome/mgx/people/rickb/Campy/complete_genomees/graph/complete_campy_graph_gfa.gfa2")
10 changes: 7 additions & 3 deletions graph_io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,19 @@ end
function read_queries(f::String, query_ids::OrderedSet{String})
queries = Vector{Vector{Int32}}()
query_ids_file_order = OrderedSet{String}() # Might find in different order in the file than query_ids
query_ids_numerical = Vector{Int32}()
for (i, line) in enumerate(eachline(f))
identifier, path = split(line, "\t")
if identifier in query_ids
path_numbers = parse_numbers(path)
push!(query_ids_file_order, identifier)
push!(queries, path_numbers)
push!(query_ids_numerical, i)
length(query_ids) == length(query_ids_file_order) && break
end
end
return queries, query_ids_file_order
println("Read: ", length(query_ids_file_order), " queries")
return queries, query_ids_file_order, query_ids_numerical
end

function read_node_sizes(f::String)
Expand All @@ -44,8 +48,8 @@ end
function processGFA(gfa::String, query_file::String)
# Read the query ids from the file
query_ids = read_ids_from_file(query_file)
queries, query_ids = read_queries(gfa, query_ids)
return queries, query_ids
queries, query_ids, query_ids_numerical = read_queries(gfa, query_ids)
return queries, query_ids, query_ids_numerical
end

function writeResults(ca::Vector{Int32}, color::Color, query_ids::OrderedSet{String}, out_file::String, size_map::Dict{Int32, Int32})
Expand Down
83 changes: 60 additions & 23 deletions graphite.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
using ProgressMeter
using Dates

const MASK = Int32(1<<30)

flipnode(n::Int32) = n ⊻ MASK
Expand All @@ -18,9 +21,23 @@ struct Color
k_size::Int32
end

function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32})
# 175378
# 175378
function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32}, ca_coords::UnitRange{Int32})

match_end = match_start+match_size-1
println("Got match: ", match_start, " -> ", match_end)
# Check if this is masked as query
# println( ca_coords[1] != Int32(0))
# println(match_start >= ca_coords[1])
# println(match_end <= ca_coords[2], " ", match_end, " and ", ca_coords[2] )
# println(ca_coords)
if ca_coords[1] != Int32(0) && match_start >= ca_coords.start && match_end <= ca_coords.stop
# println("Same as query skip")
return
end

# println("Passed as non query")

# Get info from pervious matches
at_start = color.origin[match_start]
Expand Down Expand Up @@ -91,7 +108,7 @@ function check_this_point(ca::Vector{Int32}, sa::Vector{Int32}, ref::AbstractVec
return match_size
end

function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color)
function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color, ca_coords::UnitRange{Int32})
move_dir = forward ? 1 : -1
lcp_dir = forward ? 0 : 1
# println("Extending this point")
Expand All @@ -102,12 +119,12 @@ function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{In
start_check_from = Int32(min(lcp[i + lcp_dir], match_size))
# Check the size of this match starting from +1 of the LCP value)
match_size = check_this_point(ca, sa, ref, ref_start, Int32(i), start_check_from )
update_color!(color, ref_id, sa[i], Int32(match_size), ca)
update_color!(color, ref_id, sa[i], Int32(match_size), ca, ca_coords)
i += move_dir
end
end

function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32})
function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32})
# println(ref_id, " at: ", ref[307])
max_match_index = Int32(0)
max_match_size = Int32(0)
Expand All @@ -121,16 +138,16 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}
# If we have a match keep using the linked list to extend
if max_match_size > 0
while ref_start <= length(ref)
# Check the match size at this point
# Check the match size at this point
max_match_size = check_this_point(ca, sa, ref, ref_start, max_match_index, Int32(max_match_size-1)) # skip k-1

# If we don't have any match we don't have to check the flanks
max_match_size == 0 && break
update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca)
update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca, ca_coords)

# Check up and down in suffix array for other matches
extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color)
extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color)
extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color, ca_coords)
extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color, ca_coords)

# # Move to next location in suffix array for the second around
max_match_index = inv_perm_sa[sa[max_match_index]+1]
Expand All @@ -143,40 +160,60 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}
end
end

function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32})
function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32})
# First do the forward align
align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp)
align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords)
# Flip the nodes and reverse to do the reverse alignment
reverse_complement_ref!(ref)
align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp)
align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords)
end

function run(gfa::String, seq_file::String, query_file::String, k_size::Int32, out_file::String; blacklist::String = "")

start_time = now()
blacklist_ids = !isempty(blacklist) ? read_ids_from_file(blacklist) : OrderedSet{String}()

queries, query_ids = processGFA(gfa, query_file)

ca, sa = create_k_suffix_array(queries, Int32(0))
println("Reading GFA...")
queries, query_ids, query_ids_numerical = processGFA(gfa, query_file)

println("Building data structures (SA, LCP, and ISA")
ca, query_offsets, sa = create_k_suffix_array(queries, query_ids_numerical, Int32(0))
inv_sa_perm = inverse_perm_sa(sa)
lcp = build_lcp(sa, ca)


println("Reading node sizes...")
size_map = read_node_sizes(seq_file)
len = zeros(Int32, length(ca))
ori = [Origin(-1,-1) for i in 1:length(ca)] # Vector{Origin}(undef, length(ca))
color = Color(len, ori, size_map, k_size)
println("Aligning references to the suffix array")

prog = ProgressUnknown("References read: ")

for (ref_id, line) in enumerate(eachline(gfa))
identifier, path = split(line, "\t")
if !(identifier in query_ids) && !(identifier in blacklist_ids)
path_numbers = parse_numbers(path)
align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp)
end
identifier, path = split(line, "\t")
# Skip if blacklisted
identifier in blacklist_ids && continue
# Get the mask coordinates in the CA array if this is a query
ca_coords = identifier in query_ids ? query_offsets[ref_id] : Int32(0):Int32(0)
#if ca_coords[1] != Int32(0)
#println("Got coords: ", ca_coords, "id: ", identifier, " ref id: ", ref_id)

path_numbers = parse_numbers(path)
align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp, ca_coords)
println(ca_coords)


# end



ProgressMeter.next!(prog)

end

ProgressMeter.finish!(prog)
println("Writing output")
writeResults(ca, color, query_ids, out_file, size_map)

println("Elapsed: ", now() - start_time)
end

# run("test", "test", "test", Int32(31), "test")
1 change: 1 addition & 0 deletions main.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using OrderedCollections
using ArgParse
using IntervalTrees

include("./graphite.jl")
include("./graph_io.jl")
Expand Down
32 changes: 24 additions & 8 deletions suffixArray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,40 @@

const LIBSAIS = "libsais-2.7.1/libsais.so.2"

function concat_with_seperator(vectors::Vector{Vector{Int32}})

function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32})
length(vectors) == length(identifiers) || error("Vectors != identifiers")
# Probably a cleaner way to do this :) Like using map
# to flip the node ids in slices and copying the slices to the output
total_size = sum(map(length, vectors))
concat_arr = zeros(Int32, total_size + length(vectors))
vect_id = -1 * length(vectors) # to have it decending for sorting
# Concat with seperator + store sign in most significant bit
i = 1
identifier_i = 1
# Sometimes we might want to align back to the other queries just
# not to itself. However the relationship between a sequence and it's suffixes
# is not directly evident as there are multiple suffixes along the SA that
# belong to the same query, likely all scattered.
query_offsets = Dict{Int32, UnitRange{Int32}}()
last_vect_id = Int32(1)
last_start = Int32(1)

@inbounds for v in vectors
for node_id in v
concat_arr[i] = node_id
i +=1
i += 1
end
# Get the vector id offsets and store them in the query offsets dict
query_offsets[identifiers[identifier_i]] = last_start:i-1
concat_arr[i] = vect_id
vect_id +=1
i +=1
vect_id += 1
i += 1
last_vect_id = vect_id
last_start = i
identifier_i += 1
end
return concat_arr
return concat_arr, query_offsets
end

function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32)
Expand All @@ -33,10 +49,10 @@ function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32)
return out_vector
end

function create_k_suffix_array(vectors::Vector{Vector{Int32}}, free_space::Int32)
concat_array = concat_with_seperator(vectors)
function create_k_suffix_array(vectors::Vector{Vector{Int32}}, query_identifiers::Vector{Int32}, free_space::Int32)
concat_array, query_offsets = concat_with_seperator(vectors, query_identifiers)
suffix_array = create_suffix_array(concat_array, free_space)
return concat_array, suffix_array
return concat_array, query_offsets, suffix_array
end

function build_lcp(sa::Vector{Int32}, V::Vector{Int32}, base::Integer=1)
Expand Down
36 changes: 36 additions & 0 deletions test.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32})
length(vectors) == length(identifiers) || error("Vectors != identifiers")
# Probably a cleaner way to do this :) Like using map
# to flip the node ids in slices and copying the slices to the output
total_size = sum(map(length, vectors))
concat_arr = zeros(Int32, total_size + length(vectors))
vect_id = -1 * length(vectors) # to have it decending for sorting
# Concat with seperator + store sign in most significant bit
i = 1
identifier_i = 1
# Sometimes we might want to align back to the other queries just
# not to itself. However the relationship between a sequence and it's suffixes
# is not directly evident as there are multiple suffixes along the SA that
# belong to the same query, likely all scattered.
query_offsets = Dict{Int32, UnitRange{Int32}}()
last_vect_id = Int32(1)
last_start = Int32(1)

@inbounds for v in vectors
for node_id in v
concat_arr[i] = node_id
i += 1
end
# Get the vector id offsets and store them in the query offsets dict
query_offsets[identifiers[identifier_i]] = last_start:i-1
concat_arr[i] = vect_id
vect_id += 1
i += 1
last_vect_id = vect_id
last_start = i
identifier_i += 1
end
return concat_arr, query_offsets
end

concat_with_seperator( [Int32[1,2,3,4,5], Int32[100,200,100], Int32[10000,10000] ], [Int32(1), Int32(5), Int32(9)])