From 8a74fcf5452f17f789b8461d0a0716b0bb7737b1 Mon Sep 17 00:00:00 2001 From: codegodz Date: Sat, 15 Jul 2023 14:01:46 +0200 Subject: [PATCH 1/2] update -x --- graphStats.jl | 29 ++++++++++++++++++ graphite.jl | 83 ++++++++++++++++++++++++++++++++++++-------------- main.jl | 1 + suffixArray.jl | 32 ++++++++++++++----- test.jl | 36 ++++++++++++++++++++++ 5 files changed, 150 insertions(+), 31 deletions(-) create mode 100644 graphStats.jl create mode 100644 test.jl diff --git a/graphStats.jl b/graphStats.jl new file mode 100644 index 0000000..880e1c7 --- /dev/null +++ b/graphStats.jl @@ -0,0 +1,29 @@ + + +function parse_numbers(path::AbstractString) + numbers = Vector{Int32}() + for node in split(path, " ") + number = parse(Int32, view(node, 1:length(node)-1)) + # number = node[end] == "-" ? flipnode(number) : noflip(number) + push!(numbers, number) + end + return numbers +end + +function get_stats_gfa2(f::String) + h = open(f, "r") + unique_nodes = Set{Int}() + edges = 0 + for line in eachline(h) + identifier, path = split(line, "\t") + node_ids = parse_numbers(path) + for node in node_ids + push!(unique_nodes, node) + end + edges += length(edges)-1 + end + println("Nodes: ", nodes) + println("Edges: ", edges) +end + +get_stats_gfa2("/net/phage/linuxhome/mgx/people/rickb/Campy/complete_genomees/graph/complete_campy_graph_gfa.gfa2") \ No newline at end of file diff --git a/graphite.jl b/graphite.jl index d3b0ab3..db5e2e2 100644 --- a/graphite.jl +++ b/graphite.jl @@ -1,3 +1,6 @@ +using ProgressMeter +using Dates + const MASK = Int32(1<<30) flipnode(n::Int32) = n ⊻ MASK @@ -18,9 +21,23 @@ struct Color k_size::Int32 end -function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32}) +# 175378 +# 175378 +function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32}, ca_coords::UnitRange{Int32}) match_end = match_start+match_size-1 + println("Got match: ", match_start, " -> ", match_end) + # Check if this is masked as query + # println( ca_coords[1] != Int32(0)) + # println(match_start >= ca_coords[1]) + # println(match_end <= ca_coords[2], " ", match_end, " and ", ca_coords[2] ) + # println(ca_coords) + if ca_coords[1] != Int32(0) && match_start >= ca_coords.start && match_end <= ca_coords.stop + # println("Same as query skip") + return + end + + # println("Passed as non query") # Get info from pervious matches at_start = color.origin[match_start] @@ -91,7 +108,7 @@ function check_this_point(ca::Vector{Int32}, sa::Vector{Int32}, ref::AbstractVec return match_size end -function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color) +function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color, ca_coords::UnitRange{Int32}) move_dir = forward ? 1 : -1 lcp_dir = forward ? 0 : 1 # println("Extending this point") @@ -102,12 +119,12 @@ function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{In start_check_from = Int32(min(lcp[i + lcp_dir], match_size)) # Check the size of this match starting from +1 of the LCP value) match_size = check_this_point(ca, sa, ref, ref_start, Int32(i), start_check_from ) - update_color!(color, ref_id, sa[i], Int32(match_size), ca) + update_color!(color, ref_id, sa[i], Int32(match_size), ca, ca_coords) i += move_dir end end -function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}) +function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32}) # println(ref_id, " at: ", ref[307]) max_match_index = Int32(0) max_match_size = Int32(0) @@ -121,16 +138,16 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32} # If we have a match keep using the linked list to extend if max_match_size > 0 while ref_start <= length(ref) - # Check the match size at this point + # Check the match size at this point max_match_size = check_this_point(ca, sa, ref, ref_start, max_match_index, Int32(max_match_size-1)) # skip k-1 # If we don't have any match we don't have to check the flanks max_match_size == 0 && break - update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca) + update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca, ca_coords) # Check up and down in suffix array for other matches - extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color) - extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color) + extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color, ca_coords) + extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color, ca_coords) # # Move to next location in suffix array for the second around max_match_index = inv_perm_sa[sa[max_match_index]+1] @@ -143,40 +160,60 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32} end end -function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}) +function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32}) # First do the forward align - align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp) + align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords) # Flip the nodes and reverse to do the reverse alignment reverse_complement_ref!(ref) - align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp) + align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords) end function run(gfa::String, seq_file::String, query_file::String, k_size::Int32, out_file::String; blacklist::String = "") - + start_time = now() blacklist_ids = !isempty(blacklist) ? read_ids_from_file(blacklist) : OrderedSet{String}() - queries, query_ids = processGFA(gfa, query_file) - - ca, sa = create_k_suffix_array(queries, Int32(0)) + println("Reading GFA...") + queries, query_ids, query_ids_numerical = processGFA(gfa, query_file) + + println("Building data structures (SA, LCP, and ISA") + ca, query_offsets, sa = create_k_suffix_array(queries, query_ids_numerical, Int32(0)) inv_sa_perm = inverse_perm_sa(sa) lcp = build_lcp(sa, ca) - + println("Reading node sizes...") size_map = read_node_sizes(seq_file) len = zeros(Int32, length(ca)) ori = [Origin(-1,-1) for i in 1:length(ca)] # Vector{Origin}(undef, length(ca)) color = Color(len, ori, size_map, k_size) + println("Aligning references to the suffix array") + + prog = ProgressUnknown("References read: ") for (ref_id, line) in enumerate(eachline(gfa)) - identifier, path = split(line, "\t") - if !(identifier in query_ids) && !(identifier in blacklist_ids) - path_numbers = parse_numbers(path) - align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp) - end + identifier, path = split(line, "\t") + # Skip if blacklisted + identifier in blacklist_ids && continue + # Get the mask coordinates in the CA array if this is a query + ca_coords = identifier in query_ids ? query_offsets[ref_id] : Int32(0):Int32(0) + #if ca_coords[1] != Int32(0) + #println("Got coords: ", ca_coords, "id: ", identifier, " ref id: ", ref_id) + + path_numbers = parse_numbers(path) + align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp, ca_coords) + println(ca_coords) + + + # end + + + + ProgressMeter.next!(prog) + end - + ProgressMeter.finish!(prog) + println("Writing output") writeResults(ca, color, query_ids, out_file, size_map) - + println("Elapsed: ", now() - start_time) end # run("test", "test", "test", Int32(31), "test") diff --git a/main.jl b/main.jl index acd37a7..fa2867d 100644 --- a/main.jl +++ b/main.jl @@ -1,5 +1,6 @@ using OrderedCollections using ArgParse +using IntervalTrees include("./graphite.jl") include("./graph_io.jl") diff --git a/suffixArray.jl b/suffixArray.jl index 4894f10..707ba9a 100644 --- a/suffixArray.jl +++ b/suffixArray.jl @@ -2,7 +2,9 @@ const LIBSAIS = "libsais-2.7.1/libsais.so.2" -function concat_with_seperator(vectors::Vector{Vector{Int32}}) + +function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32}) + length(vectors) == length(identifiers) || error("Vectors != identifiers") # Probably a cleaner way to do this :) Like using map # to flip the node ids in slices and copying the slices to the output total_size = sum(map(length, vectors)) @@ -10,16 +12,30 @@ function concat_with_seperator(vectors::Vector{Vector{Int32}}) vect_id = -1 * length(vectors) # to have it decending for sorting # Concat with seperator + store sign in most significant bit i = 1 + identifier_i = 1 + # Sometimes we might want to align back to the other queries just + # not to itself. However the relationship between a sequence and it's suffixes + # is not directly evident as there are multiple suffixes along the SA that + # belong to the same query, likely all scattered. + query_offsets = Dict{Int32, UnitRange{Int32}}() + last_vect_id = Int32(1) + last_start = Int32(1) + @inbounds for v in vectors for node_id in v concat_arr[i] = node_id - i +=1 + i += 1 end + # Get the vector id offsets and store them in the query offsets dict + query_offsets[identifiers[identifier_i]] = last_start:i-1 concat_arr[i] = vect_id - vect_id +=1 - i +=1 + vect_id += 1 + i += 1 + last_vect_id = vect_id + last_start = i + identifier_i += 1 end - return concat_arr + return concat_arr, query_offsets end function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32) @@ -33,10 +49,10 @@ function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32) return out_vector end -function create_k_suffix_array(vectors::Vector{Vector{Int32}}, free_space::Int32) - concat_array = concat_with_seperator(vectors) +function create_k_suffix_array(vectors::Vector{Vector{Int32}}, query_identifiers::Vector{Int32}, free_space::Int32) + concat_array, query_offsets = concat_with_seperator(vectors, query_identifiers) suffix_array = create_suffix_array(concat_array, free_space) - return concat_array, suffix_array + return concat_array, query_offsets, suffix_array end function build_lcp(sa::Vector{Int32}, V::Vector{Int32}, base::Integer=1) diff --git a/test.jl b/test.jl new file mode 100644 index 0000000..838d689 --- /dev/null +++ b/test.jl @@ -0,0 +1,36 @@ +function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32}) + length(vectors) == length(identifiers) || error("Vectors != identifiers") + # Probably a cleaner way to do this :) Like using map + # to flip the node ids in slices and copying the slices to the output + total_size = sum(map(length, vectors)) + concat_arr = zeros(Int32, total_size + length(vectors)) + vect_id = -1 * length(vectors) # to have it decending for sorting + # Concat with seperator + store sign in most significant bit + i = 1 + identifier_i = 1 + # Sometimes we might want to align back to the other queries just + # not to itself. However the relationship between a sequence and it's suffixes + # is not directly evident as there are multiple suffixes along the SA that + # belong to the same query, likely all scattered. + query_offsets = Dict{Int32, UnitRange{Int32}}() + last_vect_id = Int32(1) + last_start = Int32(1) + + @inbounds for v in vectors + for node_id in v + concat_arr[i] = node_id + i += 1 + end + # Get the vector id offsets and store them in the query offsets dict + query_offsets[identifiers[identifier_i]] = last_start:i-1 + concat_arr[i] = vect_id + vect_id += 1 + i += 1 + last_vect_id = vect_id + last_start = i + identifier_i += 1 + end + return concat_arr, query_offsets +end + +concat_with_seperator( [Int32[1,2,3,4,5], Int32[100,200,100], Int32[10000,10000] ], [Int32(1), Int32(5), Int32(9)]) \ No newline at end of file From ae3510704d884fae0cd97d9d5c091963b60a113c Mon Sep 17 00:00:00 2001 From: codegodz Date: Sat, 15 Jul 2023 14:02:06 +0200 Subject: [PATCH 2/2] graph io update --- graph_io.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/graph_io.jl b/graph_io.jl index ccc616d..d5fd710 100644 --- a/graph_io.jl +++ b/graph_io.jl @@ -11,15 +11,19 @@ end function read_queries(f::String, query_ids::OrderedSet{String}) queries = Vector{Vector{Int32}}() query_ids_file_order = OrderedSet{String}() # Might find in different order in the file than query_ids + query_ids_numerical = Vector{Int32}() for (i, line) in enumerate(eachline(f)) identifier, path = split(line, "\t") if identifier in query_ids path_numbers = parse_numbers(path) push!(query_ids_file_order, identifier) push!(queries, path_numbers) + push!(query_ids_numerical, i) + length(query_ids) == length(query_ids_file_order) && break end end - return queries, query_ids_file_order + println("Read: ", length(query_ids_file_order), " queries") + return queries, query_ids_file_order, query_ids_numerical end function read_node_sizes(f::String) @@ -44,8 +48,8 @@ end function processGFA(gfa::String, query_file::String) # Read the query ids from the file query_ids = read_ids_from_file(query_file) - queries, query_ids = read_queries(gfa, query_ids) - return queries, query_ids + queries, query_ids, query_ids_numerical = read_queries(gfa, query_ids) + return queries, query_ids, query_ids_numerical end function writeResults(ca::Vector{Int32}, color::Color, query_ids::OrderedSet{String}, out_file::String, size_map::Dict{Int32, Int32})