diff --git a/Project.toml b/Project.toml index b7e8d3c..eed5f6d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,21 +1,21 @@ name = "ProteinChains" uuid = "b8e8f2a5-48d3-44f1-ba0d-c71cb7726ff8" authors = ["Anton Oresten and contributors"] -version = "0.2.0" +version = "0.3.0" [deps] AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7" Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" -DynamicStructs = "e139c391-eeee-4818-b359-c8725224fb1f" +HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] -AssigningSecondaryStructure = "0.5" Backboner = "0.12" BioStructures = "4" -DynamicStructs = "0.1" +HDF5 = "0.17" LinearAlgebra = "1" PeriodicTable = "1" julia = "1" diff --git a/README.md b/README.md index c4249cf..f5fbc3d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Build Status](https://github.com/MurrellGroup/ProteinChains.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/ProteinChains.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/MurrellGroup/ProteinChains.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/ProteinChains.jl) -This Julia package provides implements the `ProteinChain` type: a GPU-friendly structure-of-arrays representation of protein chains. +This Julia package provides implements the `ProteinChain` type: a chain-level structure-of-arrays type representation of proteins, with support for indexing by residue index. ## Installation @@ -16,7 +16,7 @@ Pkg.add("ProteinChains") ## Examples -The `ProteinChain` type is meant to only store a set of quintessential fields, from which most other properties can be derived. +The `ProteinChain` type is meant to only store a basic set of fields, from which some other properties might be derived. ```julia julia> using ProteinChains @@ -24,43 +24,29 @@ julia> using ProteinChains julia> structure = pdb"1EYE" # string macro to fetch proteins from the PDB [ Info: Downloading file from PDB: 1EYE 1-chain ProteinStructure "1EYE.cif" - 256-residue ProteinChain{Float64} (A) + 256-residue ProteinChain{Float64, @NamedTuple{}} (A) julia> propertynames(chain) -(:id, :sequence, :backbone, :numbering, :atoms) +(:id, :atoms, :sequence, :numbering) ``` -To store additional properties, `AnnotatedProteinChain` can be used to add dynamic properties to the chain: +To store additional properties, `annotate` can be used to attach persistent chain-level properties or indexable residue-level properties: ```julia -julia> annotated_chain = annotate(chain; model=1) -256-residue AnnotatedProteinChain{Float64} (A): - 6 fields: - id::String = "A" - sequence::String = - backbone::Array{Float64,3} = - numbering::Vector{Int64} = - atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = - indexable_properties::Vector{Symbol} = Symbol[] - 1 property: - model::Int64 = 1 -``` +julia> chain = structure["A"] +256-residue ProteinChain{Float64, @NamedTuple{}} (A) -For properties of type `<:AbstractArray` that represent residue-level information, `annotate_indexable!` will index the last dimension of the property when the chain is indexed: +julia> annotated_chain = annotate(chain; taxid=ChainProperty(83332)) +256-residue ProteinChain{Float64, @NamedTuple{taxid::ChainProperty{Int64}}} (A) -```julia -julia> annotate_indexable!(annotated_chain; secondary_structure=assign_secondary_structure(annotated_chain) -256-residue AnnotatedProteinChain{Float64} (A): - 6 fields: - id::String = "A" - sequence::String = - backbone::Array{Float64,3} = - numbering::Vector{Int64} = - atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = - indexable_properties::Vector{Symbol} = [:secondary_structure] - 2 properties: - model::Int64 = 1 - secondary_structure::Vector{Int64} = [1, 1, 3, 3, 3, 3, 3, 3, 3, 1 … 2, 2, 2, 2, 2, 2, 2, 1, 1, 1] +julia> annotated_chain = annotate(annotated_chain; some_residue_property=ResidueProperty(rand(3,256))) # last dimension gets indexed +256-residue ProteinChain{Float64, @NamedTuple{ORGANISM_TAXID::ChainProperty{Int64}, some_residue_property::ResidueProperty{Matrix{Float64}}}} (A) + +julia> annotated_chain[1:100].some_residue_property +3×100 Matrix{Float64}: + 0.273545 0.639173 0.92708 … 0.459441 0.196407 0.880034 + 0.981498 0.70263 0.279264 0.552049 0.89274 0.0328866 + 0.169268 0.117848 0.732741 0.301921 0.187094 0.281187 ``` ## See also diff --git a/src/ProteinChains.jl b/src/ProteinChains.jl index 0fd42d2..07768ae 100644 --- a/src/ProteinChains.jl +++ b/src/ProteinChains.jl @@ -1,12 +1,8 @@ module ProteinChains -using DynamicStructs +using StaticArrays: SVector using Backboner -export Backbone -export ChainedBonds -export get_bond_lengths, get_bond_angles, get_torsion_angles -export Frames include("ideal.jl") export BackboneGeometry @@ -16,23 +12,25 @@ export append_residue export prepend_residue include("atom.jl") +export Atom +export encode_atom_name, decode_atom_name +export coords + +include("properties.jl") +export ChainProperty, ResidueProperty include("chain.jl") -export AbstractProteinChain export ProteinChain -export length +export annotate +export map_atoms! export psi_angles, omega_angles, phi_angles - -include("annotated-chain.jl") -export AnnotatedProteinChain -export annotate!, annotate -export annotate_indexable!, annotate_indexable +export get_atoms, get_backbone include("structure.jl") export ProteinStructure -include("secondary-structure.jl") -export assign_secondary_structure, assign_secondary_structure! +include("dataset.jl") +export ProteinDataset include("io/io.jl") export readcif, readpdb diff --git a/src/annotated-chain.jl b/src/annotated-chain.jl deleted file mode 100644 index 9454f14..0000000 --- a/src/annotated-chain.jl +++ /dev/null @@ -1,52 +0,0 @@ -@dynamic mutable struct AnnotatedProteinChain{T} <: AbstractProteinChain{T} - id::String - sequence::String - backbone::Array{T,3} - numbering::Vector{Int64} - atoms::Vector{Vector{Atom{T}}} - indexable_properties::Vector{Symbol} - - function AnnotatedProteinChain( - id::AbstractString, sequence::AbstractString, backbone::AbstractArray{T,3}, - numbering::AbstractVector{<:Integer}, atoms::Vector{Vector{Atom{T}}}, - indexable_properties=Symbol[]; kwargs... - ) where T - @assert length(sequence) == size(backbone, 3) == length(atoms) == length(numbering) - AnnotatedProteinChain{T}(id, sequence, backbone, atoms, numbering, indexable_properties; kwargs...) - end -end - -ProteinChain(chain::AbstractProteinChain) = ProteinChain(chain.id, chain.sequence, chain.backbone, chain.numbering, chain.atoms) -AnnotatedProteinChain(chain::AbstractProteinChain) = AnnotatedProteinChain(chain.id, chain.sequence, chain.backbone, chain.numbering, chain.atoms) - -function Base.getindex(chain::AnnotatedProteinChain, i::AbstractVector{<:Integer}) - properties = Dict{Symbol,Any}() - for name in getproperties(chain; fields=false) - value = getproperty(chain, name) - properties[name] = name in chain.indexable_properties ? collect(selectdim(value, ndims(value), i)) : value - end - return annotate(ProteinChain(chain)[i]; properties...) -end - -function annotate_indexable!(chain::AnnotatedProteinChain; annotations...) - for (name, value) in annotations - @assert value isa AbstractArray - @assert size(value)[end] == length(chain) - push!(chain.indexable_properties, name) - setproperty!(chain, name, value) - end - return chain -end - -annotate_indexable(chain::AnnotatedProteinChain; annotations...) = annotate_indexable!(deepcopy(chain); annotations...) -annotate_indexable(chain::ProteinChain; annotations...) = annotate_indexable!(AnnotatedProteinChain(chain); annotations...) - -function annotate!(chain::AnnotatedProteinChain; annotations...) - for (name, value) in annotations - setproperty!(chain, name, value) - end - return chain -end - -annotate(chain::AnnotatedProteinChain; annotations...) = annotate(deepcopy(chain); annotations...) -annotate(chain::ProteinChain; annotations...) = annotate!(AnnotatedProteinChain(chain); annotations...) diff --git a/src/atom.jl b/src/atom.jl index d893879..37f3b96 100644 --- a/src/atom.jl +++ b/src/atom.jl @@ -1,36 +1,35 @@ using PeriodicTable: elements -const ELEMENT_SYMBOL_TO_ATOMIC_NUMBER = Dict(uppercase(elements[atomic_number].symbol) => atomic_number for atomic_number in 1:118) -const ATOMIC_NUMBER_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_ATOMIC_NUMBER) +const ELEMENT_SYMBOL_TO_NUMBER = Dict(uppercase(elements[number].symbol) => number for number in 1:118) +const number_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_NUMBER) -element_symbol_to_atomic_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_ATOMIC_NUMBER[uppercase(strip(element_symbol))] -atomic_number_to_element_symbol(atomic_number::Integer) = ATOMIC_NUMBER_TO_ELEMENT_SYMBOL[atomic_number] +element_symbol_to_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_NUMBER[uppercase(strip(element_symbol))] +number_to_element_symbol(number::Integer) = number_TO_ELEMENT_SYMBOL[number] pad_atom_name(name::AbstractString, element_symbol::AbstractString) = rpad(" "^(2-length(strip(element_symbol)))*strip(name), 4) encode_atom_name(name::AbstractString, element_symbol::AbstractString) = reinterpret(UInt32, codeunits(pad_atom_name(name, element_symbol)))[1] decode_atom_name(name::UInt32) = String(reinterpret(UInt8, [name])) -mutable struct Atom{T<:AbstractFloat} +struct Atom{T<:AbstractFloat} name::UInt32 - atomic_number::Int8 + number::Int8 x::T y::T z::T end -Atom(name::UInt32, atomic_number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, atomic_number, x, y, z) -Atom(name::AbstractString, element_symbol::AbstractString, coords::AbstractVector{<:AbstractFloat}) = - Atom(encode_atom_name(name, element_symbol), element_symbol_to_atomic_number(element_symbol), coords...) +Atom{T}(atom::Atom) where T = Atom{T}(atom.name, atom.number, atom.x, atom.y, atom.z) -coords(atom::Atom) = [atom.x, atom.y, atom.z] +@inline Atom(name::UInt32, number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, number, x, y, z) +@inline Atom(name::AbstractString, element_symbol::AbstractString, x::T, y::T, z::T) where T = + Atom(encode_atom_name(name, element_symbol), element_symbol_to_number(element_symbol), x, y, z) -Base.summary(atom::Atom) = "$(elements[atom.atomic_number].name) atom at [$(atom.x), $(atom.y), $(atom.z)])" +@inline Atom(name, number, coords::AbstractVector{T}) where T = Atom(name, number, coords...) -function offset!(atom::Atom, coords::Vector{<:Real}) - @assert length(coords) == 3 - atom.x += coords[1] - atom.y += coords[2] - atom.z += coords[3] - return atom -end +coords(atom::Atom) = SVector(atom.x, atom.y, atom.z) + +Base.summary(atom::Atom) = "$(elements[atom.number].name) atom at [$(atom.x), $(atom.y), $(atom.z)])" + +Base.show(io::IO, atom::Atom{T}) where T = print(io, + "Atom(\"$(decode_atom_name(atom.name))\", \"$(number_to_element_symbol(atom.number))\", $(coords(atom)))") \ No newline at end of file diff --git a/src/chain.jl b/src/chain.jl index f2cfc18..38a69d7 100644 --- a/src/chain.jl +++ b/src/chain.jl @@ -1,58 +1,119 @@ """ - AbstractProteinChain{T<:AbstractFloat} + ProteinChain{T<:Real,Ps<:NamedProperties} +""" +struct ProteinChain{T<:Real,Ps<:NamedProperties} + id::String + atoms::Vector{Vector{Atom{T}}} + sequence::String + numbering::Vector{Int32} + properties::Ps +end -An abstract type representing a protein chain, with a type parameter -`T` that specifies the floating-point type of the coordinates. +function ProteinChain( + id, atoms::Vector{Vector{Atom{T}}}, sequence::String, numbering::Vector{<:Integer}, properties::Ps, +) where {T,Ps<:NamedProperties} + len = length(atoms) + @assert sizeof(sequence) == len + @assert length(numbering) == len + for property in properties + property isa ResidueProperty && @assert size(property[], ndims(property[])) == len + end + ProteinChain{T,Ps}(id, atoms, sequence, Int32.(numbering), properties) +end -## Subtypes -- `ProteinChain{T}`: A concrete subtype of `AbstractProteinChain` that represents a protein chain with a spe. -""" -abstract type AbstractProteinChain{T<:AbstractFloat} end +ProteinChain(id, atoms, sequence, numbering) = ProteinChain(id, atoms, sequence, numbering, (;)) -Base.summary(chain::AbstractProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))" -Base.length(chain::AbstractProteinChain) = size(chain.backbone, 3) +Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) = propertynames(chain1) == propertynames(chain2) && + !any(getproperty(chain1, name) != getproperty(chain2, name) for name in propertynames(chain1, false)) -mutable struct ProteinChain{T} <: AbstractProteinChain{T} - id::String - sequence::String - backbone::Array{T,3} - numbering::Vector{Int} - atoms::Vector{Vector{Atom{T}}} +Base.length(chain::ProteinChain) = length(chain.atoms) + +Base.getproperty(chain::ProteinChain, name::Symbol) = + name in fieldnames(ProteinChain) ? getfield(chain, name) : getfield(getfield(chain, :properties), name)[] + +Base.propertynames(chain::ProteinChain, private::Bool=false) = (setdiff(fieldnames(ProteinChain), private ? () : (:properties,))..., propertynames(chain.properties)...) + +function Base.getindex(chain::ProteinChain, i::AbstractVector) + properties = map(p -> p[i], chain.properties) + ProteinChain(chain.id, chain.atoms[i], chain.sequence[i], chain.numbering[i], properties) +end + +annotate(chain::ProteinChain; annotations...) = + ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, merge(chain.properties, annotations)) - function ProteinChain( - id::AbstractString, sequence::AbstractString, backbone::AbstractArray{T,3}, - numbering::AbstractVector{<:Integer} = collect(1:length(sequence)), - atoms::Vector{Vector{Atom{T}}} = [Atom{T}[] for _ in sequence], - ) where T - @assert length(sequence) == size(backbone, 3) == length(numbering) == length(atoms) - new{T}(id, sequence, backbone, numbering, atoms) +Base.summary(chain::ProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))" + +# wrap io with IOContext(io, :compact=>false) to make parseable +function Base.show(io::IO, chain::ProteinChain) + print(io, "ProteinChain(") + for fieldname in fieldnames(ProteinChain) + show(io, getproperty(chain, fieldname)) + fieldname == :properties || print(io, ", ") end + print(io, ")") end -""" - ProteinChain{T<:AbstractFloat} -""" -ProteinChain +function Base.show(io::IO, ::MIME"text/plain", chain::ProteinChain) + print(io, summary(chain)) +end + +function get_atoms(backbone_coords::Array{T,3}) where T + @assert size(backbone_coords)[1:2] == (3,3) + atoms = [Vector{Atom{T}}(undef, 3) for _ in 1:size(backbone_coords, 3)] + for (i, slice) in enumerate(eachslice(backbone_coords, dims=3)) + for (j, pos) in enumerate(eachcol(slice)) + atoms[i][j] = Atom(BACKBONE_ATOM_NAMES[j], BACKBONE_ATOM_SYMBOLS[j], pos) + end + end + return atoms +end -Base.getindex(chain::ProteinChain, i::AbstractVector{<:Integer}) = - ProteinChain(chain.id, chain.sequence[i], chain.backbone[:,:,i], chain.numbering[i], chain.atoms[i]) +get_atoms(backbone::Backbone) = get_atoms(reshape(backbone.coords, 3, 3, :)) +get_atoms(chain::ProteinChain) = chain.atoms -import Backboner +function get_backbone(atoms::Vector{Vector{Atom{T}}}) where T + backbone_coords = Array{T,3}(undef, 3, 3, length(atoms)) + encoded_names = encode_atom_name.(BACKBONE_ATOM_NAMES, BACKBONE_ATOM_SYMBOLS) + for (i, residue_atoms) in enumerate(atoms) + for (j, name) in enumerate(encoded_names) + backbone_coords[:,j,i] = coords(argmax(atom -> atom.name == name, residue_atoms)) + end + end + return backbone_coords +end + +get_backbone(chain::ProteinChain) = hasproperty(chain, :backbone) ? chain.backbone : get_backbone(chain.atoms) -Backboner.Backbone(chain::AbstractProteinChain) = Backbone(chain.backbone) -Backboner.ChainedBonds(chain::AbstractProteinChain) = ChainedBonds(Backbone(chain)) -Backboner.Frames(chain::AbstractProteinChain, ideal_residue=STANDARD_RESIDUE) = Frames(Backbone(chain), ideal_residue) +Backboner.Backbone(chain::ProteinChain) = Backbone(get_backbone(chain)) +Backboner.ChainedBonds(chain::ProteinChain) = ChainedBonds(Backbone(chain)) +Backboner.Frames(chain::ProteinChain, ideal_residue=STANDARD_RESIDUE) = Frames(Backbone(chain), ideal_residue) -psi_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[1:3:end] -omega_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[2:3:end] -phi_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[3:3:end] +psi_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[1:3:end] +omega_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[2:3:end] +phi_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[3:3:end] -function offset!(chain::AbstractProteinChain, coords::Vector{<:Real}) - chain.backbone .+= coords +function map_atoms!(f::Function, chain::ProteinChain, args...) for residue_atoms in chain.atoms - for atom in residue_atoms - offset!(atom, coords) + for i in eachindex(residue_atoms) + residue_atoms[i] = f(residue_atoms[i], args...) end end return chain end + +using AssigningSecondaryStructure: assign_secondary_structure + +calculate_property(x, name::Symbol, args...) = calculate_property(x, Val(name), args...) +annotate(chain::ProteinChain, names::Vararg{Symbol}) = annotate(chain; NamedTuple{names}(calculate_property(chain, name) for name in names)...) + +calculate_property(chain::ProteinChain, ::Val{:ideal_residue}) = collect(STANDARD_RESIDUE) |> ChainProperty +calculate_property(chain::ProteinChain, ::Val{:bond_lengths}) = Backboner.get_bond_lengths(Backboner.Backbone(get_backbone(chain))) |> ChainProperty +calculate_property(chain::ProteinChain, ::Val{:bond_angles}) = Backboner.get_bond_angles(Backboner.Backbone(get_backbone(chain))) |> ChainProperty +calculate_property(chain::ProteinChain, ::Val{:torsion_angles}) = Backboner.get_torsion_angles(Backboner.Backbone(get_backbone(chain))) |> ChainProperty +calculate_property(chain::ProteinChain, ::Val{:is_knotted}) = Backboner.is_knotted(Backboner.Backbone(get_backbone(chain)[:,2,:])) |> ChainProperty + +calculate_property(chain::ProteinChain, ::Val{:backbone}) = get_backbone(chain) |> ResidueProperty +calculate_property(chain::ProteinChain, ::Val{:secondary_structure}) = Int8.(assign_secondary_structure(get_backbone(chain))) |> ResidueProperty +calculate_property(chain::ProteinChain, ::Val{:residue_rotations}) = Backboner.Frames(Backbone(get_backbone(chain)), hasproperty(chain, :ideal_residue) ? chain.ideal_residue : STANDARD_RESIDUE).rotations |> ResidueProperty +calculate_property(chain::ProteinChain, ::Val{:residue_translations}) = dropdims(Backboner.centroid(get_backbone(chain); dims=2); dims=2) |> ResidueProperty +calculate_property(chain::ProteinChain{T}, ::Val{:residue_torsion_angles}) where T = [reshape(calculate_property(chain, :torsion_angles)[], 3, :) fill(T(NaN), 3)] |> ResidueProperty diff --git a/src/dataset.jl b/src/dataset.jl new file mode 100644 index 0000000..e67dfc3 --- /dev/null +++ b/src/dataset.jl @@ -0,0 +1,13 @@ +""" + ProteinDataset <: AbstractDict{String,ProteinStructure} +""" +struct ProteinDataset <: AbstractDict{String,ProteinStructure} + dict::Dict{String,ProteinStructure} +end + +ProteinDataset(structures::AbstractVector{<:ProteinStructure}) = + ProteinDataset(Dict(structure.name => structure for structure in structures)) + +Base.length(d::ProteinDataset) = length(d.dict) +Base.iterate(d::ProteinDataset, args...) = iterate(d.dict, args...) +Base.:(==)(d1::ProteinDataset, d2::ProteinDataset) = d1.dict == d2.dict diff --git a/src/ideal.jl b/src/ideal.jl index 1af4c6a..e51109b 100644 --- a/src/ideal.jl +++ b/src/ideal.jl @@ -34,21 +34,21 @@ the N, Ca, and C atom positions of a residue positioned at the origin. struct IdealResidue{T<:AbstractFloat} <: AbstractMatrix{T} backbone_geometry::BackboneGeometry N_Ca_C_coords::Matrix{T} +end - function IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY; template=nothing) where T - N_Ca_C_coords = Matrix{T}(undef, 3, 3) - N_pos, Ca_pos, C_pos = eachcol(N_Ca_C_coords) - Θ = backbone_geometry.N_Ca_C_angle - π/2 - N_pos .= [0, 0, 0] - Ca_pos .= [backbone_geometry.N_Ca_length, 0, 0] - C_pos .= Ca_pos + backbone_geometry.Ca_C_length * [sin(Θ), cos(Θ), 0] - N_Ca_C_coords .-= Backboner.centroid(N_Ca_C_coords; dims=2) - if !isnothing(template) - wanted_orientation, current_offset, wanted_offset = Backboner.kabsch_algorithm(N_Ca_C_coords, template) - N_Ca_C_coords .= wanted_orientation * (N_Ca_C_coords .- current_offset) .+ wanted_offset - end - new{T}(backbone_geometry, N_Ca_C_coords) +function IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY; template=nothing) where T + N_Ca_C_coords = Matrix{T}(undef, 3, 3) + N_pos, Ca_pos, C_pos = eachcol(N_Ca_C_coords) + Θ = backbone_geometry.N_Ca_C_angle - π/2 + N_pos .= [0, 0, 0] + Ca_pos .= [backbone_geometry.N_Ca_length, 0, 0] + C_pos .= Ca_pos + backbone_geometry.Ca_C_length * [sin(Θ), cos(Θ), 0] + N_Ca_C_coords .-= Backboner.centroid(N_Ca_C_coords; dims=2) + if !isnothing(template) + wanted_orientation, current_offset, wanted_offset = Backboner.kabsch_algorithm(N_Ca_C_coords, template) + N_Ca_C_coords .= wanted_orientation * (N_Ca_C_coords .- current_offset) .+ wanted_offset end + IdealResidue{T}(backbone_geometry, N_Ca_C_coords) end Base.size(::IdealResidue) = (3,3) @@ -65,7 +65,7 @@ Thus, we can use this residue as a template for aligning other residues with ver geometry to it. ```jldocotest -julia> IdealResidue{Float64}(BackboneGeometry(N_Ca_C_angle = 1.93); template=ProteinChains.STANDARD_RESIDUE) +julia> IdealResidue{Float64}(BackboneGeometry(N_Ca_C_angle = 1.93); template=ProteinChains.STANDARD_RESIDUE_TEMPLATE) 3×3 IdealResidue{Float64}: -1.06447 -0.199174 1.26364 0.646303 -0.529648 -0.116655 diff --git a/src/io/io.jl b/src/io/io.jl index 3c53971..458c21f 100644 --- a/src/io/io.jl +++ b/src/io/io.jl @@ -5,7 +5,7 @@ const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY") const BACKBONE_ATOM_NAMES = ["N", "CA", "C"] const BACKBONE_ATOM_SYMBOLS = ["N", "C", "C"] -const aa_to_threeletter = Dict{Char, String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa]) +const aa_to_threeletter = Dict{Char,String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa]) threeletter_resname(aa::Char) = get(aa_to_threeletter, aa, "XXX") oneletter_resname(threeletter::AbstractString) = Char(get(BioStructures.threeletter_to_aa, threeletter, 'X')) oneletter_resname(residue::BioStructures.AbstractResidue) = oneletter_resname(BioStructures.resname(residue)) @@ -18,3 +18,5 @@ include("renumber.jl") include("read.jl") include("write.jl") include("download.jl") + +include("serialization.jl") \ No newline at end of file diff --git a/src/io/read.jl b/src/io/read.jl index 6b3eb91..5718d29 100644 --- a/src/io/read.jl +++ b/src/io/read.jl @@ -10,20 +10,14 @@ get_sequence(residues::Vector{BioStructures.AbstractResidue}) = join(map(onelett get_atom(residue::BioStructures.AbstractResidue, name::AbstractString) = BioStructures.collectatoms(residue, a -> BioStructures.atomnameselector(a, [name])) |> only -get_backbone(residue::BioStructures.AbstractResidue) = - stack(name -> BioStructures.coords(get_atom(residue, name)), BACKBONE_ATOM_NAMES) - -get_backbone(residues::Vector{BioStructures.AbstractResidue}) = - mapreduce(get_backbone, hcat, residues; init=Matrix{Float64}(undef, 3, 0)) - Atom(atom::BioStructures.Atom) = Atom(atom.name, atom.element, atom.coords) Atom(atom::BioStructures.DisorderedAtom) = Atom(BioStructures.defaultatom(atom)) -function get_nonbackbone_atoms(residues::Vector{BioStructures.AbstractResidue}) +function get_atoms(residues::Vector{BioStructures.AbstractResidue}) atoms = Vector{Atom{Float64}}[] for residue in residues residue = residue isa BioStructures.DisorderedResidue ? BioStructures.defaultresidue(residue) : residue - residue_atoms = map(Atom, BioStructures.collectatoms(residue, !backbone_atom_selector)) + residue_atoms = map(Atom, BioStructures.collectatoms(residue)) push!(atoms, residue_atoms) end return atoms @@ -31,30 +25,31 @@ end function ProteinChain(residues::Vector{BioStructures.AbstractResidue}) id = only(unique(map(BioStructures.chainid, residues))) + atoms = get_atoms(residues) sequence = get_sequence(residues) - backbone = reshape(get_backbone(residues), 3, 3, :) - atoms = get_nonbackbone_atoms(residues) numbering = map(BioStructures.resnumber, residues) - return ProteinChain(id, sequence, backbone, numbering, atoms) + return ProteinChain(id, atoms, sequence, numbering) end -function ProteinChain(chain::BioStructures.Chain, selector=backbone_residue_selector) - residues = BioStructures.collectresidues(chain, selector) - isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[], Vector{Atom{Float64}}[]) +function ProteinChain(chain::BioStructures.Chain, backbone_residue_selector=backbone_residue_selector) + residues = BioStructures.collectresidues(chain, backbone_residue_selector) + isempty(residues) && return ProteinChain(BioStructures.chainid(chain), Vector{Atom{Float64}}[], "", Int[]) return ProteinChain(residues) end -function ProteinStructure(struc::BioStructures.MolecularStructure, selector=backbone_residue_selector; path=nothing) - chains = ProteinChain[] +function ProteinStructure(struc::BioStructures.MolecularStructure, backbone_residue_selector=backbone_residue_selector; mmcifdict=nothing) + proteinchains = ProteinChain{Float64}[] + atoms = Atom.(BioStructures.collectatoms(BioStructures.collectresidues(struc, !backbone_residue_selector))) for model in struc, chain in model - push!(chains, ProteinChain(chain, selector)) + push!(proteinchains, ProteinChain(chain, backbone_residue_selector)) end - structure = ProteinStructure(struc.name, chains) - !isnothing(path) && endswith(path, ".cif") && renumber!(structure, BioStructures.MMCIFDict(path)) - return structure + proteinstructure = ProteinStructure(struc.name, atoms, proteinchains) + mmcifdict isa BioStructures.MMCIFDict && renumber!(proteinstructure, mmcifdict) + return proteinstructure end -Base.read(path::AbstractString, ::Type{ProteinStructure}, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format); path) +Base.read(path::AbstractString, ::Type{ProteinStructure}, format::Type{MMCIFFormat}) = ProteinStructure(read(path, format); mmcifdict=BioStructures.MMCIFDict(path)) +Base.read(path::AbstractString, ::Type{ProteinStructure}, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format)) Base.read(path::AbstractString, T::Type{ProteinStructure}) = read(path, T, get_format(path)) readcif(path::AbstractString) = read(path, ProteinStructure, MMCIFFormat) diff --git a/src/io/renumber.jl b/src/io/renumber.jl index 33d389b..a4f3029 100644 --- a/src/io/renumber.jl +++ b/src/io/renumber.jl @@ -7,7 +7,6 @@ function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFD label_seq_ids = mmcif_dict["_atom_site.label_seq_id"] auth_seq_ids = mmcif_dict["_atom_site.auth_seq_id"] auth_asym_ids = mmcif_dict["_atom_site.auth_asym_id"] - reduced = Int[] current_auth_seq_id = "" for (i, auth_seq_id) in enumerate(auth_seq_ids) @@ -31,7 +30,7 @@ function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFD renum_dict = chainwise_auth_to_label_seq_ids[chain.id] numbering_str = map(n -> get(renum_dict, n, "?"), string.(chain.numbering)) "?" in numbering_str && break - chain.numbering = parse.(Int, numbering_str) + chain.numbering .= parse.(Int, numbering_str) end return structure diff --git a/src/io/serialization.jl b/src/io/serialization.jl new file mode 100644 index 0000000..a0fcfa3 --- /dev/null +++ b/src/io/serialization.jl @@ -0,0 +1,98 @@ +import HDF5 + +function Base.write(parent::Union{HDF5.File,HDF5.Group}, chain::ProteinChain{T}, path::AbstractString=chain.id) where T + chain_group = HDF5.create_group(parent, path) + HDF5.attributes(chain_group)["T"] = string(T) + + chain_group["id"] = chain.id + chain_group["atom_chunk_sizes"] = map(UInt32∘length, chain.atoms) + chain_group["atoms_flattened"] = reduce(vcat, chain.atoms) + chain_group["sequence"] = chain.sequence + chain_group["numbering"] = map(UInt32, chain.numbering) + + indexable = HDF5.create_group(chain_group, "indexable") + persistent = HDF5.create_group(chain_group, "persistent") + for (name, property) in pairs(chain.properties) + g = property isa ResidueProperty ? indexable : persistent + g[string(name)] = property[] + end + + return parent +end + +function Base.read(group::Union{Union{HDF5.File,HDF5.Group}}, ::Type{ProteinChain}) + T = eval(Symbol(read(HDF5.attributes(group)["T"]))) + id = read(group["id"]) + atom_chunk_sizes = read(group["atom_chunk_sizes"]) + atoms_flattened = [Atom(nt.name, nt.number, nt.x, nt.y, nt.z) for nt in read(group["atoms_flattened"])] + sequence = read(group["sequence"]) + numbering = read(group["numbering"]) + + atoms = Vector{Atom{T}}[] + for (i, k) in zip(Iterators.accumulate(+, atom_chunk_sizes), atom_chunk_sizes) + residue_atoms = atoms_flattened[i-k+1:i] + push!(atoms, residue_atoms) + end + + indexable = group["indexable"] + persistent = group["persistent"] + properties = merge( + NamedTuple((Symbol(key) => ResidueProperty(read(indexable[key])) for key in keys(group["indexable"]))), + NamedTuple((Symbol(key) => ChainProperty(read(persistent[key])) for key in keys(group["persistent"]))) + ) + + ProteinChain(id, atoms, sequence, numbering, properties) +end + +function Base.write(parent::Union{HDF5.File,HDF5.Group}, structure::ProteinStructure{T}, path::AbstractString=structure.name) where T + structure_group = HDF5.create_group(parent, path) + HDF5.attributes(structure_group)["T"] = string(T) + + structure_group["name"] = structure.name + structure_group["atoms"] = structure.atoms + structure_group["numbering"] = structure.numbering + + chains_group = HDF5.create_group(structure_group, "chains") + + for (chain, number) in zip(structure.chains, structure.numbering) + write(chains_group, chain, string(number)) + end + + return parent +end + +function Base.read(group::Union{Union{HDF5.File,HDF5.Group}}, ::Type{ProteinStructure}) + T = eval(Symbol(read(HDF5.attributes(group)["T"]))) + name = read(group["name"]) + atoms = [Atom(nt.name, nt.number, nt.x, nt.y, nt.z) for nt in read(group["atoms"])] + numbering = read(group["numbering"]) + + chains_group = group["chains"] + chains = ProteinChain{T}[] + for i in numbering + chain = read(chains_group[string(i)], ProteinChain) + push!(chains, chain) + end + + return ProteinStructure(name, atoms, chains, numbering) +end + +function serialize(path::AbstractString, dataset::ProteinDataset; mode="w") + HDF5.h5open(path, mode) do f + foreach(((key, structure),) -> write(f, structure, key), dataset) + end + return path +end + +serialize(path::AbstractString, structures::AbstractVector{<:ProteinStructure}; kwargs...) = + serialize(path, ProteinDataset(structures), kwargs...) + +function deserialize(path::AbstractString) + structures = ProteinStructure[] + HDF5.h5open(path, "r") do f + for key in keys(f) + push!(structures, read(f[key], ProteinStructure)) + end + end + return ProteinDataset(structures) +end diff --git a/src/io/write.jl b/src/io/write.jl index 3e198ab..ea2c413 100644 --- a/src/io/write.jl +++ b/src/io/write.jl @@ -1,6 +1,6 @@ using LinearAlgebra: cross, normalize -using AssigningSecondaryStructure: get_oxygen_positions +_normalize(A::AbstractArray; dims=1) = mapslices(normalize, A; dims) function estimate_last_oxygen_position(backbone::Array{T,3}) where T<:Real N_pos, Ca_pos, C_pos = eachcol(backbone[:,:,end]) @@ -13,16 +13,26 @@ function estimate_last_oxygen_position(backbone::Array{T,3}) where T<:Real return O_pos end -function BioStructures.Chain(proteinchain::AbstractProteinChain, model::BioStructures.Model) +function get_oxygen_positions(backbone::Array{T,3}) where T<:Real + Ca_pos, C_pos, N_pos = backbone[:, 2, 1:end-1], backbone[:, 3, 1:end-1], backbone[:, 1, 2:end] + CaC_vec = C_pos - Ca_pos + NC_vec = C_pos - N_pos + CO_vec = T(1.23) * _normalize(_normalize(CaC_vec) + _normalize(NC_vec)) + O_pos = C_pos + CO_vec + return [O_pos estimate_last_oxygen_position(backbone)] +end + +function BioStructures.Chain(proteinchain::ProteinChain, model::BioStructures.Model) numbering = proteinchain.numbering residue_list = Vector{String}() residues = Dict{String, BioStructures.AbstractResidue}() chain = BioStructures.Chain(proteinchain.id, residue_list, residues, model) - # some visualization tools require oxygen atoms to be present, so these get used if a residue is missing the oxygen atom + # some secondary structure visualization tools require oxygen atoms to be present, + # so these get used if a residue is missing the oxygen atom oxygen_name = encode_atom_name("O", "O") - oxygen_coords = get_oxygen_positions(proteinchain.backbone) - oxygen_coords[:,end] = estimate_last_oxygen_position(proteinchain.backbone) + backbone_coords = get_backbone(proteinchain) + oxygen_coords = get_oxygen_positions(backbone_coords) atom_serial = 0 for residue_index in 1:length(proteinchain) @@ -32,9 +42,12 @@ function BioStructures.Chain(proteinchain::AbstractProteinChain, model::BioStruc number = numbering[residue_index] residue = BioStructures.Residue(resname, number, ' ', false, atom_list, atoms, chain, '-') # TODO: secondary structure - for (i, (name, element)) in enumerate(zip(BACKBONE_ATOM_NAMES, BACKBONE_ATOM_SYMBOLS)) + # each residue has at least N, CA, C atoms + for atom in proteinchain.atoms[residue_index] atom_serial += 1 - coords = proteinchain.backbone[:, i, residue_index] + name = decode_atom_name(atom.name) + coords = [atom.x, atom.y, atom.z] + element = number_to_element_symbol(atom.number) atom = BioStructures.Atom(atom_serial, name, ' ', coords, 1.0, 0.0, element, " ", residue) push!(atom_list, atom.name) atoms[atom.name] = atom @@ -44,19 +57,10 @@ function BioStructures.Chain(proteinchain::AbstractProteinChain, model::BioStruc atom_serial += 1 coords = oxygen_coords[:, residue_index] atom = BioStructures.Atom(atom_serial, "O", ' ', coords, 1.0, 0.0, "O", " ", residue) - push!(atom_list, atom.name) + insert!(atom_list, 4, atom.name) atoms[atom.name] = atom end - for atom in proteinchain.atoms[residue_index] - atom_serial += 1 - name = decode_atom_name(atom.name) - coords = [atom.x, atom.y, atom.z] - element = atomic_number_to_element_symbol(atom.atomic_number) - atom = BioStructures.Atom(atom_serial, name, ' ', coords, 1.0, 0.0, element, " ", residue) - push!(atom_list, atom.name) - atoms[atom.name] = atom - end key = string(residue.number) push!(residue_list, key) residues[key] = residue @@ -85,15 +89,14 @@ function Base.write(path::AbstractString, proteinstruc::ProteinStructure, format else error("Unsupported format: $format") end - return nothing end -function Base.write(path::AbstractString, proteinchains::AbstractVector{<:AbstractProteinChain}, format=get_format(path)) +function Base.write(path::AbstractString, proteinchains::AbstractVector{<:ProteinChain}, format=get_format(path)) proteinstruc = ProteinStructure(basename(path), proteinchains) write(path, proteinstruc, format) end -Base.write(path::AbstractString, proteinchain::AbstractProteinChain, format=get_format(path)) = write(path, [proteinchain], format) +Base.write(path::AbstractString, proteinchain::ProteinChain, format=get_format(path)) = write(path, [proteinchain], format) writecif(path::AbstractString, x) = write(path, x, MMCIFFormat) writepdb(path::AbstractString, x) = write(path, x, PDBFormat) diff --git a/src/properties.jl b/src/properties.jl new file mode 100644 index 0000000..b2d326e --- /dev/null +++ b/src/properties.jl @@ -0,0 +1,17 @@ +abstract type AbstractProperty end + +Base.getindex(p::AbstractProperty) = p.value + +struct ChainProperty{T} <: AbstractProperty + value::T +end + +Base.getindex(prop::ChainProperty, ::AbstractVector) = prop + +struct ResidueProperty{T<:AbstractArray} <: AbstractProperty + value::T +end + +Base.getindex(prop::ResidueProperty, i::AbstractVector) = selectdim(prop.value, ndims(prop.value), i) |> ResidueProperty + +const NamedProperties{names} = NamedTuple{names,<:Tuple{Vararg{AbstractProperty}}} diff --git a/src/secondary-structure.jl b/src/secondary-structure.jl deleted file mode 100644 index afdff1c..0000000 --- a/src/secondary-structure.jl +++ /dev/null @@ -1,13 +0,0 @@ -import AssigningSecondaryStructure: assign_secondary_structure!, assign_secondary_structure - -const SS_CODES = ['-', 'H', 'E'] - -number_vector_to_codes(v::AbstractVector{<:Integer}) = SS_CODES[v] - -assign_secondary_structure(chain::AbstractProteinChain) = assign_secondary_structure(chain.backbone) - -function assign_secondary_structure!(chain::AnnotatedProteinChain) - number_vector = assign_secondary_structure(chain) - annotate_indexable!(chain; secondary_structure=number_vector_to_codes(number_vector)) - return chain -end diff --git a/src/structure.jl b/src/structure.jl index a4ee53d..0cd564b 100644 --- a/src/structure.jl +++ b/src/structure.jl @@ -1,20 +1,22 @@ """ - ProteinStructure <: AbstractVector{AbstractProteinChain} - -```jldoctest -julia> pdb"1EYE" -[ Info: Downloading file from PDB: 1EYE -1-chain ProteinStructure "1EYE.cif" - 253-residue AnnotatedProteinChain{Float64} (A) -``` + ProteinStructure{T} <: AbstractVector{ProteinChain{T}} """ -mutable struct ProteinStructure <: AbstractVector{AbstractProteinChain} +struct ProteinStructure{T} <: AbstractVector{ProteinChain{T}} name::String - chains::Vector{AbstractProteinChain} + atoms::Vector{Atom{T}} + chains::Vector{<:ProteinChain{T}} + numbering::Vector{Int} end +ProteinStructure(name, atoms, chains) = ProteinStructure(name, atoms, chains, collect(1:length(chains))) + Base.size(structure::ProteinStructure) = (length(structure.chains),) -Base.getindex(structure::ProteinStructure, i) = structure.chains[i] + +Base.getindex(structure::ProteinStructure, i::Integer) = structure.chains[i] + +function Base.getindex(structure::ProteinStructure, i::AbstractVector) + ProteinStructure(structure.name, structure.atoms, structure.chains[i], structure.numbering[i]) +end Base.getindex(structure::ProteinStructure, id::AbstractString) = structure[findfirst(c -> c.id == id, structure.chains)] @@ -27,9 +29,15 @@ function Base.show(io::IO, ::MIME"text/plain", structure::ProteinStructure) end end -function offset!(structure::ProteinStructure, coords::Vector{<:Real}) +function map_atoms!(f::Function, structure::ProteinStructure, args...) for chain in structure - offset!(chain, coords) + map_atoms!(f, chain, args...) + end + for i in eachindex(structure.atoms) + structure.atoms[i] = f(structure.atoms[i], args...) end return structure -end \ No newline at end of file +end + +annotate(structure::ProteinStructure, names::Vararg{Symbol}) = + ProteinStructure(structure.name, structure.atoms, annotate.(structure.chains, names...), structure.numbering) \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 33017ec..6da8ae4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,38 +4,24 @@ using Test @testset "ProteinChains.jl" begin @testset "ProteinChain" begin - chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) + chain = ProteinChain("A", get_atoms(ProteinChains.Backbone(rand(3, 3, 5))), "AMINO", collect(1:5)) @test length(chain) == 5 - @test summary(chain) == "5-residue ProteinChain{Float64} (A)" - - i = 5:-1:1 - @test chain[i].backbone == chain.backbone[:,:,i] end @testset "ProteinStructure" begin - chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) - structure = ProteinStructure("1CHN", [chain]) + chain = ProteinChain("A", get_atoms(ProteinChains.Backbone(rand(3, 3, 5))), "AMINO", collect(1:5)) + structure = ProteinStructure("1CHN", Atom{Float64}[], [chain]) @test structure[1] === chain @test structure["A"] === chain end - @testset "AnnotatedProteinChain" begin - chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) - annotated_chain = annotate(chain; modelnum=1) - @test annotated_chain isa AnnotatedProteinChain - @test annotated_chain.modelnum == 1 - annotate_indexable!(annotated_chain; secondary_structure=assign_secondary_structure(annotated_chain)) - @test length(annotated_chain.secondary_structure) == 5 - @test length(annotated_chain[1:3].secondary_structure) == 3 - end - @testset "read/write" begin @testset "read" begin chains_pdb = pdbentry("1ASS"; format=PDBFormat) @test length.(collect(chains_pdb)) == [152] chains_cif = pdbentry("1ASS"; format=MMCIFFormat) - @test chains_pdb[1].backbone == chains_cif[1].backbone + @test chains_pdb[1].sequence == chains_cif[1].sequence end @testset "write" begin @@ -45,9 +31,21 @@ using Test write(temp_path, chains) read(temp_path, ProteinStructure) end - @test chains[1].backbone == new_chains[1].backbone + @test chains[1].sequence == new_chains[1].sequence end end + @testset "serialization" begin + mktempdir() do dir + filename = "dataset.h5" + path = joinpath(dir, filename) + dataset1 = ProteinDataset([annotate(pdb"1EYE", :backbone)]) + ProteinChains.serialize(path, dataset1) + dataset2 = ProteinChains.deserialize(path) + @test dataset1 == dataset2 + @test hasproperty(first(values(dataset1))["A"], :backbone) + end + end + end