diff --git a/Project.toml b/Project.toml index d94ba80..3ae9204 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,8 @@ version = "0.0.1" AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7" Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884" [weakdeps] Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/src/ProteinChains.jl b/src/ProteinChains.jl index 97e631d..c28ed31 100644 --- a/src/ProteinChains.jl +++ b/src/ProteinChains.jl @@ -4,6 +4,8 @@ using Backboner include("properties.jl") +include("atom.jl") + include("chain.jl") export ProteinChain export countresidues @@ -13,6 +15,8 @@ export ProteinStructure include("idealize.jl") export BackboneGeometry +export append_residue +export prepend_residue include("secondary-structure.jl") @@ -21,13 +25,13 @@ export assign_bond_lengths! export assign_bond_angles! export assign_torsion_angles! export assign_secondary_structure! -export assign_standard_residue! +export assign_ideal_residue! export assign_residue_rotations! export assign_residue_rotations_quat! export assign_residue_translations! export assign_is_knotted! -include("io.jl") +include("io/io.jl") export readchains export readpdb export readcif diff --git a/src/atom.jl b/src/atom.jl new file mode 100644 index 0000000..6cc30f7 --- /dev/null +++ b/src/atom.jl @@ -0,0 +1,27 @@ +using PeriodicTable: elements + +const ELEMENT_SYMBOL_TO_ATOMIC_NUMBER = Dict(uppercase(elements[atomic_number].symbol) => atomic_number for atomic_number in 1:118) +const ATOMIC_NUMBER_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_ATOMIC_NUMBER) + +element_symbol_to_atomic_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_ATOMIC_NUMBER[uppercase(element_symbol)] +atomic_number_to_element_symbol(atomic_number::Integer) = ATOMIC_NUMBER_TO_ELEMENT_SYMBOL[atomic_number] + +encode_atom_name(atom_name::AbstractString) = reinterpret(UInt32, codeunits(lpad(atom_name, 4)))[1] +decode_atom_name(atom_name::UInt32) = String(reinterpret(UInt8, [atom_name])) + +mutable struct Atom{T<:AbstractFloat} + atom_name::UInt32 + atomic_number::UInt8 + x::T + y::T + z::T +end + +coords(atom::Atom) = [atom.x, atom.y, atom.z] + +Base.summary(atom::Atom) = "$(elements[atom.atomic_number].name) atom at [$(atom.x), $(atom.y), $(atom.z)])" + +Atom(atom_name::UInt32, atomic_number::Integer, x::T, y::T, z::T) where T = Atom{T}(atom_name, atomic_number, x, y, z) +Atom(atom_name::UInt32, element_symbol::AbstractString, args...) = Atom(atom_name, element_symbol_to_atomic_number(element_symbol), args...) +Atom(atom_name::AbstractString, args...) = Atom(encode_atom_name(atom_name), args...) +Atom(atom_name::AbstractString, element_symbol, coords::AbstractVector{<:AbstractFloat}) = Atom(atom_name, element_symbol, coords...) diff --git a/src/chain-properties.jl b/src/chain-properties.jl index 0160991..8d164f7 100644 --- a/src/chain-properties.jl +++ b/src/chain-properties.jl @@ -1,40 +1,79 @@ # TODO: store non-backbone atoms -function assign_bond_lengths!(chain::ProteinChain) - chain.bond_lengths = Backboner.get_bond_lengths(Backboner.Backbone(chain.backbone)) -end +export assign_property! -function assign_bond_angles!(chain::ProteinChain) - chain.bond_angles = Backboner.get_bond_angles(Backboner.Backbone(chain.backbone)) -end +export residue_indexing_function +export ResidueIndexingUndefined -function assign_torsion_angles!(chain::ProteinChain) - chain.torsion_angles = Backboner.get_torsional_angles(Backboner.Backbone(chain.backbone)) -end +assign_property!(x, name::Symbol, args...) = assign_property!(x, Val(name), args...) -function assign_secondary_structure!(chain::ProteinChain, dict::Union{Dict{ProteinChain,Any},Nothing}=nothing) +function assign_property!(chain::ProteinChain, ::Val{:secondary_structure}, dict::Union{Dict{ProteinChain,Any},Nothing}=nothing) number_vector = isnothing(dict) ? ASS.assign_secondary_structure(chain) : dict[chain] chain.secondary_structure = number_vector_to_code_string(number_vector) end -function assign_standard_residue!(chain::ProteinChain, standard_residue::Matrix{<:Real}=STANDARD_RESIDUE_ANGSTROM) - chain.standard_residue = standard_residue +function assign_property!(chain::ProteinChain, ::Val{:ideal_residue}, ideal_residue::AbstractMatrix{<:Real}=DEFAULT_IDEAL_RESIDUE) + chain.ideal_residue = collect(ideal_residue) end -function assign_residue_rotations!(chain::ProteinChain) - frames = Backboner.Frames(Backboner.Backbone(chain.backbone), chain.standard_residue) +function assign_property!(chain::ProteinChain, ::Val{:residue_rotations}) + frames = Backboner.Frames(Backboner.Backbone(chain.backbone), chain.ideal_residue) chain.residue_rotations = frames.rotations end -function assign_residue_rotations_quat!(chain::ProteinChain) - frames = Backboner.Frames(Backboner.Backbone(chain.backbone), chain.standard_residue) +function assign_property!(chain::ProteinChain, ::Val{:residue_rotations_quat}) + frames = Backboner.Frames(Backboner.Backbone(chain.backbone), chain.ideal_residue) chain.residue_rotations_quat = rotation_matrices_to_quaternions(frames.rotations) end -function assign_residue_translations!(chain::ProteinChain) +function assign_property!(chain::ProteinChain, ::Val{:residue_translations}) chain.residue_translations = Backboner.centroid(chain.backbone; dims=2) end -function assign_is_knotted!(chain::ProteinChain) +function assign_property!(chain::ProteinChain, ::Val{:bond_lengths}) + chain.bond_lengths = Backboner.get_bond_lengths(Backboner.Backbone(chain.backbone)) +end + +function assign_property!(chain::ProteinChain, ::Val{:bond_angles}) + chain.bond_angles = Backboner.get_bond_angles(Backboner.Backbone(chain.backbone)) +end + +function assign_property!(chain::ProteinChain, ::Val{:torsion_angles}) + chain.torsion_angles = Backboner.get_torsional_angles(Backboner.Backbone(chain.backbone)) +end + +function assign_property!(chain::ProteinChain, ::Val{:is_knotted}) chain.is_knotted = Backboner.is_knotted(Backboner.Backbone(chain.backbone)[2:3:end]) end + + +struct ResidueIndexingUndefined <: Exception + name::Symbol +end + +ResidueIndexingUndefined(val::Val) = typeof(val).type_parameters[1] + +Base.showerror(io::IO, err::ResidueIndexingUndefined) = print(io, "$(typeof(err)): property `:$(err.name)` does not have a defined reorder function. Property needs reassignment.") + +residue_indexing_function(name::Symbol) = residue_indexing_function(Val(name)) + +# move non-properties to getindex(::ProteinChain) +residue_indexing_function(::Val{:id}) = (x, i) -> x[i] +residue_indexing_function(::Val{:sequence}) = (x, i) -> x[i] +residue_indexing_function(::Val{:backbone}) = (x, i) -> x[:, :, i] +residue_indexing_function(::Val{:atoms}) = (x, i) -> x[i] + +residue_indexing_function(::Val{:secondary_structure}) = (x, i) -> x[i] +residue_indexing_function(::Val{:residue_rotations}) = (x, i) -> x[:, :, i] +residue_indexing_function(::Val{:residue_rotations_quat}) = (x, i) -> x[:, i] +residue_indexing_function(::Val{:residue_translations}) = (x, i) -> x[:, i] + +residue_indexing_function(name::Val{:bond_lengths}) = throw(ResidueIndexingUndefined(name)) +residue_indexing_function(name::Val{:bond_angles}) = throw(ResidueIndexingUndefined(name)) +residue_indexing_function(name::Val{:torsion_angles}) = throw(ResidueIndexingUndefined(name)) +residue_indexing_function(name::Val{:is_knotted}) = throw(ResidueIndexingUndefined(name)) + +flatten_property(chains::AbstractVector{<:ProteinChain}, ::Val{:sequence}) = join(chain -> chain.sequence, chains) + + +function extend_property! end \ No newline at end of file diff --git a/src/chain.jl b/src/chain.jl index 2df4355..b6bc1a0 100644 --- a/src/chain.jl +++ b/src/chain.jl @@ -1,64 +1,54 @@ """ - ProteinChain + ProteinChain{T<:AbstractFloat} ## Examples + ```jldoctest -julia> structure = pdb"1EYE" +julia> structure = pdb"1EYE"; [ Info: Downloading file from PDB: 1EYE -1-chain ProteinStructure "1EYE.cif" with 0 properties: - 256-residue ProteinChain "A" with 0 properties julia> structure[1] -256-residue ProteinChain "A" with 0 properties: +253-residue ProteinChain "A" with 2 properties: fields: id::String = "A" - aminoacids::String = "PVQVMGVLNVTDDSFSDGGCYLDLDDAVKHGLAMAAAGAGIVDVGGETSRVIPVVKELAAQGITVSIDTMRADVARAALQNGAQMVNDVSGGRADPAM… - backbone::Array{Float64, 3} = [45.592 44.171 43.719; -10.864 -10.936 -9.688; 30.192 30.504 31.278;;; 42.568 42.02 40.707; -9.163 … - numbers::Vector{Int64} = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14 … 265, 266, 267, 268, 269, 270, 271, 272, 273, 274] - properties: (none) + sequence::String = "PVQVMGVLNVTDDSFSDGGCYLDLDDAVKHGLAMAAAGAGIVDVGGETSRVIPVVKELAAQGITVIDTMRADVARAALQNGAQMVNDVSGGRADPAMG… + backbone::Array{Float64,3} = [45.592 44.171 43.719; -10.864 -10.936 -9.688; 30.192 30.504 31.278;;; 42.568 42.02 40.707; -9.163 … + atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = Vector{Atom{Float64}}[[Atom{Float64}(0x4f202020, 0x08, 44.41, -9.176, 32.157), Atom{Float64}(0x4243… + properties: + numbering::Vector{Int64} = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14 … 265, 266, 267, 268, 269, 270, 271, 272, 273, 274] + modelnum::Int64 = 1 ``` """ -mutable struct ProteinChain +mutable struct ProteinChain{T<:AbstractFloat} id::String - aminoacids::String - backbone::Array{Float64,3} - numbers::Vector{Int} - properties::Dict{Symbol,Any} - - function ProteinChain(id::String, aminoacids::String, backbone::Array{Float64,3}, numbers::Vector{Int}, properties::Dict{Symbol,Any}) - chain = new(id, aminoacids, backbone, numbers, properties) - countresidues(chain) - @assert all(!in(fieldnames(ProteinChain)), keys(properties)) - return chain - end + sequence::String + backbone::Array{T,3} + atoms::Vector{Vector{Atom{T}}} + properties::Properties end -Base.getproperty(chain::ProteinChain, property::Symbol) = _getproperty(chain, property) -Base.setproperty!(chain::ProteinChain, property::Symbol, value) = _setproperty!(chain, property, value) - function countresidues(chain::ProteinChain) - @assert length(chain.aminoacids) == size(chain.backbone, 3) == length(chain.numbers) - return length(chain.aminoacids) + @assert length(chain.sequence) == size(chain.backbone, 3) + return length(chain.sequence) end -function ProteinChain(id::String, aminoacids::String, backbone::Array{Float64,3}, numbers::Vector{Int}; kwargs...) - chain = ProteinChain(id, aminoacids, backbone, numbers, Dict{Symbol,Any}()) - !isempty(kwargs) && push!(chain.properties, kwargs...) +function ProteinChain(id::String, sequence::String, backbone::Array{T,3}, atoms::Vector{Vector{Atom{T}}}, properties::Properties) where T + chain = ProteinChain{T}(id, sequence, backbone, atoms, properties) + countresidues(chain) + @assert size(backbone)[1:2] == (3, 3) return chain end +function ProteinChain(id::String, sequence::String, backbone::Array{T,3}, atoms::Vector{Vector{Atom{T}}}=Vector{Atom{T}}[]; kwargs...) where T + return ProteinChain(id, sequence, backbone, atoms, Properties(; kwargs...)) +end + +Base.hasproperty(chain::ProteinChain, name::Symbol) = hasproperty(HasProperties(chain), name) +Base.getproperty(chain::ProteinChain, name::Symbol) = getproperty(HasProperties(chain), name) +Base.setproperty!(chain::ProteinChain, name::Symbol, value) = setproperty!(HasProperties(chain), name, value) +Base.propertynames(chain::ProteinChain) = propertynames(HasProperties(chain)) + Base.summary(chain::ProteinChain) = "$(countresidues(chain))-residue ProteinChain \"$(chain.id)\" with $(length(chain.properties)) properties" +# where T <: Union{ProteinChain,ProteinStructure} -function Base.show(io::IO, ::MIME"text/plain", chain::ProteinChain) - context = IOContext(io, :compact => true, :limit => true) - print(context, summary(chain), ":") - printstyled(context, "\n fields:", color=:yellow) - for fieldname in fieldnames(ProteinChain)[1:end-1] - printfield(context, chain, fieldname) - end - printstyled(context, "\n properties:", color=:yellow) - isempty(chain.properties) && print(io, " (none)") - for property in keys(chain.properties) - printfield(context, chain, property) - end -end +Base.show(io::IO, ::MIME"text/plain", chain::ProteinChain) = showproperties(io, chain) \ No newline at end of file diff --git a/src/flatten.jl b/src/flatten.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/idealize.jl b/src/idealize.jl index 5471db0..d79c008 100644 --- a/src/idealize.jl +++ b/src/idealize.jl @@ -1,5 +1,18 @@ +""" + BackboneGeometry(; + N_Ca_length = 1.46, + Ca_C_length = 1.52, + C_N_length = 1.33, + + N_Ca_C_angle = 1.94, + Ca_C_N_angle = 2.03, + C_N_Ca_angle = 2.13, + ) + +Define the idealized bond lengths and bond angles of a protein backbone. +""" Base.@kwdef struct BackboneGeometry - N_Ca_length = 1.46 + N_Ca_length = 1.46 Ca_C_length = 1.52 C_N_length = 1.33 @@ -8,25 +21,66 @@ Base.@kwdef struct BackboneGeometry C_N_Ca_angle = 2.13 end -# TODO: make this align with default BackboneGeometry -const STANDARD_RESIDUE_ANGSTROM = [ +const DEFAULT_BACKBONE_GEOMETRY = BackboneGeometry() + +""" + IdealResidue{T<:AbstractFloat} <: AbstractMatrix{T} + + IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) +""" +struct IdealResidue{T<:AbstractFloat} <: AbstractMatrix{T} + backbone_geometry::BackboneGeometry + N_Ca_C_coords::Matrix{T} + + function IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) where T + N_Ca_C_coords = Matrix{T}(undef, 3, 3) + N_pos, Ca_pos, C_pos = eachcol(N_Ca_C_coords) + Θ = backbone_geometry.N_Ca_C_angle - π/2 + N_pos .= [0, 0, 0] + Ca_pos .= [backbone_geometry.N_Ca_length, 0, 0] + C_pos .= Ca_pos + backbone_geometry.Ca_C_length * [sin(Θ), cos(Θ), 0] + centroid = (N_pos + Ca_pos + C_pos) / 3 + N_Ca_C_coords .-= centroid + new{T}(backbone_geometry, N_Ca_C_coords) + end +end + +Base.size(::IdealResidue) = (3,3) +Base.getindex(ideal_residue::IdealResidue, args...) = ideal_residue.N_Ca_C_coords[args...] + +const DEFAULT_IDEAL_RESIDUE = IdealResidue{Float64}() + +const OLD_STANDARD_RESIDUE_ANGSTROM = [ -1.066 -0.200 1.266; 0.645 -0.527 -0.118; 0.000 0.000 0.000; ] # N Ca Cs -function append_residue(backbone::Backbone, dihedrals::Vector{<:Real}; ideal::BackboneGeometry=BackboneGeometry()) - length(dihedrals) == 3 || throw(ArgumentError("length of `dihedrals` must be 3, as only 3 backbone atoms are introduced.")) - lengths = [ideal.C_N_length, ideal.N_Ca_length, ideal.Ca_C_length] - angles = [ideal.Ca_C_N_angle, ideal.C_N_Ca_angle, ideal.N_Ca_C_angle] - append_bonds(backbone, Float64.(lengths), Float64.(angles), Float64.(dihedrals)) +""" + append_residue(Backbone::Backbone, torsion_angles::Vector{<:Real}; ideal::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) + +Create a new backbone by appending 3 torsion angles (ψ, ω, ϕ) at the end, using bond lengths and bond angles specified in BackboneGeometry. +""" +function append_residue(backbone::Backbone, torsion_angles::Vector{<:Real}; ideal::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) + length(torsion_angles) == 3 || throw(ArgumentError("length of `torsion_angles` must be 3, as only 3 backbone atoms are introduced.")) + bond_lengths = [ideal.C_N_length, ideal.N_Ca_length, ideal.Ca_C_length] + bond_angles = [ideal.Ca_C_N_angle, ideal.C_N_Ca_angle, ideal.N_Ca_C_angle] + append_bonds(backbone, Float64.(bond_lengths), Float64.(bond_angles), Float64.(torsion_angles)) end -function prepend_residue(backbone::Backbone, dihedrals::Vector{<:Real}; ideal::BackboneGeometry=BackboneGeometry()) - length(dihedrals) == 3 || throw(ArgumentError("length of `dihedrals` must be 3, as only 3 backbone atoms are introduced.")) - lengths = [ideal.N_Ca_length, ideal.Ca_C_length, ideal.C_N_length] - angles = [ideal.N_Ca_C_angle, ideal.Ca_C_N_angle, ideal.C_N_Ca_angle] - return prepend_bonds(backbone, Float64.(lengths), Float64.(angles), Float64.(dihedrals)) +""" + append_residue(Backbone::Backbone, torsion_angles::Vector{<:Real}; ideal::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) + +Create a new backbone by prepending 3 torsion angles (ψ, ω, ϕ) at the end, using bond lengths and bond angles specified in BackboneGeometry. + +!!! note + The torsion angle order is the same as it would be when appending. The order is *not* reversed. +""" +function prepend_residue(backbone::Backbone, torsion_angles::Vector{<:Real}; ideal::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY) + length(torsion_angles) == 3 || throw(ArgumentError("length of `torsion_angles` must be 3, as only 3 backbone atoms are introduced.")) + bond_lengths = [ideal.N_Ca_length, ideal.Ca_C_length, ideal.C_N_length] + bond_angles = [ideal.N_Ca_C_angle, ideal.Ca_C_N_angle, ideal.C_N_Ca_angle] + return prepend_bonds(backbone, Float64.(bond_lengths), Float64.(bond_angles), Float64.(torsion_angles)) end # TODO: use cooldown (and Float64?) to increase precision and accuracy diff --git a/src/io.jl b/src/io.jl deleted file mode 100644 index cf36d7d..0000000 --- a/src/io.jl +++ /dev/null @@ -1,205 +0,0 @@ -using BioStructures: BioStructures, PDBFormat, MMCIFFormat - -const ProteinFileFormat = Union{PDBFormat, MMCIFFormat} -const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY") -const BACKBONE_ATOM_NAMES = ["N", "CA", "C"] - -threeletter_to_oneletter(threeletter::AbstractString) = Char(get(BioStructures.threeletter_to_aa, threeletter, 'X')) -oneletter_resname(residue::BioStructures.AbstractResidue) = threeletter_to_oneletter(BioStructures.resname(residue)) - -backbone_atom_selector(atom::BioStructures.AbstractAtom) = BioStructures.atomnameselector(atom, BACKBONE_ATOM_NAMES) -backbone_residue_selector(residue::BioStructures.AbstractResidue) = - oneletter_resname(residue) in AMINOACIDS && - BioStructures.countatoms(residue, backbone_atom_selector) == 3 && - BioStructures.standardselector(residue) && - !BioStructures.disorderselector(residue) - -function get_atom(residue::BioStructures.AbstractResidue, atom_name::AbstractString) - selector = atom -> BioStructures.atomnameselector(atom, [atom_name]) - residue_atoms = BioStructures.collectatoms(residue, selector) - return only(residue_atoms) -end - -function get_backbone(residue::BioStructures.AbstractResidue) - atom_name_to_atom_coords = atom_name -> BioStructures.coords(get_atom(residue, atom_name)) - residue_coords = stack(atom_name_to_atom_coords, BACKBONE_ATOM_NAMES) - return Backbone(residue_coords) -end - -function get_backbone(residues::Vector{BioStructures.AbstractResidue}) - chain_coords = mapreduce(residue -> get_backbone(residue).coords, hcat, residues; init=Matrix{Float64}(undef, 3, 0)) - return Backbone(chain_coords) -end - -get_backbone(chain::BioStructures.Chain, selector) = get_backbone(BioStructures.collectresidues(chain, selector)) - -aminoacid_sequence(residues::Vector{BioStructures.AbstractResidue}) = join(oneletter_resname.(residues)) -aminoacid_sequence(chain::BioStructures.Chain, selector) = aminoacid_sequence(BioStructures.collectresidues(chain, selector)) - -#=function get_residue_atoms(residues::Vector{BioStructures.AbstractResidue}) - atom_names = Vector{Vector{String}}[] - atom_coords = Vector{Vector{Float64}}[] - for residue in residues - residue_atoms = BioStructures.collectatoms(residue, a -> !backbone_atom_selector(a) && BioStructures.standardselector(a) && !BioStructures.disorderselector(a)) - push!(atom_names, map(a -> a.name), residue_atoms) - push!(atom_coords, map(a -> a.coords), residue_atoms) - end - return atom_names, atom_coords -end=# - -function ProteinChain(residues::Vector{BioStructures.AbstractResidue}) - id = only(unique(BioStructures.chainid.(residues))) - aminoacids = aminoacid_sequence(residues) - backbone = reshape(Backbone(residues).coords, 3, 3, :) - numbers = BioStructures.resnumber.(residues) - #ins_codes = BioStructures.inscode.(residues) - #modelnum = only(unique(BioStructures.modelnumber.(residues))) - #residue_atoms = get_residue_atoms(residues) - return ProteinChain(id, aminoacids, backbone, numbers) -end - -function ProteinChain(chain::BioStructures.Chain, selector=backbone_residue_selector) - residues = BioStructures.collectresidues(chain, selector) - isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[]) - return ProteinChain(residues) -end - -function collectchains(struc::BioStructures.MolecularStructure, selector=backbone_residue_selector) - chains = ProteinChain[] - for model in struc, chain in model - if !isempty(chain) - pc = ProteinChain(chain, selector) - countresidues(pc) > 0 && push!(chains, pc) - end - end - return chains -end - -function ProteinStructure(struc::BioStructures.MolecularStructure) - name = struc.name - chains = collectchains(struc) - ProteinStructure(name, chains, Dict()) -end - -const pdbextension_to_format = Dict(ext => format for (format, ext) in BioStructures.pdbextension) - -get_format(path::AbstractString) = get(pdbextension_to_format, lowercase(last(splitext(path))[2:end]), PDBFormat) - -""" - readchains(path, format) -> chains::ProteinStructure - -Loads a protein structure from a PDB file. - -Exported formats: `PDBFormat`, `MMCIFFormat` - -## Examples - -```julia -readchains("example.pdb") # detects PDB format from extension - -readchains("example.cif") # detects mmCIF format from extension - -readchains("example.abc", PDBFormat) # force PDB format - -readchains("example.xyz", MMCIFFormat) # force mmCIF format -``` -""" -readchains(path::AbstractString, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format)) -readchains(path::AbstractString) = readchains(path, get_format(path)) - -""" - readpdb(path) -> chains::Vector{ProteinChain} -""" -readpdb(path::AbstractString) = readchains(path, PDBFormat) - -""" - readcif(path) -> chains::Vector{ProteinChain} -""" -readcif(path::AbstractString) = readchains(path, MMCIFFormat) - -""" - writepdb(path, chains::AbstractVector{ProteinChain}) - writepdb(path, chain::ProteinChain) -""" -function writepdb(path::AbstractString, chains::AbstractVector{ProteinChain}) - atom_records = BioStructures.AtomRecord[] - index = 0 - residue_index = 0 - for chain in chains - residue_backbone_coords = reshape(chain.backbone.coords, 3, 3, :) - assign_missing_oxygens!(chain) - for i in 1:countresidues(chain) - resname = get(threeletter_aa_names, chain.aminoacids[i], "XXX") # threletter_aa_names in residue.jl - residue_index += 1 - residue_atoms = [ - [(; name, coords) for (name, coords) in zip(BACKBONE_ATOM_NAMES, eachcol(view(residue_backbone_coords, :, :, i)))]; - #filter(a -> !(a.name in BACKBONE_ATOM_NAMES), residue.atoms) - ] - for atom in residue_atoms - index += 1 - number = chain.numbers[i] - push!(atom_records, BioStructures.AtomRecord(false, index, atom.name, ' ', resname, chain.id, - number, ' ', atom.coords, 1.0, 0.0, strip(atom.name)[1:1], "")) - end - end - end - pdblines = BioStructures.pdbline.(atom_records) - open(path, "w") do io - for line in pdblines - println(io, line) - end - end - return nothing -end - -writepdb(path::AbstractString, chain::ProteinChain) = writepdb(path, [chain]) - -# compat -writepdb(chains::Union{ProteinChain, AbstractVector{ProteinChain}}, path::AbstractString) = writepdb(path, chains) - -""" - writechains(path, chains::AbstractVector{ProteinChain}, format) - writechains(path, chain::ProteinChain, format) - -Write a protein structure (represented as a `Vector{ProteinChain}`s) to file with the specified format. - -Exported formats: `PDBFormat`, `MMCIFFormat` - -## Examples - -```julia -writechains("example.pdb", chains) # detects PDB format from extension - -writechains("example.cif", chains) # detects mmCIF format from extension - -writechains("example.abc", chains, PDBFormat) # force PDB format - -writechains("example.xyz", chains, MMCIFFormat) # force mmCIF format -``` -""" -writechains(path::AbstractString, chains::AbstractVector{ProteinChain}, ::Type{PDBFormat}) = writepdb(path, chains) - -function writechains(path::AbstractString, chains::AbstractVector{ProteinChain}, format::Type{<:ProteinFileFormat}) - struc = mktempdir() do temp_dir - temp_path = joinpath(temp_dir, "temp.pdb") - writechains(temp_path, chains, PDBFormat) - read(temp_path, PDBFormat) # loads BioStructures.MolecularStructure - end - write_function = format == PDBFormat ? BioStructures.writepdb : BioStructures.writemmcif - write_function(path, struc) -end - -writechains(path::AbstractString, chains::AbstractVector{ProteinChain}) = writechains(path, chains, get_format(path)) - -writechains(path, chain::ProteinChain, args...) = writechains(path, [chain], args...) - -pdbentry(pdbid::AbstractString; format=BioStructures.MMCIFFormat, kwargs...) = mktempdir() do dir - path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...) - readchains(path, format) -end - -macro pdb_str(pdbid) - quote - pdbentry($(esc(pdbid))) - end -end \ No newline at end of file diff --git a/src/io/download.jl b/src/io/download.jl new file mode 100644 index 0000000..fa42dc0 --- /dev/null +++ b/src/io/download.jl @@ -0,0 +1,10 @@ +pdbentry(pdbid::AbstractString; format=BioStructures.MMCIFFormat, kwargs...) = mktempdir() do dir + path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...) + readchains(path, format) +end + +macro pdb_str(pdbid) + quote + pdbentry($(esc(pdbid))) + end +end diff --git a/src/io/io.jl b/src/io/io.jl new file mode 100644 index 0000000..7970410 --- /dev/null +++ b/src/io/io.jl @@ -0,0 +1,18 @@ +using BioStructures: BioStructures, PDBFormat, MMCIFFormat + +const ProteinFileFormat = Union{PDBFormat, MMCIFFormat} +const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY") +const BACKBONE_ATOM_NAMES = ["N", "CA", "C"] +const BACKBONE_ATOM_SYMBOLS = ["N", "C", "C"] + +const threeletter_aa_names = Dict{Char, String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa]) +threeletter_to_oneletter(threeletter::AbstractString) = Char(get(BioStructures.threeletter_to_aa, threeletter, 'X')) +oneletter_resname(residue::BioStructures.AbstractResidue) = threeletter_to_oneletter(BioStructures.resname(residue)) + +const pdbextension_to_format = Dict(ext => format for (format, ext) in BioStructures.pdbextension) + +get_format(path::AbstractString) = get(pdbextension_to_format, lowercase(last(splitext(path))[2:end]), PDBFormat) + +include("read.jl") +include("write.jl") +include("download.jl") \ No newline at end of file diff --git a/src/io/read.jl b/src/io/read.jl new file mode 100644 index 0000000..1d5059b --- /dev/null +++ b/src/io/read.jl @@ -0,0 +1,68 @@ +backbone_atom_selector(atom::BioStructures.AbstractAtom) = BioStructures.atomnameselector(atom, BACKBONE_ATOM_NAMES) +backbone_residue_selector(residue::BioStructures.AbstractResidue) = + oneletter_resname(residue) in AMINOACIDS && + BioStructures.countatoms(residue, backbone_atom_selector) == 3 && + BioStructures.standardselector(residue) && + !BioStructures.disorderselector(residue) && + !any(atom -> atom isa BioStructures.DisorderedAtom, BioStructures.collectatoms(residue)) + +function get_atom(residue::BioStructures.AbstractResidue, atom_name::AbstractString) + selector = atom -> BioStructures.atomnameselector(atom, [atom_name]) + residue_atoms = BioStructures.collectatoms(residue, selector) + return only(residue_atoms) +end + +function get_backbone(residue::BioStructures.AbstractResidue) + atom_name_to_atom_coords = atom_name -> BioStructures.coords(get_atom(residue, atom_name)) + residue_coords = stack(atom_name_to_atom_coords, BACKBONE_ATOM_NAMES) + return Backbone(residue_coords) +end + +function get_backbone(residues::Vector{BioStructures.AbstractResidue}) + chain_coords = mapreduce(residue -> get_backbone(residue).coords, hcat, residues; init=Matrix{Float64}(undef, 3, 0)) + return Backbone(chain_coords) +end + +get_backbone(chain::BioStructures.Chain, selector) = get_backbone(BioStructures.collectresidues(chain, selector)) + +aminoacid_sequence(residues::Vector{BioStructures.AbstractResidue}) = join(oneletter_resname.(residues)) +aminoacid_sequence(chain::BioStructures.Chain, selector) = aminoacid_sequence(BioStructures.collectresidues(chain, selector)) + +function get_nonbackbone_atoms(residues::Vector{BioStructures.AbstractResidue}) + atoms = Vector{Atom{Float64}}[] + for residue in residues + residue_atoms_bs = BioStructures.collectatoms(residue, a -> !backbone_atom_selector(a) && BioStructures.standardselector(a) && !BioStructures.disorderselector(a)) + @assert !isempty(residue_atoms_bs) + push!(atoms, map(atom_bs -> Atom(atom_bs.name, atom_bs.element, atom_bs.coords), residue_atoms_bs)) + end + return atoms +end + +function ProteinChain(residues::Vector{BioStructures.AbstractResidue}) + id = only(unique(BioStructures.chainid.(residues))) + sequence = aminoacid_sequence(residues) + backbone = reshape(get_backbone(residues).coords, 3, 3, :) + atoms = get_nonbackbone_atoms(residues) + numbering = BioStructures.resnumber.(residues) + modelnum = only(unique(BioStructures.modelnumber.(residues))) + #ins_codes = BioStructures.inscode.(residues) + return ProteinChain(id, sequence, backbone, atoms; numbering, modelnum) +end + +function ProteinChain(chain::BioStructures.Chain, selector=backbone_residue_selector) + residues = BioStructures.collectresidues(chain, selector) + isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[]) + return ProteinChain(residues) +end + +function ProteinStructure(struc::BioStructures.MolecularStructure, selector=backbone_residue_selector) + name = struc.name + chains = ProteinChain{Float64}[] + for model in struc, chain in model + if !isempty(chain) + pc = ProteinChain(chain, selector) + countresidues(pc) > 0 && push!(chains, pc) + end + end + ProteinStructure(name, chains) +end diff --git a/src/io/write.jl b/src/io/write.jl new file mode 100644 index 0000000..601d436 --- /dev/null +++ b/src/io/write.jl @@ -0,0 +1,52 @@ +readchains(path::AbstractString, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format)) +readchains(path::AbstractString) = readchains(path, get_format(path)) + +readpdb(path::AbstractString) = readchains(path, PDBFormat) +readcif(path::AbstractString) = readchains(path, MMCIFFormat) + +function writepdb(path::AbstractString, chains::AbstractVector{ProteinChain{T}}) where T + atom_records = BioStructures.AtomRecord[] + index = 0 + residue_index_global = 0 + for chain in chains + numbering = hasproperty(chain, :numbering) ? chain.numbering : collect(1:countresidues(chain)) + for residue_index in 1:countresidues(chain) + resname = get(threeletter_aa_names, chain.sequence[residue_index], "XXX") # threletter_aa_names in residue.jl + residue_index_global += 1 + residue_atoms = [ + [Atom(atom_name, atom_symbol, coords) for (atom_name, atom_symbol, coords) in zip(BACKBONE_ATOM_NAMES, BACKBONE_ATOM_SYMBOLS, eachcol(view(chain.backbone, :, :, residue_index)))]; + chain.atoms[residue_index] + ] + for atom in residue_atoms + index += 1 + push!(atom_records, BioStructures.AtomRecord(false, index, decode_atom_name(atom.atom_name), ' ', resname, chain.id, + numbering[residue_index], ' ', [atom.x, atom.y, atom.z], 1.0, 0.0, atomic_number_to_element_symbol(atom.atomic_number), "")) + end + end + end + pdblines = BioStructures.pdbline.(atom_records) + open(path, "w") do io + for line in pdblines + println(io, line) + end + end + return nothing +end + +writepdb(path::AbstractString, chain::ProteinChain) = writepdb(path, [chain]) + +writechains(path::AbstractString, chains::AbstractVector{<:ProteinChain}, ::Type{PDBFormat}) = writepdb(path, chains) + +function writechains(path::AbstractString, chains::AbstractVector{<:ProteinChain}, format::Type{<:ProteinFileFormat}) + struc = mktempdir() do temp_dir + temp_path = joinpath(temp_dir, "temp.pdb") + writechains(temp_path, chains, PDBFormat) + read(temp_path, PDBFormat) # loads BioStructures.MolecularStructure + end + write_function = format == PDBFormat ? BioStructures.writepdb : BioStructures.writemmcif + write_function(path, struc) +end + +writechains(path::AbstractString, chains::AbstractVector{<:ProteinChain}) = writechains(path, chains, get_format(path)) + +writechains(path, chain::ProteinChain, args...) = writechains(path, [chain], args...) diff --git a/src/properties.jl b/src/properties.jl index aa306ed..7a63a4d 100644 --- a/src/properties.jl +++ b/src/properties.jl @@ -1,33 +1,77 @@ -function _getproperty(pc::T, property::Symbol) where T - property in fieldnames(T) && return getfield(pc, property) - property in keys(pc.properties) && return pc.properties[property] - throw(ErrorException("$(T) instance has no field or property $property")) +using OrderedCollections: OrderedDict + +export Properties + +struct Properties <: AbstractDict{Symbol,Any} + dict::OrderedDict{Symbol,Any} end -function _setproperty!(pc::T, property::Symbol, x) where T - property in fieldnames(T) && return setfield!(pc, property, x) - return pc.properties[property] = x +Properties() = Properties(OrderedDict{Symbol,Any}()) +Properties(pairs::Vararg{<:Pair{Symbol}}) = Properties(OrderedDict(name => value for (name, value) in pairs)) +Properties(args...; kwargs...) = Properties(args..., kwargs...) + +Base.length(x::Properties) = length(x.dict) +Base.iterate(x::Properties, args...) = iterate(x.dict, args...) +Base.keys(x::Properties) = keys(x.dict) +Base.values(x::Properties) = values(x.dict) + +Base.getindex(x::Properties, name::Symbol) = getindex(x.dict, name) +Base.setindex!(x::Properties, value, name::Symbol) = setindex!(x.dict, value, name) + +Base.hasproperty(x::Properties, name::Symbol) = hasfield(Properties, name) || name in keys(x) +Base.getproperty(x::Properties, name::Symbol) = hasfield(Properties, name) ? getfield(x, name) : getindex(x, name) +Base.setproperty!(x::Properties, name::Symbol, value) = hasfield(Properties, name) ? setfield!(x, name, value) : setindex!(x, value, name) + +Base.propertynames(x::Properties, private::Bool=false) = ((private ? (:dict,) : ())..., collect(keys(x.dict))...,) + +struct HasProperties{T} + parent::T + + function HasProperties(parent::T) where T + hasfield(T, :properties) && getfield(parent, :properties) isa Properties || throw(ArgumentError("input type `$T` needs to have a properties::$(Properties) field")) + new{T}(parent) + end end +Base.hasproperty(x::HasProperties, name::Symbol) = hasfield(T, name) || hasproperty(x, name) + +function Base.getproperty(x::HasProperties{T}, name::Symbol) where T + hasfield(HasProperties, name) && return getfield(x, name) + hasfield(T, name) && return getfield(x.parent, name) + name in keys(x.parent.properties) && return getindex(x.parent.properties, name) + throw(ErrorException("$(T) instance has no field or property $name")) +end + +function Base.setproperty!(x::HasProperties{T}, name::Symbol, value) where T + hasfield(HasProperties, name) && return setfield(x, name, value) + hasfield(T, name) && return setfield!(x.parent, name, value) + setindex!(x.parent.properties, value, name) +end + +Base.propertynames(x::HasProperties) = (fieldnames(typeof(getfield(x, :parent)))..., propertynames(x.parent.properties, false)...) + truncate(s::AbstractString, len::Integer) = length(s) > len ? String(first(s, len-1) * '…') : s -function printfield(io::IO, x, field::Symbol; indent=4) - value = getproperty(x, field) +function printfield(io::IO, x, name::Symbol; indent=4) + value = getproperty(x, name) print(io, "\n"*" "^indent) - printstyled(io, string(field); color=:white) + printstyled(io, string(name); color=:white) printstyled(io, "::"; color=:red) - printstyled(io, string(typeof(value)); color=:blue) + printstyled(io, replace(string(typeof(value)), " " => ""); color=:blue) printstyled(io, " = "; color=:red) printstyled(io, truncate(repr(value; context=io), 100)) end -function printproperty(io::IO, x, property::Symbol; indent=4) - value = getproperty(x, property) - print(io, "\n"*" "^indent) - printstyled(io, ":", string(property); color=:magenta) - printstyled(io, " => "; color=:red) - printstyled(io, truncate(repr(value; context=io), 100)) +function showproperties(io::IO, x::T) where T + context = IOContext(io, :compact => true, :limit => true) + print(context, summary(x), ":") + printstyled(context, "\n fields:", color=:yellow) + for fieldname in fieldnames(T)[1:end-1] + printfield(context, x, fieldname) + end + printstyled(context, "\n properties:", color=:yellow) + isempty(x.properties) && print(io, " (none)") + for name in keys(x.properties) + printfield(context, x, name) + end end - -# TODO: `assign_property!(chain, property)` vs `assign_!(chain)` -#assignproperty!(x, property::Symbol, args...) = assignproperty!(x, Val(property), args...) \ No newline at end of file diff --git a/src/secondary-structure.jl b/src/secondary-structure.jl index 3acaf1e..4afe394 100644 --- a/src/secondary-structure.jl +++ b/src/secondary-structure.jl @@ -5,7 +5,7 @@ const SS_CODES = ['-', 'H', 'E'] number_vector_to_codes(v::AbstractVector{<:Integer}) = SS_CODES[v] number_vector_to_code_string(v::AbstractVector{<:Integer}) = join(SS_CODES)[v] -function ASS.assign_secondary_structure(structure::AbstractVector{ProteinChain}) +function ASS.assign_secondary_structure(structure::AbstractVector{<:ProteinChain}) chains = collect(structure) backbones = map(chain -> chain.backbone, chains) return Dict(zip(chains, ASS.assign_secondary_structure(backbones))) diff --git a/src/structure.jl b/src/structure.jl index 4a014e2..bddda9c 100644 --- a/src/structure.jl +++ b/src/structure.jl @@ -1,20 +1,23 @@ """ - ProteinStructure <: AbstractVector{ProteinChain} + ProteinStructure{T<:AbstractFloat} <: AbstractVector{ProteinChain{T}} """ -mutable struct ProteinStructure <: AbstractVector{ProteinChain} +mutable struct ProteinStructure{T<:AbstractFloat} <: AbstractVector{ProteinChain{T}} name::String - chains::Vector{ProteinChain} - properties::Dict{Symbol,Any} + chains::Vector{ProteinChain{T}} + properties::Properties end -function ProteinStructure(name::String, chains::Vector{ProteinChain}; kwargs...) - ps = ProteinStructure(name, chains, Dict{Symbol,Any}()) - !isempty(kwargs) && push!(pc.properties, kwargs...) - return ps +function ProteinStructure(name::String, chains::Vector{ProteinChain{T}}; kwargs...) where T + return ProteinStructure{T}(name, chains, Properties(; + ids=map(chain -> chain.id, chains), + lengths=map(countresidues, chains), + kwargs...)) end -Base.getproperty(structure::ProteinStructure, property::Symbol) = _getproperty(structure, property) -Base.setproperty!(structure::ProteinStructure, property::Symbol, value) = _setproperty!(structure, property, value) +Base.hasproperty(structure::ProteinStructure, name::Symbol) = hasproperty(HasProperties(structure), name) +Base.getproperty(structure::ProteinStructure, name::Symbol) = getproperty(HasProperties(structure), name) +Base.setproperty!(structure::ProteinStructure, name::Symbol, value) = setproperty!(HasProperties(structure), name, value) +Base.propertynames(structure::ProteinStructure) = propertynames(HasProperties(structure)) Base.size(structure::ProteinStructure) = (length(structure.chains),) Base.getindex(structure::ProteinStructure, i) = structure.chains[i] @@ -23,12 +26,4 @@ Base.getindex(structure::ProteinStructure, id::AbstractString) = structure[findf Base.summary(structure::ProteinStructure) = "$(length(structure))-chain ProteinStructure \"$(structure.name)\" with $(length(structure.properties)) properties" -function Base.show(io::IO, ::MIME"text/plain", structure::ProteinStructure) - print(io, summary(structure), ":") - n, i = 10, 0 - for chain in structure - (i += 1) <= n || break - print(io, "\n ", summary(chain)) - end - i < length(structure) && print("\n …") -end +Base.show(io::IO, ::MIME"text/plain", structure::ProteinStructure) = showproperties(io, structure) \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index e456dd8..1e3269e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,5 +2,40 @@ using ProteinChains using Test @testset "ProteinChains.jl" begin - # Write your tests here. + + @testset "ProteinChain" begin + chain = ProteinChain("A", "AMINO", rand(3, 3, 5); modelnum=1) + @test countresidues(chain) == 5 + @test summary(chain) == "5-residue ProteinChain \"A\" with 1 properties" + end + + @testset "ProteinStructure" begin + chain = ProteinChain("A", "AMINO", rand(3, 3, 5)) + structure = ProteinStructure("1CHN", [chain]) + @test structure[1] === chain + @test structure["A"] === chain + + end + + @testset "PDB" begin + + @testset "read" begin + structure = pdb"1ASS" + @test countresidues.(chains_pdb) == [152] + chains_cif = readchains("data/1ASS.cif", MMCIFFormat) + @test chains_pdb[1].backbone.coords == chains_cif[1].backbone.coords + end + + @testset "write" begin + chains = pdb"1ASS" + new_chains = mktempdir() do temp_dir + temp_path = joinpath(temp_dir, "temp.pdb") + writepdb(temp_path, chains) + readpdb(temp_path) + end + @test chains[1].backbone.coords == new_chains[1].backbone.coords + end + + end + end