From 51c3c32ea162b83913f36a144b42e84dd373bd10 Mon Sep 17 00:00:00 2001 From: anton083 Date: Wed, 11 Sep 2024 16:42:00 +0200 Subject: [PATCH] Add AnnotatedProteinChain, cache PDB downloads, include disordered atoms/residues --- Project.toml | 2 +- README.md | 63 +++++++++++++++++++-------------- src/ProteinChains.jl | 15 +++++--- src/annotated-chain.jl | 52 ++++++++++++++++++++++++++++ src/atom.jl | 16 ++++----- src/chain.jl | 71 +++++++++++++++++++------------------- src/io/download.jl | 39 +++++++++++++++++++++ src/io/io.jl | 10 +----- src/io/read.jl | 68 +++++++++++++++--------------------- src/io/write.jl | 27 ++++++++------- src/secondary-structure.jl | 16 +++------ src/structure.jl | 34 +++++++++--------- test/runtests.jl | 34 ++++++++++-------- 13 files changed, 266 insertions(+), 181 deletions(-) create mode 100644 src/annotated-chain.jl create mode 100644 src/io/download.jl diff --git a/Project.toml b/Project.toml index 9f8cc59..b7e8d3c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ProteinChains" uuid = "b8e8f2a5-48d3-44f1-ba0d-c71cb7726ff8" authors = ["Anton Oresten and contributors"] -version = "0.1.1" +version = "0.2.0" [deps] AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7" diff --git a/README.md b/README.md index 43ea5ea..c4249cf 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Build Status](https://github.com/MurrellGroup/ProteinChains.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/ProteinChains.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/MurrellGroup/ProteinChains.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/ProteinChains.jl) -This Julia package provides implements the `ProteinChain` type: an efficient and GPU-friendly representation of protein chains. +This Julia package provides implements the `ProteinChain` type: a GPU-friendly structure-of-arrays representation of protein chains. ## Installation @@ -16,44 +16,55 @@ Pkg.add("ProteinChains") ## Examples +The `ProteinChain` type is meant to only store a set of quintessential fields, from which most other properties can be derived. + ```julia julia> using ProteinChains -julia> structure = pdb"1EYE" # convenient macro to download proteins from the PDB +julia> structure = pdb"1EYE" # string macro to fetch proteins from the PDB [ Info: Downloading file from PDB: 1EYE -1-chain ProteinStructure "1EYE.cif": - 2 fields: - name::String = "1EYE.cif" - chains::Vector{ProteinChain{Float64}} = - 2 properties: - ids::Vector{String} = ["A"] - lengths::Vector{Int64} = [253] +1-chain ProteinStructure "1EYE.cif" + 256-residue ProteinChain{Float64} (A) + +julia> propertynames(chain) +(:id, :sequence, :backbone, :numbering, :atoms) +``` + +To store additional properties, `AnnotatedProteinChain` can be used to add dynamic properties to the chain: -julia> chain = structure["A"] -253-residue ProteinChain "A": - 4 fields: +```julia +julia> annotated_chain = annotate(chain; model=1) +256-residue AnnotatedProteinChain{Float64} (A): + 6 fields: id::String = "A" sequence::String = backbone::Array{Float64,3} = + numbering::Vector{Int64} = atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = - 2 properties: + indexable_properties::Vector{Symbol} = Symbol[] + 1 property: + model::Int64 = 1 +``` + +For properties of type `<:AbstractArray` that represent residue-level information, `annotate_indexable!` will index the last dimension of the property when the chain is indexed: + +```julia +julia> annotate_indexable!(annotated_chain; secondary_structure=assign_secondary_structure(annotated_chain) +256-residue AnnotatedProteinChain{Float64} (A): + 6 fields: + id::String = "A" + sequence::String = + backbone::Array{Float64,3} = numbering::Vector{Int64} = - modelnum::Int64 = 1 - -julia> chain.numbering -253-element Vector{Int64}: - 5 - 6 - 7 - 8 - ⋮ - 271 - 272 - 273 - 274 + atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = + indexable_properties::Vector{Symbol} = [:secondary_structure] + 2 properties: + model::Int64 = 1 + secondary_structure::Vector{Int64} = [1, 1, 3, 3, 3, 3, 3, 3, 3, 1 … 2, 2, 2, 2, 2, 2, 2, 1, 1, 1] ``` ## See also + - [Backboner.jl](https://github.com/MurrellGroup/Backboner.jl) - [BioStructures.jl](https://github.com/BioJulia/BioStructures.jl) - [PDBTools.jl](https://github.com/m3g/PDBTools.jl) diff --git a/src/ProteinChains.jl b/src/ProteinChains.jl index 589bee8..0fd42d2 100644 --- a/src/ProteinChains.jl +++ b/src/ProteinChains.jl @@ -18,20 +18,25 @@ export prepend_residue include("atom.jl") include("chain.jl") +export AbstractProteinChain export ProteinChain -export countresidues +export length export psi_angles, omega_angles, phi_angles +include("annotated-chain.jl") +export AnnotatedProteinChain +export annotate!, annotate +export annotate_indexable!, annotate_indexable + include("structure.jl") export ProteinStructure include("secondary-structure.jl") +export assign_secondary_structure, assign_secondary_structure! include("io/io.jl") -export readchains -export readpdb -export readcif -export writechains +export readcif, readpdb +export writecif, writepdb export PDBFormat, MMCIFFormat export pdbentry, @pdb_str diff --git a/src/annotated-chain.jl b/src/annotated-chain.jl new file mode 100644 index 0000000..9454f14 --- /dev/null +++ b/src/annotated-chain.jl @@ -0,0 +1,52 @@ +@dynamic mutable struct AnnotatedProteinChain{T} <: AbstractProteinChain{T} + id::String + sequence::String + backbone::Array{T,3} + numbering::Vector{Int64} + atoms::Vector{Vector{Atom{T}}} + indexable_properties::Vector{Symbol} + + function AnnotatedProteinChain( + id::AbstractString, sequence::AbstractString, backbone::AbstractArray{T,3}, + numbering::AbstractVector{<:Integer}, atoms::Vector{Vector{Atom{T}}}, + indexable_properties=Symbol[]; kwargs... + ) where T + @assert length(sequence) == size(backbone, 3) == length(atoms) == length(numbering) + AnnotatedProteinChain{T}(id, sequence, backbone, atoms, numbering, indexable_properties; kwargs...) + end +end + +ProteinChain(chain::AbstractProteinChain) = ProteinChain(chain.id, chain.sequence, chain.backbone, chain.numbering, chain.atoms) +AnnotatedProteinChain(chain::AbstractProteinChain) = AnnotatedProteinChain(chain.id, chain.sequence, chain.backbone, chain.numbering, chain.atoms) + +function Base.getindex(chain::AnnotatedProteinChain, i::AbstractVector{<:Integer}) + properties = Dict{Symbol,Any}() + for name in getproperties(chain; fields=false) + value = getproperty(chain, name) + properties[name] = name in chain.indexable_properties ? collect(selectdim(value, ndims(value), i)) : value + end + return annotate(ProteinChain(chain)[i]; properties...) +end + +function annotate_indexable!(chain::AnnotatedProteinChain; annotations...) + for (name, value) in annotations + @assert value isa AbstractArray + @assert size(value)[end] == length(chain) + push!(chain.indexable_properties, name) + setproperty!(chain, name, value) + end + return chain +end + +annotate_indexable(chain::AnnotatedProteinChain; annotations...) = annotate_indexable!(deepcopy(chain); annotations...) +annotate_indexable(chain::ProteinChain; annotations...) = annotate_indexable!(AnnotatedProteinChain(chain); annotations...) + +function annotate!(chain::AnnotatedProteinChain; annotations...) + for (name, value) in annotations + setproperty!(chain, name, value) + end + return chain +end + +annotate(chain::AnnotatedProteinChain; annotations...) = annotate(deepcopy(chain); annotations...) +annotate(chain::ProteinChain; annotations...) = annotate!(AnnotatedProteinChain(chain); annotations...) diff --git a/src/atom.jl b/src/atom.jl index 518e936..d893879 100644 --- a/src/atom.jl +++ b/src/atom.jl @@ -6,22 +6,22 @@ const ATOMIC_NUMBER_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL element_symbol_to_atomic_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_ATOMIC_NUMBER[uppercase(strip(element_symbol))] atomic_number_to_element_symbol(atomic_number::Integer) = ATOMIC_NUMBER_TO_ELEMENT_SYMBOL[atomic_number] -pad_atom_name(atom_name::AbstractString, element_symbol::AbstractString) = rpad(" "^(2-length(strip(element_symbol)))*strip(atom_name), 4) +pad_atom_name(name::AbstractString, element_symbol::AbstractString) = rpad(" "^(2-length(strip(element_symbol)))*strip(name), 4) -encode_atom_name(atom_name::AbstractString, element_symbol::AbstractString) = reinterpret(UInt32, codeunits(pad_atom_name(atom_name, element_symbol)))[1] -decode_atom_name(atom_name::UInt32) = String(reinterpret(UInt8, [atom_name])) +encode_atom_name(name::AbstractString, element_symbol::AbstractString) = reinterpret(UInt32, codeunits(pad_atom_name(name, element_symbol)))[1] +decode_atom_name(name::UInt32) = String(reinterpret(UInt8, [name])) mutable struct Atom{T<:AbstractFloat} - atom_name::UInt32 + name::UInt32 atomic_number::Int8 x::T y::T z::T end -Atom(atom_name::UInt32, atomic_number::Integer, x::T, y::T, z::T) where T = Atom{T}(atom_name, atomic_number, x, y, z) -Atom(atom_name::AbstractString, element_symbol::AbstractString, coords::AbstractVector{<:AbstractFloat}) = - Atom(encode_atom_name(atom_name, element_symbol), element_symbol_to_atomic_number(element_symbol), coords...) +Atom(name::UInt32, atomic_number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, atomic_number, x, y, z) +Atom(name::AbstractString, element_symbol::AbstractString, coords::AbstractVector{<:AbstractFloat}) = + Atom(encode_atom_name(name, element_symbol), element_symbol_to_atomic_number(element_symbol), coords...) coords(atom::Atom) = [atom.x, atom.y, atom.z] @@ -33,4 +33,4 @@ function offset!(atom::Atom, coords::Vector{<:Real}) atom.y += coords[2] atom.z += coords[3] return atom -end \ No newline at end of file +end diff --git a/src/chain.jl b/src/chain.jl index dbe4644..f2cfc18 100644 --- a/src/chain.jl +++ b/src/chain.jl @@ -1,41 +1,53 @@ -@dynamic mutable struct ProteinChain{T<:AbstractFloat} +""" + AbstractProteinChain{T<:AbstractFloat} + +An abstract type representing a protein chain, with a type parameter +`T` that specifies the floating-point type of the coordinates. + +## Subtypes +- `ProteinChain{T}`: A concrete subtype of `AbstractProteinChain` that represents a protein chain with a spe. +""" +abstract type AbstractProteinChain{T<:AbstractFloat} end + +Base.summary(chain::AbstractProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))" +Base.length(chain::AbstractProteinChain) = size(chain.backbone, 3) + +mutable struct ProteinChain{T} <: AbstractProteinChain{T} id::String sequence::String backbone::Array{T,3} + numbering::Vector{Int} atoms::Vector{Vector{Atom{T}}} + + function ProteinChain( + id::AbstractString, sequence::AbstractString, backbone::AbstractArray{T,3}, + numbering::AbstractVector{<:Integer} = collect(1:length(sequence)), + atoms::Vector{Vector{Atom{T}}} = [Atom{T}[] for _ in sequence], + ) where T + @assert length(sequence) == size(backbone, 3) == length(numbering) == length(atoms) + new{T}(id, sequence, backbone, numbering, atoms) + end end """ ProteinChain{T<:AbstractFloat} - -## Examples - -```jldoctest -julia> structure = pdb"1EYE"; -[ Info: Downloading file from PDB: 1EYE - -julia> structure[1] -253-residue ProteinChain "A": - 4 fields: - id::String = "A" - sequence::String = - backbone::Array{Float64,3} = - atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = - 2 properties: - numbering::Vector{Int64} = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14 … 265, 266, 267, 268, 269, 270, 271, 272, 273, 274] - modelnum::Int64 = 1 -``` """ ProteinChain -countresidues(chain::ProteinChain) = length(chain.sequence) +Base.getindex(chain::ProteinChain, i::AbstractVector{<:Integer}) = + ProteinChain(chain.id, chain.sequence[i], chain.backbone[:,:,i], chain.numbering[i], chain.atoms[i]) + +import Backboner -Base.getindex(chain::ProteinChain, i::AbstractVector{<:Integer}) = ProteinChain(chain.id, chain.sequence[i], chain.backbone[:,:,i], chain.atoms[i]) +Backboner.Backbone(chain::AbstractProteinChain) = Backbone(chain.backbone) +Backboner.ChainedBonds(chain::AbstractProteinChain) = ChainedBonds(Backbone(chain)) +Backboner.Frames(chain::AbstractProteinChain, ideal_residue=STANDARD_RESIDUE) = Frames(Backbone(chain), ideal_residue) -Base.summary(chain::ProteinChain) = "$(countresidues(chain))-residue ProteinChain \"$(chain.id)\"" +psi_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[1:3:end] +omega_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[2:3:end] +phi_angles(chain::AbstractProteinChain) = get_torsion_angles(Backbone(chain))[3:3:end] -function offset!(chain::ProteinChain, coords::Vector{<:Real}) - @assert length(coords) == 3 +function offset!(chain::AbstractProteinChain, coords::Vector{<:Real}) chain.backbone .+= coords for residue_atoms in chain.atoms for atom in residue_atoms @@ -44,14 +56,3 @@ function offset!(chain::ProteinChain, coords::Vector{<:Real}) end return chain end - -import Backboner - -Backboner.Backbone(chain::ProteinChain) = Backbone(chain.backbone) -Backboner.ChainedBonds(chain::ProteinChain) = ChainedBonds(Backbone(chain)) -Backboner.Frames(chain::ProteinChain, ideal_residue=STANDARD_RESIDUE) = Frames(Backbone(chain), ideal_residue) - -# TODO: "contiguity" field/property to mask outputs with NaN where residues are not contiguous -psi_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[1:3:end] -omega_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[2:3:end] -phi_angles(chain::ProteinChain) = get_torsion_angles(Backbone(chain))[3:3:end] \ No newline at end of file diff --git a/src/io/download.jl b/src/io/download.jl new file mode 100644 index 0000000..075662d --- /dev/null +++ b/src/io/download.jl @@ -0,0 +1,39 @@ +using Base.Filesystem + +const MAX_CACHE_ENTRIES = Ref{Int}(4) +const CACHE_DIR = Ref{Union{String, Nothing}}(nothing) + +function initialize_cache_dir() + if CACHE_DIR[] === nothing + CACHE_DIR[] = mktempdir(prefix="pdb_cache_") + atexit(() -> rm(CACHE_DIR[], recursive=true)) + end +end + +function manage_cache() + entries = readdir(CACHE_DIR[]) + while length(entries) > MAX_CACHE_ENTRIES[] + oldest_entry = entries[argmin([mtime(joinpath(CACHE_DIR[], entry)) for entry in entries])] + rm(joinpath(CACHE_DIR[], oldest_entry)) + entries = readdir(CACHE_DIR[]) + end +end + +function pdbentry(pdbid::AbstractString; format=BioStructures.MMCIFFormat, kwargs...) + initialize_cache_dir() + path = BioStructures.downloadpdb(pdbid; dir=CACHE_DIR[], format=format, kwargs...) + manage_cache() + return read(path, ProteinStructure, format) +end + +macro pdb_str(pdbid::AbstractString) + :(pdbentry($(esc(pdbid)))) +end + +macro pdb_str(pdbid::AbstractString, chain::AbstractString) + :(pdbentry($(esc(pdbid)))[$chain]) +end + +macro pdb_str(pdbid::AbstractString, ba_number::Integer) + :(pdbentry($(esc(pdbid)), ba_number=$ba_number)) +end diff --git a/src/io/io.jl b/src/io/io.jl index a234029..3c53971 100644 --- a/src/io/io.jl +++ b/src/io/io.jl @@ -17,12 +17,4 @@ get_format(path::AbstractString) = get(pdbextension_to_format, lowercase(last(sp include("renumber.jl") include("read.jl") include("write.jl") - -pdbentry(pdbid::AbstractString; format=BioStructures.MMCIFFormat, kwargs...) = mktempdir() do dir - path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...) - readchains(path, format) -end - -macro pdb_str(pdbid::AbstractString) - :(pdbentry($(esc(pdbid)))) -end +include("download.jl") diff --git a/src/io/read.jl b/src/io/read.jl index 35820f0..6b3eb91 100644 --- a/src/io/read.jl +++ b/src/io/read.jl @@ -1,75 +1,61 @@ backbone_atom_selector(atom::BioStructures.AbstractAtom) = BioStructures.atomnameselector(atom, BACKBONE_ATOM_NAMES) + backbone_residue_selector(residue::BioStructures.AbstractResidue) = oneletter_resname(residue) in AMINOACIDS && BioStructures.countatoms(residue, backbone_atom_selector) == 3 && - BioStructures.standardselector(residue) && - !BioStructures.disorderselector(residue) && - !any(atom -> atom isa BioStructures.DisorderedAtom, BioStructures.collectatoms(residue)) + BioStructures.standardselector(residue) -function get_atom(residue::BioStructures.AbstractResidue, atom_name::AbstractString) - selector = atom -> BioStructures.atomnameselector(atom, [atom_name]) - residue_atoms = BioStructures.collectatoms(residue, selector) - return only(residue_atoms) -end +get_sequence(residues::Vector{BioStructures.AbstractResidue}) = join(map(oneletter_resname, residues)) -function get_backbone(residue::BioStructures.AbstractResidue) - atom_name_to_atom_coords = atom_name -> BioStructures.coords(get_atom(residue, atom_name)) - residue_coords = stack(atom_name_to_atom_coords, BACKBONE_ATOM_NAMES) - return Backbone(residue_coords) -end +get_atom(residue::BioStructures.AbstractResidue, name::AbstractString) = + BioStructures.collectatoms(residue, a -> BioStructures.atomnameselector(a, [name])) |> only -function get_backbone(residues::Vector{BioStructures.AbstractResidue}) - chain_coords = mapreduce(residue -> get_backbone(residue).coords, hcat, residues; init=Matrix{Float64}(undef, 3, 0)) - return Backbone(chain_coords) -end +get_backbone(residue::BioStructures.AbstractResidue) = + stack(name -> BioStructures.coords(get_atom(residue, name)), BACKBONE_ATOM_NAMES) -get_backbone(chain::BioStructures.Chain, selector) = get_backbone(BioStructures.collectresidues(chain, selector)) +get_backbone(residues::Vector{BioStructures.AbstractResidue}) = + mapreduce(get_backbone, hcat, residues; init=Matrix{Float64}(undef, 3, 0)) -aminoacid_sequence(residues::Vector{BioStructures.AbstractResidue}) = join(oneletter_resname.(residues)) -aminoacid_sequence(chain::BioStructures.Chain, selector) = aminoacid_sequence(BioStructures.collectresidues(chain, selector)) +Atom(atom::BioStructures.Atom) = Atom(atom.name, atom.element, atom.coords) +Atom(atom::BioStructures.DisorderedAtom) = Atom(BioStructures.defaultatom(atom)) function get_nonbackbone_atoms(residues::Vector{BioStructures.AbstractResidue}) atoms = Vector{Atom{Float64}}[] for residue in residues - residue_atoms_bs = BioStructures.collectatoms(residue, a -> !backbone_atom_selector(a) && BioStructures.standardselector(a) && !BioStructures.disorderselector(a)) - @assert !isempty(residue_atoms_bs) - push!(atoms, map(atom_bs -> Atom(atom_bs.name, atom_bs.element, atom_bs.coords), residue_atoms_bs)) + residue = residue isa BioStructures.DisorderedResidue ? BioStructures.defaultresidue(residue) : residue + residue_atoms = map(Atom, BioStructures.collectatoms(residue, !backbone_atom_selector)) + push!(atoms, residue_atoms) end return atoms end function ProteinChain(residues::Vector{BioStructures.AbstractResidue}) - id = only(unique(BioStructures.chainid.(residues))) - sequence = aminoacid_sequence(residues) - backbone = reshape(get_backbone(residues).coords, 3, 3, :) + id = only(unique(map(BioStructures.chainid, residues))) + sequence = get_sequence(residues) + backbone = reshape(get_backbone(residues), 3, 3, :) atoms = get_nonbackbone_atoms(residues) - numbering = BioStructures.resnumber.(residues) - modelnum = only(unique(BioStructures.modelnumber.(residues))) - return ProteinChain(id, sequence, backbone, atoms; numbering, modelnum) + numbering = map(BioStructures.resnumber, residues) + return ProteinChain(id, sequence, backbone, numbering, atoms) end function ProteinChain(chain::BioStructures.Chain, selector=backbone_residue_selector) residues = BioStructures.collectresidues(chain, selector) - isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[]) + isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[], Vector{Atom{Float64}}[]) return ProteinChain(residues) end function ProteinStructure(struc::BioStructures.MolecularStructure, selector=backbone_residue_selector; path=nothing) - name = struc.name - chains = ProteinChain{Float64}[] + chains = ProteinChain[] for model in struc, chain in model - if !isempty(chain) - pc = ProteinChain(chain, selector) - countresidues(pc) > 0 && push!(chains, pc) - end + push!(chains, ProteinChain(chain, selector)) end - structure = ProteinStructure(name, chains; ids=map(chain -> chain.id, chains), lengths=map(countresidues, chains)) + structure = ProteinStructure(struc.name, chains) !isnothing(path) && endswith(path, ".cif") && renumber!(structure, BioStructures.MMCIFDict(path)) return structure end -readchains(path::AbstractString, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format); path) -readchains(path::AbstractString) = readchains(path, get_format(path)) +Base.read(path::AbstractString, ::Type{ProteinStructure}, format::Type{<:ProteinFileFormat}) = ProteinStructure(read(path, format); path) +Base.read(path::AbstractString, T::Type{ProteinStructure}) = read(path, T, get_format(path)) -readpdb(path::AbstractString) = readchains(path, PDBFormat) -readcif(path::AbstractString) = readchains(path, MMCIFFormat) \ No newline at end of file +readcif(path::AbstractString) = read(path, ProteinStructure, MMCIFFormat) +readpdb(path::AbstractString) = read(path, ProteinStructure, PDBFormat) diff --git a/src/io/write.jl b/src/io/write.jl index 8f7d7ad..3e198ab 100644 --- a/src/io/write.jl +++ b/src/io/write.jl @@ -13,8 +13,8 @@ function estimate_last_oxygen_position(backbone::Array{T,3}) where T<:Real return O_pos end -function BioStructures.Chain(proteinchain::ProteinChain, model::BioStructures.Model) - numbering = hasproperty(proteinchain, :numbering) ? proteinchain.numbering : collect(1:countresidues(proteinchain)) +function BioStructures.Chain(proteinchain::AbstractProteinChain, model::BioStructures.Model) + numbering = proteinchain.numbering residue_list = Vector{String}() residues = Dict{String, BioStructures.AbstractResidue}() chain = BioStructures.Chain(proteinchain.id, residue_list, residues, model) @@ -25,22 +25,22 @@ function BioStructures.Chain(proteinchain::ProteinChain, model::BioStructures.Mo oxygen_coords[:,end] = estimate_last_oxygen_position(proteinchain.backbone) atom_serial = 0 - for residue_index in 1:countresidues(proteinchain) + for residue_index in 1:length(proteinchain) atom_list = Vector{String}() atoms = Dict{String, BioStructures.AbstractAtom}() resname = threeletter_resname(proteinchain.sequence[residue_index]) number = numbering[residue_index] residue = BioStructures.Residue(resname, number, ' ', false, atom_list, atoms, chain, '-') # TODO: secondary structure - for (i, (atom_name, element)) in enumerate(zip(BACKBONE_ATOM_NAMES, BACKBONE_ATOM_SYMBOLS)) + for (i, (name, element)) in enumerate(zip(BACKBONE_ATOM_NAMES, BACKBONE_ATOM_SYMBOLS)) atom_serial += 1 coords = proteinchain.backbone[:, i, residue_index] - atom = BioStructures.Atom(atom_serial, atom_name, ' ', coords, 1.0, 0.0, element, " ", residue) + atom = BioStructures.Atom(atom_serial, name, ' ', coords, 1.0, 0.0, element, " ", residue) push!(atom_list, atom.name) atoms[atom.name] = atom end - if !any(atom -> atom.atom_name == oxygen_name, proteinchain.atoms[residue_index]) + if !any(atom -> atom.name == oxygen_name, proteinchain.atoms[residue_index]) atom_serial += 1 coords = oxygen_coords[:, residue_index] atom = BioStructures.Atom(atom_serial, "O", ' ', coords, 1.0, 0.0, "O", " ", residue) @@ -50,10 +50,10 @@ function BioStructures.Chain(proteinchain::ProteinChain, model::BioStructures.Mo for atom in proteinchain.atoms[residue_index] atom_serial += 1 - atom_name = decode_atom_name(atom.atom_name) + name = decode_atom_name(atom.name) coords = [atom.x, atom.y, atom.z] element = atomic_number_to_element_symbol(atom.atomic_number) - atom = BioStructures.Atom(atom_serial, atom_name, ' ', coords, 1.0, 0.0, element, " ", residue) + atom = BioStructures.Atom(atom_serial, name, ' ', coords, 1.0, 0.0, element, " ", residue) push!(atom_list, atom.name) atoms[atom.name] = atom end @@ -76,7 +76,7 @@ function BioStructures.MolecularStructure(proteinstruc::ProteinStructure) return struc end -function writechains(path::AbstractString, proteinstruc::ProteinStructure, format::Type{<:ProteinFileFormat}) +function Base.write(path::AbstractString, proteinstruc::ProteinStructure, format::Type{<:ProteinFileFormat}) struc = BioStructures.MolecularStructure(proteinstruc) if format == MMCIFFormat BioStructures.writemmcif(path, struc) @@ -88,11 +88,12 @@ function writechains(path::AbstractString, proteinstruc::ProteinStructure, forma return nothing end -function writechains(path::AbstractString, proteinchains::AbstractVector{<:ProteinChain}, format) +function Base.write(path::AbstractString, proteinchains::AbstractVector{<:AbstractProteinChain}, format=get_format(path)) proteinstruc = ProteinStructure(basename(path), proteinchains) - writechains(path, proteinstruc, format) + write(path, proteinstruc, format) end -writechains(path::AbstractString, proteinchain::ProteinChain, format) = writechains(path, [proteinchain], format) +Base.write(path::AbstractString, proteinchain::AbstractProteinChain, format=get_format(path)) = write(path, [proteinchain], format) -writechains(path::AbstractString, arg) = writechains(path, arg, get_format(path)) +writecif(path::AbstractString, x) = write(path, x, MMCIFFormat) +writepdb(path::AbstractString, x) = write(path, x, PDBFormat) diff --git a/src/secondary-structure.jl b/src/secondary-structure.jl index 4af836e..afdff1c 100644 --- a/src/secondary-structure.jl +++ b/src/secondary-structure.jl @@ -1,19 +1,13 @@ import AssigningSecondaryStructure: assign_secondary_structure!, assign_secondary_structure -export assign_secondary_structure! - const SS_CODES = ['-', 'H', 'E'] number_vector_to_codes(v::AbstractVector{<:Integer}) = SS_CODES[v] -number_vector_to_code_string(v::AbstractVector{<:Integer}) = join(SS_CODES)[v] -function assign_secondary_structure!(chain::ProteinChain) - number_vector = assign_secondary_structure(chain.backbone) - chain.secondary_structure = number_vector_to_code_string(number_vector) - return chain -end +assign_secondary_structure(chain::AbstractProteinChain) = assign_secondary_structure(chain.backbone) -function assign_secondary_structure!(structure::AbstractVector{<:ProteinChain}) - assign_secondary_structure!.(structure) - return structure +function assign_secondary_structure!(chain::AnnotatedProteinChain) + number_vector = assign_secondary_structure(chain) + annotate_indexable!(chain; secondary_structure=number_vector_to_codes(number_vector)) + return chain end diff --git a/src/structure.jl b/src/structure.jl index 1021afd..a4ee53d 100644 --- a/src/structure.jl +++ b/src/structure.jl @@ -1,35 +1,33 @@ -@dynamic mutable struct ProteinStructure{T<:AbstractFloat} <: AbstractVector{ProteinChain{T}} - name::String - chains::Vector{ProteinChain{T}} -end - """ - ProteinStructure{T<:AbstractFloat} <: AbstractVector{ProteinChain{T}} - -## Examples + ProteinStructure <: AbstractVector{AbstractProteinChain} ```jldoctest julia> pdb"1EYE" -1-chain ProteinStructure "1EYE.cif" with 2 dynamic properties: - 2 fields: - name::String = "1EYE.cif" - chains::Vector{ProteinChain{Float64}} = - 2 properties: - ids::Vector{String} = ["A"] - lengths::Vector{Int64} = [253] +[ Info: Downloading file from PDB: 1EYE +1-chain ProteinStructure "1EYE.cif" + 253-residue AnnotatedProteinChain{Float64} (A) ``` """ -ProteinStructure +mutable struct ProteinStructure <: AbstractVector{AbstractProteinChain} + name::String + chains::Vector{AbstractProteinChain} +end Base.size(structure::ProteinStructure) = (length(structure.chains),) Base.getindex(structure::ProteinStructure, i) = structure.chains[i] Base.getindex(structure::ProteinStructure, id::AbstractString) = structure[findfirst(c -> c.id == id, structure.chains)] -Base.summary(structure::ProteinStructure) = "$(length(structure))-chain ProteinStructure \"$(structure.name)\" with $(length(getproperties(structure; fields=false))) dynamic properties" +Base.summary(structure::ProteinStructure) = "$(length(structure))-chain ProteinStructure \"$(structure.name)\"" + +function Base.show(io::IO, ::MIME"text/plain", structure::ProteinStructure) + print(io, summary(structure)) + for chain in structure + print(io, "\n ", summary(chain)) + end +end function offset!(structure::ProteinStructure, coords::Vector{<:Real}) - @assert length(coords) == 3 for chain in structure offset!(chain, coords) end diff --git a/test/runtests.jl b/test/runtests.jl index 80f35e5..33017ec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,24 +4,36 @@ using Test @testset "ProteinChains.jl" begin @testset "ProteinChain" begin - chain = ProteinChain("A", "AMINO", rand(3, 3, 5), [ProteinChains.Atom{Float64}[] for _ in 1:5]; modelnum=1) - @test countresidues(chain) == 5 - @test summary(chain) == "5-residue ProteinChain \"A\"" - @test chain[5:-1:1] == ProteinChain("A", "ONIMA", chain.backbone[:,:,5:-1:1], chain.atoms[5:-1:1]) + chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) + @test length(chain) == 5 + @test summary(chain) == "5-residue ProteinChain{Float64} (A)" + + i = 5:-1:1 + @test chain[i].backbone == chain.backbone[:,:,i] end @testset "ProteinStructure" begin - chain = ProteinChain("A", "AMINO", rand(3, 3, 5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) + chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) structure = ProteinStructure("1CHN", [chain]) @test structure[1] === chain @test structure["A"] === chain end + @testset "AnnotatedProteinChain" begin + chain = ProteinChain("A", "AMINO", rand(3, 3, 5), collect(1:5), [ProteinChains.Atom{Float64}[] for _ in 1:5]) + annotated_chain = annotate(chain; modelnum=1) + @test annotated_chain isa AnnotatedProteinChain + @test annotated_chain.modelnum == 1 + annotate_indexable!(annotated_chain; secondary_structure=assign_secondary_structure(annotated_chain)) + @test length(annotated_chain.secondary_structure) == 5 + @test length(annotated_chain[1:3].secondary_structure) == 3 + end + @testset "read/write" begin @testset "read" begin chains_pdb = pdbentry("1ASS"; format=PDBFormat) - @test countresidues.(collect(chains_pdb)) == [152] + @test length.(collect(chains_pdb)) == [152] chains_cif = pdbentry("1ASS"; format=MMCIFFormat) @test chains_pdb[1].backbone == chains_cif[1].backbone end @@ -30,18 +42,12 @@ using Test chains = pdb"1ASS" new_chains = mktempdir() do temp_dir temp_path = joinpath(temp_dir, "temp.pdb") - writechains(temp_path, chains) - readchains(temp_path) + write(temp_path, chains) + read(temp_path, ProteinStructure) end @test chains[1].backbone == new_chains[1].backbone end end - @testset "secondary structure" begin - ss_composition(chain::ProteinChain) = [count(==(ss), chain.secondary_structure) for ss in ['-', 'H', 'E']] - structure = assign_secondary_structure!(pdb"1ZAK") - @test ss_composition.(collect(structure)) == [[72, 116, 32], [72, 116, 32]] - end - end