diff --git a/README.md b/README.md index 87d19d8..8a20e59 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,42 @@ # AssigningSecondaryStructure +[![Latest Release](https://img.shields.io/github/release/MurrellGroup/AssigningSecondaryStructure.jl.svg)](https://github.com/MurrellGroup/AssigningSecondaryStructure.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/license/MIT) [![Build Status](https://github.com/MurrellGroup/AssigningSecondaryStructure.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/AssigningSecondaryStructure.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/MurrellGroup/AssigningSecondaryStructure.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/AssigningSecondaryStructure.jl) -This package provides a quick way to assign secondary structure using a simplified version of the [DSSP](https://swift.cmbi.umcn.nl/gv/dssp/) algorithm. The code was ported from the [PyDSSP](https://github.com/ShintaroMinami/PyDSSP) package. +This package provides an easy way to assign secondary structure to proteins using a simplified version of the [DSSP](https://swift.cmbi.umcn.nl/gv/dssp/) algorithm. The code was ported from the [PyDSSP](https://github.com/ShintaroMinami/PyDSSP) package. See the original Python package for more information on the differences between this implementation and the original DSSP algorithm -This is not a complete implementation of DSSP, as it only assigns loops (1), helices (2), and strands (3). It is not as accurate as the original, but is significantly faster. For the full DSSP algorithm, check out [BioStructures.jl](https://github.com/BioJulia/BioStructures.jl) or [ProteinSecondaryStructures.jl](https://github.com/m3g/ProteinSecondaryStructures.jl), which both use the [DSSP_jll.jl](https://docs.juliahub.com/General/DSSP_jll/stable/) package that was auto-generated using [BinaryBuilder.jl](https://github.com/JuliaPackaging/BinaryBuilder.jl). +This is not a complete implementation of DSSP, as it only assigns coils/loops ('-'), helices ('H'), and strands ('E').[^1] It is not as accurate as the original, but is significantly faster. For the full DSSP algorithm, check out [BioStructures.jl](https://github.com/BioJulia/BioStructures.jl) or [ProteinSecondaryStructures.jl](https://github.com/m3g/ProteinSecondaryStructures.jl), which both use the [DSSP_jll.jl](https://docs.juliahub.com/General/DSSP_jll/stable/) package that was auto-generated using [BinaryBuilder.jl](https://github.com/JuliaPackaging/BinaryBuilder.jl). + +## Installation + +The package can be installed using the Julia package manager: + +```julia +using Pkg; +Pkg.add("AssigningSecondaryStructure") +``` + +## Usage ```julia +julia> using AssigningSecondaryStructure + julia> assign_secondary_structure("test/data/1ASS.pdb") # 1 chain -1-element Vector{Vector{Int64}}: - [1, 1, 1, 3, 3, 3, 1, 1, 1, 1 … 3, 3, 3, 3, 3, 3, 3, 1, 1, 1] +1-element Vector{Vector{Char}}: + ['-', '-', '-', 'E', 'E', 'E', '-', '-' … 'E', 'E', 'E', 'E', 'E', '-', '-', '-'] julia> assign_secondary_structure("test/data/1ZAK.pdb") # 2 chains -2-element Vector{Vector{Int64}}: - [1, 1, 1, 1, 3, 3, 3, 3, 3, 3 … 2, 2, 2, 2, 2, 2, 2, 1, 1, 1] - [1, 1, 1, 1, 3, 3, 3, 3, 3, 3 … 2, 2, 2, 2, 2, 2, 2, 1, 1, 1] -``` \ No newline at end of file +2-element Vector{Vector{Char}}: + ['-', '-', '-', '-', 'E', 'E', 'E', 'E' … 'H', 'H', 'H', 'H', 'H', '-', '-', '-'] + ['-', '-', '-', '-', 'E', 'E', 'E', 'E' … 'H', 'H', 'H', 'H', 'H', '-', '-', '-'] +``` + +Note: The `assign_secondary_structure` function can also take a vector of atom coordinate arrays of size (3, 4, L) to avoid the overhead of I/O, in cases where the atom coordinates are already loaded. The first dimension is the x, y, and z coordinates, the second dimension is the atom type (N, CA, C, O), and the third dimension is the number of residues. + +## Acknowledgements + +This package was ported from the [PyDSSP](https://github.com/ShintaroMinami/PyDSSP), created by Shintaro Minami. The creation of this package would have been much more difficult without the original Python code as reference. + +[^1] - Read about DSSP codes on Wikipedia: [Protein secondary structure](https://en.wikipedia.org/wiki/Protein_secondary_structure) \ No newline at end of file diff --git a/src/assign.jl b/src/assign.jl index b7bfb53..5273c36 100644 --- a/src/assign.jl +++ b/src/assign.jl @@ -15,11 +15,14 @@ function assign_secondary_structure(coords_chains::Vector{<:AbstractArray{T, 3}} coords = cat(coords_chains..., dims=3) num_vector = dssp(coords) + code_vector = [NUM_TO_SS_CODE[num] for num in num_vector] cum_indices = cumsum(lengths) - num_vectors_by_chain = [num_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)] + code_vectors_by_chain = [code_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)] - return num_vectors_by_chain + clean_secondary_structure!.(code_vectors_by_chain) + + return code_vectors_by_chain end """ @@ -27,11 +30,11 @@ end Returns a vector of vectors of integers, each of which is the secondary structure assignment for the corresponding chain and their respective residues. - + The integers are assigned as follows: -- 1: loop -- 2: helix -- 3: strand +- '-': coil/loop. Neither helix nor strand. +- 'H': 4-turn helix (α helix). Minimum length 4 residues. +- 'E': extended strand in parallel and/or anti-parallel β-sheet conformation. Min length 2 residues. """ function assign_secondary_structure(filename::String) chains = load_pdb_backbone_coords(filename) diff --git a/src/utils.jl b/src/utils.jl index 04ff522..6041ea1 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,5 +1,41 @@ # These functions come from numpy and were used to port the code from python to julia. +const NUM_TO_SS_CODE = Dict( + 1 => '-', + 2 => 'H', + 3 => 'E', +) + +const MIN_HELIX_LENGTH = 4 +const MIN_STRAND_LENGTH = 2 + +function clean_secondary_structure!(ss_vector::Vector{Char}) + n = length(ss_vector) + i = 1 + + while i <= n + current_structure = ss_vector[i] + start = i + + while i <= n && ss_vector[i] == current_structure + i += 1 + end + segment_end = i - 1 + segment_length = segment_end - start + 1 + + for (code, max_len) in [('H', MIN_HELIX_LENGTH), ('E', MIN_STRAND_LENGTH)] + if current_structure == code && segment_length < max_len + for j in start:segment_end + ss_vector[j] = '-' + end + end + end + end + + return ss_vector +end + + function _pad(x::T, arr::AbstractArray{T, N}, paddings::Vararg{Tuple{Int, Int}, N}) where {T, N} @assert ndims(arr) == length(paddings) new_size = Int[] diff --git a/test/runtests.jl b/test/runtests.jl index c1303a9..b0b12a3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ using AssigningSecondaryStructure using Test -ss_composition(ss::Vector{Int}) = [count(==(i), ss) for i in 1:3] +ss_composition(ss::Vector{Char}) = [count(==(code), ss) for code in ['-', 'H', 'E']] @testset "AssigningSecondaryStructure.jl" begin @@ -26,7 +26,7 @@ ss_composition(ss::Vector{Int}) = [count(==(i), ss) for i in 1:3] @testset "1ASS" begin ss = assign_secondary_structure("data/1ASS.pdb") @test length(ss) == 1 - @test ss_composition.(ss) == [[60, 53, 39]] + @test ss_composition.(ss) == [[63, 53, 36]] end @testset "1ZAK" begin