diff --git a/README.md b/README.md index 26a449b..e43bdd1 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,9 @@ Profiling Protein Structures from Protein Data Bank and integrate various resour * * NOTE: currently only support minimum use * Download data from PDB Archive against unexpected needs - * wwwPDB&RCSB: + * wwPDB&RCSB: * EBI: - * wwwPDB Versioned: + * wwPDB Versioned: ## Install diff --git a/docs/figs/ToUniProt_ali.svg b/docs/figs/ToUniProt_ali.svg new file mode 100644 index 0000000..6873d05 --- /dev/null +++ b/docs/figs/ToUniProt_ali.svg @@ -0,0 +1,72986 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/ToUniProt.ipynb b/examples/ToUniProt.ipynb new file mode 100644 index 0000000..f59615b --- /dev/null +++ b/examples/ToUniProt.ipynb @@ -0,0 +1,247 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "Python 3.7.1 64-bit ('base': conda)", + "display_name": "Python 3.7.1 64-bit ('base': conda)", + "metadata": { + "interpreter": { + "hash": "2266c607543d224cb119288ea55888d6fda87cc9a4c78c02ed099d39082a76ce" + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Wall time: 4.99 ms\nWall time: 2.47 s\nWall time: 0 ns\nWall time: 82.1 ms\n" + } + ], + "source": [ + "%time from pdb_profiling import default_config\n", + "%time from pdb_profiling.processors import Identifier, UniProtFASTA\n", + "%time from pdb_profiling.utils import a_seq_reader\n", + "\n", + "%time default_config('C:/GitWorks/pdb-profiling/test/demo')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "demo = Identifier('NP_001291289.1')\n", + "demo" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "('Q9C0B2', None)" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "entry, isoform = demo.map2unp().result()\n", + "entry, isoform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "{'protein': 'NP_001291289.1', 'transcript': 'NM_001304360.1', 'gene': None}" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "demo.get_all_level_identifiers().result()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "('>NP_001291289.1 cilia- and flagella-associated protein 74 [Homo sapiens]',\n 'MEDDGSLLPEDELLADALLLEDERDELEDPEFDIKCLLQEAEDDVDPGHSSSVKELDTDADKLKKKTAEDRTQAFHLRQNLSALDKMHEEQELFTEKMRGELRACRQRRDLIDKQQEAVAAEIATEEEAGNMAAVGRLQAVSRRLFAELENERDLQSRTEAVLKESENTMWHIEIQEGRLEAFRTADREEVEATGRRLQVRAAEQLCREQEALGKVERNRLLRIRKSLNTQKELGLRHQKLLEDARKNHKVAVRFLKASLGRIREQEKKEEMECHEYMRRRMDAVVALKGSISANRDTLRKFQAWDRAKAELAEQRVQAEKKAILAQGRDAFRHLVHQRRRQELEAQKRAFEEEQKLRKQEIISRILKEEAEEEKRKKQHPPTSARHRLTLRDKTWNYISDFCKKTTVPTNTYTLDYEAAAGPGPSRLLEVVSSELIQGDPGASSEEETLAEPEISGLWNEDYKPYQVPKEDVDRKPVGGTKMDKDILERTVERLRSRVVHKQVVWGREFQGRPFNSKPELLHFQDFDIGKVYKKKITLVNTTYTINYCKLVGVEEHLRDFIHVDFDPPGPLSAGMSCEVLVTFKPMINKDLEGNISFLAQTGEFSVPLKCSTKKCSLSLDKELIDFGSYVVGETTSRTITLTNVGGLGTTFKFLPASEPCEMDDSQSALKLSSLLTYEDKSLYDKAATSFSEQQLEGTESSQADMQSRKELEKLDKEQEEEQPAEPERLTTVIPPSEEQTEITLGEVTEGEIGPFSSIKVPIVFTPVVPGDVQARFKVTFKNPQCPTLHFRVVGVAIDVPVWVPKPSVDLKICMYDRLYQDSVLVHTRSKAALRLKFEVCKELRAHLELLPKTGYIQAQSSYSVQLKFLPRHSLPEDAGRYFDKETRVLEAPMTIWVADQNKPVGFTVHAIVTTSDLELSPSEVDFGYCTIYEAIRTEISLHNHSLLPQEFGFVRLPKFVDVQPNDGFGTILPLETLQFCVIFQPTKAEEHRFQLTCKSEINRCFKLSCRAVGVHPPLELSHYQIKFAATALYDTSVATVYVINSHLSMSSPTHSKPRIGSEDASPMGPTSFEFLLPPDSPITISPSVGTVWPGKRCLVQVAFRPVLPEKLIRQEALPLLNKEMETKSFRKNMAPQRKDLHGLSFSVLRAQNRDKLFKVSVPHVLEMRKRELRPSSDEYQAARATLLRAFQAKFDTFVVPCVVASGDIKDRKGSEPLSFSPHNTLYLELWCPTVAPSVVVTSHKGKTIFNFGDVAVGHRSIKKISIQNVSPEDLALDFSLLNPNGPFVLLNHSSLLRAGGTQVLVLSFSPHESILAQETLDIITKRGTLTLTLMGTGVASMITCSIEGSVLNMGYVIAGESVSSGFKLQNNSLLPIKFSMHLDSLSSTRGRGQQQLPQFLSSPSQRTEVVGTQNLNGQSVFSVAPVKGVMDPGKTQDFTVTFSPDHESLYFSDKLQVVLFEKKISHQILLKGAACQHMMFVEGGDPLDVPVESLTAIPVFDPRHREASSRPGPLSPEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKTVEFSIDSVASLQHKGFSIEPSRGSVERGQTKTISISWVPPADFDPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP')" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "'''\n", + "Identifier(\n", + " demo.get_all_level_identifiers().result()['protein']\n", + ").fetch_sequence().result()\n", + "'''\n", + "np_header, np_seq = demo.fetch_sequence().result()\n", + "np_header, np_seq" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "True" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "demo.status" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "('>sp|Q9C0B2|CFA74_HUMAN Cilia- and flagella-associated protein 74 OS=Homo sapiens OX=9606 GN=CFAP74 PE=2 SV=3',\n 'MEDDGSLLPEDELLADALLLEDERDELEDPEFDIKCLLQEAEDDVDPGHSSSVKELDTDADKLKKKTAEDRTQAFHLRQNLSALDKMHEEQELFTEKMRGELRACRQRRDLIDKQQEAVAAEIATEEEAGNMAAVGRLQAVSRRLFAELENERDLQSRTEAVLKESENTMWHIEIQEGRLEAFRTADREEVEATGRRLQVRAAEQLCREQEALGKVERNRLLRIRKSLNTQKELGLRHQKLLEDARKNHKVAVRFLKASLGRIREQEKKEEMECHEYMRRRMDAVVALKGSISANRDTLRKFQAWDRAKAELAEQRVQAEKKAILAQGRDAFRHLVHQRRRQELEAQKRAFEEEQKLRKQEIISRILKEEAEEEKRKKQHPPTSARHRLTLRDKTWNYISDFCKKTTVPTNTYTLDYEAAAGPGPSRLLEVVSSELIQGDPGASSEEETLAEPEISGLWNEDYKPYQVPKEDVDRKPVGGTKMDKDILERTVERLRSRVVHKQVVWGREFQGRPFNSKPELLHFQDFDIGKVYKKKITLVNTTYTINYCKLVGVEEHLRDFIHVDFDPPGPLSAGMSCEVLVTFKPMINKDLEGNISFLAQTGEFSVPLKCSTKKCSLSLDKELIDFGSYVVGETTSRTITLTNVGGLGTTFKFLPASEPCEMDDSQSALKLSSLLTYEDKSLYDKAATSFSEQQLEGTESSQADMQSRKELEKLDKEQEEEQPAEPERLTTVIPPSEEQTEITLGEVTEGEIGPFSSIKVPIVFTPVVPGDVQARFKVTFKNPQCPTLHFRVVGVAIDVPVWVPKPSVDLKICMYDRLYQDSVLVHTRSKAALRLKFEVCKELRAHLELLPKTGYIQAQSSYSVQLKFLPRHSLPEDAGRYFDKETRVLEAPMTIWVADQNKPVGFTVHAIVTTSDLELSPSEVDFGYCTIYEAIRTEISLHNHSLLPQEFGFVRLPKFVDVQPNDGFGTILPLETLQFCVIFQPTKAEEHRFQLTCKSEINRCFKLSCRAVGVHPPLELSHYQIKFAATALYDTSVATVYVINSHLSMSSPTHSKPRIGSEDASPMGPTSFEFLLPPDSPITISPSVGTVWPGKRCLVQVAFRPVLPEKLIRQEALPLLNKEMETKSFRKNMAPQRKDLHGLSFSVLRAQNRDKLFKVSVPHVLEMRKRELRPSSDEYQAARATLLRAFQAKFDTFVVPCVVASGDIKDRKGSEPLSFSPHNTLYLELWCPTVAPSVVVTSHKGKTIFNFGDVAVGHRSIKKISIQNVSPEDLALDFSLLNPNGPFVLLNHSSLLRAGGTQVLVLSFSPHESILAQETLDIITKRGTLTLTLMGTGVASMITCSIEGSVLNMGYVIAGESVSSGFKLQNNSLLPIKFSMHLDSLSSTRGRGQQQLPQFLSSPSQRTEVVGTQNLNGQSVFSVAPVKGVMDPGKTQDFTVTFSPDHESLYFSDKLQVVLFEKKISHQILLKGAACQHMMFVEGGDPLDVPVESLTAIPVFDPRHREEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP')" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "unp_header, unp_seq = UniProtFASTA.single_retrieve(entry, UniProtFASTA.folder, UniProtFASTA.web_semaphore).then(a_seq_reader).result()\n", + "unp_header, unp_seq" + ] + }, + { + "source": [ + "```py\n", + "from dtaidistance import alignment\n", + "%time value, matrix = alignment.needleman_wunsch(unp_seq, np_seq)\n", + "algn, s1a, s2a = alignment.best_alignment(matrix, unp_seq, np_seq, gap='-')\n", + "print(''.join(s1a[1500:]))\n", + "print(''.join(s2a[1500:]))\n", + "\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('ggplot')\n", + "plt.figure(figsize=(10,8))\n", + "sns.heatmap(matrix[1500:,1500:],cmap='icefire')\n", + "plt.show()\n", + "```\n", + "\n", + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Wall time: 0 ns\nWall time: 20.9 ms\nWall time: 0 ns\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": "(((0, 1507), (1507, 1551), (1551, 1552), (1552, 1553), (1553, 1584)),\n ((0, 1507), (1517, 1561), (1581, 1582), (1603, 1604), (1608, 1639)))" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "from Bio import Align\n", + "%time aligner = Align.PairwiseAligner()\n", + "%time alignments = aligner.align(unp_seq, np_seq)\n", + "%time alignments[0].aligned" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "FDPRHRE----------EAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKK--------------------P---------------------D----HPLMVSALLQLRGDVKETYKVIFVAQVLTGP\n|||||||----------||||||||||||||||||||||||||||||||||||||||||||--------------------|---------------------|----|||||||||||||||||||||||||||||||\nFDPRHREASSRPGPLSPEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKTVEFSIDSVASLQHKGFSIEPSRGSVERGQTKTISISWVPPADFDPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP\n\n" + } + ], + "source": [ + "for i in str(alignments[0]).split('\\n'):\n", + " print(i[1500:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py index 34d878a..b27a3a6 100644 --- a/pdb_profiling/__init__.py +++ b/pdb_profiling/__init__.py @@ -6,7 +6,7 @@ # @Copyright (c) 2020 MinghuiGroup, Soochow University from re import compile as re_compile -__version__ = '0.1.6' +__version__ = '0.1.7' common_pat = r'^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]' @@ -34,6 +34,7 @@ def default_config(folder='./'): from pdb_profiling.processors.pdbe.record import Base from pdb_profiling.processors.pdbe.api import ProcessPDBe from pdb_profiling.processors.proteins.record import Identifier + from pdb_profiling.processors import UniProtFASTA # Use Existing Handled PDBe API Results (e.g. tsv format results) ProcessPDBe.use_existing = True # Use Existing API Results (e.g. json format results downloaded from web) @@ -43,6 +44,8 @@ def default_config(folder='./'): # Set WebFetcher's Semaphore Base.set_web_semaphore(30).result() Identifier.set_web_semaphore(30).result() + UniProtFASTA.set_web_semaphore(30).result() # Set Folder that store downloaded and handled files Base.set_folder(folder) Identifier.set_folder(folder) + UniProtFASTA.set_folder(folder) diff --git a/pdb_profiling/processors/eutils/api.py b/pdb_profiling/processors/eutils/api.py index 676e79b..4d39377 100644 --- a/pdb_profiling/processors/eutils/api.py +++ b/pdb_profiling/processors/eutils/api.py @@ -22,7 +22,7 @@ class EutilsAPI(Abclog): ''' headers = {"Content-Type": "text/plain"} api_set = frozenset(('efetch.fcgi', 'einfo.fcgi', 'esearch.fcgi', - 'epost.fcgi', 'esummary.fcgi')) + 'epost.fcgi', 'esummary.fcgi', 'egquery.fcgi')) @classmethod def dumpsParams(cls, params: Dict) -> str: @@ -34,7 +34,7 @@ def task_unit(cls, suffix: str, params: Dict, folder: Path) -> Tuple: url=f'{BASE_URL}{suffix}', headers=cls.headers, params=params) - return 'get', args, folder/f'{cls.dumpsParams(params)}.{params.get("retmode", params.get("rettype", "txt"))}' + return 'get', args, folder/f'{cls.dumpsParams(params)}.{params.get("retmode", params.get("rettype", "xml"))}' @classmethod def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path) -> Generator: diff --git a/pdb_profiling/processors/pdbe/api.py b/pdb_profiling/processors/pdbe/api.py index 3d47164..ad471b7 100644 --- a/pdb_profiling/processors/pdbe/api.py +++ b/pdb_profiling/processors/pdbe/api.py @@ -115,8 +115,8 @@ def __init__(self, name_group): def output(self): if self.pdb_range: - pdb_range = json.dumps(self.pdb_range) - unp_range = json.dumps(self.unp_range) + pdb_range = json.dumps(self.pdb_range).decode('utf-8') + unp_range = json.dumps(self.unp_range).decode('utf-8') return pdb_range, unp_range else: return self.default_pdb_range, self.default_unp_range @@ -261,12 +261,13 @@ def related_UNP_PDB(cls, filePath: Union[str, Path], related_unp: Optional[Itera return set(pdb_list), set(dfrm['SP_PRIMARY']) @classmethod - def reformat(cls, path: str) -> pd.DataFrame: - dfrm = pd.read_csv(path, sep='\t', converters=cls.converters) + def reformat(cls, path: Optional[str]=None, dfrm:Optional[pd.DataFrame]=None) -> pd.DataFrame: + if path is not None: + dfrm = pd.read_csv(path, sep='\t', converters=cls.converters) group_info_col = ['pdb_id', 'chain_id', 'UniProt'] range_info_col = ['pdb_start', 'pdb_end', 'unp_start', 'unp_end'] reader = SeqRangeReader(group_info_col) - dfrm[['sifts_pdb_range', 'sifts_unp_range']] = pd.DataFrame(dfrm.apply( + dfrm[['pdb_range', 'unp_range']] = pd.DataFrame(dfrm.apply( lambda x: reader.check(tuple(x[i] for i in group_info_col), tuple( x[i] for i in range_info_col)), axis=1).values.tolist(), index=dfrm.index) @@ -276,52 +277,72 @@ def reformat(cls, path: str) -> pd.DataFrame: return dfrm @staticmethod - def dealWithInDe(dfrm: pd.DataFrame) -> pd.DataFrame: + def sort_2_range(unp_range: List, pdb_range: List): + unp_range, pdb_range = zip( + *sorted(zip(unp_range, pdb_range), key=lambda x: x[0][0])) + return unp_range, pdb_range + + @classmethod + def dealWithInDel(cls, dfrm: pd.DataFrame, sort_by_unp:bool=True) -> pd.DataFrame: def get_gap_list(li: List): return [li[i+1][0] - li[i][1] - 1 for i in range(len(li)-1)] def get_range_diff(lyst_a: List, lyst_b: List): - array_a = np.array([ran[1] - ran[0] + 1 for ran in lyst_a]) - array_b = np.array([ran[1] - ran[0] + 1 for ran in lyst_b]) - return (array_a - array_b).tolist() + array_a = np.array([right - left + 1 for left, right in lyst_a]) + array_b = np.array([right - left + 1 for left, right in lyst_b]) + return array_a - array_b def add_tage_to_range(df: pd.DataFrame, tage_name: str): # ADD TAGE FOR SIFTS df[tage_name] = 'Safe' # No Insertion But Deletion[Pure Deletion] df.loc[df[(df['group_info'] == 1) & ( - df['sifts_unp_pdb_var'] > 0)].index, tage_name] = 'Deletion' + df['diff+'] > 0)].index, tage_name] = 'Deletion' # Insertion & No Deletion df.loc[df[ - (df['group_info'] != 1) & - (df['var_0_count'] == df['group_info']) & - (df['unp_GAP_0_count'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion' + (df['group_info'] == 1) & + (df['diff-'] > 0)].index, tage_name] = 'Insertion (Specail Case)' + df.loc[df[ + (df['group_info'] > 1) & + (df['diff0'] == df['group_info']) & + (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion' # Insertion & Deletion df.loc[df[ - (df['group_info'] != 1) & - ((df['var_0_count'] != df['group_info']) | - (df['unp_GAP_0_count'] != (df['group_info'] - 1)))].index, tage_name] = 'Insertion & Deletion' - - dfrm['pdb_GAP_list'] = dfrm.apply(lambda x: json.dumps( - get_gap_list(json.loads(x['sifts_pdb_range']))), axis=1) - dfrm['unp_GAP_list'] = dfrm.apply(lambda x: json.dumps( - get_gap_list(json.loads(x['sifts_unp_range']))), axis=1) - dfrm['var_list'] = dfrm.apply(lambda x: json.dumps(get_range_diff( - json.loads(x['sifts_unp_range']), json.loads(x['sifts_pdb_range']))), axis=1) - dfrm['delete'] = dfrm.apply( - lambda x: '-' in x['var_list'], axis=1) - dfrm['delete'] = dfrm.apply( - lambda x: True if '-' in x['unp_GAP_list'] else x['delete'], axis=1) - dfrm['var_0_count'] = dfrm.apply( - lambda x: json.loads(x['var_list']).count(0), axis=1) - dfrm['unp_GAP_0_count'] = dfrm.apply( - lambda x: json.loads(x['unp_GAP_list']).count(0), axis=1) + (df['group_info'] > 1) & + ((df['diff0'] != df['group_info']) | + (df['unp_gaps0'] != (df['group_info'] - 1)))].index, tage_name] = 'Insertion & Deletion' + + dfrm.pdb_range = dfrm.pdb_range.apply(json.loads) + dfrm.unp_range = dfrm.unp_range.apply(json.loads) dfrm['group_info'] = dfrm.apply(lambda x: len( - json.loads(x['sifts_pdb_range'])), axis=1) - dfrm['sifts_unp_pdb_var'] = dfrm.apply( - lambda x: json.loads(x['var_list'])[0], axis=1) - add_tage_to_range(dfrm, tage_name='sifts_range_tage') - return dfrm + x['pdb_range']), axis=1) + + focus_index = dfrm[dfrm.group_info.gt(1)].index + if sort_by_unp and (len(focus_index) > 0): + focus_df = dfrm.loc[focus_index].apply(lambda x: cls.sort_2_range( + x['unp_range'], x['pdb_range']), axis=1, result_type='expand') + focus_df.index = focus_index + focus_df.columns = ['unp_range', 'pdb_range'] + dfrm.loc[focus_index, ['unp_range', 'pdb_range']] = focus_df + + dfrm['pdb_gaps'] = dfrm.pdb_range.apply(get_gap_list) + dfrm['unp_gaps'] = dfrm.unp_range.apply(get_gap_list) + dfrm['range_diff'] = dfrm.apply(lambda x: get_range_diff(x['unp_range'], x['pdb_range']), axis=1) + dfrm['diff0'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x == 0)) + dfrm['diff+'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x > 0)) + dfrm['diff-'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x < 0)) + dfrm['unp_gaps0'] = dfrm.unp_gaps.apply(lambda x: x.count(0)) + add_tage_to_range(dfrm, tage_name='sifts_range_tag') + dfrm['repeated'] = dfrm.apply( + lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion (Specail Case)', axis=1) + dfrm['repeated'] = dfrm.apply( + lambda x: True if any(i < 0 for i in x['unp_gaps']) else x['repeated'], axis=1) + dfrm['reversed'] = dfrm.pdb_gaps.apply(lambda x: any(i < 0 for i in x)) + dfrm.pdb_range = dfrm.pdb_range.apply(lambda x: json.dumps(x).decode('utf-8')) + dfrm.unp_range = dfrm.unp_range.apply(lambda x: json.dumps(x).decode('utf-8')) + temp_cols = ['start', 'end', 'group_info', 'pdb_gaps', 'unp_gaps', 'range_diff', + 'diff0', 'diff+', 'diff-', 'unp_gaps0'] + return dfrm.drop(columns=temp_cols), dfrm[temp_cols] ''' @staticmethod @@ -761,7 +782,7 @@ class PDBArchive(Abclog): ''' Download files from PDB Archive - * wwwPDB/RCSB: PDB_ARCHIVE_URL_WWPDB: str = 'https://ftp.wwpdb.org/pub/pdb/data/structures/' + * wwPDB/RCSB: PDB_ARCHIVE_URL_WWPDB: str = 'https://ftp.wwpdb.org/pub/pdb/data/structures/' * EBI: PDB_ARCHIVE_URL_EBI: str = 'http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/' ''' root = PDB_ARCHIVE_URL_EBI @@ -801,7 +822,7 @@ class PDBVersioned(PDBArchive): ''' Download files from PDB Versioned - * wwwPDB Versioned: PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/entries/' + * wwPDB Versioned: PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/entries/' >>> PDBVersioned.single_retrieve( ('2wmg', '_v1-2'), 'entries/', diff --git a/pdb_profiling/processors/proteins/record.py b/pdb_profiling/processors/proteins/record.py index 285e06d..5c0c324 100644 --- a/pdb_profiling/processors/proteins/record.py +++ b/pdb_profiling/processors/proteins/record.py @@ -73,8 +73,7 @@ def __init__(self, identifier: str, folder: Optional[Union[Path, str]] = None): except AttributeError: raise AttributeError( "Please specify class variable `folder` via set_folder() first or pass `folder` in this method!") - self.ensembl_status = None - self.refseq_status = None + self.status = None def __repr__(self): return f'<{self.source} {self.level} {self.identifier} {self.version}>' @@ -89,9 +88,9 @@ async def set_status(self): self.ensembl_api_web_semaphore, headers={'Content-Type': 'application/json'}) if res is None: - self.ensembl_status = False + self.status = False else: - self.ensembl_status = await a_load_json(res) + self.status = await a_load_json(res) @unsync async def fetch_from_ProteinsAPI(self): @@ -116,7 +115,17 @@ async def fetch_from_ProteinsAPI(self): f"Can't find dbReference with {self.identifier}") @unsync - async def map2unp(self): + async def get_all_level_identifiers(self): + try: + return dict(zip(('protein', 'transcript', 'gene'), await self.sqlite_api.database.fetch_one( + query=f""" + SELECT protein,transcript,gene FROM dbReferences + WHERE type == '{self.source}' AND {self.level} == '{self.raw_identifier}'"""))) + except TypeError: + return + + @unsync + async def map2unp_from_localDB(self): try: entry, isoform = await self.sqlite_api.database.fetch_one( query=f""" @@ -149,19 +158,21 @@ async def fetch_sequence(self, newest: bool = True): self.seq_folder['RefSeq'], self.eutils_api_web_semaphore) if res is not None: + self.status = True return await a_seq_reader(res) else: + self.status = False self.logger.warning(f'Invalid Identifier!') elif self.source == 'Ensembl': - if self.ensembl_status is None: + if self.status is None: await self.set_status() - if self.ensembl_status is False: + if self.status is False: self.logger.warning(f'Invalid Identifier!') return - elif self.ensembl_status['is_current'] != '1': + elif self.status['is_current'] != '1': self.logger.warning( - f'Not exists in current archive: \n{self.ensembl_status}') + f'Not exists in current archive: \n{self.status}') return if not newest: self.logger.warning( @@ -171,3 +182,11 @@ async def fetch_sequence(self, newest: bool = True): dict(type='protein'), self.seq_folder['Ensembl'], self.ensembl_api_web_semaphore).then(a_seq_reader) + + @unsync + async def map2unp(self): + res = await self.map2unp_from_localDB() + if res is None: + await self.fetch_from_ProteinsAPI() + res = await self.map2unp_from_localDB() + return res diff --git a/pdb_profiling/processors/uniprot/api.py b/pdb_profiling/processors/uniprot/api.py index aa0f606..13a0baa 100644 --- a/pdb_profiling/processors/uniprot/api.py +++ b/pdb_profiling/processors/uniprot/api.py @@ -18,6 +18,7 @@ from pdb_profiling.log import Abclog from pdb_profiling.fetcher.webfetch import UnsyncFetch from pdb_profiling.processors.uniprot.process import ExtractIsoAlt +from pdb_profiling.utils import init_semaphore, init_folder_from_suffix QUERY_COLUMNS: List[str] = [ @@ -385,6 +386,15 @@ class UniProtFASTA(Abclog): params = {'include': 'yes'} obj = {} + @classmethod + @unsync + async def set_web_semaphore(cls, web_semaphore_value:int): + cls.web_semaphore = await init_semaphore(web_semaphore_value) + + @classmethod + def set_folder(cls, folder: Union[Path, str]): + cls.folder = init_folder_from_suffix(folder, 'UniProt/fasta/') + @classmethod @unsync async def process(cls, path: Union[str, Path, Unfuture]):