diff --git a/README.md b/README.md
index 624d036..d1445fa 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,9 @@
+# PyEnsembl
-PyEnsembl
-=======
-PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files.
+PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files.
# Example Usage
@@ -25,7 +24,7 @@ data = EnsemblRelease(77)
gene_names = data.gene_names_at_locus(contig=6, position=29945884)
# get all exons associated with HLA-A
-exon_ids = data.exon_ids_of_gene_name('HLA-A')
+exon_ids = data.exon_ids_of_gene_name("HLA-A")
```
# Installation
@@ -52,6 +51,7 @@ Alternatively, you can create the `EnsemblRelease` object from inside a Python
process and call `ensembl_object.download()` followed by `ensembl_object.index()`.
## Cache Location
+
By default, PyEnsembl uses the platform-specific `Cache` folder
and caches the files into the `pyensembl` sub-directory.
You can override this default by setting the environment key `PYENSEMBL_CACHE_DIR`
@@ -66,11 +66,11 @@ or
```python
import os
-os.environ['PYENSEMBL_CACHE_DIR'] = '/custom/cache/dir'
+os.environ["PYENSEMBL_CACHE_DIR"] = "/custom/cache/dir"
# ... PyEnsembl API usage
```
-# Usage tips
+# Usage tips
## List installed genomes
@@ -80,6 +80,7 @@ pyensembl list
```python
from pyensembl.shell import collect_all_installed_ensembl_releases
+
collect_all_installed_ensembl_releases()
```
@@ -87,10 +88,11 @@ collect_all_installed_ensembl_releases()
```python
from pyensembl import EnsemblRelease
+
data = EnsemblRelease(
release=100,
- species=find_species_by_name('drosophila_melanogaster'),
- )
+ species=find_species_by_name("drosophila_melanogaster"),
+)
```
## Data structure
@@ -98,13 +100,13 @@ data = EnsemblRelease(
### Gene object
```python
-gene=data.gene_by_id(gene_id='FBgn0011747')
+gene = data.gene_by_id(gene_id="FBgn0011747")
```
### Transcript object
```python
-transcript=gene.transcripts[0]
+transcript = gene.transcripts[0]
```
### Protein information
@@ -125,11 +127,12 @@ For example:
```python
from pyensembl import Genome
+
data = Genome(
- reference_name='GRCh38',
- annotation_name='my_genome_features',
+ reference_name="GRCh38",
+ annotation_name="my_genome_features",
# annotation_version=None,
- gtf_path_or_url='/My/local/gtf/path_to_my_genome_features.gtf', # Path or URL of GTF file
+ gtf_path_or_url="/My/local/gtf/path_to_my_genome_features.gtf", # Path or URL of GTF file
# transcript_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing transcript sequences
# protein_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing protein sequences
# cache_directory_path=None, # Where to place downloaded and cached files for this genome
@@ -142,8 +145,8 @@ gene_names = data.gene_names_at_locus(contig=6, position=29945884)
# API
The `EnsemblRelease` object has methods to let you access all possible
-combinations of the annotation features *gene\_name*, *gene\_id*,
-*transcript\_name*, *transcript\_id*, *exon\_id* as well as the location of
+combinations of the annotation features _gene_name_, _gene_id_,
+_transcript_name_, _transcript_id_, _exon_id_ as well as the location of
these genomic elements (contig, start position, end position, strand).
## Genes
diff --git a/docs/conf.py b/docs/conf.py
index bbc0aaf..aefddaa 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,47 +18,47 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath(".."))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
+ "sphinx.ext.autodoc",
]
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = u'pyensembl'
-copyright = u'2016, Hammer Lab'
-author = u'Hammer Lab'
+project = "pyensembl"
+copyright = "2016, Hammer Lab"
+author = "Hammer Lab"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = u'0.8.10'
+version = "0.8.10"
# The full version, including alpha/beta/rc tags.
-release = u'0.8.10'
+release = "0.8.10"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -69,37 +69,37 @@
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
@@ -109,156 +109,155 @@
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'alabaster'
+html_theme = "alabaster"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (relative to this directory) to use as a favicon of
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
-#html_search_language = 'en'
+# html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
+# html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
+# html_search_scorer = 'scorer.js'
# Output file base name for HTML help builder.
-htmlhelp_basename = 'pyensembldoc'
+htmlhelp_basename = "pyensembldoc"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
+ # Latex figure (float) alignment
+ #'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- (master_doc, 'pyensembl.tex', u'pyensembl Documentation',
- u'Hammer Lab', 'manual'),
+ (
+ master_doc,
+ "pyensembl.tex",
+ "pyensembl Documentation",
+ "Hammer Lab",
+ "manual",
+ ),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [
- (master_doc, 'pyensembl', u'pyensembl Documentation',
- [author], 1)
-]
+man_pages = [(master_doc, "pyensembl", "pyensembl Documentation", [author], 1)]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@@ -267,19 +266,25 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- (master_doc, 'pyensembl', u'pyensembl Documentation',
- author, 'pyensembl', 'One line description of project.',
- 'Miscellaneous'),
+ (
+ master_doc,
+ "pyensembl",
+ "pyensembl Documentation",
+ author,
+ "pyensembl",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py
index eeb28fb..991af8c 100644
--- a/pyensembl/__init__.py
+++ b/pyensembl/__init__.py
@@ -10,27 +10,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+from .config import MAX_ENSEMBL_RELEASE, MAX_ENSEMBLGENOME_RELEASE
from .database import Database
from .download_cache import DownloadCache
from .ensembl_release import EnsemblRelease, cached_release
-from .ensembl_release_versions import MAX_ENSEMBL_RELEASE
from .exon import Exon
-from .genome import Genome
from .gene import Gene
+from .genome import Genome
from .locus import Locus
-from .reference_name import (
- ensembl_grch36,
- ensembl_grch37,
- ensembl_grch38,
- normalize_reference_name,
+from .reference_name import ( # ensembl_grch36,; ensembl_grch37,; ensembl_grch38,
find_species_by_reference,
- which_reference,
genome_for_reference_name,
+ normalize_reference_name,
+ which_reference,
)
-
from .search import find_nearest_locus
from .sequence_data import SequenceData
-from .species import find_species_by_name, check_species_object, normalize_species_name
+from .species import (
+ check_species_object,
+ find_species_by_name,
+ normalize_species_name,
+)
from .transcript import Transcript
from .version import __version__
@@ -41,6 +41,7 @@
"EnsemblRelease",
"cached_release",
"MAX_ENSEMBL_RELEASE",
+ "MAX_ENSEMBLGENOME_RELEASE",
"Gene",
"Transcript",
"Exon",
@@ -56,7 +57,7 @@
"Genome",
"Locus",
"Exon",
- "ensembl_grch36",
- "ensembl_grch37",
- "ensembl_grch38",
+ # "ensembl_grch36",
+ # "ensembl_grch37",
+ # "ensembl_grch38",
]
diff --git a/pyensembl/common.py b/pyensembl/common.py
index ccc5eb1..a9a3964 100644
--- a/pyensembl/common.py
+++ b/pyensembl/common.py
@@ -11,7 +11,6 @@
# limitations under the License.
import pickle
-
from functools import wraps
@@ -28,10 +27,11 @@ def load_pickle(filepath):
def _memoize_cache_key(args, kwargs):
- """Turn args tuple and kwargs dictionary into a hashable key.
+ """
+ Turn args tuple and kwargs dictionary into a hashable key.
- Expects that all arguments to a memoized function are either hashable
- or can be uniquely identified from type(arg) and repr(arg).
+ Expects that all arguments to a memoized function are either
+ hashable or can be uniquely identified from type(arg) and repr(arg).
"""
cache_key_list = []
@@ -51,9 +51,9 @@ def _memoize_cache_key(args, kwargs):
def memoize(fn):
- """Simple reset-able memoization decorator for functions and methods,
- assumes that all arguments to the function can be hashed and
- compared.
+ """
+ Simple reset-able memoization decorator for functions and methods, assumes
+ that all arguments to the function can be hashed and compared.
"""
cache = {}
diff --git a/pyensembl/config.py b/pyensembl/config.py
new file mode 100644
index 0000000..faaa3a5
--- /dev/null
+++ b/pyensembl/config.py
@@ -0,0 +1,181 @@
+# TODO: save the config in YMAL file, or TOML file?
+
+MIN_ENSEMBL_RELEASE = 54
+MAX_ENSEMBL_RELEASE = 110
+MIN_ENSEMBLGENOME_RELEASE = 50
+MAX_ENSEMBLGENOME_RELEASE = 57
+
+
+SPECIES_DATA = [
+ {
+ "latin_name": "homo_sapiens",
+ "synonyms": ["human"],
+ "reference_assemblies": {
+ "NCBI36": (54, 54),
+ "GRCh37": (55, 75),
+ "GRCh38": (76, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "mus_musculus",
+ "synonyms": ["mouse", "house mouse"],
+ "reference_assemblies": {
+ "NCBIM37": (54, 67),
+ "GRCm38": (68, 102),
+ "GRCm39": (103, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "canis_familiaris",
+ "synonyms": ["dog"],
+ "reference_assemblies": {"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "felis_catus",
+ "synonyms": ["cat"],
+ "reference_assemblies": {
+ "Felis_catus_6.2": (75, 90),
+ "Felis_catus_8.0": (91, 92),
+ "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "gallus_gallus",
+ "synonyms": ["chicken"],
+ "reference_assemblies": {
+ "Galgal4": (75, 85),
+ "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "rattus_norvegicus",
+ "synonyms": ["rat", "brown_rat", "lab_rat"],
+ "reference_assemblies": {
+ "Rnor_5.0": (75, 79),
+ "Rnor_6.0": (80, 104),
+ "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "macaca_fascicularis",
+ "synonyms": ["macaque", "Crab-eating_macaque"],
+ "reference_assemblies": {
+ "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE)
+ },
+ },
+ {
+ "latin_name": "chlorocebus_sabaeus",
+ "synonyms": ["green_monkey", "african_green_monkey"],
+ "reference_assemblies": {"ChlSab1.1": (86, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "macaca_mulatta",
+ "synonyms": ["rhesus"],
+ "reference_assemblies": {"Mmul_10": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "oryctolagus_cuniculus",
+ "synonyms": ["rabbit"],
+ "reference_assemblies": {"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "meriones_unguiculatus",
+ "synonyms": ["gerbil"],
+ "reference_assemblies": {"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "mesocricetus_auratus",
+ "synonyms": ["syrian_hamster"],
+ "reference_assemblies": {"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "cricetulus_griseus_chok1gshd",
+ "synonyms": ["chinese_hamster"],
+ "reference_assemblies": {"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "heterocephalus_glaber_female",
+ "synonyms": ["naked_mole_rat"],
+ "reference_assemblies": {
+ "HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)
+ },
+ },
+ {
+ "latin_name": "cavia_porcellus",
+ "synonyms": ["guinea_pig"],
+ "reference_assemblies": {"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "sus_scrofa",
+ "synonyms": ["pig"],
+ "reference_assemblies": {"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "danio_rerio",
+ "synonyms": ["zebrafish"],
+ "reference_assemblies": {
+ "Zv8": (54, 59),
+ "Zv9": (60, 79),
+ "GRCz10": (80, 91),
+ "GRCz11": (92, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "drosophila_melanogaster",
+ "synonyms": ["drosophila", "fruit fly", "fly"],
+ "reference_assemblies": {
+ "BDGP5": (75, 78),
+ "BDGP6": (79, 95),
+ "BDGP6.22": (96, 98),
+ "BDGP6.28": (99, 102),
+ "BDGP6.32": (103, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "caenorhabditis_elegans",
+ "synonyms": ["nematode", "C_elegans"],
+ "reference_assemblies": {
+ "WS200": (55, 57),
+ "WS210": (58, 60),
+ "WS220": (61, 66),
+ "WBcel235": (67, MAX_ENSEMBL_RELEASE),
+ },
+ },
+ {
+ "latin_name": "saccharomyces_cerevisiae",
+ "synonyms": ["yeast", "budding_yeast"],
+ "reference_assemblies": {"R64-1-1": (75, MAX_ENSEMBL_RELEASE)},
+ },
+ {
+ "latin_name": "arabidopsis_thaliana",
+ "synonyms": ["cress", "thale_cress", "hehe"],
+ "reference_assemblies": {
+ "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE),
+ },
+ "database": "plants",
+ },
+ {
+ "latin_name": "oryza_sativa",
+ "synonyms": ["rice"],
+ "reference_assemblies": {
+ "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE),
+ },
+ "database": "plants",
+ },
+ {
+ "latin_name": "zea_mays",
+ "synonyms": ["maize"],
+ "reference_assemblies": {
+ "Zm-B73-REFERENCE-NAM-5.0": (55, MAX_ENSEMBLGENOME_RELEASE),
+ },
+ "database": "plants",
+ },
+ {
+ "latin_name": "glycine_max",
+ "synonyms": ["soybean"],
+ "reference_assemblies": {
+ "Glycine_max_v2.1": (55, MAX_ENSEMBLGENOME_RELEASE),
+ },
+ "database": "plants",
+ },
+]
diff --git a/pyensembl/database.py b/pyensembl/database.py
index 4286908..b5fcd99 100644
--- a/pyensembl/database.py
+++ b/pyensembl/database.py
@@ -11,16 +11,16 @@
# limitations under the License.
import logging
-from os.path import split, join, exists, splitext
import sqlite3
+from os.path import exists, join, split, splitext
import datacache
+from gtfparse import create_missing_features, read_gtf
from typechecks import require_integer, require_string
-from gtfparse import read_gtf, create_missing_features
from .common import memoize
-from .normalization import normalize_chromosome, normalize_strand
from .locus import Locus
+from .normalization import normalize_chromosome, normalize_strand
# any time we update the database schema, increment this version number
DATABASE_SCHEMA_VERSION = 3
@@ -31,9 +31,9 @@
class Database(object):
"""
- Wrapper around sqlite3 database so that the rest of the
- library doesn't have to worry about constructing the .db file or
- writing SQL queries directly.
+ Wrapper around sqlite3 database so that the rest of the library doesn't
+ have to worry about constructing the .db file or writing SQL queries
+ directly.
"""
def __init__(
@@ -104,8 +104,8 @@ def local_db_path(self):
def _all_possible_indices(self, column_names):
"""
- Create list of tuples containing all possible index groups
- we might want to create over tables in this database.
+ Create list of tuples containing all possible index groups we might
+ want to create over tables in this database.
If a set of genome annotations is missing some column we want
to index on, we have to drop any indices which use that column.
@@ -136,7 +136,8 @@ def _all_possible_indices(self, column_names):
# other GTFs)
if column_name not in column_set:
logger.info(
- "Skipping database index for {%s}", ", ".join(column_group)
+ "Skipping database index for {%s}",
+ ", ".join(column_group),
)
skip = True
if skip:
@@ -149,7 +150,8 @@ def _all_possible_indices(self, column_names):
PRIMARY_KEY_COLUMNS = {"gene": "gene_id", "transcript": "transcript_id"}
def _get_primary_key(self, feature_name, feature_df):
- """Name of primary key for a feature table (e.g. "gene" -> "gene_id")
+ """
+ Name of primary key for a feature table (e.g. "gene" -> "gene_id")
Since we're potentially going to run this code over unseen data,
make sure that the primary is unique and never null.
@@ -163,18 +165,21 @@ def _get_primary_key(self, feature_name, feature_df):
if primary_key_values.isnull().any():
raise ValueError(
"Column '%s' can't be primary key of table '%s'"
- " because it contains nulls values" % (primary_key, feature_name)
+ " because it contains nulls values"
+ % (primary_key, feature_name)
)
elif len(primary_key_values.unique()) < len(primary_key_values):
raise ValueError(
"Column '%s' can't be primary key of table '%s'"
- " because it contains repeated values" % (primary_key, feature_name)
+ " because it contains repeated values"
+ % (primary_key, feature_name)
)
else:
return primary_key
def _feature_indices(self, all_index_groups, primary_key, feature_df):
- """Choose subset of index group tuples from `all_index_groups` which are
+ """
+ Choose subset of index group tuples from `all_index_groups` which are
applicable to a particular feature (not same as its primary key, have
non-null values).
"""
@@ -194,9 +199,8 @@ def _feature_indices(self, all_index_groups, primary_key, feature_df):
def create(self, overwrite=False):
"""
- Create the local database (including indexing) if it's not
- already set up. If `overwrite` is True, always re-create
- the database from scratch.
+ Create the local database (including indexing) if it's not already set
+ up. If `overwrite` is True, always re-create the database from scratch.
Returns a connection to the database.
"""
@@ -204,8 +208,19 @@ def create(self, overwrite=False):
datacache.ensure_dir(self.cache_directory_path)
df = self._load_gtf_as_dataframe(
- usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features
+ usecols=self.restrict_gtf_columns,
+ features=self.restrict_gtf_features,
)
+ # Some species such as maize, do not have a gene_name and transcript_name
+ # but do have gene_id and transcript_id, use the as alias of names
+ if "gene_id" in df.columns and "gene_name" not in df.columns:
+ df["gene_name"] = df["gene_id"]
+ if (
+ "transcript_id" in df.columns
+ and "transcript_name" not in df.columns
+ ):
+ df["transcript_name"] = df["transcript_id"]
+
all_index_groups = self._all_possible_indices(df.columns)
if self.restrict_gtf_features:
@@ -261,7 +276,7 @@ def _get_connection(self):
@property
def connection(self):
"""
- Get a connection to the database or raise an exception
+ Get a connection to the database or raise an exception.
"""
connection = self._get_connection()
if connection:
@@ -275,6 +290,7 @@ def connection(self):
def connect_or_create(self, overwrite=False):
"""
Return a connection to the database if it exists, otherwise create it.
+
Overwrite the existing database if `overwrite` is True.
"""
connection = self._get_connection()
@@ -306,8 +322,8 @@ def column_values_at_locus(
sorted=False,
):
"""
- Get the non-null values of a column from the database
- at a particular range of loci
+ Get the non-null values of a column from the database at a particular
+ range of loci.
"""
# TODO: combine with the query method, since they overlap
@@ -408,8 +424,8 @@ def distinct_column_values_at_locus(
def run_sql_query(self, sql, required=False, query_params=[]):
"""
- Given an arbitrary SQL query, run it against the database
- and return the results.
+ Given an arbitrary SQL query, run it against the database and return
+ the results.
Parameters
----------
@@ -454,8 +470,8 @@ def query(
required=False,
):
"""
- Construct a SQL query and run against the sqlite3 database,
- filtered both by the feature type and a user-provided column/value.
+ Construct a SQL query and run against the sqlite3 database, filtered
+ both by the feature type and a user-provided column/value.
"""
sql = """
SELECT %s%s
@@ -468,7 +484,9 @@ def query(
filter_column,
)
query_params = [filter_value]
- return self.run_sql_query(sql, required=required, query_params=query_params)
+ return self.run_sql_query(
+ sql, required=required, query_params=query_params
+ )
def query_one(
self,
@@ -490,7 +508,9 @@ def query_one(
if len(results) == 0:
if required:
- raise ValueError("%s not found: %s" % (filter_column, filter_value))
+ raise ValueError(
+ "%s not found: %s" % (filter_column, filter_value)
+ )
else:
return None
elif len(results) > 1:
@@ -505,8 +525,8 @@ def query_feature_values(
self, column, feature, distinct=True, contig=None, strand=None
):
"""
- Run a SQL query against the sqlite3 database, filtered
- only on the feature type.
+ Run a SQL query against the sqlite3 database, filtered only on the
+ feature type.
"""
query = """
SELECT %s%s
@@ -541,7 +561,6 @@ def query_loci(self, filter_column, filter_value, feature):
"""
Query for loci satisfying a given filter and feature type.
-
Parameters
----------
filter_column : str
@@ -571,8 +590,8 @@ def query_loci(self, filter_column, filter_value, feature):
def query_locus(self, filter_column, filter_value, feature):
"""
- Query for unique locus, raises error if missing or more than
- one locus in the database.
+ Query for unique locus, raises error if missing or more than one locus
+ in the database.
Parameters
----------
@@ -588,7 +607,9 @@ def query_locus(self, filter_column, filter_value, feature):
Returns single Locus object.
"""
loci = self.query_loci(
- filter_column=filter_column, filter_value=filter_value, feature=feature
+ filter_column=filter_column,
+ filter_value=filter_value,
+ feature=feature,
)
if len(loci) == 0:
@@ -605,7 +626,7 @@ def query_locus(self, filter_column, filter_value, feature):
def _load_gtf_as_dataframe(self, usecols=None, features=None):
"""
- Parse this genome source's GTF file and load it as a Pandas DataFrame
+ Parse this genome source's GTF file and load it as a Pandas DataFrame.
"""
logger.info("Reading GTF from %s", self.gtf_path)
df = read_gtf(
@@ -621,7 +642,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None):
column_names = set(df.keys())
expect_gene_feature = features is None or "gene" in features
- expect_transcript_feature = features is None or "transcript" in features
+ expect_transcript_feature = (
+ features is None or "transcript" in features
+ )
observed_features = set(df["feature"])
# older Ensembl releases don't have "gene" or "transcript"
@@ -635,7 +658,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None):
dataframe=df,
unique_keys={"gene": "gene_id"},
extra_columns={
- "gene": {"gene_name", "gene_biotype"}.intersection(column_names),
+ "gene": {"gene_name", "gene_biotype"}.intersection(
+ column_names
+ ),
},
missing_value="",
)
diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py
index c33d6fe..48ebd00 100644
--- a/pyensembl/download_cache.py
+++ b/pyensembl/download_cache.py
@@ -11,14 +11,13 @@
# limitations under the License.
+import logging
from os import listdir, remove
-from os.path import join, exists, split, abspath, isdir
+from os.path import abspath, exists, isdir, join, split
from shutil import copy2, rmtree
-import logging
import datacache
-
logger = logging.getLogger(__name__)
CACHE_BASE_SUBDIR = "pyensembl"
@@ -29,9 +28,11 @@ def cache_subdirectory(
reference_name=None, annotation_name=None, annotation_version=None
):
"""
- Which cache subdirectory to use for a given annotation database
- over a particular reference. All arguments can be omitted to just get
- the base subdirectory for all pyensembl cached datasets.
+ Which cache subdirectory to use for a given annotation database over a
+ particular reference.
+
+ All arguments can be omitted to just get the base subdirectory for
+ all pyensembl cached datasets.
"""
if reference_name is None:
reference_name = ""
@@ -135,7 +136,7 @@ def cache_directory_path(self):
def _fields(self):
"""
- Fields used for hashing, string representation, equality comparison
+ Fields used for hashing, string representation, equality comparison.
"""
return (
(
@@ -150,7 +151,10 @@ def _fields(self):
)
def __eq__(self, other):
- return other.__class__ is DownloadCache and self._fields() == other._fields()
+ return (
+ other.__class__ is DownloadCache
+ and self._fields() == other._fields()
+ )
def __hash__(self):
return hash(self._fields())
@@ -202,7 +206,9 @@ def cached_path(self, path_or_url):
# for stripping decompression extensions for both local
# and remote files
local_filename = datacache.build_local_filename(
- download_url=path_or_url, filename=remote_filename, decompress=False
+ download_url=path_or_url,
+ filename=remote_filename,
+ decompress=False,
)
else:
local_filename = remote_filename
@@ -210,10 +216,14 @@ def cached_path(self, path_or_url):
# if we expect the download function to decompress this file then
# we should use its name without the compression extension
if self.decompress_on_download:
- local_filename = self._remove_compression_suffix_if_present(local_filename)
+ local_filename = self._remove_compression_suffix_if_present(
+ local_filename
+ )
if len(local_filename) == 0:
- raise ValueError("Can't determine local filename for %s" % (path_or_url,))
+ raise ValueError(
+ "Can't determine local filename for %s" % (path_or_url,)
+ )
return join(self.cache_directory_path, local_filename)
@@ -254,8 +264,8 @@ def download_or_copy_if_necessary(
self, path_or_url, download_if_missing=False, overwrite=False
):
"""
- Download a remote file or copy
- Get the local path to a possibly remote file.
+ Download a remote file or copy Get the local path to a possibly remote
+ file.
Download if file is missing from the cache directory and
`download_if_missing` is True. Download even if local file exists if
@@ -295,7 +305,11 @@ def _raise_missing_file_error(self, missing_urls_dict):
raise ValueError(error_message)
def local_path_or_install_error(
- self, field_name, path_or_url, download_if_missing=False, overwrite=False
+ self,
+ field_name,
+ path_or_url,
+ download_if_missing=False,
+ overwrite=False,
):
try:
return self.download_or_copy_if_necessary(
@@ -308,13 +322,13 @@ def local_path_or_install_error(
def delete_cached_files(self, prefixes=[], suffixes=[]):
"""
- Deletes any cached files matching the prefixes or suffixes given
+ Deletes any cached files matching the prefixes or suffixes given.
"""
if isdir(self.cache_directory_path):
for filename in listdir():
- delete = any([filename.endswith(ext) for ext in suffixes]) or any(
- [filename.startswith(pre) for pre in prefixes]
- )
+ delete = any(
+ [filename.endswith(ext) for ext in suffixes]
+ ) or any([filename.startswith(pre) for pre in prefixes])
if delete:
path = join(self.cache_directory_path, filename)
logger.info("Deleting %s", path)
diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py
index 8ad47ab..497b503 100644
--- a/pyensembl/ensembl_release.py
+++ b/pyensembl/ensembl_release.py
@@ -11,34 +11,23 @@
# limitations under the License.
"""
-Contains the EnsemblRelease class, which extends the Genome class
-to be specific to (a particular release of) Ensembl.
+Contains the EnsemblRelease class, which extends the Genome class to be
+specific to (a particular release of) Ensembl.
"""
from weakref import WeakValueDictionary
+from .ensembl_release_versions import check_release_number
+from .ensembl_url_templates import make_fasta_url, make_gtf_url
from .genome import Genome
-from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
from .species import check_species_object, human
-from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url
-
class EnsemblRelease(Genome):
"""
- Bundles together the genomic annotation and sequence data associated with
- a particular release of the Ensembl database.
+ Bundles together the genomic annotation and sequence data associated with a
+ particular release of the Ensembl database.
"""
- @classmethod
- def normalize_init_values(cls, release, species, server):
- """
- Normalizes the arguments which uniquely specify an EnsemblRelease
- genome.
- """
- release = check_release_number(release)
- species = check_species_object(species)
- return (release, species, server)
-
# Using a WeakValueDictionary instead of an ordinary dict to prevent a
# memory leak in cases where we test many different releases in sequence.
# When all the references to a particular EnsemblRelease die then that
@@ -47,13 +36,21 @@ def normalize_init_values(cls, release, species, server):
@classmethod
def cached(
- cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
+ cls,
+ release=None,
+ species=human,
+ database=None,
+ server=None,
+ # server=ENSEMBL_FTP_SERVER,
):
"""
Construct EnsemblRelease if it's never been made before, otherwise
return an old instance.
"""
- init_args_tuple = cls.normalize_init_values(release, species, server)
+ species = check_species_object(species)
+ release = check_release_number(release, species.database)
+ init_args_tuple = (release, species, database, server)
+
if init_args_tuple in cls._genome_cache:
genome = cls._genome_cache[init_args_tuple]
else:
@@ -61,14 +58,23 @@ def cached(
return genome
def __init__(
- self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
+ self,
+ release=None,
+ species=human,
+ database=None,
+ server=None,
+ # server=EMBL_FTP_SERVER,,
):
- self.release, self.species, self.server = self.normalize_init_values(
- release=release, species=species, server=server
- )
+ self.species = check_species_object(species)
+ self.release = check_release_number(release, self.species.database)
+ self.database = database
+ self.server = server
self.gtf_url = make_gtf_url(
- ensembl_release=self.release, species=self.species, server=self.server
+ ensembl_release=self.release,
+ species=self.species.latin_name,
+ server=self.server,
+ database=self.species.database,
)
self.transcript_fasta_urls = [
@@ -77,12 +83,14 @@ def __init__(
species=self.species.latin_name,
sequence_type="cdna",
server=server,
+ database=self.species.database,
),
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="ncrna",
server=server,
+ database=self.species.database,
),
]
@@ -92,6 +100,7 @@ def __init__(
species=self.species.latin_name,
sequence_type="pep",
server=self.server,
+ database=self.species.database,
)
]
@@ -130,7 +139,11 @@ def __hash__(self):
return hash((self.release, self.species))
def to_dict(self):
- return {"release": self.release, "species": self.species, "server": self.server}
+ return {
+ "release": self.release,
+ "species": self.species,
+ "server": self.server,
+ }
@classmethod
def from_dict(cls, state_dict):
@@ -144,7 +157,9 @@ def cached_release(release, species="human"):
"""
Create an EnsemblRelease instance only if it's hasn't already been made,
otherwise returns the old instance.
- Keeping this function for backwards compatibility but this functionality
- has been moving into the cached method of EnsemblRelease.
+
+ Keeping this function for backwards compatibility but this
+ functionality has been moving into the cached method of
+ EnsemblRelease.
"""
return EnsemblRelease.cached(release=release, species=species)
diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py
index 79649bd..246a380 100644
--- a/pyensembl/ensembl_release_versions.py
+++ b/pyensembl/ensembl_release_versions.py
@@ -10,23 +10,35 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-MIN_ENSEMBL_RELEASE = 54
-MAX_ENSEMBL_RELEASE = 110
+from .config import (
+ MAX_ENSEMBL_RELEASE,
+ MAX_ENSEMBLGENOME_RELEASE,
+ MIN_ENSEMBL_RELEASE,
+ MIN_ENSEMBLGENOME_RELEASE,
+)
-def check_release_number(release):
+def check_release_number(release, database=None):
"""
- Check to make sure a release is in the valid range of
- Ensembl releases.
+ Check to make sure a release is in the valid range of Ensembl releases.
"""
+ if release is None:
+ return (
+ MAX_ENSEMBL_RELEASE
+ if database is None
+ else MAX_ENSEMBLGENOME_RELEASE
+ )
try:
release = int(release)
- except:
+ except ValueError:
raise ValueError("Invalid Ensembl release: %s" % release)
-
- if release < MIN_ENSEMBL_RELEASE:
+ if database is None:
+ min_release = MIN_ENSEMBL_RELEASE
+ else:
+ min_release = MIN_ENSEMBLGENOME_RELEASE
+ if release < min_release:
raise ValueError(
"Invalid Ensembl releases %d, must be greater than %d"
- % (release, MIN_ENSEMBL_RELEASE)
+ % (release, min_release)
)
return release
diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py
index ded3570..e00968b 100644
--- a/pyensembl/ensembl_url_templates.py
+++ b/pyensembl/ensembl_url_templates.py
@@ -11,19 +11,23 @@
# limitations under the License.
"""
-Templates for URLs and paths to specific relase, species, and file type
-on the Ensembl ftp server.
+Templates for URLs and paths to specific relase, species, and file type on the
+Ensembl ftp server.
For example, the human chromosomal DNA sequences for release 78 are in:
https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
+For plant, fungi and metazoa species, the url is as follow:
+
+ https://ftp.ensemblgenomes.ebi.ac.uk/pub/release-57/plants/fasta/glycine_max/cdna/
"""
-from .species import Species, find_species_by_name
from .ensembl_release_versions import check_release_number
+from .species import Species, find_species_by_name
ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
+ENSEMBLGENOME_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk"
# Example directories
# FASTA files: /pub/release-78/fasta/homo_sapiens/
@@ -31,27 +35,58 @@
FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
+DATABASE_FASTA_SUBDIR_TEMPLATE = (
+ "/pub/release-%(release)d/%(database)s/fasta/%(species)s/%(type)s/"
+)
+DATABASE_GTF_SUBDIR_TEMPLATE = (
+ "/pub/release-%(release)d/%(database)s/gtf/%(species)s/"
+)
+
+# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz
+GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz"
+
+# cDNA & protein FASTA file for releases before (and including) Ensembl 75
+# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz
+OLD_FASTA_FILENAME_TEMPLATE = (
+ "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz"
+)
+
+# ncRNA FASTA file for releases before (and including) Ensembl 75
+# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz
+
+OLD_FASTA_FILENAME_TEMPLATE_NCRNA = (
+ "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz"
+)
+
+# cDNA & protein FASTA file for releases after Ensembl 75
+# example: Homo_sapiens.GRCh37.cdna.all.fa.gz
+NEW_FASTA_FILENAME_TEMPLATE = (
+ "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz"
+)
+
+# ncRNA FASTA file for releases after Ensembl 75
+# example: Homo_sapiens.GRCh37.ncrna.fa.gz
+NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"
+
def normalize_release_properties(ensembl_release, species):
"""
Make sure a given release is valid, normalize it to be an integer,
normalize the species name, and get its associated reference.
"""
- ensembl_release = check_release_number(ensembl_release)
if not isinstance(species, Species):
species = find_species_by_name(species)
+ ensembl_release = check_release_number(
+ ensembl_release, database=species.database
+ )
reference_name = species.which_reference(ensembl_release)
return ensembl_release, species.latin_name, reference_name
-# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz
-GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz"
-
-
def make_gtf_filename(ensembl_release, species):
"""
Return GTF filename expect on Ensembl FTP server for a specific
- species/release combination
+ species/release combination.
"""
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
@@ -63,41 +98,45 @@ def make_gtf_filename(ensembl_release, species):
}
-def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
+def make_gtf_url(ensembl_release, species, server=None, database=None):
"""
Returns a URL and a filename, which can be joined together.
"""
- ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
- subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
- filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
+ if server is None:
+ if database is None:
+ server = ENSEMBL_FTP_SERVER
+ else:
+ server = ENSEMBLGENOME_FTP_SERVER
+ ensembl_release, species, _ = normalize_release_properties(
+ ensembl_release, species
+ )
+ if database is None:
+ subdir = GTF_SUBDIR_TEMPLATE % {
+ "release": ensembl_release,
+ "species": species,
+ }
+ else:
+ subdir = DATABASE_GTF_SUBDIR_TEMPLATE % {
+ "release": ensembl_release,
+ "database": database,
+ "species": species,
+ }
+ filename = make_gtf_filename(
+ ensembl_release=ensembl_release, species=species
+ )
return server + subdir + filename
-# cDNA & protein FASTA file for releases before (and including) Ensembl 75
-# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz
-OLD_FASTA_FILENAME_TEMPLATE = (
- "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz"
-)
-
-# ncRNA FASTA file for releases before (and including) Ensembl 75
-# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz
-
-OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz"
-
-# cDNA & protein FASTA file for releases after Ensembl 75
-# example: Homo_sapiens.GRCh37.cdna.all.fa.gz
-NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz"
-
-# ncRNA FASTA file for releases after Ensembl 75
-# example: Homo_sapiens.GRCh37.ncrna.fa.gz
-NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"
-
-
-def make_fasta_filename(ensembl_release, species, sequence_type):
+def make_fasta_filename(ensembl_release, species, database, sequence_type):
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
- if ensembl_release <= 75:
+ # for plant database, start from release 32 (inlcude 32) , the fasta file use the "old name"
+ # for releses before 31, the fasta file use the "new name"
+ # version 31 use both old and new name
+ if (ensembl_release <= 75 and database is None) or (
+ ensembl_release <= 31 and database is not None
+ ):
if sequence_type == "ncrna":
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
@@ -125,23 +164,47 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
}
-def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
- """Construct URL to FASTA file with cDNA transcript or protein sequences
+def make_fasta_url(
+ ensembl_release,
+ species,
+ sequence_type,
+ server=None,
+ database=None,
+):
+ """
+ Construct URL to FASTA file with cDNA transcript or protein sequences.
Parameter examples:
ensembl_release = 75
species = "Homo_sapiens"
sequence_type = "cdna" (other option: "pep")
"""
- ensembl_release, species, reference_name = normalize_release_properties(
+ if server is None:
+ if database is None:
+ server = ENSEMBL_FTP_SERVER
+ else:
+ server = ENSEMBLGENOME_FTP_SERVER
+ ensembl_release, species, _ = normalize_release_properties(
ensembl_release, species
)
- subdir = FASTA_SUBDIR_TEMPLATE % {
- "release": ensembl_release,
- "species": species,
- "type": sequence_type,
- }
+ if database is None:
+ subdir = FASTA_SUBDIR_TEMPLATE % {
+ "release": ensembl_release,
+ "species": species,
+ "type": sequence_type,
+ }
+ else:
+ subdir = DATABASE_FASTA_SUBDIR_TEMPLATE % {
+ "release": ensembl_release,
+ "database": database,
+ "species": species,
+ "type": sequence_type,
+ }
+
filename = make_fasta_filename(
- ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
+ ensembl_release=ensembl_release,
+ species=species,
+ database=database,
+ sequence_type=sequence_type,
)
return server + subdir + filename
diff --git a/pyensembl/exon.py b/pyensembl/exon.py
index a520290..a84b75f 100644
--- a/pyensembl/exon.py
+++ b/pyensembl/exon.py
@@ -15,7 +15,9 @@
class Exon(Locus):
- def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id):
+ def __init__(
+ self, exon_id, contig, start, end, strand, gene_name, gene_id
+ ):
Locus.__init__(self, contig, start, end, strand)
self.exon_id = exon_id
self.gene_name = gene_name
diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py
index e339a8a..b55750b 100644
--- a/pyensembl/fasta.py
+++ b/pyensembl/fasta.py
@@ -19,9 +19,8 @@
"""
-from gzip import GzipFile
import logging
-
+from gzip import GzipFile
logger = logging.getLogger(__name__)
@@ -33,7 +32,8 @@ def _parse_header_id(line):
"""
if type(line) is not bytes:
raise TypeError(
- "Expected header line to be of type %s but got %s" % (bytes, type(line))
+ "Expected header line to be of type %s but got %s"
+ % (bytes, type(line))
)
if len(line) <= 1:
diff --git a/pyensembl/gene.py b/pyensembl/gene.py
index f26de48..b787c64 100644
--- a/pyensembl/gene.py
+++ b/pyensembl/gene.py
@@ -17,7 +17,9 @@
class Gene(LocusWithGenome):
- def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome):
+ def __init__(
+ self, gene_id, gene_name, contig, start, end, strand, biotype, genome
+ ):
LocusWithGenome.__init__(
self,
contig=contig,
@@ -98,7 +100,8 @@ def transcripts(self):
# its particular information, might be more efficient if we
# just get all the columns here, but how do we keep that modular?
return [
- self.genome.transcript_by_id(result[0]) for result in transcript_id_results
+ self.genome.transcript_by_id(result[0])
+ for result in transcript_id_results
]
@memoized_property
diff --git a/pyensembl/genome.py b/pyensembl/genome.py
index 05b6efc..a5e202d 100644
--- a/pyensembl/genome.py
+++ b/pyensembl/genome.py
@@ -11,8 +11,8 @@
# limitations under the License.
"""
-Contains the Genome class, with its millions of accessors and wrappers
-around an arbitrary genomic database.
+Contains the Genome class, with its millions of accessors and wrappers around
+an arbitrary genomic database.
"""
@@ -21,8 +21,8 @@
from serializable import Serializable
-from .download_cache import DownloadCache
from .database import Database
+from .download_cache import DownloadCache
from .exon import Exon
from .gene import Gene
from .sequence_data import SequenceData
@@ -31,8 +31,8 @@
class Genome(Serializable):
"""
- Bundles together the genomic annotation and sequence data associated with
- a particular genomic database source (e.g. a single Ensembl release) and
+ Bundles together the genomic annotation and sequence data associated with a
+ particular genomic database source (e.g. a single Ensembl release) and
provides a wide variety of helper methods for accessing this data.
"""
@@ -148,7 +148,7 @@ def to_dict(self):
def _init_lazy_fields(self):
"""
- Member data that gets loaded or constructed on demand
+ Member data that gets loaded or constructed on demand.
"""
self.gtf_path = None
self._protein_sequences = None
@@ -163,11 +163,15 @@ def _init_lazy_fields(self):
self._exons = {}
def _get_cached_path(
- self, field_name, path_or_url, download_if_missing=False, overwrite=False
+ self,
+ field_name,
+ path_or_url,
+ download_if_missing=False,
+ overwrite=False,
):
"""
- Get the local path for a possibly remote file, invoking either
- a download or install error message if it's missing.
+ Get the local path for a possibly remote file, invoking either a
+ download or install error message if it's missing.
"""
if len(field_name) == 0:
raise ValueError("Expected non-empty field name")
@@ -188,7 +192,9 @@ def _get_gtf_path(self, download_if_missing=False, overwrite=False):
overwrite=overwrite,
)
- def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False):
+ def _get_transcript_fasta_paths(
+ self, download_if_missing=False, overwrite=False
+ ):
if not self.requires_transcript_fasta:
raise ValueError("No transcript FASTA source for %s" % self)
return [
@@ -201,7 +207,9 @@ def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False
for path in self._transcript_fasta_paths_or_urls
]
- def _get_protein_fasta_paths(self, download_if_missing=False, overwrite=False):
+ def _get_protein_fasta_paths(
+ self, download_if_missing=False, overwrite=False
+ ):
# get the path for peptide FASTA files containing
# this genome's protein sequences
if not self.requires_protein_fasta:
@@ -233,7 +241,9 @@ def _set_local_paths(self, download_if_missing=True, overwrite=False):
def required_local_files(self):
paths = []
if self._gtf_path_or_url:
- paths.append(self.download_cache.cached_path(self._gtf_path_or_url))
+ paths.append(
+ self.download_cache.cached_path(self._gtf_path_or_url)
+ )
if self._transcript_fasta_paths_or_urls:
paths.extend(
[
@@ -273,8 +283,8 @@ def download(self, overwrite=False):
def index(self, overwrite=False):
"""
Assuming that all necessary data for this Genome has been downloaded,
- generate the GTF database and save efficient representation of
- FASTA sequence files.
+ generate the GTF database and save efficient representation of FASTA
+ sequence files.
"""
if self.requires_gtf:
self.db.connect_or_create(overwrite=overwrite)
@@ -291,10 +301,13 @@ def db(self):
# make sure GTF file exists locally
# and populate self.gtf_path
self._set_local_paths(
- download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though
- overwrite=False)
+ download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though
+ overwrite=False,
+ )
if self.gtf_path is None:
- raise ValueError("Property 'gtf_path' of %s cannot be None" % self)
+ raise ValueError(
+ "Property 'gtf_path' of %s cannot be None" % self
+ )
# Database object turns the GTF dataframes into sqlite3 tables
# and wraps them with methods like `query_one`
@@ -347,7 +360,8 @@ def protein_sequences(self):
self._set_local_paths(download_if_missing=False, overwrite=False)
if self.protein_fasta_paths is None:
raise ValueError(
- "Property 'protein_fasta_paths' of %s cannot be None" % self
+ "Property 'protein_fasta_paths' of %s cannot be None"
+ % self
)
self._protein_sequences = SequenceData(
fasta_paths=self.protein_fasta_paths,
@@ -359,13 +373,16 @@ def protein_sequences(self):
def transcript_sequences(self):
if self._transcript_sequences is None:
if not self.requires_transcript_fasta:
- raise ValueError("Missing transcript FASTA source for %s" % self)
+ raise ValueError(
+ "Missing transcript FASTA source for %s" % self
+ )
# make sure transcript FASTA file exists locally
# and populate self.transcript_fasta_paths
self._set_local_paths(download_if_missing=False, overwrite=False)
if self.transcript_fasta_paths is None:
raise ValueError(
- "Property 'transcript_fasta_paths' of %s cannot be None" % (self,)
+ "Property 'transcript_fasta_paths' of %s cannot be None"
+ % (self,)
)
self._transcript_sequences = SequenceData(
fasta_paths=self.transcript_fasta_paths,
@@ -375,8 +392,8 @@ def transcript_sequences(self):
def install_string(self):
"""
- Add every missing file to the install string shown to the user
- in an error message.
+ Add every missing file to the install string shown to the user in an
+ error message.
"""
args = [
"--reference-name",
@@ -450,7 +467,7 @@ def __hash__(self):
def clear_cache(self):
"""
- Clear any in-memory cached values
+ Clear any in-memory cached values.
"""
for maybe_fn in self.__dict__.values():
# clear cache associated with all memoization decorators,
@@ -460,7 +477,7 @@ def clear_cache(self):
def delete_index_files(self):
"""
- Delete all data aside from source GTF and FASTA files
+ Delete all data aside from source GTF and FASTA files.
"""
self.clear_cache()
db_path = self.db.local_db_path()
@@ -471,9 +488,8 @@ def _all_feature_values(
self, column, feature, distinct=True, contig=None, strand=None
):
"""
- Cached lookup of all values for a particular feature property from
- the database, caches repeated queries in memory and
- stores them as a CSV.
+ Cached lookup of all values for a particular feature property from the
+ database, caches repeated queries in memory and stores them as a CSV.
Parameters
----------
@@ -504,23 +520,31 @@ def _all_feature_values(
)
def transcript_sequence(self, transcript_id):
- """Return cDNA nucleotide sequence of transcript, or None if
- transcript doesn't have cDNA sequence.
+ """
+ Return cDNA nucleotide sequence of transcript, or None if transcript
+ doesn't have cDNA sequence.
"""
if self.transcript_sequences is None:
- raise ValueError("No transcript FASTA supplied to this Genome: %s" % self)
+ raise ValueError(
+ "No transcript FASTA supplied to this Genome: %s" % self
+ )
return self.transcript_sequences.get(transcript_id)
def protein_sequence(self, protein_id):
- """Return cDNA nucleotide sequence of transcript, or None if
- transcript doesn't have cDNA sequence.
+ """
+ Return cDNA nucleotide sequence of transcript, or None if transcript
+ doesn't have cDNA sequence.
"""
if self.protein_sequences is None:
- raise ValueError("No protein FASTA supplied to this Genome: %s" % self)
+ raise ValueError(
+ "No protein FASTA supplied to this Genome: %s" % self
+ )
return self.protein_sequences.get(protein_id)
def genes_at_locus(self, contig, position, end=None, strand=None):
- gene_ids = self.gene_ids_at_locus(contig, position, end=end, strand=strand)
+ gene_ids = self.gene_ids_at_locus(
+ contig, position, end=end, strand=strand
+ )
return [self.gene_by_id(gene_id) for gene_id in gene_ids]
def transcripts_at_locus(self, contig, position, end=None, strand=None):
@@ -528,11 +552,14 @@ def transcripts_at_locus(self, contig, position, end=None, strand=None):
contig, position, end=end, strand=strand
)
return [
- self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id)
+ for transcript_id in transcript_ids
]
def exons_at_locus(self, contig, position, end=None, strand=None):
- exon_ids = self.exon_ids_at_locus(contig, position, end=end, strand=strand)
+ exon_ids = self.exon_ids_at_locus(
+ contig, position, end=end, strand=strand
+ )
return [self.exon_by_id(exon_id) for exon_id in exon_ids]
def gene_ids_at_locus(self, contig, position, end=None, strand=None):
@@ -575,7 +602,9 @@ def transcript_ids_at_locus(self, contig, position, end=None, strand=None):
strand=strand,
)
- def transcript_names_at_locus(self, contig, position, end=None, strand=None):
+ def transcript_names_at_locus(
+ self, contig, position, end=None, strand=None
+ ):
return self.db.distinct_column_values_at_locus(
column="transcript_name",
feature="transcript",
@@ -605,7 +634,7 @@ def protein_ids_at_locus(self, contig, position, end=None, strand=None):
def locus_of_gene_id(self, gene_id):
"""
- Given a gene ID returns Locus with: chromosome, start, stop, strand
+ Given a gene ID returns Locus with: chromosome, start, stop, strand.
"""
return self.db.query_locus(
filter_column="gene_id", filter_value=gene_id, feature="gene"
@@ -614,9 +643,9 @@ def locus_of_gene_id(self, gene_id):
def loci_of_gene_names(self, gene_name):
"""
Given a gene name returns list of Locus objects with fields:
- chromosome, start, stop, strand
- You can get multiple results since a gene might have multiple copies
- in the genome.
+
+ chromosome, start, stop, strand You can get multiple results
+ since a gene might have multiple copies in the genome.
"""
return self.db.query_loci("gene_name", gene_name, "gene")
@@ -629,7 +658,7 @@ def locus_of_transcript_id(self, transcript_id):
def locus_of_exon_id(self, exon_id):
"""
- Given an exon ID returns Locus
+ Given an exon ID returns Locus.
"""
return self.db.query_locus("exon_id", exon_id, feature="exon")
@@ -641,8 +670,8 @@ def locus_of_exon_id(self, exon_id):
def contigs(self):
"""
- Returns all contig names for any gene in the genome
- (field called "seqname" in Ensembl GTF files)
+ Returns all contig names for any gene in the genome (field called
+ "seqname" in Ensembl GTF files)
"""
return self.db.query_feature_values("seqname", "gene")
@@ -703,7 +732,9 @@ def gene_by_id(self, gene_id):
gene_name, gene_biotype = None, None
if len(result) < 4 or len(result) > 6:
- raise ValueError("Result is not the expected length: %d" % len(result))
+ raise ValueError(
+ "Result is not the expected length: %d" % len(result)
+ )
contig, start, end, strand = result[:4]
if len(result) == 5:
if "gene_name" in field_names:
@@ -737,8 +768,8 @@ def genes_by_name(self, gene_name):
def gene_by_protein_id(self, protein_id):
"""
- Get the gene ID associated with the given protein ID,
- return its Gene object
+ Get the gene ID associated with the given protein ID, return its Gene
+ object.
"""
gene_id = self.gene_id_of_protein_id(protein_id)
return self.gene_by_id(gene_id)
@@ -762,8 +793,8 @@ def _query_gene_name(self, property_name, property_value, feature_type):
def gene_names(self, contig=None, strand=None):
"""
- Return all genes in the database,
- optionally restrict to a chromosome and/or strand.
+ Return all genes in the database, optionally restrict to a chromosome
+ and/or strand.
"""
return self._all_feature_values(
column="gene_name", feature="gene", contig=contig, strand=strand
@@ -773,10 +804,14 @@ def gene_name_of_gene_id(self, gene_id):
return self._query_gene_name("gene_id", gene_id, "gene")
def gene_name_of_transcript_id(self, transcript_id):
- return self._query_gene_name("transcript_id", transcript_id, "transcript")
+ return self._query_gene_name(
+ "transcript_id", transcript_id, "transcript"
+ )
def gene_name_of_transcript_name(self, transcript_name):
- return self._query_gene_name("transcript_name", transcript_name, "transcript")
+ return self._query_gene_name(
+ "transcript_name", transcript_name, "transcript"
+ )
def gene_name_of_exon_id(self, exon_id):
return self._query_gene_name("exon_id", exon_id, "exon")
@@ -800,8 +835,8 @@ def _query_gene_ids(self, property_name, value, feature="gene"):
def gene_ids(self, contig=None, strand=None):
"""
- What are all the gene IDs
- (optionally restrict to a given chromosome/contig and/or strand)
+ What are all the gene IDs (optionally restrict to a given
+ chromosome/contig and/or strand)
"""
return self._all_feature_values(
column="gene_id", feature="gene", contig=contig, strand=strand
@@ -810,6 +845,7 @@ def gene_ids(self, contig=None, strand=None):
def gene_ids_of_gene_name(self, gene_name):
"""
What are the gene IDs associated with a given gene name?
+
(due to copy events, there might be multiple genes per name)
"""
results = self._query_gene_ids("gene_name", gene_name)
@@ -842,17 +878,21 @@ def gene_id_of_protein_id(self, protein_id):
def transcripts(self, contig=None, strand=None):
"""
- Construct Transcript object for every transcript entry in
- the database. Optionally restrict to a particular
- chromosome using the `contig` argument.
+ Construct Transcript object for every transcript entry in the database.
+
+ Optionally restrict to a particular chromosome using the
+ `contig` argument.
"""
transcript_ids = self.transcript_ids(contig=contig, strand=strand)
return [
- self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id)
+ for transcript_id in transcript_ids
]
def transcript_by_id(self, transcript_id):
- """Construct Transcript object with given transcript ID"""
+ """
+ Construct Transcript object with given transcript ID.
+ """
if transcript_id not in self._transcripts:
optional_field_names = [
"transcript_name",
@@ -885,8 +925,12 @@ def transcript_by_id(self, transcript_id):
raise ValueError("Transcript not found: %s" % (transcript_id,))
transcript_name, transcript_biotype, tsl = None, None, None
- if len(result) < 5 or len(result) > (5 + len(optional_field_names)):
- raise ValueError("Result is not the expected length: %d" % len(result))
+ if len(result) < 5 or len(result) > (
+ 5 + len(optional_field_names)
+ ):
+ raise ValueError(
+ "Result is not the expected length: %d" % len(result)
+ )
contig, start, end, strand, gene_id = result[:5]
if len(result) > 5:
extra_field_names = [
@@ -895,8 +939,10 @@ def transcript_by_id(self, transcript_id):
extra_data = dict(zip(extra_field_names, result[5:]))
transcript_name = extra_data.get("transcript_name")
transcript_biotype = extra_data.get("transcript_biotype")
- tsl = extra_data.get("transcript_support_level")
- if not tsl or tsl == "NA":
+ tsl = extra_data.get("transcript_support_level", "NA")
+ if tsl:
+ tsl = tsl.split(" ")[0]
+ if not tsl or tsl == "NA" or not tsl.isnumeric():
tsl = None
else:
tsl = int(tsl)
@@ -917,9 +963,12 @@ def transcript_by_id(self, transcript_id):
return self._transcripts[transcript_id]
def transcripts_by_name(self, transcript_name):
- transcript_ids = self.transcript_ids_of_transcript_name(transcript_name)
+ transcript_ids = self.transcript_ids_of_transcript_name(
+ transcript_name
+ )
return [
- self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id)
+ for transcript_id in transcript_ids
]
def transcript_by_protein_id(self, protein_id):
@@ -945,25 +994,31 @@ def _query_transcript_names(self, property_name, value):
def transcript_names(self, contig=None, strand=None):
"""
- What are all the transcript names in the database
- (optionally, restrict to a given chromosome and/or strand)
+ What are all the transcript names in the database (optionally, restrict
+ to a given chromosome and/or strand)
"""
return self._all_feature_values(
- column="transcript_name", feature="transcript", contig=contig, strand=strand
+ column="transcript_name",
+ feature="transcript",
+ contig=contig,
+ strand=strand,
)
def transcript_names_of_gene_name(self, gene_name):
return self._query_transcript_names("gene_name", gene_name)
def transcript_name_of_transcript_id(self, transcript_id):
- transcript_names = self._query_transcript_names("transcript_id", transcript_id)
+ transcript_names = self._query_transcript_names(
+ "transcript_id", transcript_id
+ )
if len(transcript_names) == 0:
raise ValueError(
"No transcript names for transcript ID = %s" % transcript_id
)
elif len(transcript_names) > 1:
raise ValueError(
- "Multiple transcript names for transcript ID = %s" % (transcript_id,)
+ "Multiple transcript names for transcript ID = %s"
+ % (transcript_id,)
)
return transcript_names[0]
@@ -973,7 +1028,9 @@ def transcript_name_of_transcript_id(self, transcript_id):
#
###################################################
- def _query_transcript_ids(self, property_name, value, feature="transcript"):
+ def _query_transcript_ids(
+ self, property_name, value, feature="transcript"
+ ):
results = self.db.query(
select_column_names=["transcript_id"],
filter_column=property_name,
@@ -986,7 +1043,10 @@ def _query_transcript_ids(self, property_name, value, feature="transcript"):
def transcript_ids(self, contig=None, strand=None):
return self._all_feature_values(
- column="transcript_id", feature="transcript", contig=contig, strand=strand
+ column="transcript_id",
+ feature="transcript",
+ contig=contig,
+ strand=strand,
)
def transcript_ids_of_gene_id(self, gene_id):
@@ -1005,7 +1065,9 @@ def transcript_id_of_protein_id(self, protein_id):
"""
What is the transcript ID associated with a given protein ID?
"""
- results = self._query_transcript_ids("protein_id", protein_id, feature="CDS")
+ results = self._query_transcript_ids(
+ "protein_id", protein_id, feature="CDS"
+ )
if len(results) == 0:
raise ValueError("Protein ID not found: %s" % protein_id)
elif len(results) > 1:
@@ -1026,15 +1088,16 @@ def transcript_id_of_protein_id(self, protein_id):
def exons(self, contig=None, strand=None):
"""
- Create exon object for all exons in the database, optionally
- restrict to a particular chromosome using the `contig` argument.
+ Create exon object for all exons in the database, optionally restrict
+ to a particular chromosome using the `contig` argument.
"""
# DataFrame with single column called "exon_id"
exon_ids = self.exon_ids(contig=contig, strand=strand)
return [self.exon_by_id(exon_id) for exon_id in exon_ids]
def exon_by_id(self, exon_id):
- """Construct an Exon object from its ID by looking up the exon"s
+ """
+ Construct an Exon object from its ID by looking up the exon"s
properties in the given Database.
"""
if exon_id not in self._exons:
@@ -1109,8 +1172,8 @@ def exon_ids_of_transcript_id(self, transcript_id):
def protein_ids(self, contig=None, strand=None):
"""
- What are all the protein IDs
- (optionally restrict to a given chromosome and/or strand)
+ What are all the protein IDs (optionally restrict to a given chromosome
+ and/or strand)
"""
protein_ids = self._all_feature_values(
column="protein_id",
diff --git a/pyensembl/locus.py b/pyensembl/locus.py
index b88b4a3..c087183 100644
--- a/pyensembl/locus.py
+++ b/pyensembl/locus.py
@@ -49,7 +49,8 @@ def __init__(self, contig, start, end, strand):
if end < start:
raise ValueError(
- "Expected start <= end, got start = %d, end = %d" % (start, end)
+ "Expected start <= end, got start = %d, end = %d"
+ % (start, end)
)
self.start = start
self.end = end
@@ -149,7 +150,9 @@ def offset_range(self, start, end):
)
if start < self.start or end > self.end:
- raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self))
+ raise ValueError(
+ "Range (%d, %d) falls outside %s" % (start, end, self)
+ )
if self.on_forward_strand:
return (start - self.start, end - self.start)
@@ -183,7 +186,9 @@ def can_overlap(self, contig, strand=None):
"""
Is this locus on the same contig and (optionally) on the same strand?
"""
- return self.on_contig(contig) and (strand is None or self.on_strand(strand))
+ return self.on_contig(contig) and (
+ strand is None or self.on_strand(strand)
+ )
def distance_to_interval(self, start, end):
"""
@@ -220,15 +225,23 @@ def overlaps(self, contig, start, end, strand=None):
def overlaps_locus(self, other_locus):
return self.overlaps(
- other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
+ other_locus.contig,
+ other_locus.start,
+ other_locus.end,
+ other_locus.strand,
)
def contains(self, contig, start, end, strand=None):
return (
- self.can_overlap(contig, strand) and start >= self.start and end <= self.end
+ self.can_overlap(contig, strand)
+ and start >= self.start
+ and end <= self.end
)
def contains_locus(self, other_locus):
return self.contains(
- other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
+ other_locus.contig,
+ other_locus.start,
+ other_locus.end,
+ other_locus.strand,
)
diff --git a/pyensembl/locus_with_genome.py b/pyensembl/locus_with_genome.py
index 33dd38d..338a222 100644
--- a/pyensembl/locus_with_genome.py
+++ b/pyensembl/locus_with_genome.py
@@ -16,8 +16,8 @@
class LocusWithGenome(Locus):
"""
- Common base class for Gene and Transcript to avoid copying
- their shared logic.
+ Common base class for Gene and Transcript to avoid copying their shared
+ logic.
"""
def __init__(self, contig, start, end, strand, biotype, genome):
@@ -39,16 +39,17 @@ def to_dict(self):
@property
def is_protein_coding(self):
"""
- We're not counting immunoglobulin-like genes from the T-cell receptor or
- or antibodies since they occur in fragments that must be recombined.
- It might be worth consider counting non-sense mediated decay and
- non-stop decay since variants in these could potentially make a
- functional protein. To read more about the biotypes used in Ensembl:
- http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
- http://www.gencodegenes.org/gencode_biotypes.html
-
- For now let's stick with the simple category of 'protein_coding', which
- means that there is an open reading frame in this gene/transcript
- whose successful transcription has been observed.
+ We're not counting immunoglobulin-like genes from the T-cell receptor
+ or or antibodies since they occur in fragments that must be recombined.
+ It might be worth consider counting non-sense mediated decay and non-
+ stop decay since variants in these could potentially make a functional
+ protein. To read more about the biotypes used in Ensembl:
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.gencodegenes.org/gencode_biotypes.html.
+
+ For now let's stick with the simple category of
+ 'protein_coding', which means that there is an open reading
+ frame in this gene/transcript whose successful transcription has
+ been observed.
"""
- return self.biotype == "protein_coding"
+ return self.biotype in "protein_coding"
diff --git a/pyensembl/normalization.py b/pyensembl/normalization.py
index fb0cc33..81f65c5 100644
--- a/pyensembl/normalization.py
+++ b/pyensembl/normalization.py
@@ -11,7 +11,8 @@
# limitations under the License.
from sys import intern
-from typechecks import is_string, is_integer
+
+from typechecks import is_integer, is_string
# Manually memoizing here, since our simple common.memoize function has
# noticable overhead in this instance.
diff --git a/pyensembl/reference_name.py b/pyensembl/reference_name.py
index 1b7639d..5731d80 100644
--- a/pyensembl/reference_name.py
+++ b/pyensembl/reference_name.py
@@ -29,7 +29,9 @@ def normalize_reference_name(name):
def find_species_by_reference(reference_name):
- return Species._reference_names_to_species[normalize_reference_name(reference_name)]
+ return Species._reference_names_to_species[
+ normalize_reference_name(reference_name)
+ ]
def which_reference(species_name, ensembl_release):
@@ -42,7 +44,9 @@ def max_ensembl_release(reference_name):
return max_release
-def genome_for_reference_name(reference_name, allow_older_downloaded_release=True):
+def genome_for_reference_name(
+ reference_name, allow_older_downloaded_release=True
+):
"""
Given a genome reference name, such as "GRCh38", returns the
corresponding Ensembl Release object.
@@ -60,7 +64,9 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru
]
if allow_older_downloaded_release:
# go through candidate releases in descending order
- for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)):
+ for release in reversed(
+ range(min_ensembl_release, max_ensembl_release + 1)
+ ):
# check if release has been locally downloaded
candidate = EnsemblRelease.cached(release=release, species=species)
if candidate.required_local_files_exist():
@@ -70,6 +76,6 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru
return EnsemblRelease.cached(release=max_ensembl_release, species=species)
-ensembl_grch36 = genome_for_reference_name("ncbi36")
-ensembl_grch37 = genome_for_reference_name("grch37")
-ensembl_grch38 = genome_for_reference_name("grch38")
+# ensembl_grch36 = genome_for_reference_name("ncbi36")
+# ensembl_grch37 = genome_for_reference_name("grch37")
+# ensembl_grch38 = genome_for_reference_name("grch38")
diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py
index 631c748..e18a9e8 100644
--- a/pyensembl/sequence_data.py
+++ b/pyensembl/sequence_data.py
@@ -10,14 +10,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from os import remove
-from os.path import exists, abspath, split, join
import logging
-from collections import Counter
import pickle
-from .common import load_pickle, dump_pickle
-from .fasta import parse_fasta_dictionary
+from collections import Counter
+from os import remove
+from os.path import abspath, exists, join, split
+from .common import dump_pickle, load_pickle
+from .fasta import parse_fasta_dictionary
logger = logging.getLogger(__name__)
@@ -32,10 +32,14 @@ def __init__(self, fasta_paths, cache_directory_path=None):
fasta_paths = [fasta_paths]
self.fasta_paths = [abspath(path) for path in fasta_paths]
- self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths]
+ self.fasta_directory_paths = [
+ split(path)[0] for path in self.fasta_paths
+ ]
self.fasta_filenames = [split(path)[1] for path in self.fasta_paths]
if cache_directory_path:
- self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths)
+ self.cache_directory_paths = [cache_directory_path] * len(
+ self.fasta_paths
+ )
else:
self.cache_directory_paths = self.fasta_directory_paths
for path in self.fasta_paths:
@@ -104,7 +108,9 @@ def _load_or_create_fasta_dictionary_pickle(self):
try:
fasta_dictionary_tmp = load_pickle(pickle_path)
self._add_to_fasta_dictionary(fasta_dictionary_tmp)
- logger.info("Loaded sequence dictionary from %s", pickle_path)
+ logger.info(
+ "Loaded sequence dictionary from %s", pickle_path
+ )
continue
except (pickle.UnpicklingError, AttributeError):
# catch either an UnpicklingError or an AttributeError
diff --git a/pyensembl/shell.py b/pyensembl/shell.py
old mode 100755
new mode 100644
index cd7ab3c..546dfa9
--- a/pyensembl/shell.py
+++ b/pyensembl/shell.py
@@ -30,6 +30,9 @@
To list all installed genomes:
%(prog)s list
+To list all available genomes:
+ %(prog)s available
+
To install a genome from source files:
%(prog)s install \
--reference-name "GRCh38" \
@@ -40,14 +43,18 @@
import argparse
import logging.config
-import pkg_resources
import os
-from .ensembl_release import EnsemblRelease, MAX_ENSEMBL_RELEASE
+import pkg_resources
+
+from .config import MAX_ENSEMBL_RELEASE
+from .ensembl_release import EnsemblRelease
from .genome import Genome
-from .species import Species
+from .species import Species, normalize_species_name
-logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf"))
+logging.config.fileConfig(
+ pkg_resources.resource_filename(__name__, "logging.conf")
+)
logger = logging.getLogger(__name__)
@@ -94,7 +101,9 @@
)
path_group.add_argument(
- "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)"
+ "--annotation-name",
+ default=None,
+ help="Name of annotation source (e.g. refseq)",
)
path_group.add_argument(
@@ -140,6 +149,7 @@
"delete-all-files",
"delete-index-files",
"list",
+ "available",
),
help=(
'"install" will download and index any data that is not '
@@ -151,6 +161,25 @@
)
+def collect_all_available_ensembl_releases():
+ for species_name in Species.all_registered_latin_names():
+ species = Species._latin_names_to_species[species_name]
+ # print in tree format
+ print(
+ "* "
+ + species_name
+ + " ("
+ + ", ".join(species.synonyms)
+ + ")"
+ + ":"
+ )
+ for (
+ release_name,
+ release_range,
+ ) in species.reference_assemblies.items():
+ print(" * " + release_name + ":", release_range)
+
+
def collect_all_installed_ensembl_releases():
genomes = []
for species, release in Species.all_species_release_pairs():
@@ -164,11 +193,26 @@ def all_combinations_of_ensembl_genomes(args):
"""
Use all combinations of species and release versions specified by the
commandline arguments to return a list of EnsemblRelease or Genome objects.
- The results will typically be of type EnsemblRelease unless the
+ The results will typically be of type EnsemblRelease unless the.
+
--custom-mirror argument was given.
"""
species_list = args.species if args.species else ["human"]
- release_list = args.release if args.release else [MAX_ENSEMBL_RELEASE]
+
+ release_list = (
+ args.release
+ if args.release
+ else [
+ max(
+ i
+ for _, i in Species._latin_names_to_species[
+ normalize_species_name(species_name)
+ ].reference_assemblies.values()
+ )
+ for species_name in species_list
+ ]
+ )
+
genomes = []
for species in species_list:
# Otherwise, use Ensembl release information
@@ -182,11 +226,13 @@ def all_combinations_of_ensembl_genomes(args):
# URL to be a directory with all the same filenames as
# would be provided by Ensembl
gtf_url = os.path.join(
- args.custom_mirror, os.path.basename(ensembl_release.gtf_url)
+ args.custom_mirror,
+ os.path.basename(ensembl_release.gtf_url),
)
transcript_fasta_urls = [
os.path.join(
- args.custom_mirror, os.path.basename(transcript_fasta_url)
+ args.custom_mirror,
+ os.path.basename(transcript_fasta_url),
)
for transcript_fasta_url in ensembl_release.transcript_fasta_urls
]
@@ -244,7 +290,9 @@ def collect_selected_genomes(args):
def run():
args = parser.parse_args()
- if args.action == "list":
+ if args.action == "available":
+ collect_all_available_ensembl_releases()
+ elif args.action == "list":
# TODO: how do we also identify which non-Ensembl genomes are
# installed?
genomes = collect_all_installed_ensembl_releases()
diff --git a/pyensembl/species.py b/pyensembl/species.py
index a236bb1..cb78766 100644
--- a/pyensembl/species.py
+++ b/pyensembl/species.py
@@ -12,7 +12,7 @@
from serializable import Serializable
-from .ensembl_release_versions import MAX_ENSEMBL_RELEASE
+from .config import SPECIES_DATA
# TODO: replace Serializable with data class
@@ -30,15 +30,18 @@ class Species(Serializable):
_reference_names_to_species = {}
@classmethod
- def register(cls, latin_name, synonyms, reference_assemblies):
+ def register(
+ cls, latin_name, synonyms, reference_assemblies, database=None
+ ):
"""
- Create a Species object from the given arguments and enter into
- all the dicts used to look the species up by its fields.
+ Create a Species object from the given arguments and enter into all the
+ dicts used to look the species up by its fields.
"""
species = Species(
latin_name=latin_name,
synonyms=synonyms,
reference_assemblies=reference_assemblies,
+ database=database,
)
cls._latin_names_to_species[species.latin_name] = species
for synonym in synonyms:
@@ -71,8 +74,8 @@ def all_registered_latin_names(cls):
@classmethod
def all_species_release_pairs(cls):
"""
- Generator which yields (species, release) pairs
- for all possible combinations.
+ Generator which yields (species, release) pairs for all possible
+ combinations.
"""
for species_name in cls.all_registered_latin_names():
species = cls._latin_names_to_species[species_name]
@@ -80,7 +83,9 @@ def all_species_release_pairs(cls):
for release in range(release_range[0], release_range[1] + 1):
yield species_name, release
- def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
+ def __init__(
+ self, latin_name, synonyms=[], reference_assemblies={}, database=None
+ ):
"""
Parameters
----------
@@ -95,6 +100,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
self.latin_name = latin_name.lower().replace(" ", "_")
self.synonyms = synonyms
self.reference_assemblies = reference_assemblies
+ self.database = database
self._release_to_genome = {}
for genome_name, (start, end) in self.reference_assemblies.items():
for i in range(start, end + 1):
@@ -114,10 +120,14 @@ def which_reference(self, ensembl_release):
return self._release_to_genome[ensembl_release]
def __str__(self):
- return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % (
- self.latin_name,
- self.synonyms,
- self.reference_assemblies,
+ return (
+ "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s, database=%s)"
+ % (
+ self.latin_name,
+ self.synonyms,
+ self.reference_assemblies,
+ self.database,
+ )
)
def __eq__(self, other):
@@ -126,6 +136,7 @@ def __eq__(self, other):
and self.latin_name == other.latin_name
and self.synonyms == other.synonyms
and self.reference_assemblies == other.reference_assemblies
+ and self.database == other.database
)
def to_dict(self):
@@ -141,15 +152,17 @@ def __hash__(self):
self.latin_name,
tuple(self.synonyms),
frozenset(self.reference_assemblies.items()),
+ self.database,
)
)
def normalize_species_name(name):
"""
- If species name was "Homo sapiens" then replace spaces with underscores
- and return "homo_sapiens". Also replace common names like "human" with
- "homo_sapiens".
+ If species name was "Homo sapiens" then replace spaces with underscores and
+ return "homo_sapiens".
+
+ Also replace common names like "human" with "homo_sapiens".
"""
lower_name = name.lower().strip()
@@ -173,6 +186,8 @@ def find_species_by_name(species_name):
def check_species_object(species_name_or_object):
"""
Helper for validating user supplied species names or objects.
+
+ Return `Species` Object
"""
if isinstance(species_name_or_object, Species):
return species_name_or_object
@@ -185,168 +200,10 @@ def check_species_object(species_name_or_object):
)
-human = Species.register(
- latin_name="homo_sapiens",
- synonyms=["human"],
- reference_assemblies={
- "GRCh38": (76, MAX_ENSEMBL_RELEASE),
- "GRCh37": (55, 75),
- "NCBI36": (54, 54),
- },
-)
-
-mouse = Species.register(
- latin_name="mus_musculus",
- synonyms=["mouse", "house mouse"],
- reference_assemblies={
- "NCBIM37": (54, 67),
- "GRCm38": (68, 102),
- "GRCm39": (103, MAX_ENSEMBL_RELEASE),
- },
-)
-
-dog = Species.register(
- latin_name="canis_familiaris",
- synonyms=["dog"],
- reference_assemblies={"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-cat = Species.register(
- latin_name="felis_catus",
- synonyms=["cat"],
- reference_assemblies={
- "Felis_catus_6.2": (75, 90),
- "Felis_catus_8.0": (91, 92),
- "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE),
- },
-)
-
-chicken = Species.register(
- latin_name="gallus_gallus",
- synonyms=["chicken"],
- reference_assemblies={
- "Galgal4": (75, 85),
- "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE),
- },
-)
-
-# Does the black rat (Rattus Rattus) get used for research too?
-brown_rat = Species.register(
- latin_name="rattus_norvegicus",
- synonyms=["brown rat", "lab rat", "rat"],
- reference_assemblies={
- "Rnor_5.0": (75, 79),
- "Rnor_6.0": (80, 104),
- "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE),
- },
-)
-
-macaque = Species.register(
- latin_name="macaca_fascicularis",
- synonyms=["macaque", "Crab-eating macaque"],
- reference_assemblies={
- "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE),
- },
-)
-
-green_monkey = Species.register(
- latin_name="chlorocebus_sabaeus",
- synonyms=["green_monkey", "african_green_monkey"],
- reference_assemblies={
- "ChlSab1.1": (86, MAX_ENSEMBL_RELEASE),
- },
-)
-
-rhesus = Species.register(
- latin_name="macaca_mulatta",
- synonyms=["rhesus"],
- reference_assemblies={"Mmul_10": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-rabbit = Species.register(
- latin_name="oryctolagus_cuniculus",
- synonyms=["rabbit"],
- reference_assemblies={"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-gerbil = Species.register(
- latin_name="meriones_unguiculatus",
- synonyms=["gerbil"],
- reference_assemblies={"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-syrian_hamster = Species.register(
- latin_name="mesocricetus_auratus",
- synonyms=["syrian_hamster"],
- reference_assemblies={"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-chinese_hamster = Species.register(
- latin_name="cricetulus_griseus_chok1gshd",
- synonyms=["chinese_hamster"],
- reference_assemblies={"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-naked_mole_rat = Species.register(
- latin_name="heterocephalus_glaber_female",
- synonyms=["naked_mole_rat"],
- reference_assemblies={"HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-guinea_pig = Species.register(
- latin_name="cavia_porcellus",
- synonyms=["guinea_pig"],
- reference_assemblies={"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-pig = Species.register(
- latin_name="sus_scrofa",
- synonyms=["pig"],
- reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)},
-)
-
-zebrafish = Species.register(
- latin_name="danio_rerio",
- synonyms=["zebrafish"],
- reference_assemblies={
- "ZFISH7": (47, 53),
- "Zv8": (54, 59),
- "Zv9": (60, 79),
- "GRCz10": (80, 91),
- "GRCz11": (92, MAX_ENSEMBL_RELEASE),
- },
-)
-
-fly = Species.register(
- latin_name="drosophila_melanogaster",
- synonyms=["drosophila", "fruit fly", "fly"],
- reference_assemblies={
- "BDGP5": (75, 78),
- "BDGP6": (79, 95),
- "BDGP6.22": (96, 98),
- "BDGP6.28": (99, 102),
- "BDGP6.32": (103, MAX_ENSEMBL_RELEASE),
- },
-)
-
-nematode = Species.register(
- latin_name="caenorhabditis_elegans",
- synonyms=["nematode", "C_elegans"],
- reference_assemblies={
- "WS180": (47, 49),
- "WS190": (50, 54),
- "WS200": (55, 57),
- "WS210": (58, 59),
- "WS220": (61, 66),
- "WBcel215": (67, 70),
- "WBcel235": (71, MAX_ENSEMBL_RELEASE),
- },
-)
-
-yeast = Species.register(
- latin_name="saccharomyces_cerevisiae",
- synonyms=["yeast", "budding_yeast"],
- reference_assemblies={
- "R64-1-1": (76, MAX_ENSEMBL_RELEASE),
- },
-)
+for data in SPECIES_DATA:
+ globals()[data["synonyms"][0]] = Species.register(
+ latin_name=data["latin_name"],
+ synonyms=data["synonyms"],
+ reference_assemblies=data["reference_assemblies"],
+ database=data.get("database", None),
+ )
diff --git a/pyensembl/species.py.orig b/pyensembl/species.py.orig
new file mode 100644
index 0000000..cb78766
--- /dev/null
+++ b/pyensembl/species.py.orig
@@ -0,0 +1,209 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from serializable import Serializable
+
+from .config import SPECIES_DATA
+
+# TODO: replace Serializable with data class
+
+
+class Species(Serializable):
+ """
+ Container for combined information about a species name, its synonyn names
+ and which reference to use for this species in each Ensembl release.
+ """
+
+ # as species instances get created, they get registered in these
+ # dictionaries
+ _latin_names_to_species = {}
+ _common_names_to_species = {}
+ _reference_names_to_species = {}
+
+ @classmethod
+ def register(
+ cls, latin_name, synonyms, reference_assemblies, database=None
+ ):
+ """
+ Create a Species object from the given arguments and enter into all the
+ dicts used to look the species up by its fields.
+ """
+ species = Species(
+ latin_name=latin_name,
+ synonyms=synonyms,
+ reference_assemblies=reference_assemblies,
+ database=database,
+ )
+ cls._latin_names_to_species[species.latin_name] = species
+ for synonym in synonyms:
+ if synonym in cls._common_names_to_species:
+ raise ValueError(
+ "Can't use synonym '%s' for both %s and %s"
+ % (synonym, species, cls._common_names_to_species[synonym])
+ )
+ cls._common_names_to_species[synonym] = species
+ for reference_name in reference_assemblies:
+ if reference_name in cls._reference_names_to_species:
+ raise ValueError(
+ "Can't use reference '%s' for both %s and %s"
+ % (
+ reference_name,
+ species,
+ cls._reference_names_to_species[reference_name],
+ )
+ )
+ cls._reference_names_to_species[reference_name] = species
+ return species
+
+ @classmethod
+ def all_registered_latin_names(cls):
+ """
+ Returns latin name of every registered species.
+ """
+ return list(cls._latin_names_to_species.keys())
+
+ @classmethod
+ def all_species_release_pairs(cls):
+ """
+ Generator which yields (species, release) pairs for all possible
+ combinations.
+ """
+ for species_name in cls.all_registered_latin_names():
+ species = cls._latin_names_to_species[species_name]
+ for _, release_range in species.reference_assemblies.items():
+ for release in range(release_range[0], release_range[1] + 1):
+ yield species_name, release
+
+ def __init__(
+ self, latin_name, synonyms=[], reference_assemblies={}, database=None
+ ):
+ """
+ Parameters
+ ----------
+ latin_name : str
+
+ synonyms : list of strings
+
+ reference_assemblies : dict
+ Mapping of names of reference genomes onto inclusive ranges of
+ Ensembl releases Example: {"GRCh37": (54, 75)}
+ """
+ self.latin_name = latin_name.lower().replace(" ", "_")
+ self.synonyms = synonyms
+ self.reference_assemblies = reference_assemblies
+ self.database = database
+ self._release_to_genome = {}
+ for genome_name, (start, end) in self.reference_assemblies.items():
+ for i in range(start, end + 1):
+ if i in self._release_to_genome:
+ raise ValueError(
+ "Ensembl release %d for %s already has an associated genome"
+ % (i, latin_name)
+ )
+ self._release_to_genome[i] = genome_name
+
+ def which_reference(self, ensembl_release):
+ if ensembl_release not in self._release_to_genome:
+ raise ValueError(
+ "No genome for %s in Ensembl release %d"
+ % (self.latin_name, ensembl_release)
+ )
+ return self._release_to_genome[ensembl_release]
+
+ def __str__(self):
+ return (
+ "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s, database=%s)"
+ % (
+ self.latin_name,
+ self.synonyms,
+ self.reference_assemblies,
+ self.database,
+ )
+ )
+
+ def __eq__(self, other):
+ return (
+ other.__class__ is Species
+ and self.latin_name == other.latin_name
+ and self.synonyms == other.synonyms
+ and self.reference_assemblies == other.reference_assemblies
+ and self.database == other.database
+ )
+
+ def to_dict(self):
+ return {"latin_name": self.latin_name}
+
+ @classmethod
+ def from_dict(cls, state_dict):
+ return cls._latin_names_to_species[state_dict["latin_name"]]
+
+ def __hash__(self):
+ return hash(
+ (
+ self.latin_name,
+ tuple(self.synonyms),
+ frozenset(self.reference_assemblies.items()),
+ self.database,
+ )
+ )
+
+
+def normalize_species_name(name):
+ """
+ If species name was "Homo sapiens" then replace spaces with underscores and
+ return "homo_sapiens".
+
+ Also replace common names like "human" with "homo_sapiens".
+ """
+ lower_name = name.lower().strip()
+
+ # if given a common name such as "human", look up its latin equivalent
+ if lower_name in Species._common_names_to_species:
+ return Species._common_names_to_species[lower_name].latin_name
+
+ return lower_name.replace(" ", "_")
+
+
+def find_species_by_name(species_name):
+ latin_name = normalize_species_name(species_name)
+ if latin_name not in Species._latin_names_to_species:
+ raise ValueError(
+ "Species not found: %s, for non-Ensembl data see https://github.com/openvax/pyensembl#non-ensembl-data"
+ % (species_name,)
+ )
+ return Species._latin_names_to_species[latin_name]
+
+
+def check_species_object(species_name_or_object):
+ """
+ Helper for validating user supplied species names or objects.
+
+ Return `Species` Object
+ """
+ if isinstance(species_name_or_object, Species):
+ return species_name_or_object
+ elif isinstance(species_name_or_object, str):
+ return find_species_by_name(species_name_or_object)
+ else:
+ raise ValueError(
+ "Unexpected type for species: %s : %s"
+ % (species_name_or_object, type(species_name_or_object))
+ )
+
+
+for data in SPECIES_DATA:
+ globals()[data["synonyms"][0]] = Species.register(
+ latin_name=data["latin_name"],
+ synonyms=data["synonyms"],
+ reference_assemblies=data["reference_assemblies"],
+ database=data.get("database", None),
+ )
diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py
index 9d30c5c..694e702 100644
--- a/pyensembl/transcript.py
+++ b/pyensembl/transcript.py
@@ -24,18 +24,20 @@ class Transcript(LocusWithGenome):
and not using the sequence, avoid the memory/performance overhead
of fetching and storing sequences from a FASTA file.
"""
+
def __init__(
- self,
- transcript_id,
- transcript_name,
- contig,
- start,
- end,
- strand,
- biotype,
- gene_id,
- genome,
- support_level=None):
+ self,
+ transcript_id,
+ transcript_name,
+ contig,
+ start,
+ end,
+ strand,
+ biotype,
+ gene_id,
+ genome,
+ support_level=None,
+ ):
LocusWithGenome.__init__(
self,
contig=contig,
@@ -43,7 +45,8 @@ def __init__(
end=end,
strand=strand,
biotype=biotype,
- genome=genome)
+ genome=genome,
+ )
self.transcript_id = transcript_id
self.transcript_name = transcript_name
self.gene_id = gene_id
@@ -71,16 +74,18 @@ def __str__(self):
" biotype='%s',"
" contig='%s',"
" start=%d,"
- " end=%d, strand='%s', genome='%s')") % (
- self.transcript_id,
- self.name,
- self.gene_id,
- self.biotype,
- self.contig,
- self.start,
- self.end,
- self.strand,
- self.genome.reference_name)
+ " end=%d, strand='%s', genome='%s')"
+ ) % (
+ self.transcript_id,
+ self.name,
+ self.gene_id,
+ self.biotype,
+ self.contig,
+ self.start,
+ self.end,
+ self.strand,
+ self.genome.reference_name,
+ )
def __len__(self):
"""
@@ -90,9 +95,10 @@ def __len__(self):
def __eq__(self, other):
return (
- other.__class__ is Transcript and
- self.id == other.id and
- self.genome == other.genome)
+ other.__class__ is Transcript
+ and self.id == other.id
+ and self.genome == other.genome
+ )
def __hash__(self):
return hash(self.id)
@@ -123,7 +129,8 @@ def exons(self):
columns,
filter_column="transcript_id",
filter_value=self.id,
- feature="exon")
+ feature="exon",
+ )
# fill this list in its correct order (by exon_number) by using
# the exon_number as a 1-based list offset
@@ -133,15 +140,17 @@ def exons(self):
exon = self.genome.exon_by_id(exon_id)
if exon is None:
raise ValueError(
- "Missing exon %s for transcript %s" % (
- exon_number, self.id))
+ "Missing exon %s for transcript %s"
+ % (exon_number, self.id)
+ )
exon_number = int(exon_number)
if exon_number < 1:
raise ValueError("Invalid exon number: %s" % exon_number)
elif exon_number > len(exons):
raise ValueError(
- "Invalid exon number: %s (max expected = %d)" % (
- exon_number, len(exons)))
+ "Invalid exon number: %s (max expected = %d)"
+ % (exon_number, len(exons))
+ )
# exon_number is 1-based, convert to list index by subtracting 1
exon_idx = exon_number - 1
@@ -164,12 +173,14 @@ def _transcript_feature_position_ranges(self, feature, required=True):
select_column_names=["start", "end"],
filter_column="transcript_id",
filter_value=self.id,
- feature=feature)
+ feature=feature,
+ )
if required and len(results) == 0:
raise ValueError(
- "Transcript %s does not contain feature %s" % (
- self.id, feature))
+ "Transcript %s does not contain feature %s"
+ % (self.id, feature)
+ )
return results
@memoize
@@ -178,19 +189,21 @@ def _transcript_feature_positions(self, feature):
Get unique positions for feature, raise an error if feature is absent.
"""
ranges = self._transcript_feature_position_ranges(
- feature, required=True)
+ feature, required=True
+ )
results = []
# a feature (such as a stop codon), maybe be split over multiple
# contiguous ranges. Collect all the nucleotide positions into a
# single list.
- for (start, end) in ranges:
+ for start, end in ranges:
# since ranges are [inclusive, inclusive] and
# Python ranges are [inclusive, exclusive) we have to increment
# the end position
for position in range(start, end + 1):
if position in results:
raise ValueError(
- "Repeated position %d for %s" % (position, feature))
+ "Repeated position %d for %s" % (position, feature)
+ )
results.append(position)
return results
@@ -207,10 +220,9 @@ def _codon_positions(self, feature):
results = self._transcript_feature_positions(feature)
if len(results) != 3:
raise ValueError(
- "Expected 3 positions for %s of %s but got %d" % (
- feature,
- self.id,
- len(results)))
+ "Expected 3 positions for %s of %s but got %d"
+ % (feature, self.id, len(results))
+ )
return results
@memoized_property
@@ -219,7 +231,8 @@ def contains_start_codon(self):
Does this transcript have an annotated start_codon entry?
"""
start_codons = self._transcript_feature_position_ranges(
- "start_codon", required=False)
+ "start_codon", required=False
+ )
return len(start_codons) > 0
@memoized_property
@@ -228,9 +241,10 @@ def contains_stop_codon(self):
Does this transcript have an annotated stop_codon entry?
"""
stop_codons = self._transcript_feature_position_ranges(
- "stop_codon", required=False)
+ "stop_codon", required=False
+ )
return len(stop_codons) > 0
-
+
@memoized_property
def start_codon_complete(self):
"""
@@ -266,9 +280,10 @@ def exon_intervals(self):
select_column_names=["exon_number", "start", "end"],
filter_column="transcript_id",
filter_value=self.id,
- feature="exon")
+ feature="exon",
+ )
sorted_intervals = [None] * len(results)
- for (exon_number, start, end) in results:
+ for exon_number, start, end in results:
sorted_intervals[int(exon_number) - 1] = (start, end)
return sorted_intervals
@@ -281,15 +296,15 @@ def spliced_offset(self, position):
"""
if type(position) is not int:
raise TypeError(
- "Position argument must be an integer, got %s : %s" % (
- position, type(position)))
+ "Position argument must be an integer, got %s : %s"
+ % (position, type(position))
+ )
if position < self.start or position > self.end:
raise ValueError(
- "Invalid position: %d (must be between %d and %d)" % (
- position,
- self.start,
- self.end))
+ "Invalid position: %d (must be between %d and %d)"
+ % (position, self.start, self.end)
+ )
# offset from beginning of unspliced transcript (including introns)
unspliced_offset = self.offset(position)
@@ -306,7 +321,8 @@ def spliced_offset(self, position):
# Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii...
for exon in self.exons:
exon_unspliced_start, exon_unspliced_end = self.offset_range(
- exon.start, exon.end)
+ exon.start, exon.end
+ )
# If the relative position is not within this exon, keep a running
# total of the total exonic length-so-far.
#
@@ -320,11 +336,13 @@ def spliced_offset(self, position):
exon_offset = unspliced_offset - exon_unspliced_start
return total_spliced_offset + exon_offset
else:
- exon_length = len(exon) # exon_end_position - exon_start_position + 1
+ exon_length = len(
+ exon
+ ) # exon_end_position - exon_start_position + 1
total_spliced_offset += exon_length
raise ValueError(
- "Couldn't find position %d on any exon of %s" % (
- position, self.id))
+ "Couldn't find position %d on any exon of %s" % (position, self.id)
+ )
@memoized_property
def start_codon_unspliced_offsets(self):
@@ -333,9 +351,7 @@ def start_codon_unspliced_offsets(self):
of nucleotides in start codon.
"""
return [
- self.offset(position)
- for position
- in self.start_codon_positions
+ self.offset(position) for position in self.start_codon_positions
]
@memoized_property
@@ -345,9 +361,7 @@ def stop_codon_unspliced_offsets(self):
of nucleotides in stop codon.
"""
return [
- self.offset(position)
- for position
- in self.stop_codon_positions
+ self.offset(position) for position in self.stop_codon_positions
]
def _contiguous_offsets(self, offsets):
@@ -358,8 +372,7 @@ def _contiguous_offsets(self, offsets):
offsets.sort()
for i in range(len(offsets) - 1):
if offsets[i] + 1 != offsets[i + 1]:
- raise ValueError(
- "Offsets not contiguous: %s" % (offsets,))
+ raise ValueError("Offsets not contiguous: %s" % (offsets,))
return offsets
@memoized_property
@@ -370,8 +383,7 @@ def start_codon_spliced_offsets(self):
"""
offsets = [
self.spliced_offset(position)
- for position
- in self.start_codon_positions
+ for position in self.start_codon_positions
]
return self._contiguous_offsets(offsets)
@@ -383,8 +395,7 @@ def stop_codon_spliced_offsets(self):
"""
offsets = [
self.spliced_offset(position)
- for position
- in self.stop_codon_positions
+ for position in self.stop_codon_positions
]
return self._contiguous_offsets(offsets)
@@ -403,11 +414,11 @@ def complete(self):
a coding sequence whose length is divisible by 3
"""
return (
- self.contains_start_codon and
- self.start_codon_complete and
- self.contains_stop_codon and
- self.coding_sequence is not None and
- len(self.coding_sequence) % 3 == 0
+ self.contains_start_codon
+ and self.start_codon_complete
+ and self.contains_stop_codon
+ and self.coding_sequence is not None
+ and len(self.coding_sequence) % 3 == 0
)
@memoized_property
@@ -459,7 +470,7 @@ def coding_sequence(self):
# pylint: disable=invalid-slice-index
# TODO(tavi) Figure out pylint is not happy with this slice
- return self.sequence[start:end + 1]
+ return self.sequence[start : end + 1]
@memoized_property
def five_prime_utr_sequence(self):
@@ -469,7 +480,7 @@ def five_prime_utr_sequence(self):
"""
# pylint: disable=invalid-slice-index
# TODO(tavi) Figure out pylint is not happy with this slice
- return self.sequence[:self.first_start_codon_spliced_offset]
+ return self.sequence[: self.first_start_codon_spliced_offset]
@memoized_property
def three_prime_utr_sequence(self):
@@ -477,7 +488,7 @@ def three_prime_utr_sequence(self):
cDNA sequence of 3' UTR
(untranslated region at the end of the transcript)
"""
- return self.sequence[self.last_stop_codon_spliced_offset + 1:]
+ return self.sequence[self.last_stop_codon_spliced_offset + 1 :]
@memoized_property
def protein_id(self):
@@ -487,7 +498,8 @@ def protein_id(self):
filter_value=self.id,
feature="CDS",
distinct=True,
- required=False)
+ required=False,
+ )
if result_tuple:
return result_tuple[0]
else:
diff --git a/setup.py b/setup.py
index 45dc0a4..65dee28 100644
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@
# limitations under the License.
from __future__ import print_function
+
import os
import re
diff --git a/tests/test_ucsc_gtf.py b/tests/test_ucsc_gtf.py
index 24e444f..b40c3ff 100644
--- a/tests/test_ucsc_gtf.py
+++ b/tests/test_ucsc_gtf.py
@@ -31,13 +31,11 @@ def test_ucsc_gencode_genome():
genome.index()
genes = genome.genes()
for gene in genes:
- assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
+ assert gene.id, "Gene with missing ID in %s" % (genome,)
assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
transcripts = genome.transcripts()
for transcript in transcripts:
- assert transcript.id, "Transcript with missing ID in %s" % (
- genome.gtf.dataframe(),
- )
+ assert transcript.id, "Transcript with missing ID in %s" % (genome,)
assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (
len(transcripts),
transcripts,