diff --git a/README.md b/README.md index 624d036..d1445fa 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,9 @@ PyPI +# PyEnsembl -PyEnsembl -======= -PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files. +PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files. # Example Usage @@ -25,7 +24,7 @@ data = EnsemblRelease(77) gene_names = data.gene_names_at_locus(contig=6, position=29945884) # get all exons associated with HLA-A -exon_ids = data.exon_ids_of_gene_name('HLA-A') +exon_ids = data.exon_ids_of_gene_name("HLA-A") ``` # Installation @@ -52,6 +51,7 @@ Alternatively, you can create the `EnsemblRelease` object from inside a Python process and call `ensembl_object.download()` followed by `ensembl_object.index()`. ## Cache Location + By default, PyEnsembl uses the platform-specific `Cache` folder and caches the files into the `pyensembl` sub-directory. You can override this default by setting the environment key `PYENSEMBL_CACHE_DIR` @@ -66,11 +66,11 @@ or ```python import os -os.environ['PYENSEMBL_CACHE_DIR'] = '/custom/cache/dir' +os.environ["PYENSEMBL_CACHE_DIR"] = "/custom/cache/dir" # ... PyEnsembl API usage ``` -# Usage tips +# Usage tips ## List installed genomes @@ -80,6 +80,7 @@ pyensembl list ```python from pyensembl.shell import collect_all_installed_ensembl_releases + collect_all_installed_ensembl_releases() ``` @@ -87,10 +88,11 @@ collect_all_installed_ensembl_releases() ```python from pyensembl import EnsemblRelease + data = EnsemblRelease( release=100, - species=find_species_by_name('drosophila_melanogaster'), - ) + species=find_species_by_name("drosophila_melanogaster"), +) ``` ## Data structure @@ -98,13 +100,13 @@ data = EnsemblRelease( ### Gene object ```python -gene=data.gene_by_id(gene_id='FBgn0011747') +gene = data.gene_by_id(gene_id="FBgn0011747") ``` ### Transcript object ```python -transcript=gene.transcripts[0] +transcript = gene.transcripts[0] ``` ### Protein information @@ -125,11 +127,12 @@ For example: ```python from pyensembl import Genome + data = Genome( - reference_name='GRCh38', - annotation_name='my_genome_features', + reference_name="GRCh38", + annotation_name="my_genome_features", # annotation_version=None, - gtf_path_or_url='/My/local/gtf/path_to_my_genome_features.gtf', # Path or URL of GTF file + gtf_path_or_url="/My/local/gtf/path_to_my_genome_features.gtf", # Path or URL of GTF file # transcript_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing transcript sequences # protein_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing protein sequences # cache_directory_path=None, # Where to place downloaded and cached files for this genome @@ -142,8 +145,8 @@ gene_names = data.gene_names_at_locus(contig=6, position=29945884) # API The `EnsemblRelease` object has methods to let you access all possible -combinations of the annotation features *gene\_name*, *gene\_id*, -*transcript\_name*, *transcript\_id*, *exon\_id* as well as the location of +combinations of the annotation features _gene_name_, _gene_id_, +_transcript_name_, _transcript_id_, _exon_id_ as well as the location of these genomic elements (contig, start position, end position, strand). ## Genes diff --git a/docs/conf.py b/docs/conf.py index bbc0aaf..aefddaa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,47 +18,47 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', + "sphinx.ext.autodoc", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'pyensembl' -copyright = u'2016, Hammer Lab' -author = u'Hammer Lab' +project = "pyensembl" +copyright = "2016, Hammer Lab" +author = "Hammer Lab" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'0.8.10' +version = "0.8.10" # The full version, including alpha/beta/rc tags. -release = u'0.8.10' +release = "0.8.10" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -69,37 +69,37 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -109,156 +109,155 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'pyensembldoc' +htmlhelp_basename = "pyensembldoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'pyensembl.tex', u'pyensembl Documentation', - u'Hammer Lab', 'manual'), + ( + master_doc, + "pyensembl.tex", + "pyensembl Documentation", + "Hammer Lab", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'pyensembl', u'pyensembl Documentation', - [author], 1) -] +man_pages = [(master_doc, "pyensembl", "pyensembl Documentation", [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -267,19 +266,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'pyensembl', u'pyensembl Documentation', - author, 'pyensembl', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "pyensembl", + "pyensembl Documentation", + author, + "pyensembl", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index eeb28fb..991af8c 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -10,27 +10,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .config import MAX_ENSEMBL_RELEASE, MAX_ENSEMBLGENOME_RELEASE from .database import Database from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease, cached_release -from .ensembl_release_versions import MAX_ENSEMBL_RELEASE from .exon import Exon -from .genome import Genome from .gene import Gene +from .genome import Genome from .locus import Locus -from .reference_name import ( - ensembl_grch36, - ensembl_grch37, - ensembl_grch38, - normalize_reference_name, +from .reference_name import ( # ensembl_grch36,; ensembl_grch37,; ensembl_grch38, find_species_by_reference, - which_reference, genome_for_reference_name, + normalize_reference_name, + which_reference, ) - from .search import find_nearest_locus from .sequence_data import SequenceData -from .species import find_species_by_name, check_species_object, normalize_species_name +from .species import ( + check_species_object, + find_species_by_name, + normalize_species_name, +) from .transcript import Transcript from .version import __version__ @@ -41,6 +41,7 @@ "EnsemblRelease", "cached_release", "MAX_ENSEMBL_RELEASE", + "MAX_ENSEMBLGENOME_RELEASE", "Gene", "Transcript", "Exon", @@ -56,7 +57,7 @@ "Genome", "Locus", "Exon", - "ensembl_grch36", - "ensembl_grch37", - "ensembl_grch38", + # "ensembl_grch36", + # "ensembl_grch37", + # "ensembl_grch38", ] diff --git a/pyensembl/common.py b/pyensembl/common.py index ccc5eb1..a9a3964 100644 --- a/pyensembl/common.py +++ b/pyensembl/common.py @@ -11,7 +11,6 @@ # limitations under the License. import pickle - from functools import wraps @@ -28,10 +27,11 @@ def load_pickle(filepath): def _memoize_cache_key(args, kwargs): - """Turn args tuple and kwargs dictionary into a hashable key. + """ + Turn args tuple and kwargs dictionary into a hashable key. - Expects that all arguments to a memoized function are either hashable - or can be uniquely identified from type(arg) and repr(arg). + Expects that all arguments to a memoized function are either + hashable or can be uniquely identified from type(arg) and repr(arg). """ cache_key_list = [] @@ -51,9 +51,9 @@ def _memoize_cache_key(args, kwargs): def memoize(fn): - """Simple reset-able memoization decorator for functions and methods, - assumes that all arguments to the function can be hashed and - compared. + """ + Simple reset-able memoization decorator for functions and methods, assumes + that all arguments to the function can be hashed and compared. """ cache = {} diff --git a/pyensembl/config.py b/pyensembl/config.py new file mode 100644 index 0000000..faaa3a5 --- /dev/null +++ b/pyensembl/config.py @@ -0,0 +1,181 @@ +# TODO: save the config in YMAL file, or TOML file? + +MIN_ENSEMBL_RELEASE = 54 +MAX_ENSEMBL_RELEASE = 110 +MIN_ENSEMBLGENOME_RELEASE = 50 +MAX_ENSEMBLGENOME_RELEASE = 57 + + +SPECIES_DATA = [ + { + "latin_name": "homo_sapiens", + "synonyms": ["human"], + "reference_assemblies": { + "NCBI36": (54, 54), + "GRCh37": (55, 75), + "GRCh38": (76, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "mus_musculus", + "synonyms": ["mouse", "house mouse"], + "reference_assemblies": { + "NCBIM37": (54, 67), + "GRCm38": (68, 102), + "GRCm39": (103, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "canis_familiaris", + "synonyms": ["dog"], + "reference_assemblies": {"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "felis_catus", + "synonyms": ["cat"], + "reference_assemblies": { + "Felis_catus_6.2": (75, 90), + "Felis_catus_8.0": (91, 92), + "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "gallus_gallus", + "synonyms": ["chicken"], + "reference_assemblies": { + "Galgal4": (75, 85), + "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "rattus_norvegicus", + "synonyms": ["rat", "brown_rat", "lab_rat"], + "reference_assemblies": { + "Rnor_5.0": (75, 79), + "Rnor_6.0": (80, 104), + "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "macaca_fascicularis", + "synonyms": ["macaque", "Crab-eating_macaque"], + "reference_assemblies": { + "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE) + }, + }, + { + "latin_name": "chlorocebus_sabaeus", + "synonyms": ["green_monkey", "african_green_monkey"], + "reference_assemblies": {"ChlSab1.1": (86, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "macaca_mulatta", + "synonyms": ["rhesus"], + "reference_assemblies": {"Mmul_10": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "oryctolagus_cuniculus", + "synonyms": ["rabbit"], + "reference_assemblies": {"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "meriones_unguiculatus", + "synonyms": ["gerbil"], + "reference_assemblies": {"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "mesocricetus_auratus", + "synonyms": ["syrian_hamster"], + "reference_assemblies": {"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "cricetulus_griseus_chok1gshd", + "synonyms": ["chinese_hamster"], + "reference_assemblies": {"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "heterocephalus_glaber_female", + "synonyms": ["naked_mole_rat"], + "reference_assemblies": { + "HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE) + }, + }, + { + "latin_name": "cavia_porcellus", + "synonyms": ["guinea_pig"], + "reference_assemblies": {"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "sus_scrofa", + "synonyms": ["pig"], + "reference_assemblies": {"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "danio_rerio", + "synonyms": ["zebrafish"], + "reference_assemblies": { + "Zv8": (54, 59), + "Zv9": (60, 79), + "GRCz10": (80, 91), + "GRCz11": (92, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "drosophila_melanogaster", + "synonyms": ["drosophila", "fruit fly", "fly"], + "reference_assemblies": { + "BDGP5": (75, 78), + "BDGP6": (79, 95), + "BDGP6.22": (96, 98), + "BDGP6.28": (99, 102), + "BDGP6.32": (103, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "caenorhabditis_elegans", + "synonyms": ["nematode", "C_elegans"], + "reference_assemblies": { + "WS200": (55, 57), + "WS210": (58, 60), + "WS220": (61, 66), + "WBcel235": (67, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "saccharomyces_cerevisiae", + "synonyms": ["yeast", "budding_yeast"], + "reference_assemblies": {"R64-1-1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "arabidopsis_thaliana", + "synonyms": ["cress", "thale_cress", "hehe"], + "reference_assemblies": { + "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, + { + "latin_name": "oryza_sativa", + "synonyms": ["rice"], + "reference_assemblies": { + "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, + { + "latin_name": "zea_mays", + "synonyms": ["maize"], + "reference_assemblies": { + "Zm-B73-REFERENCE-NAM-5.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, + { + "latin_name": "glycine_max", + "synonyms": ["soybean"], + "reference_assemblies": { + "Glycine_max_v2.1": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, +] diff --git a/pyensembl/database.py b/pyensembl/database.py index 4286908..b5fcd99 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -11,16 +11,16 @@ # limitations under the License. import logging -from os.path import split, join, exists, splitext import sqlite3 +from os.path import exists, join, split, splitext import datacache +from gtfparse import create_missing_features, read_gtf from typechecks import require_integer, require_string -from gtfparse import read_gtf, create_missing_features from .common import memoize -from .normalization import normalize_chromosome, normalize_strand from .locus import Locus +from .normalization import normalize_chromosome, normalize_strand # any time we update the database schema, increment this version number DATABASE_SCHEMA_VERSION = 3 @@ -31,9 +31,9 @@ class Database(object): """ - Wrapper around sqlite3 database so that the rest of the - library doesn't have to worry about constructing the .db file or - writing SQL queries directly. + Wrapper around sqlite3 database so that the rest of the library doesn't + have to worry about constructing the .db file or writing SQL queries + directly. """ def __init__( @@ -104,8 +104,8 @@ def local_db_path(self): def _all_possible_indices(self, column_names): """ - Create list of tuples containing all possible index groups - we might want to create over tables in this database. + Create list of tuples containing all possible index groups we might + want to create over tables in this database. If a set of genome annotations is missing some column we want to index on, we have to drop any indices which use that column. @@ -136,7 +136,8 @@ def _all_possible_indices(self, column_names): # other GTFs) if column_name not in column_set: logger.info( - "Skipping database index for {%s}", ", ".join(column_group) + "Skipping database index for {%s}", + ", ".join(column_group), ) skip = True if skip: @@ -149,7 +150,8 @@ def _all_possible_indices(self, column_names): PRIMARY_KEY_COLUMNS = {"gene": "gene_id", "transcript": "transcript_id"} def _get_primary_key(self, feature_name, feature_df): - """Name of primary key for a feature table (e.g. "gene" -> "gene_id") + """ + Name of primary key for a feature table (e.g. "gene" -> "gene_id") Since we're potentially going to run this code over unseen data, make sure that the primary is unique and never null. @@ -163,18 +165,21 @@ def _get_primary_key(self, feature_name, feature_df): if primary_key_values.isnull().any(): raise ValueError( "Column '%s' can't be primary key of table '%s'" - " because it contains nulls values" % (primary_key, feature_name) + " because it contains nulls values" + % (primary_key, feature_name) ) elif len(primary_key_values.unique()) < len(primary_key_values): raise ValueError( "Column '%s' can't be primary key of table '%s'" - " because it contains repeated values" % (primary_key, feature_name) + " because it contains repeated values" + % (primary_key, feature_name) ) else: return primary_key def _feature_indices(self, all_index_groups, primary_key, feature_df): - """Choose subset of index group tuples from `all_index_groups` which are + """ + Choose subset of index group tuples from `all_index_groups` which are applicable to a particular feature (not same as its primary key, have non-null values). """ @@ -194,9 +199,8 @@ def _feature_indices(self, all_index_groups, primary_key, feature_df): def create(self, overwrite=False): """ - Create the local database (including indexing) if it's not - already set up. If `overwrite` is True, always re-create - the database from scratch. + Create the local database (including indexing) if it's not already set + up. If `overwrite` is True, always re-create the database from scratch. Returns a connection to the database. """ @@ -204,8 +208,19 @@ def create(self, overwrite=False): datacache.ensure_dir(self.cache_directory_path) df = self._load_gtf_as_dataframe( - usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features + usecols=self.restrict_gtf_columns, + features=self.restrict_gtf_features, ) + # Some species such as maize, do not have a gene_name and transcript_name + # but do have gene_id and transcript_id, use the as alias of names + if "gene_id" in df.columns and "gene_name" not in df.columns: + df["gene_name"] = df["gene_id"] + if ( + "transcript_id" in df.columns + and "transcript_name" not in df.columns + ): + df["transcript_name"] = df["transcript_id"] + all_index_groups = self._all_possible_indices(df.columns) if self.restrict_gtf_features: @@ -261,7 +276,7 @@ def _get_connection(self): @property def connection(self): """ - Get a connection to the database or raise an exception + Get a connection to the database or raise an exception. """ connection = self._get_connection() if connection: @@ -275,6 +290,7 @@ def connection(self): def connect_or_create(self, overwrite=False): """ Return a connection to the database if it exists, otherwise create it. + Overwrite the existing database if `overwrite` is True. """ connection = self._get_connection() @@ -306,8 +322,8 @@ def column_values_at_locus( sorted=False, ): """ - Get the non-null values of a column from the database - at a particular range of loci + Get the non-null values of a column from the database at a particular + range of loci. """ # TODO: combine with the query method, since they overlap @@ -408,8 +424,8 @@ def distinct_column_values_at_locus( def run_sql_query(self, sql, required=False, query_params=[]): """ - Given an arbitrary SQL query, run it against the database - and return the results. + Given an arbitrary SQL query, run it against the database and return + the results. Parameters ---------- @@ -454,8 +470,8 @@ def query( required=False, ): """ - Construct a SQL query and run against the sqlite3 database, - filtered both by the feature type and a user-provided column/value. + Construct a SQL query and run against the sqlite3 database, filtered + both by the feature type and a user-provided column/value. """ sql = """ SELECT %s%s @@ -468,7 +484,9 @@ def query( filter_column, ) query_params = [filter_value] - return self.run_sql_query(sql, required=required, query_params=query_params) + return self.run_sql_query( + sql, required=required, query_params=query_params + ) def query_one( self, @@ -490,7 +508,9 @@ def query_one( if len(results) == 0: if required: - raise ValueError("%s not found: %s" % (filter_column, filter_value)) + raise ValueError( + "%s not found: %s" % (filter_column, filter_value) + ) else: return None elif len(results) > 1: @@ -505,8 +525,8 @@ def query_feature_values( self, column, feature, distinct=True, contig=None, strand=None ): """ - Run a SQL query against the sqlite3 database, filtered - only on the feature type. + Run a SQL query against the sqlite3 database, filtered only on the + feature type. """ query = """ SELECT %s%s @@ -541,7 +561,6 @@ def query_loci(self, filter_column, filter_value, feature): """ Query for loci satisfying a given filter and feature type. - Parameters ---------- filter_column : str @@ -571,8 +590,8 @@ def query_loci(self, filter_column, filter_value, feature): def query_locus(self, filter_column, filter_value, feature): """ - Query for unique locus, raises error if missing or more than - one locus in the database. + Query for unique locus, raises error if missing or more than one locus + in the database. Parameters ---------- @@ -588,7 +607,9 @@ def query_locus(self, filter_column, filter_value, feature): Returns single Locus object. """ loci = self.query_loci( - filter_column=filter_column, filter_value=filter_value, feature=feature + filter_column=filter_column, + filter_value=filter_value, + feature=feature, ) if len(loci) == 0: @@ -605,7 +626,7 @@ def query_locus(self, filter_column, filter_value, feature): def _load_gtf_as_dataframe(self, usecols=None, features=None): """ - Parse this genome source's GTF file and load it as a Pandas DataFrame + Parse this genome source's GTF file and load it as a Pandas DataFrame. """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf( @@ -621,7 +642,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None): column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features - expect_transcript_feature = features is None or "transcript" in features + expect_transcript_feature = ( + features is None or "transcript" in features + ) observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" @@ -635,7 +658,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None): dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ - "gene": {"gene_name", "gene_biotype"}.intersection(column_names), + "gene": {"gene_name", "gene_biotype"}.intersection( + column_names + ), }, missing_value="", ) diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py index c33d6fe..48ebd00 100644 --- a/pyensembl/download_cache.py +++ b/pyensembl/download_cache.py @@ -11,14 +11,13 @@ # limitations under the License. +import logging from os import listdir, remove -from os.path import join, exists, split, abspath, isdir +from os.path import abspath, exists, isdir, join, split from shutil import copy2, rmtree -import logging import datacache - logger = logging.getLogger(__name__) CACHE_BASE_SUBDIR = "pyensembl" @@ -29,9 +28,11 @@ def cache_subdirectory( reference_name=None, annotation_name=None, annotation_version=None ): """ - Which cache subdirectory to use for a given annotation database - over a particular reference. All arguments can be omitted to just get - the base subdirectory for all pyensembl cached datasets. + Which cache subdirectory to use for a given annotation database over a + particular reference. + + All arguments can be omitted to just get the base subdirectory for + all pyensembl cached datasets. """ if reference_name is None: reference_name = "" @@ -135,7 +136,7 @@ def cache_directory_path(self): def _fields(self): """ - Fields used for hashing, string representation, equality comparison + Fields used for hashing, string representation, equality comparison. """ return ( ( @@ -150,7 +151,10 @@ def _fields(self): ) def __eq__(self, other): - return other.__class__ is DownloadCache and self._fields() == other._fields() + return ( + other.__class__ is DownloadCache + and self._fields() == other._fields() + ) def __hash__(self): return hash(self._fields()) @@ -202,7 +206,9 @@ def cached_path(self, path_or_url): # for stripping decompression extensions for both local # and remote files local_filename = datacache.build_local_filename( - download_url=path_or_url, filename=remote_filename, decompress=False + download_url=path_or_url, + filename=remote_filename, + decompress=False, ) else: local_filename = remote_filename @@ -210,10 +216,14 @@ def cached_path(self, path_or_url): # if we expect the download function to decompress this file then # we should use its name without the compression extension if self.decompress_on_download: - local_filename = self._remove_compression_suffix_if_present(local_filename) + local_filename = self._remove_compression_suffix_if_present( + local_filename + ) if len(local_filename) == 0: - raise ValueError("Can't determine local filename for %s" % (path_or_url,)) + raise ValueError( + "Can't determine local filename for %s" % (path_or_url,) + ) return join(self.cache_directory_path, local_filename) @@ -254,8 +264,8 @@ def download_or_copy_if_necessary( self, path_or_url, download_if_missing=False, overwrite=False ): """ - Download a remote file or copy - Get the local path to a possibly remote file. + Download a remote file or copy Get the local path to a possibly remote + file. Download if file is missing from the cache directory and `download_if_missing` is True. Download even if local file exists if @@ -295,7 +305,11 @@ def _raise_missing_file_error(self, missing_urls_dict): raise ValueError(error_message) def local_path_or_install_error( - self, field_name, path_or_url, download_if_missing=False, overwrite=False + self, + field_name, + path_or_url, + download_if_missing=False, + overwrite=False, ): try: return self.download_or_copy_if_necessary( @@ -308,13 +322,13 @@ def local_path_or_install_error( def delete_cached_files(self, prefixes=[], suffixes=[]): """ - Deletes any cached files matching the prefixes or suffixes given + Deletes any cached files matching the prefixes or suffixes given. """ if isdir(self.cache_directory_path): for filename in listdir(): - delete = any([filename.endswith(ext) for ext in suffixes]) or any( - [filename.startswith(pre) for pre in prefixes] - ) + delete = any( + [filename.endswith(ext) for ext in suffixes] + ) or any([filename.startswith(pre) for pre in prefixes]) if delete: path = join(self.cache_directory_path, filename) logger.info("Deleting %s", path) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 8ad47ab..497b503 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -11,34 +11,23 @@ # limitations under the License. """ -Contains the EnsemblRelease class, which extends the Genome class -to be specific to (a particular release of) Ensembl. +Contains the EnsemblRelease class, which extends the Genome class to be +specific to (a particular release of) Ensembl. """ from weakref import WeakValueDictionary +from .ensembl_release_versions import check_release_number +from .ensembl_url_templates import make_fasta_url, make_gtf_url from .genome import Genome -from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE from .species import check_species_object, human -from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url - class EnsemblRelease(Genome): """ - Bundles together the genomic annotation and sequence data associated with - a particular release of the Ensembl database. + Bundles together the genomic annotation and sequence data associated with a + particular release of the Ensembl database. """ - @classmethod - def normalize_init_values(cls, release, species, server): - """ - Normalizes the arguments which uniquely specify an EnsemblRelease - genome. - """ - release = check_release_number(release) - species = check_species_object(species) - return (release, species, server) - # Using a WeakValueDictionary instead of an ordinary dict to prevent a # memory leak in cases where we test many different releases in sequence. # When all the references to a particular EnsemblRelease die then that @@ -47,13 +36,21 @@ def normalize_init_values(cls, release, species, server): @classmethod def cached( - cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER + cls, + release=None, + species=human, + database=None, + server=None, + # server=ENSEMBL_FTP_SERVER, ): """ Construct EnsemblRelease if it's never been made before, otherwise return an old instance. """ - init_args_tuple = cls.normalize_init_values(release, species, server) + species = check_species_object(species) + release = check_release_number(release, species.database) + init_args_tuple = (release, species, database, server) + if init_args_tuple in cls._genome_cache: genome = cls._genome_cache[init_args_tuple] else: @@ -61,14 +58,23 @@ def cached( return genome def __init__( - self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER + self, + release=None, + species=human, + database=None, + server=None, + # server=EMBL_FTP_SERVER,, ): - self.release, self.species, self.server = self.normalize_init_values( - release=release, species=species, server=server - ) + self.species = check_species_object(species) + self.release = check_release_number(release, self.species.database) + self.database = database + self.server = server self.gtf_url = make_gtf_url( - ensembl_release=self.release, species=self.species, server=self.server + ensembl_release=self.release, + species=self.species.latin_name, + server=self.server, + database=self.species.database, ) self.transcript_fasta_urls = [ @@ -77,12 +83,14 @@ def __init__( species=self.species.latin_name, sequence_type="cdna", server=server, + database=self.species.database, ), make_fasta_url( ensembl_release=self.release, species=self.species.latin_name, sequence_type="ncrna", server=server, + database=self.species.database, ), ] @@ -92,6 +100,7 @@ def __init__( species=self.species.latin_name, sequence_type="pep", server=self.server, + database=self.species.database, ) ] @@ -130,7 +139,11 @@ def __hash__(self): return hash((self.release, self.species)) def to_dict(self): - return {"release": self.release, "species": self.species, "server": self.server} + return { + "release": self.release, + "species": self.species, + "server": self.server, + } @classmethod def from_dict(cls, state_dict): @@ -144,7 +157,9 @@ def cached_release(release, species="human"): """ Create an EnsemblRelease instance only if it's hasn't already been made, otherwise returns the old instance. - Keeping this function for backwards compatibility but this functionality - has been moving into the cached method of EnsemblRelease. + + Keeping this function for backwards compatibility but this + functionality has been moving into the cached method of + EnsemblRelease. """ return EnsemblRelease.cached(release=release, species=species) diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 79649bd..246a380 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -10,23 +10,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -MIN_ENSEMBL_RELEASE = 54 -MAX_ENSEMBL_RELEASE = 110 +from .config import ( + MAX_ENSEMBL_RELEASE, + MAX_ENSEMBLGENOME_RELEASE, + MIN_ENSEMBL_RELEASE, + MIN_ENSEMBLGENOME_RELEASE, +) -def check_release_number(release): +def check_release_number(release, database=None): """ - Check to make sure a release is in the valid range of - Ensembl releases. + Check to make sure a release is in the valid range of Ensembl releases. """ + if release is None: + return ( + MAX_ENSEMBL_RELEASE + if database is None + else MAX_ENSEMBLGENOME_RELEASE + ) try: release = int(release) - except: + except ValueError: raise ValueError("Invalid Ensembl release: %s" % release) - - if release < MIN_ENSEMBL_RELEASE: + if database is None: + min_release = MIN_ENSEMBL_RELEASE + else: + min_release = MIN_ENSEMBLGENOME_RELEASE + if release < min_release: raise ValueError( "Invalid Ensembl releases %d, must be greater than %d" - % (release, MIN_ENSEMBL_RELEASE) + % (release, min_release) ) return release diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index ded3570..e00968b 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -11,19 +11,23 @@ # limitations under the License. """ -Templates for URLs and paths to specific relase, species, and file type -on the Ensembl ftp server. +Templates for URLs and paths to specific relase, species, and file type on the +Ensembl ftp server. For example, the human chromosomal DNA sequences for release 78 are in: https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/ +For plant, fungi and metazoa species, the url is as follow: + + https://ftp.ensemblgenomes.ebi.ac.uk/pub/release-57/plants/fasta/glycine_max/cdna/ """ -from .species import Species, find_species_by_name from .ensembl_release_versions import check_release_number +from .species import Species, find_species_by_name ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org" +ENSEMBLGENOME_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk" # Example directories # FASTA files: /pub/release-78/fasta/homo_sapiens/ @@ -31,27 +35,58 @@ FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/" GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/" +DATABASE_FASTA_SUBDIR_TEMPLATE = ( + "/pub/release-%(release)d/%(database)s/fasta/%(species)s/%(type)s/" +) +DATABASE_GTF_SUBDIR_TEMPLATE = ( + "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" +) + +# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz +GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" + +# cDNA & protein FASTA file for releases before (and including) Ensembl 75 +# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz +OLD_FASTA_FILENAME_TEMPLATE = ( + "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz" +) + +# ncRNA FASTA file for releases before (and including) Ensembl 75 +# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz + +OLD_FASTA_FILENAME_TEMPLATE_NCRNA = ( + "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" +) + +# cDNA & protein FASTA file for releases after Ensembl 75 +# example: Homo_sapiens.GRCh37.cdna.all.fa.gz +NEW_FASTA_FILENAME_TEMPLATE = ( + "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" +) + +# ncRNA FASTA file for releases after Ensembl 75 +# example: Homo_sapiens.GRCh37.ncrna.fa.gz +NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" + def normalize_release_properties(ensembl_release, species): """ Make sure a given release is valid, normalize it to be an integer, normalize the species name, and get its associated reference. """ - ensembl_release = check_release_number(ensembl_release) if not isinstance(species, Species): species = find_species_by_name(species) + ensembl_release = check_release_number( + ensembl_release, database=species.database + ) reference_name = species.which_reference(ensembl_release) return ensembl_release, species.latin_name, reference_name -# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz -GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" - - def make_gtf_filename(ensembl_release, species): """ Return GTF filename expect on Ensembl FTP server for a specific - species/release combination + species/release combination. """ ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species @@ -63,41 +98,45 @@ def make_gtf_filename(ensembl_release, species): } -def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER): +def make_gtf_url(ensembl_release, species, server=None, database=None): """ Returns a URL and a filename, which can be joined together. """ - ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) - subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species} - filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) + if server is None: + if database is None: + server = ENSEMBL_FTP_SERVER + else: + server = ENSEMBLGENOME_FTP_SERVER + ensembl_release, species, _ = normalize_release_properties( + ensembl_release, species + ) + if database is None: + subdir = GTF_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "species": species, + } + else: + subdir = DATABASE_GTF_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "database": database, + "species": species, + } + filename = make_gtf_filename( + ensembl_release=ensembl_release, species=species + ) return server + subdir + filename -# cDNA & protein FASTA file for releases before (and including) Ensembl 75 -# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz -OLD_FASTA_FILENAME_TEMPLATE = ( - "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz" -) - -# ncRNA FASTA file for releases before (and including) Ensembl 75 -# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz - -OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" - -# cDNA & protein FASTA file for releases after Ensembl 75 -# example: Homo_sapiens.GRCh37.cdna.all.fa.gz -NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" - -# ncRNA FASTA file for releases after Ensembl 75 -# example: Homo_sapiens.GRCh37.ncrna.fa.gz -NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" - - -def make_fasta_filename(ensembl_release, species, sequence_type): +def make_fasta_filename(ensembl_release, species, database, sequence_type): ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species ) - if ensembl_release <= 75: + # for plant database, start from release 32 (inlcude 32) , the fasta file use the "old name" + # for releses before 31, the fasta file use the "new name" + # version 31 use both old and new name + if (ensembl_release <= 75 and database is None) or ( + ensembl_release <= 31 and database is not None + ): if sequence_type == "ncrna": return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % { "Species": species.capitalize(), @@ -125,23 +164,47 @@ def make_fasta_filename(ensembl_release, species, sequence_type): } -def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER): - """Construct URL to FASTA file with cDNA transcript or protein sequences +def make_fasta_url( + ensembl_release, + species, + sequence_type, + server=None, + database=None, +): + """ + Construct URL to FASTA file with cDNA transcript or protein sequences. Parameter examples: ensembl_release = 75 species = "Homo_sapiens" sequence_type = "cdna" (other option: "pep") """ - ensembl_release, species, reference_name = normalize_release_properties( + if server is None: + if database is None: + server = ENSEMBL_FTP_SERVER + else: + server = ENSEMBLGENOME_FTP_SERVER + ensembl_release, species, _ = normalize_release_properties( ensembl_release, species ) - subdir = FASTA_SUBDIR_TEMPLATE % { - "release": ensembl_release, - "species": species, - "type": sequence_type, - } + if database is None: + subdir = FASTA_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "species": species, + "type": sequence_type, + } + else: + subdir = DATABASE_FASTA_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "database": database, + "species": species, + "type": sequence_type, + } + filename = make_fasta_filename( - ensembl_release=ensembl_release, species=species, sequence_type=sequence_type + ensembl_release=ensembl_release, + species=species, + database=database, + sequence_type=sequence_type, ) return server + subdir + filename diff --git a/pyensembl/exon.py b/pyensembl/exon.py index a520290..a84b75f 100644 --- a/pyensembl/exon.py +++ b/pyensembl/exon.py @@ -15,7 +15,9 @@ class Exon(Locus): - def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id): + def __init__( + self, exon_id, contig, start, end, strand, gene_name, gene_id + ): Locus.__init__(self, contig, start, end, strand) self.exon_id = exon_id self.gene_name = gene_name diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index e339a8a..b55750b 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -19,9 +19,8 @@ """ -from gzip import GzipFile import logging - +from gzip import GzipFile logger = logging.getLogger(__name__) @@ -33,7 +32,8 @@ def _parse_header_id(line): """ if type(line) is not bytes: raise TypeError( - "Expected header line to be of type %s but got %s" % (bytes, type(line)) + "Expected header line to be of type %s but got %s" + % (bytes, type(line)) ) if len(line) <= 1: diff --git a/pyensembl/gene.py b/pyensembl/gene.py index f26de48..b787c64 100644 --- a/pyensembl/gene.py +++ b/pyensembl/gene.py @@ -17,7 +17,9 @@ class Gene(LocusWithGenome): - def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome): + def __init__( + self, gene_id, gene_name, contig, start, end, strand, biotype, genome + ): LocusWithGenome.__init__( self, contig=contig, @@ -98,7 +100,8 @@ def transcripts(self): # its particular information, might be more efficient if we # just get all the columns here, but how do we keep that modular? return [ - self.genome.transcript_by_id(result[0]) for result in transcript_id_results + self.genome.transcript_by_id(result[0]) + for result in transcript_id_results ] @memoized_property diff --git a/pyensembl/genome.py b/pyensembl/genome.py index 05b6efc..a5e202d 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -11,8 +11,8 @@ # limitations under the License. """ -Contains the Genome class, with its millions of accessors and wrappers -around an arbitrary genomic database. +Contains the Genome class, with its millions of accessors and wrappers around +an arbitrary genomic database. """ @@ -21,8 +21,8 @@ from serializable import Serializable -from .download_cache import DownloadCache from .database import Database +from .download_cache import DownloadCache from .exon import Exon from .gene import Gene from .sequence_data import SequenceData @@ -31,8 +31,8 @@ class Genome(Serializable): """ - Bundles together the genomic annotation and sequence data associated with - a particular genomic database source (e.g. a single Ensembl release) and + Bundles together the genomic annotation and sequence data associated with a + particular genomic database source (e.g. a single Ensembl release) and provides a wide variety of helper methods for accessing this data. """ @@ -148,7 +148,7 @@ def to_dict(self): def _init_lazy_fields(self): """ - Member data that gets loaded or constructed on demand + Member data that gets loaded or constructed on demand. """ self.gtf_path = None self._protein_sequences = None @@ -163,11 +163,15 @@ def _init_lazy_fields(self): self._exons = {} def _get_cached_path( - self, field_name, path_or_url, download_if_missing=False, overwrite=False + self, + field_name, + path_or_url, + download_if_missing=False, + overwrite=False, ): """ - Get the local path for a possibly remote file, invoking either - a download or install error message if it's missing. + Get the local path for a possibly remote file, invoking either a + download or install error message if it's missing. """ if len(field_name) == 0: raise ValueError("Expected non-empty field name") @@ -188,7 +192,9 @@ def _get_gtf_path(self, download_if_missing=False, overwrite=False): overwrite=overwrite, ) - def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False): + def _get_transcript_fasta_paths( + self, download_if_missing=False, overwrite=False + ): if not self.requires_transcript_fasta: raise ValueError("No transcript FASTA source for %s" % self) return [ @@ -201,7 +207,9 @@ def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False for path in self._transcript_fasta_paths_or_urls ] - def _get_protein_fasta_paths(self, download_if_missing=False, overwrite=False): + def _get_protein_fasta_paths( + self, download_if_missing=False, overwrite=False + ): # get the path for peptide FASTA files containing # this genome's protein sequences if not self.requires_protein_fasta: @@ -233,7 +241,9 @@ def _set_local_paths(self, download_if_missing=True, overwrite=False): def required_local_files(self): paths = [] if self._gtf_path_or_url: - paths.append(self.download_cache.cached_path(self._gtf_path_or_url)) + paths.append( + self.download_cache.cached_path(self._gtf_path_or_url) + ) if self._transcript_fasta_paths_or_urls: paths.extend( [ @@ -273,8 +283,8 @@ def download(self, overwrite=False): def index(self, overwrite=False): """ Assuming that all necessary data for this Genome has been downloaded, - generate the GTF database and save efficient representation of - FASTA sequence files. + generate the GTF database and save efficient representation of FASTA + sequence files. """ if self.requires_gtf: self.db.connect_or_create(overwrite=overwrite) @@ -291,10 +301,13 @@ def db(self): # make sure GTF file exists locally # and populate self.gtf_path self._set_local_paths( - download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though - overwrite=False) + download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though + overwrite=False, + ) if self.gtf_path is None: - raise ValueError("Property 'gtf_path' of %s cannot be None" % self) + raise ValueError( + "Property 'gtf_path' of %s cannot be None" % self + ) # Database object turns the GTF dataframes into sqlite3 tables # and wraps them with methods like `query_one` @@ -347,7 +360,8 @@ def protein_sequences(self): self._set_local_paths(download_if_missing=False, overwrite=False) if self.protein_fasta_paths is None: raise ValueError( - "Property 'protein_fasta_paths' of %s cannot be None" % self + "Property 'protein_fasta_paths' of %s cannot be None" + % self ) self._protein_sequences = SequenceData( fasta_paths=self.protein_fasta_paths, @@ -359,13 +373,16 @@ def protein_sequences(self): def transcript_sequences(self): if self._transcript_sequences is None: if not self.requires_transcript_fasta: - raise ValueError("Missing transcript FASTA source for %s" % self) + raise ValueError( + "Missing transcript FASTA source for %s" % self + ) # make sure transcript FASTA file exists locally # and populate self.transcript_fasta_paths self._set_local_paths(download_if_missing=False, overwrite=False) if self.transcript_fasta_paths is None: raise ValueError( - "Property 'transcript_fasta_paths' of %s cannot be None" % (self,) + "Property 'transcript_fasta_paths' of %s cannot be None" + % (self,) ) self._transcript_sequences = SequenceData( fasta_paths=self.transcript_fasta_paths, @@ -375,8 +392,8 @@ def transcript_sequences(self): def install_string(self): """ - Add every missing file to the install string shown to the user - in an error message. + Add every missing file to the install string shown to the user in an + error message. """ args = [ "--reference-name", @@ -450,7 +467,7 @@ def __hash__(self): def clear_cache(self): """ - Clear any in-memory cached values + Clear any in-memory cached values. """ for maybe_fn in self.__dict__.values(): # clear cache associated with all memoization decorators, @@ -460,7 +477,7 @@ def clear_cache(self): def delete_index_files(self): """ - Delete all data aside from source GTF and FASTA files + Delete all data aside from source GTF and FASTA files. """ self.clear_cache() db_path = self.db.local_db_path() @@ -471,9 +488,8 @@ def _all_feature_values( self, column, feature, distinct=True, contig=None, strand=None ): """ - Cached lookup of all values for a particular feature property from - the database, caches repeated queries in memory and - stores them as a CSV. + Cached lookup of all values for a particular feature property from the + database, caches repeated queries in memory and stores them as a CSV. Parameters ---------- @@ -504,23 +520,31 @@ def _all_feature_values( ) def transcript_sequence(self, transcript_id): - """Return cDNA nucleotide sequence of transcript, or None if - transcript doesn't have cDNA sequence. + """ + Return cDNA nucleotide sequence of transcript, or None if transcript + doesn't have cDNA sequence. """ if self.transcript_sequences is None: - raise ValueError("No transcript FASTA supplied to this Genome: %s" % self) + raise ValueError( + "No transcript FASTA supplied to this Genome: %s" % self + ) return self.transcript_sequences.get(transcript_id) def protein_sequence(self, protein_id): - """Return cDNA nucleotide sequence of transcript, or None if - transcript doesn't have cDNA sequence. + """ + Return cDNA nucleotide sequence of transcript, or None if transcript + doesn't have cDNA sequence. """ if self.protein_sequences is None: - raise ValueError("No protein FASTA supplied to this Genome: %s" % self) + raise ValueError( + "No protein FASTA supplied to this Genome: %s" % self + ) return self.protein_sequences.get(protein_id) def genes_at_locus(self, contig, position, end=None, strand=None): - gene_ids = self.gene_ids_at_locus(contig, position, end=end, strand=strand) + gene_ids = self.gene_ids_at_locus( + contig, position, end=end, strand=strand + ) return [self.gene_by_id(gene_id) for gene_id in gene_ids] def transcripts_at_locus(self, contig, position, end=None, strand=None): @@ -528,11 +552,14 @@ def transcripts_at_locus(self, contig, position, end=None, strand=None): contig, position, end=end, strand=strand ) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def exons_at_locus(self, contig, position, end=None, strand=None): - exon_ids = self.exon_ids_at_locus(contig, position, end=end, strand=strand) + exon_ids = self.exon_ids_at_locus( + contig, position, end=end, strand=strand + ) return [self.exon_by_id(exon_id) for exon_id in exon_ids] def gene_ids_at_locus(self, contig, position, end=None, strand=None): @@ -575,7 +602,9 @@ def transcript_ids_at_locus(self, contig, position, end=None, strand=None): strand=strand, ) - def transcript_names_at_locus(self, contig, position, end=None, strand=None): + def transcript_names_at_locus( + self, contig, position, end=None, strand=None + ): return self.db.distinct_column_values_at_locus( column="transcript_name", feature="transcript", @@ -605,7 +634,7 @@ def protein_ids_at_locus(self, contig, position, end=None, strand=None): def locus_of_gene_id(self, gene_id): """ - Given a gene ID returns Locus with: chromosome, start, stop, strand + Given a gene ID returns Locus with: chromosome, start, stop, strand. """ return self.db.query_locus( filter_column="gene_id", filter_value=gene_id, feature="gene" @@ -614,9 +643,9 @@ def locus_of_gene_id(self, gene_id): def loci_of_gene_names(self, gene_name): """ Given a gene name returns list of Locus objects with fields: - chromosome, start, stop, strand - You can get multiple results since a gene might have multiple copies - in the genome. + + chromosome, start, stop, strand You can get multiple results + since a gene might have multiple copies in the genome. """ return self.db.query_loci("gene_name", gene_name, "gene") @@ -629,7 +658,7 @@ def locus_of_transcript_id(self, transcript_id): def locus_of_exon_id(self, exon_id): """ - Given an exon ID returns Locus + Given an exon ID returns Locus. """ return self.db.query_locus("exon_id", exon_id, feature="exon") @@ -641,8 +670,8 @@ def locus_of_exon_id(self, exon_id): def contigs(self): """ - Returns all contig names for any gene in the genome - (field called "seqname" in Ensembl GTF files) + Returns all contig names for any gene in the genome (field called + "seqname" in Ensembl GTF files) """ return self.db.query_feature_values("seqname", "gene") @@ -703,7 +732,9 @@ def gene_by_id(self, gene_id): gene_name, gene_biotype = None, None if len(result) < 4 or len(result) > 6: - raise ValueError("Result is not the expected length: %d" % len(result)) + raise ValueError( + "Result is not the expected length: %d" % len(result) + ) contig, start, end, strand = result[:4] if len(result) == 5: if "gene_name" in field_names: @@ -737,8 +768,8 @@ def genes_by_name(self, gene_name): def gene_by_protein_id(self, protein_id): """ - Get the gene ID associated with the given protein ID, - return its Gene object + Get the gene ID associated with the given protein ID, return its Gene + object. """ gene_id = self.gene_id_of_protein_id(protein_id) return self.gene_by_id(gene_id) @@ -762,8 +793,8 @@ def _query_gene_name(self, property_name, property_value, feature_type): def gene_names(self, contig=None, strand=None): """ - Return all genes in the database, - optionally restrict to a chromosome and/or strand. + Return all genes in the database, optionally restrict to a chromosome + and/or strand. """ return self._all_feature_values( column="gene_name", feature="gene", contig=contig, strand=strand @@ -773,10 +804,14 @@ def gene_name_of_gene_id(self, gene_id): return self._query_gene_name("gene_id", gene_id, "gene") def gene_name_of_transcript_id(self, transcript_id): - return self._query_gene_name("transcript_id", transcript_id, "transcript") + return self._query_gene_name( + "transcript_id", transcript_id, "transcript" + ) def gene_name_of_transcript_name(self, transcript_name): - return self._query_gene_name("transcript_name", transcript_name, "transcript") + return self._query_gene_name( + "transcript_name", transcript_name, "transcript" + ) def gene_name_of_exon_id(self, exon_id): return self._query_gene_name("exon_id", exon_id, "exon") @@ -800,8 +835,8 @@ def _query_gene_ids(self, property_name, value, feature="gene"): def gene_ids(self, contig=None, strand=None): """ - What are all the gene IDs - (optionally restrict to a given chromosome/contig and/or strand) + What are all the gene IDs (optionally restrict to a given + chromosome/contig and/or strand) """ return self._all_feature_values( column="gene_id", feature="gene", contig=contig, strand=strand @@ -810,6 +845,7 @@ def gene_ids(self, contig=None, strand=None): def gene_ids_of_gene_name(self, gene_name): """ What are the gene IDs associated with a given gene name? + (due to copy events, there might be multiple genes per name) """ results = self._query_gene_ids("gene_name", gene_name) @@ -842,17 +878,21 @@ def gene_id_of_protein_id(self, protein_id): def transcripts(self, contig=None, strand=None): """ - Construct Transcript object for every transcript entry in - the database. Optionally restrict to a particular - chromosome using the `contig` argument. + Construct Transcript object for every transcript entry in the database. + + Optionally restrict to a particular chromosome using the + `contig` argument. """ transcript_ids = self.transcript_ids(contig=contig, strand=strand) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def transcript_by_id(self, transcript_id): - """Construct Transcript object with given transcript ID""" + """ + Construct Transcript object with given transcript ID. + """ if transcript_id not in self._transcripts: optional_field_names = [ "transcript_name", @@ -885,8 +925,12 @@ def transcript_by_id(self, transcript_id): raise ValueError("Transcript not found: %s" % (transcript_id,)) transcript_name, transcript_biotype, tsl = None, None, None - if len(result) < 5 or len(result) > (5 + len(optional_field_names)): - raise ValueError("Result is not the expected length: %d" % len(result)) + if len(result) < 5 or len(result) > ( + 5 + len(optional_field_names) + ): + raise ValueError( + "Result is not the expected length: %d" % len(result) + ) contig, start, end, strand, gene_id = result[:5] if len(result) > 5: extra_field_names = [ @@ -895,8 +939,10 @@ def transcript_by_id(self, transcript_id): extra_data = dict(zip(extra_field_names, result[5:])) transcript_name = extra_data.get("transcript_name") transcript_biotype = extra_data.get("transcript_biotype") - tsl = extra_data.get("transcript_support_level") - if not tsl or tsl == "NA": + tsl = extra_data.get("transcript_support_level", "NA") + if tsl: + tsl = tsl.split(" ")[0] + if not tsl or tsl == "NA" or not tsl.isnumeric(): tsl = None else: tsl = int(tsl) @@ -917,9 +963,12 @@ def transcript_by_id(self, transcript_id): return self._transcripts[transcript_id] def transcripts_by_name(self, transcript_name): - transcript_ids = self.transcript_ids_of_transcript_name(transcript_name) + transcript_ids = self.transcript_ids_of_transcript_name( + transcript_name + ) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def transcript_by_protein_id(self, protein_id): @@ -945,25 +994,31 @@ def _query_transcript_names(self, property_name, value): def transcript_names(self, contig=None, strand=None): """ - What are all the transcript names in the database - (optionally, restrict to a given chromosome and/or strand) + What are all the transcript names in the database (optionally, restrict + to a given chromosome and/or strand) """ return self._all_feature_values( - column="transcript_name", feature="transcript", contig=contig, strand=strand + column="transcript_name", + feature="transcript", + contig=contig, + strand=strand, ) def transcript_names_of_gene_name(self, gene_name): return self._query_transcript_names("gene_name", gene_name) def transcript_name_of_transcript_id(self, transcript_id): - transcript_names = self._query_transcript_names("transcript_id", transcript_id) + transcript_names = self._query_transcript_names( + "transcript_id", transcript_id + ) if len(transcript_names) == 0: raise ValueError( "No transcript names for transcript ID = %s" % transcript_id ) elif len(transcript_names) > 1: raise ValueError( - "Multiple transcript names for transcript ID = %s" % (transcript_id,) + "Multiple transcript names for transcript ID = %s" + % (transcript_id,) ) return transcript_names[0] @@ -973,7 +1028,9 @@ def transcript_name_of_transcript_id(self, transcript_id): # ################################################### - def _query_transcript_ids(self, property_name, value, feature="transcript"): + def _query_transcript_ids( + self, property_name, value, feature="transcript" + ): results = self.db.query( select_column_names=["transcript_id"], filter_column=property_name, @@ -986,7 +1043,10 @@ def _query_transcript_ids(self, property_name, value, feature="transcript"): def transcript_ids(self, contig=None, strand=None): return self._all_feature_values( - column="transcript_id", feature="transcript", contig=contig, strand=strand + column="transcript_id", + feature="transcript", + contig=contig, + strand=strand, ) def transcript_ids_of_gene_id(self, gene_id): @@ -1005,7 +1065,9 @@ def transcript_id_of_protein_id(self, protein_id): """ What is the transcript ID associated with a given protein ID? """ - results = self._query_transcript_ids("protein_id", protein_id, feature="CDS") + results = self._query_transcript_ids( + "protein_id", protein_id, feature="CDS" + ) if len(results) == 0: raise ValueError("Protein ID not found: %s" % protein_id) elif len(results) > 1: @@ -1026,15 +1088,16 @@ def transcript_id_of_protein_id(self, protein_id): def exons(self, contig=None, strand=None): """ - Create exon object for all exons in the database, optionally - restrict to a particular chromosome using the `contig` argument. + Create exon object for all exons in the database, optionally restrict + to a particular chromosome using the `contig` argument. """ # DataFrame with single column called "exon_id" exon_ids = self.exon_ids(contig=contig, strand=strand) return [self.exon_by_id(exon_id) for exon_id in exon_ids] def exon_by_id(self, exon_id): - """Construct an Exon object from its ID by looking up the exon"s + """ + Construct an Exon object from its ID by looking up the exon"s properties in the given Database. """ if exon_id not in self._exons: @@ -1109,8 +1172,8 @@ def exon_ids_of_transcript_id(self, transcript_id): def protein_ids(self, contig=None, strand=None): """ - What are all the protein IDs - (optionally restrict to a given chromosome and/or strand) + What are all the protein IDs (optionally restrict to a given chromosome + and/or strand) """ protein_ids = self._all_feature_values( column="protein_id", diff --git a/pyensembl/locus.py b/pyensembl/locus.py index b88b4a3..c087183 100644 --- a/pyensembl/locus.py +++ b/pyensembl/locus.py @@ -49,7 +49,8 @@ def __init__(self, contig, start, end, strand): if end < start: raise ValueError( - "Expected start <= end, got start = %d, end = %d" % (start, end) + "Expected start <= end, got start = %d, end = %d" + % (start, end) ) self.start = start self.end = end @@ -149,7 +150,9 @@ def offset_range(self, start, end): ) if start < self.start or end > self.end: - raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self)) + raise ValueError( + "Range (%d, %d) falls outside %s" % (start, end, self) + ) if self.on_forward_strand: return (start - self.start, end - self.start) @@ -183,7 +186,9 @@ def can_overlap(self, contig, strand=None): """ Is this locus on the same contig and (optionally) on the same strand? """ - return self.on_contig(contig) and (strand is None or self.on_strand(strand)) + return self.on_contig(contig) and ( + strand is None or self.on_strand(strand) + ) def distance_to_interval(self, start, end): """ @@ -220,15 +225,23 @@ def overlaps(self, contig, start, end, strand=None): def overlaps_locus(self, other_locus): return self.overlaps( - other_locus.contig, other_locus.start, other_locus.end, other_locus.strand + other_locus.contig, + other_locus.start, + other_locus.end, + other_locus.strand, ) def contains(self, contig, start, end, strand=None): return ( - self.can_overlap(contig, strand) and start >= self.start and end <= self.end + self.can_overlap(contig, strand) + and start >= self.start + and end <= self.end ) def contains_locus(self, other_locus): return self.contains( - other_locus.contig, other_locus.start, other_locus.end, other_locus.strand + other_locus.contig, + other_locus.start, + other_locus.end, + other_locus.strand, ) diff --git a/pyensembl/locus_with_genome.py b/pyensembl/locus_with_genome.py index 33dd38d..338a222 100644 --- a/pyensembl/locus_with_genome.py +++ b/pyensembl/locus_with_genome.py @@ -16,8 +16,8 @@ class LocusWithGenome(Locus): """ - Common base class for Gene and Transcript to avoid copying - their shared logic. + Common base class for Gene and Transcript to avoid copying their shared + logic. """ def __init__(self, contig, start, end, strand, biotype, genome): @@ -39,16 +39,17 @@ def to_dict(self): @property def is_protein_coding(self): """ - We're not counting immunoglobulin-like genes from the T-cell receptor or - or antibodies since they occur in fragments that must be recombined. - It might be worth consider counting non-sense mediated decay and - non-stop decay since variants in these could potentially make a - functional protein. To read more about the biotypes used in Ensembl: - http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html - http://www.gencodegenes.org/gencode_biotypes.html - - For now let's stick with the simple category of 'protein_coding', which - means that there is an open reading frame in this gene/transcript - whose successful transcription has been observed. + We're not counting immunoglobulin-like genes from the T-cell receptor + or or antibodies since they occur in fragments that must be recombined. + It might be worth consider counting non-sense mediated decay and non- + stop decay since variants in these could potentially make a functional + protein. To read more about the biotypes used in Ensembl: + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.gencodegenes.org/gencode_biotypes.html. + + For now let's stick with the simple category of + 'protein_coding', which means that there is an open reading + frame in this gene/transcript whose successful transcription has + been observed. """ - return self.biotype == "protein_coding" + return self.biotype in "protein_coding" diff --git a/pyensembl/normalization.py b/pyensembl/normalization.py index fb0cc33..81f65c5 100644 --- a/pyensembl/normalization.py +++ b/pyensembl/normalization.py @@ -11,7 +11,8 @@ # limitations under the License. from sys import intern -from typechecks import is_string, is_integer + +from typechecks import is_integer, is_string # Manually memoizing here, since our simple common.memoize function has # noticable overhead in this instance. diff --git a/pyensembl/reference_name.py b/pyensembl/reference_name.py index 1b7639d..5731d80 100644 --- a/pyensembl/reference_name.py +++ b/pyensembl/reference_name.py @@ -29,7 +29,9 @@ def normalize_reference_name(name): def find_species_by_reference(reference_name): - return Species._reference_names_to_species[normalize_reference_name(reference_name)] + return Species._reference_names_to_species[ + normalize_reference_name(reference_name) + ] def which_reference(species_name, ensembl_release): @@ -42,7 +44,9 @@ def max_ensembl_release(reference_name): return max_release -def genome_for_reference_name(reference_name, allow_older_downloaded_release=True): +def genome_for_reference_name( + reference_name, allow_older_downloaded_release=True +): """ Given a genome reference name, such as "GRCh38", returns the corresponding Ensembl Release object. @@ -60,7 +64,9 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru ] if allow_older_downloaded_release: # go through candidate releases in descending order - for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)): + for release in reversed( + range(min_ensembl_release, max_ensembl_release + 1) + ): # check if release has been locally downloaded candidate = EnsemblRelease.cached(release=release, species=species) if candidate.required_local_files_exist(): @@ -70,6 +76,6 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru return EnsemblRelease.cached(release=max_ensembl_release, species=species) -ensembl_grch36 = genome_for_reference_name("ncbi36") -ensembl_grch37 = genome_for_reference_name("grch37") -ensembl_grch38 = genome_for_reference_name("grch38") +# ensembl_grch36 = genome_for_reference_name("ncbi36") +# ensembl_grch37 = genome_for_reference_name("grch37") +# ensembl_grch38 = genome_for_reference_name("grch38") diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py index 631c748..e18a9e8 100644 --- a/pyensembl/sequence_data.py +++ b/pyensembl/sequence_data.py @@ -10,14 +10,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from os import remove -from os.path import exists, abspath, split, join import logging -from collections import Counter import pickle -from .common import load_pickle, dump_pickle -from .fasta import parse_fasta_dictionary +from collections import Counter +from os import remove +from os.path import abspath, exists, join, split +from .common import dump_pickle, load_pickle +from .fasta import parse_fasta_dictionary logger = logging.getLogger(__name__) @@ -32,10 +32,14 @@ def __init__(self, fasta_paths, cache_directory_path=None): fasta_paths = [fasta_paths] self.fasta_paths = [abspath(path) for path in fasta_paths] - self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths] + self.fasta_directory_paths = [ + split(path)[0] for path in self.fasta_paths + ] self.fasta_filenames = [split(path)[1] for path in self.fasta_paths] if cache_directory_path: - self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths) + self.cache_directory_paths = [cache_directory_path] * len( + self.fasta_paths + ) else: self.cache_directory_paths = self.fasta_directory_paths for path in self.fasta_paths: @@ -104,7 +108,9 @@ def _load_or_create_fasta_dictionary_pickle(self): try: fasta_dictionary_tmp = load_pickle(pickle_path) self._add_to_fasta_dictionary(fasta_dictionary_tmp) - logger.info("Loaded sequence dictionary from %s", pickle_path) + logger.info( + "Loaded sequence dictionary from %s", pickle_path + ) continue except (pickle.UnpicklingError, AttributeError): # catch either an UnpicklingError or an AttributeError diff --git a/pyensembl/shell.py b/pyensembl/shell.py old mode 100755 new mode 100644 index cd7ab3c..546dfa9 --- a/pyensembl/shell.py +++ b/pyensembl/shell.py @@ -30,6 +30,9 @@ To list all installed genomes: %(prog)s list +To list all available genomes: + %(prog)s available + To install a genome from source files: %(prog)s install \ --reference-name "GRCh38" \ @@ -40,14 +43,18 @@ import argparse import logging.config -import pkg_resources import os -from .ensembl_release import EnsemblRelease, MAX_ENSEMBL_RELEASE +import pkg_resources + +from .config import MAX_ENSEMBL_RELEASE +from .ensembl_release import EnsemblRelease from .genome import Genome -from .species import Species +from .species import Species, normalize_species_name -logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf")) +logging.config.fileConfig( + pkg_resources.resource_filename(__name__, "logging.conf") +) logger = logging.getLogger(__name__) @@ -94,7 +101,9 @@ ) path_group.add_argument( - "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)" + "--annotation-name", + default=None, + help="Name of annotation source (e.g. refseq)", ) path_group.add_argument( @@ -140,6 +149,7 @@ "delete-all-files", "delete-index-files", "list", + "available", ), help=( '"install" will download and index any data that is not ' @@ -151,6 +161,25 @@ ) +def collect_all_available_ensembl_releases(): + for species_name in Species.all_registered_latin_names(): + species = Species._latin_names_to_species[species_name] + # print in tree format + print( + "* " + + species_name + + " (" + + ", ".join(species.synonyms) + + ")" + + ":" + ) + for ( + release_name, + release_range, + ) in species.reference_assemblies.items(): + print(" * " + release_name + ":", release_range) + + def collect_all_installed_ensembl_releases(): genomes = [] for species, release in Species.all_species_release_pairs(): @@ -164,11 +193,26 @@ def all_combinations_of_ensembl_genomes(args): """ Use all combinations of species and release versions specified by the commandline arguments to return a list of EnsemblRelease or Genome objects. - The results will typically be of type EnsemblRelease unless the + The results will typically be of type EnsemblRelease unless the. + --custom-mirror argument was given. """ species_list = args.species if args.species else ["human"] - release_list = args.release if args.release else [MAX_ENSEMBL_RELEASE] + + release_list = ( + args.release + if args.release + else [ + max( + i + for _, i in Species._latin_names_to_species[ + normalize_species_name(species_name) + ].reference_assemblies.values() + ) + for species_name in species_list + ] + ) + genomes = [] for species in species_list: # Otherwise, use Ensembl release information @@ -182,11 +226,13 @@ def all_combinations_of_ensembl_genomes(args): # URL to be a directory with all the same filenames as # would be provided by Ensembl gtf_url = os.path.join( - args.custom_mirror, os.path.basename(ensembl_release.gtf_url) + args.custom_mirror, + os.path.basename(ensembl_release.gtf_url), ) transcript_fasta_urls = [ os.path.join( - args.custom_mirror, os.path.basename(transcript_fasta_url) + args.custom_mirror, + os.path.basename(transcript_fasta_url), ) for transcript_fasta_url in ensembl_release.transcript_fasta_urls ] @@ -244,7 +290,9 @@ def collect_selected_genomes(args): def run(): args = parser.parse_args() - if args.action == "list": + if args.action == "available": + collect_all_available_ensembl_releases() + elif args.action == "list": # TODO: how do we also identify which non-Ensembl genomes are # installed? genomes = collect_all_installed_ensembl_releases() diff --git a/pyensembl/species.py b/pyensembl/species.py index a236bb1..cb78766 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -12,7 +12,7 @@ from serializable import Serializable -from .ensembl_release_versions import MAX_ENSEMBL_RELEASE +from .config import SPECIES_DATA # TODO: replace Serializable with data class @@ -30,15 +30,18 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register(cls, latin_name, synonyms, reference_assemblies): + def register( + cls, latin_name, synonyms, reference_assemblies, database=None + ): """ - Create a Species object from the given arguments and enter into - all the dicts used to look the species up by its fields. + Create a Species object from the given arguments and enter into all the + dicts used to look the species up by its fields. """ species = Species( latin_name=latin_name, synonyms=synonyms, reference_assemblies=reference_assemblies, + database=database, ) cls._latin_names_to_species[species.latin_name] = species for synonym in synonyms: @@ -71,8 +74,8 @@ def all_registered_latin_names(cls): @classmethod def all_species_release_pairs(cls): """ - Generator which yields (species, release) pairs - for all possible combinations. + Generator which yields (species, release) pairs for all possible + combinations. """ for species_name in cls.all_registered_latin_names(): species = cls._latin_names_to_species[species_name] @@ -80,7 +83,9 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__(self, latin_name, synonyms=[], reference_assemblies={}): + def __init__( + self, latin_name, synonyms=[], reference_assemblies={}, database=None + ): """ Parameters ---------- @@ -95,6 +100,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): self.latin_name = latin_name.lower().replace(" ", "_") self.synonyms = synonyms self.reference_assemblies = reference_assemblies + self.database = database self._release_to_genome = {} for genome_name, (start, end) in self.reference_assemblies.items(): for i in range(start, end + 1): @@ -114,10 +120,14 @@ def which_reference(self, ensembl_release): return self._release_to_genome[ensembl_release] def __str__(self): - return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % ( - self.latin_name, - self.synonyms, - self.reference_assemblies, + return ( + "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s, database=%s)" + % ( + self.latin_name, + self.synonyms, + self.reference_assemblies, + self.database, + ) ) def __eq__(self, other): @@ -126,6 +136,7 @@ def __eq__(self, other): and self.latin_name == other.latin_name and self.synonyms == other.synonyms and self.reference_assemblies == other.reference_assemblies + and self.database == other.database ) def to_dict(self): @@ -141,15 +152,17 @@ def __hash__(self): self.latin_name, tuple(self.synonyms), frozenset(self.reference_assemblies.items()), + self.database, ) ) def normalize_species_name(name): """ - If species name was "Homo sapiens" then replace spaces with underscores - and return "homo_sapiens". Also replace common names like "human" with - "homo_sapiens". + If species name was "Homo sapiens" then replace spaces with underscores and + return "homo_sapiens". + + Also replace common names like "human" with "homo_sapiens". """ lower_name = name.lower().strip() @@ -173,6 +186,8 @@ def find_species_by_name(species_name): def check_species_object(species_name_or_object): """ Helper for validating user supplied species names or objects. + + Return `Species` Object """ if isinstance(species_name_or_object, Species): return species_name_or_object @@ -185,168 +200,10 @@ def check_species_object(species_name_or_object): ) -human = Species.register( - latin_name="homo_sapiens", - synonyms=["human"], - reference_assemblies={ - "GRCh38": (76, MAX_ENSEMBL_RELEASE), - "GRCh37": (55, 75), - "NCBI36": (54, 54), - }, -) - -mouse = Species.register( - latin_name="mus_musculus", - synonyms=["mouse", "house mouse"], - reference_assemblies={ - "NCBIM37": (54, 67), - "GRCm38": (68, 102), - "GRCm39": (103, MAX_ENSEMBL_RELEASE), - }, -) - -dog = Species.register( - latin_name="canis_familiaris", - synonyms=["dog"], - reference_assemblies={"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)}, -) - -cat = Species.register( - latin_name="felis_catus", - synonyms=["cat"], - reference_assemblies={ - "Felis_catus_6.2": (75, 90), - "Felis_catus_8.0": (91, 92), - "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE), - }, -) - -chicken = Species.register( - latin_name="gallus_gallus", - synonyms=["chicken"], - reference_assemblies={ - "Galgal4": (75, 85), - "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE), - }, -) - -# Does the black rat (Rattus Rattus) get used for research too? -brown_rat = Species.register( - latin_name="rattus_norvegicus", - synonyms=["brown rat", "lab rat", "rat"], - reference_assemblies={ - "Rnor_5.0": (75, 79), - "Rnor_6.0": (80, 104), - "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE), - }, -) - -macaque = Species.register( - latin_name="macaca_fascicularis", - synonyms=["macaque", "Crab-eating macaque"], - reference_assemblies={ - "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE), - }, -) - -green_monkey = Species.register( - latin_name="chlorocebus_sabaeus", - synonyms=["green_monkey", "african_green_monkey"], - reference_assemblies={ - "ChlSab1.1": (86, MAX_ENSEMBL_RELEASE), - }, -) - -rhesus = Species.register( - latin_name="macaca_mulatta", - synonyms=["rhesus"], - reference_assemblies={"Mmul_10": (75, MAX_ENSEMBL_RELEASE)}, -) - -rabbit = Species.register( - latin_name="oryctolagus_cuniculus", - synonyms=["rabbit"], - reference_assemblies={"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -gerbil = Species.register( - latin_name="meriones_unguiculatus", - synonyms=["gerbil"], - reference_assemblies={"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -syrian_hamster = Species.register( - latin_name="mesocricetus_auratus", - synonyms=["syrian_hamster"], - reference_assemblies={"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -chinese_hamster = Species.register( - latin_name="cricetulus_griseus_chok1gshd", - synonyms=["chinese_hamster"], - reference_assemblies={"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)}, -) - -naked_mole_rat = Species.register( - latin_name="heterocephalus_glaber_female", - synonyms=["naked_mole_rat"], - reference_assemblies={"HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -guinea_pig = Species.register( - latin_name="cavia_porcellus", - synonyms=["guinea_pig"], - reference_assemblies={"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -pig = Species.register( - latin_name="sus_scrofa", - synonyms=["pig"], - reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)}, -) - -zebrafish = Species.register( - latin_name="danio_rerio", - synonyms=["zebrafish"], - reference_assemblies={ - "ZFISH7": (47, 53), - "Zv8": (54, 59), - "Zv9": (60, 79), - "GRCz10": (80, 91), - "GRCz11": (92, MAX_ENSEMBL_RELEASE), - }, -) - -fly = Species.register( - latin_name="drosophila_melanogaster", - synonyms=["drosophila", "fruit fly", "fly"], - reference_assemblies={ - "BDGP5": (75, 78), - "BDGP6": (79, 95), - "BDGP6.22": (96, 98), - "BDGP6.28": (99, 102), - "BDGP6.32": (103, MAX_ENSEMBL_RELEASE), - }, -) - -nematode = Species.register( - latin_name="caenorhabditis_elegans", - synonyms=["nematode", "C_elegans"], - reference_assemblies={ - "WS180": (47, 49), - "WS190": (50, 54), - "WS200": (55, 57), - "WS210": (58, 59), - "WS220": (61, 66), - "WBcel215": (67, 70), - "WBcel235": (71, MAX_ENSEMBL_RELEASE), - }, -) - -yeast = Species.register( - latin_name="saccharomyces_cerevisiae", - synonyms=["yeast", "budding_yeast"], - reference_assemblies={ - "R64-1-1": (76, MAX_ENSEMBL_RELEASE), - }, -) +for data in SPECIES_DATA: + globals()[data["synonyms"][0]] = Species.register( + latin_name=data["latin_name"], + synonyms=data["synonyms"], + reference_assemblies=data["reference_assemblies"], + database=data.get("database", None), + ) diff --git a/pyensembl/species.py.orig b/pyensembl/species.py.orig new file mode 100644 index 0000000..cb78766 --- /dev/null +++ b/pyensembl/species.py.orig @@ -0,0 +1,209 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from serializable import Serializable + +from .config import SPECIES_DATA + +# TODO: replace Serializable with data class + + +class Species(Serializable): + """ + Container for combined information about a species name, its synonyn names + and which reference to use for this species in each Ensembl release. + """ + + # as species instances get created, they get registered in these + # dictionaries + _latin_names_to_species = {} + _common_names_to_species = {} + _reference_names_to_species = {} + + @classmethod + def register( + cls, latin_name, synonyms, reference_assemblies, database=None + ): + """ + Create a Species object from the given arguments and enter into all the + dicts used to look the species up by its fields. + """ + species = Species( + latin_name=latin_name, + synonyms=synonyms, + reference_assemblies=reference_assemblies, + database=database, + ) + cls._latin_names_to_species[species.latin_name] = species + for synonym in synonyms: + if synonym in cls._common_names_to_species: + raise ValueError( + "Can't use synonym '%s' for both %s and %s" + % (synonym, species, cls._common_names_to_species[synonym]) + ) + cls._common_names_to_species[synonym] = species + for reference_name in reference_assemblies: + if reference_name in cls._reference_names_to_species: + raise ValueError( + "Can't use reference '%s' for both %s and %s" + % ( + reference_name, + species, + cls._reference_names_to_species[reference_name], + ) + ) + cls._reference_names_to_species[reference_name] = species + return species + + @classmethod + def all_registered_latin_names(cls): + """ + Returns latin name of every registered species. + """ + return list(cls._latin_names_to_species.keys()) + + @classmethod + def all_species_release_pairs(cls): + """ + Generator which yields (species, release) pairs for all possible + combinations. + """ + for species_name in cls.all_registered_latin_names(): + species = cls._latin_names_to_species[species_name] + for _, release_range in species.reference_assemblies.items(): + for release in range(release_range[0], release_range[1] + 1): + yield species_name, release + + def __init__( + self, latin_name, synonyms=[], reference_assemblies={}, database=None + ): + """ + Parameters + ---------- + latin_name : str + + synonyms : list of strings + + reference_assemblies : dict + Mapping of names of reference genomes onto inclusive ranges of + Ensembl releases Example: {"GRCh37": (54, 75)} + """ + self.latin_name = latin_name.lower().replace(" ", "_") + self.synonyms = synonyms + self.reference_assemblies = reference_assemblies + self.database = database + self._release_to_genome = {} + for genome_name, (start, end) in self.reference_assemblies.items(): + for i in range(start, end + 1): + if i in self._release_to_genome: + raise ValueError( + "Ensembl release %d for %s already has an associated genome" + % (i, latin_name) + ) + self._release_to_genome[i] = genome_name + + def which_reference(self, ensembl_release): + if ensembl_release not in self._release_to_genome: + raise ValueError( + "No genome for %s in Ensembl release %d" + % (self.latin_name, ensembl_release) + ) + return self._release_to_genome[ensembl_release] + + def __str__(self): + return ( + "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s, database=%s)" + % ( + self.latin_name, + self.synonyms, + self.reference_assemblies, + self.database, + ) + ) + + def __eq__(self, other): + return ( + other.__class__ is Species + and self.latin_name == other.latin_name + and self.synonyms == other.synonyms + and self.reference_assemblies == other.reference_assemblies + and self.database == other.database + ) + + def to_dict(self): + return {"latin_name": self.latin_name} + + @classmethod + def from_dict(cls, state_dict): + return cls._latin_names_to_species[state_dict["latin_name"]] + + def __hash__(self): + return hash( + ( + self.latin_name, + tuple(self.synonyms), + frozenset(self.reference_assemblies.items()), + self.database, + ) + ) + + +def normalize_species_name(name): + """ + If species name was "Homo sapiens" then replace spaces with underscores and + return "homo_sapiens". + + Also replace common names like "human" with "homo_sapiens". + """ + lower_name = name.lower().strip() + + # if given a common name such as "human", look up its latin equivalent + if lower_name in Species._common_names_to_species: + return Species._common_names_to_species[lower_name].latin_name + + return lower_name.replace(" ", "_") + + +def find_species_by_name(species_name): + latin_name = normalize_species_name(species_name) + if latin_name not in Species._latin_names_to_species: + raise ValueError( + "Species not found: %s, for non-Ensembl data see https://github.com/openvax/pyensembl#non-ensembl-data" + % (species_name,) + ) + return Species._latin_names_to_species[latin_name] + + +def check_species_object(species_name_or_object): + """ + Helper for validating user supplied species names or objects. + + Return `Species` Object + """ + if isinstance(species_name_or_object, Species): + return species_name_or_object + elif isinstance(species_name_or_object, str): + return find_species_by_name(species_name_or_object) + else: + raise ValueError( + "Unexpected type for species: %s : %s" + % (species_name_or_object, type(species_name_or_object)) + ) + + +for data in SPECIES_DATA: + globals()[data["synonyms"][0]] = Species.register( + latin_name=data["latin_name"], + synonyms=data["synonyms"], + reference_assemblies=data["reference_assemblies"], + database=data.get("database", None), + ) diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 9d30c5c..694e702 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -24,18 +24,20 @@ class Transcript(LocusWithGenome): and not using the sequence, avoid the memory/performance overhead of fetching and storing sequences from a FASTA file. """ + def __init__( - self, - transcript_id, - transcript_name, - contig, - start, - end, - strand, - biotype, - gene_id, - genome, - support_level=None): + self, + transcript_id, + transcript_name, + contig, + start, + end, + strand, + biotype, + gene_id, + genome, + support_level=None, + ): LocusWithGenome.__init__( self, contig=contig, @@ -43,7 +45,8 @@ def __init__( end=end, strand=strand, biotype=biotype, - genome=genome) + genome=genome, + ) self.transcript_id = transcript_id self.transcript_name = transcript_name self.gene_id = gene_id @@ -71,16 +74,18 @@ def __str__(self): " biotype='%s'," " contig='%s'," " start=%d," - " end=%d, strand='%s', genome='%s')") % ( - self.transcript_id, - self.name, - self.gene_id, - self.biotype, - self.contig, - self.start, - self.end, - self.strand, - self.genome.reference_name) + " end=%d, strand='%s', genome='%s')" + ) % ( + self.transcript_id, + self.name, + self.gene_id, + self.biotype, + self.contig, + self.start, + self.end, + self.strand, + self.genome.reference_name, + ) def __len__(self): """ @@ -90,9 +95,10 @@ def __len__(self): def __eq__(self, other): return ( - other.__class__ is Transcript and - self.id == other.id and - self.genome == other.genome) + other.__class__ is Transcript + and self.id == other.id + and self.genome == other.genome + ) def __hash__(self): return hash(self.id) @@ -123,7 +129,8 @@ def exons(self): columns, filter_column="transcript_id", filter_value=self.id, - feature="exon") + feature="exon", + ) # fill this list in its correct order (by exon_number) by using # the exon_number as a 1-based list offset @@ -133,15 +140,17 @@ def exons(self): exon = self.genome.exon_by_id(exon_id) if exon is None: raise ValueError( - "Missing exon %s for transcript %s" % ( - exon_number, self.id)) + "Missing exon %s for transcript %s" + % (exon_number, self.id) + ) exon_number = int(exon_number) if exon_number < 1: raise ValueError("Invalid exon number: %s" % exon_number) elif exon_number > len(exons): raise ValueError( - "Invalid exon number: %s (max expected = %d)" % ( - exon_number, len(exons))) + "Invalid exon number: %s (max expected = %d)" + % (exon_number, len(exons)) + ) # exon_number is 1-based, convert to list index by subtracting 1 exon_idx = exon_number - 1 @@ -164,12 +173,14 @@ def _transcript_feature_position_ranges(self, feature, required=True): select_column_names=["start", "end"], filter_column="transcript_id", filter_value=self.id, - feature=feature) + feature=feature, + ) if required and len(results) == 0: raise ValueError( - "Transcript %s does not contain feature %s" % ( - self.id, feature)) + "Transcript %s does not contain feature %s" + % (self.id, feature) + ) return results @memoize @@ -178,19 +189,21 @@ def _transcript_feature_positions(self, feature): Get unique positions for feature, raise an error if feature is absent. """ ranges = self._transcript_feature_position_ranges( - feature, required=True) + feature, required=True + ) results = [] # a feature (such as a stop codon), maybe be split over multiple # contiguous ranges. Collect all the nucleotide positions into a # single list. - for (start, end) in ranges: + for start, end in ranges: # since ranges are [inclusive, inclusive] and # Python ranges are [inclusive, exclusive) we have to increment # the end position for position in range(start, end + 1): if position in results: raise ValueError( - "Repeated position %d for %s" % (position, feature)) + "Repeated position %d for %s" % (position, feature) + ) results.append(position) return results @@ -207,10 +220,9 @@ def _codon_positions(self, feature): results = self._transcript_feature_positions(feature) if len(results) != 3: raise ValueError( - "Expected 3 positions for %s of %s but got %d" % ( - feature, - self.id, - len(results))) + "Expected 3 positions for %s of %s but got %d" + % (feature, self.id, len(results)) + ) return results @memoized_property @@ -219,7 +231,8 @@ def contains_start_codon(self): Does this transcript have an annotated start_codon entry? """ start_codons = self._transcript_feature_position_ranges( - "start_codon", required=False) + "start_codon", required=False + ) return len(start_codons) > 0 @memoized_property @@ -228,9 +241,10 @@ def contains_stop_codon(self): Does this transcript have an annotated stop_codon entry? """ stop_codons = self._transcript_feature_position_ranges( - "stop_codon", required=False) + "stop_codon", required=False + ) return len(stop_codons) > 0 - + @memoized_property def start_codon_complete(self): """ @@ -266,9 +280,10 @@ def exon_intervals(self): select_column_names=["exon_number", "start", "end"], filter_column="transcript_id", filter_value=self.id, - feature="exon") + feature="exon", + ) sorted_intervals = [None] * len(results) - for (exon_number, start, end) in results: + for exon_number, start, end in results: sorted_intervals[int(exon_number) - 1] = (start, end) return sorted_intervals @@ -281,15 +296,15 @@ def spliced_offset(self, position): """ if type(position) is not int: raise TypeError( - "Position argument must be an integer, got %s : %s" % ( - position, type(position))) + "Position argument must be an integer, got %s : %s" + % (position, type(position)) + ) if position < self.start or position > self.end: raise ValueError( - "Invalid position: %d (must be between %d and %d)" % ( - position, - self.start, - self.end)) + "Invalid position: %d (must be between %d and %d)" + % (position, self.start, self.end) + ) # offset from beginning of unspliced transcript (including introns) unspliced_offset = self.offset(position) @@ -306,7 +321,8 @@ def spliced_offset(self, position): # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii... for exon in self.exons: exon_unspliced_start, exon_unspliced_end = self.offset_range( - exon.start, exon.end) + exon.start, exon.end + ) # If the relative position is not within this exon, keep a running # total of the total exonic length-so-far. # @@ -320,11 +336,13 @@ def spliced_offset(self, position): exon_offset = unspliced_offset - exon_unspliced_start return total_spliced_offset + exon_offset else: - exon_length = len(exon) # exon_end_position - exon_start_position + 1 + exon_length = len( + exon + ) # exon_end_position - exon_start_position + 1 total_spliced_offset += exon_length raise ValueError( - "Couldn't find position %d on any exon of %s" % ( - position, self.id)) + "Couldn't find position %d on any exon of %s" % (position, self.id) + ) @memoized_property def start_codon_unspliced_offsets(self): @@ -333,9 +351,7 @@ def start_codon_unspliced_offsets(self): of nucleotides in start codon. """ return [ - self.offset(position) - for position - in self.start_codon_positions + self.offset(position) for position in self.start_codon_positions ] @memoized_property @@ -345,9 +361,7 @@ def stop_codon_unspliced_offsets(self): of nucleotides in stop codon. """ return [ - self.offset(position) - for position - in self.stop_codon_positions + self.offset(position) for position in self.stop_codon_positions ] def _contiguous_offsets(self, offsets): @@ -358,8 +372,7 @@ def _contiguous_offsets(self, offsets): offsets.sort() for i in range(len(offsets) - 1): if offsets[i] + 1 != offsets[i + 1]: - raise ValueError( - "Offsets not contiguous: %s" % (offsets,)) + raise ValueError("Offsets not contiguous: %s" % (offsets,)) return offsets @memoized_property @@ -370,8 +383,7 @@ def start_codon_spliced_offsets(self): """ offsets = [ self.spliced_offset(position) - for position - in self.start_codon_positions + for position in self.start_codon_positions ] return self._contiguous_offsets(offsets) @@ -383,8 +395,7 @@ def stop_codon_spliced_offsets(self): """ offsets = [ self.spliced_offset(position) - for position - in self.stop_codon_positions + for position in self.stop_codon_positions ] return self._contiguous_offsets(offsets) @@ -403,11 +414,11 @@ def complete(self): a coding sequence whose length is divisible by 3 """ return ( - self.contains_start_codon and - self.start_codon_complete and - self.contains_stop_codon and - self.coding_sequence is not None and - len(self.coding_sequence) % 3 == 0 + self.contains_start_codon + and self.start_codon_complete + and self.contains_stop_codon + and self.coding_sequence is not None + and len(self.coding_sequence) % 3 == 0 ) @memoized_property @@ -459,7 +470,7 @@ def coding_sequence(self): # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[start:end + 1] + return self.sequence[start : end + 1] @memoized_property def five_prime_utr_sequence(self): @@ -469,7 +480,7 @@ def five_prime_utr_sequence(self): """ # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[:self.first_start_codon_spliced_offset] + return self.sequence[: self.first_start_codon_spliced_offset] @memoized_property def three_prime_utr_sequence(self): @@ -477,7 +488,7 @@ def three_prime_utr_sequence(self): cDNA sequence of 3' UTR (untranslated region at the end of the transcript) """ - return self.sequence[self.last_stop_codon_spliced_offset + 1:] + return self.sequence[self.last_stop_codon_spliced_offset + 1 :] @memoized_property def protein_id(self): @@ -487,7 +498,8 @@ def protein_id(self): filter_value=self.id, feature="CDS", distinct=True, - required=False) + required=False, + ) if result_tuple: return result_tuple[0] else: diff --git a/setup.py b/setup.py index 45dc0a4..65dee28 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ # limitations under the License. from __future__ import print_function + import os import re diff --git a/tests/test_ucsc_gtf.py b/tests/test_ucsc_gtf.py index 24e444f..b40c3ff 100644 --- a/tests/test_ucsc_gtf.py +++ b/tests/test_ucsc_gtf.py @@ -31,13 +31,11 @@ def test_ucsc_gencode_genome(): genome.index() genes = genome.genes() for gene in genes: - assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) + assert gene.id, "Gene with missing ID in %s" % (genome,) assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: - assert transcript.id, "Transcript with missing ID in %s" % ( - genome.gtf.dataframe(), - ) + assert transcript.id, "Transcript with missing ID in %s" % (genome,) assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % ( len(transcripts), transcripts,