Skip to content

Commit

Permalink
Add support for downloading plant species with pyEnsembl (#305)
Browse files Browse the repository at this point in the history
This commit introduces the ability to download plant species data using pyEnsembl. We've added the 'is_plant' parameter to the Species class, and registered two new species: Arabidopsis Thaliana and Oryza Sativa (Rice).

We've also added the ENSEMBL_PLANTS_FTP_SERVER URL, and the PLANTS_GTF_SUBDIR_TEMPLATE and PLANTS_FASTA_SUBDIR_TEMPLATE for creating the download links. The code checks if the species is a plant to determine which templates to use.
  • Loading branch information
pamonlan authored Mar 28, 2024
1 parent 2208f87 commit ad206e3
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 12 deletions.
3 changes: 3 additions & 0 deletions pyensembl/ensembl_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,14 @@ def __init__(
species=self.species.latin_name,
sequence_type="cdna",
server=server,
is_plant = self.species.is_plant,
),
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="ncrna",
server=server,
is_plant = self.species.is_plant,
),
]

Expand All @@ -92,6 +94,7 @@ def __init__(
species=self.species.latin_name,
sequence_type="pep",
server=self.server,
is_plant = self.species.is_plant,
)
]

Expand Down
31 changes: 24 additions & 7 deletions pyensembl/ensembl_url_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,19 @@
from .ensembl_versions import check_release_number

ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/"

# Example directories
# FASTA files: /pub/release-78/fasta/homo_sapiens/
# GTF annotation files: /pub/release-78/gtf/homo_sapiens/
FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/"
GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/"

#List plants
#Lest do a vector with all the plants species that we added to make the custom url
lPlants = ("arabidopsis_thaliana","arabidopsis")

def normalize_release_properties(ensembl_release, species):
"""
Expand Down Expand Up @@ -63,12 +69,18 @@ def make_gtf_filename(ensembl_release, species):
}


def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE):
"""
Returns a URL and a filename, which can be joined together.
"""
if species.is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE
#else:
#print(f"[+] {species.latin_name} it is not a plant", flush=True)

ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
subdir = gtf_subdir % {"release": ensembl_release, "species": species}
filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
return server + subdir + filename

Expand All @@ -93,11 +105,11 @@ def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"


def make_fasta_filename(ensembl_release, species, sequence_type):
def make_fasta_filename(ensembl_release, species, sequence_type, is_plant):
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
if ensembl_release <= 75:
if ensembl_release <= 75 and not is_plant:
if sequence_type == "ncrna":
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
Expand Down Expand Up @@ -125,7 +137,7 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
}


def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE):
"""Construct URL to FASTA file with cDNA transcript or protein sequences
Parameter examples:
Expand All @@ -136,12 +148,17 @@ def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_S
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
subdir = FASTA_SUBDIR_TEMPLATE % {

if is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE

subdir = fasta_subdir % {
"release": ensembl_release,
"species": species,
"type": sequence_type,
}
filename = make_fasta_filename(
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant
)
return server + subdir + filename
2 changes: 1 addition & 1 deletion pyensembl/ensembl_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

MIN_ENSEMBL_RELEASE = 47
MAX_ENSEMBL_RELEASE = 111

MAX_PLANTS_ENSEMBL_RELEASE = 58

def check_release_number(release):
"""
Expand Down
26 changes: 23 additions & 3 deletions pyensembl/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from serializable import Serializable

from .ensembl_versions import MAX_ENSEMBL_RELEASE
from .ensembl_versions import MAX_ENSEMBL_RELEASE, MAX_PLANTS_ENSEMBL_RELEASE

# TODO: replace Serializable with data class

Expand All @@ -30,7 +30,7 @@ class Species(Serializable):
_reference_names_to_species = {}

@classmethod
def register(cls, latin_name, synonyms, reference_assemblies):
def register(cls, latin_name, synonyms, reference_assemblies, is_plant=False):
"""
Create a Species object from the given arguments and enter into
all the dicts used to look the species up by its fields.
Expand All @@ -39,6 +39,7 @@ def register(cls, latin_name, synonyms, reference_assemblies):
latin_name=latin_name,
synonyms=synonyms,
reference_assemblies=reference_assemblies,
is_plant=is_plant,
)
cls._latin_names_to_species[species.latin_name] = species
for synonym in synonyms:
Expand Down Expand Up @@ -80,7 +81,7 @@ def all_species_release_pairs(cls):
for release in range(release_range[0], release_range[1] + 1):
yield species_name, release

def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
def __init__(self, latin_name, synonyms=[], reference_assemblies={}, is_plant=False):
"""
Parameters
----------
Expand All @@ -96,6 +97,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
self.synonyms = synonyms
self.reference_assemblies = reference_assemblies
self._release_to_genome = {}
self.is_plant = is_plant
for genome_name, (start, end) in self.reference_assemblies.items():
for i in range(start, end + 1):
if i in self._release_to_genome:
Expand Down Expand Up @@ -350,3 +352,21 @@ def check_species_object(species_name_or_object):
"R64-1-1": (76, MAX_ENSEMBL_RELEASE),
},
)

arabidopsis_thaliana = Species.register(
latin_name="arabidopsis_thaliana",
synonyms=["arabidopsis"],
reference_assemblies={
"TAIR10": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)

rice = Species.register(
latin_name="oryza_sativa",
synonyms=["rice"],
reference_assemblies={
"IRGSP-1.0": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)
2 changes: 1 addition & 1 deletion pyensembl/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "2.3.11"
__version__ = "2.3.12"

def print_version():
print(f"v{__version__}")
Expand Down

0 comments on commit ad206e3

Please sign in to comment.