diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 1553448..2f71a50 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -77,12 +77,14 @@ def __init__( species=self.species.latin_name, sequence_type="cdna", server=server, + is_plant = self.species.is_plant, ), make_fasta_url( ensembl_release=self.release, species=self.species.latin_name, sequence_type="ncrna", server=server, + is_plant = self.species.is_plant, ), ] @@ -92,6 +94,7 @@ def __init__( species=self.species.latin_name, sequence_type="pep", server=self.server, + is_plant = self.species.is_plant, ) ] diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index e8b6924..cfa8db4 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -24,13 +24,19 @@ from .ensembl_versions import check_release_number ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org" +ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/" # Example directories # FASTA files: /pub/release-78/fasta/homo_sapiens/ # GTF annotation files: /pub/release-78/gtf/homo_sapiens/ FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/" +PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/" GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/" +PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/" +#List plants +#Lest do a vector with all the plants species that we added to make the custom url +lPlants = ("arabidopsis_thaliana","arabidopsis") def normalize_release_properties(ensembl_release, species): """ @@ -63,12 +69,18 @@ def make_gtf_filename(ensembl_release, species): } -def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER): +def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE): """ Returns a URL and a filename, which can be joined together. """ + if species.is_plant: + server = ENSEMBL_PLANTS_FTP_SERVER + gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE + #else: + #print(f"[+] {species.latin_name} it is not a plant", flush=True) + ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) - subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species} + subdir = gtf_subdir % {"release": ensembl_release, "species": species} filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) return server + subdir + filename @@ -93,11 +105,11 @@ def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER): NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" -def make_fasta_filename(ensembl_release, species, sequence_type): +def make_fasta_filename(ensembl_release, species, sequence_type, is_plant): ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species ) - if ensembl_release <= 75: + if ensembl_release <= 75 and not is_plant: if sequence_type == "ncrna": return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % { "Species": species.capitalize(), @@ -125,7 +137,7 @@ def make_fasta_filename(ensembl_release, species, sequence_type): } -def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER): +def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE): """Construct URL to FASTA file with cDNA transcript or protein sequences Parameter examples: @@ -136,12 +148,17 @@ def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_S ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species ) - subdir = FASTA_SUBDIR_TEMPLATE % { + + if is_plant: + server = ENSEMBL_PLANTS_FTP_SERVER + fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE + + subdir = fasta_subdir % { "release": ensembl_release, "species": species, "type": sequence_type, } filename = make_fasta_filename( - ensembl_release=ensembl_release, species=species, sequence_type=sequence_type + ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant ) return server + subdir + filename diff --git a/pyensembl/ensembl_versions.py b/pyensembl/ensembl_versions.py index 101e7aa..c8cd31a 100644 --- a/pyensembl/ensembl_versions.py +++ b/pyensembl/ensembl_versions.py @@ -12,7 +12,7 @@ MIN_ENSEMBL_RELEASE = 47 MAX_ENSEMBL_RELEASE = 111 - +MAX_PLANTS_ENSEMBL_RELEASE = 58 def check_release_number(release): """ diff --git a/pyensembl/species.py b/pyensembl/species.py index 9eec027..f1b69bb 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -12,7 +12,7 @@ from serializable import Serializable -from .ensembl_versions import MAX_ENSEMBL_RELEASE +from .ensembl_versions import MAX_ENSEMBL_RELEASE, MAX_PLANTS_ENSEMBL_RELEASE # TODO: replace Serializable with data class @@ -30,7 +30,7 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register(cls, latin_name, synonyms, reference_assemblies): + def register(cls, latin_name, synonyms, reference_assemblies, is_plant=False): """ Create a Species object from the given arguments and enter into all the dicts used to look the species up by its fields. @@ -39,6 +39,7 @@ def register(cls, latin_name, synonyms, reference_assemblies): latin_name=latin_name, synonyms=synonyms, reference_assemblies=reference_assemblies, + is_plant=is_plant, ) cls._latin_names_to_species[species.latin_name] = species for synonym in synonyms: @@ -80,7 +81,7 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__(self, latin_name, synonyms=[], reference_assemblies={}): + def __init__(self, latin_name, synonyms=[], reference_assemblies={}, is_plant=False): """ Parameters ---------- @@ -96,6 +97,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): self.synonyms = synonyms self.reference_assemblies = reference_assemblies self._release_to_genome = {} + self.is_plant = is_plant for genome_name, (start, end) in self.reference_assemblies.items(): for i in range(start, end + 1): if i in self._release_to_genome: @@ -350,3 +352,21 @@ def check_species_object(species_name_or_object): "R64-1-1": (76, MAX_ENSEMBL_RELEASE), }, ) + +arabidopsis_thaliana = Species.register( + latin_name="arabidopsis_thaliana", + synonyms=["arabidopsis"], + reference_assemblies={ + "TAIR10": (40, MAX_PLANTS_ENSEMBL_RELEASE), + }, + is_plant=True +) + +rice = Species.register( + latin_name="oryza_sativa", + synonyms=["rice"], + reference_assemblies={ + "IRGSP-1.0": (40, MAX_PLANTS_ENSEMBL_RELEASE), + }, + is_plant=True +) \ No newline at end of file diff --git a/pyensembl/version.py b/pyensembl/version.py index 99c0c63..b383d93 100644 --- a/pyensembl/version.py +++ b/pyensembl/version.py @@ -1,4 +1,4 @@ -__version__ = "2.3.11" +__version__ = "2.3.12" def print_version(): print(f"v{__version__}")