integrates indexable sample data into analyses ebi search dump

EBI-Metagenomics · Oct 23, 2023 · 14f0842 · 14f0842
1 parent 974371f
commit 14f0842
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 107 deletions.
diff --git a/emgapi/management/commands/ebi_search_analysis_dump.py b/emgapi/management/commands/ebi_search_analysis_dump.py
@@ -15,10 +15,13 @@
 # limitations under the License.
 
 import logging
-import re
+import pathlib
+from datetime import timedelta
 
 from django.core.management import BaseCommand
+from django.db.models import QuerySet
 from django.template.loader import render_to_string
+from django.utils import timezone
 
 from emgapi.models import AnalysisJob
 from emgapianns.models import (
@@ -31,50 +34,18 @@
 
 
 class Command(BaseCommand):
-    help = "Generate the XML dump of an analysis."
+    help = "Generate the XML dump of analyses for EBI Search."
 
     def add_arguments(self, parser):
         super(Command, self).add_arguments(parser)
         parser.add_argument(
-            "-a", "--accession", help="Analysis accession", required=True
+            "--full",
+            action="store_true",
+            help="Create a full snapshot rather than incremental.",
         )
-        parser.add_argument("-o", "--output", help="Output xml file", required=True)
-
-    # TODO: this was ported directly from EBI Search Dump
-    # we need to improve it, maybe move it to a template tag?
-    # TODO: apply this to InterPro and GO annotations
-    def unicode_and_clean(self, identifier, text):
-        """Converts text to utf8 encoded string with xml subsitutions"""
-        if not text:
-            return text
-        try:
-            text = text.encode("utf8", "strict")
-        except Exception as ex:
-            match = re.search(r"in position (\d+)", str(ex))
-            if match:
-                position = int(match.group(1))
-                matchStart = position - 15
-                if matchStart < 0:
-                    matchStart = 0
-                matchEnd = position + 15
-                if matchEnd > len(text):
-                    matchEnd = len(text)
-                logger.error(
-                    f"Replacing '{identifier}' in {text[position]} [{text[matchStart:matchEnd]}]",
-                    ex,
-                )
-                text = text[:position] + text[position + 1 :]
-                text = self.unicode_and_clean(identifier, text)
-            else:
-                logger.error(f"Failed to convert: {identifier}.", ex)
-        return text.decode("utf8")
+        parser.add_argument("-o", "--output", help="Output dir for xml files", required=True)
 
-    def handle(self, *args, **options):
-        """Render an analysis using the EBI Search XML template"""
-        # TODO: The migration for the analysis should be nearly done, code ported from: https://github.com/EBI-Metagenomics/MetagenomicsSearchDump
-        # In that repo the analyses corresponds to the Run entries.
-        accession: str = options["accession"]
-        analysis: AnalysisJob = AnalysisJob.objects.get(job_id=accession)
+    def get_analysis_context(self, analysis: AnalysisJob):
         analysis_taxonomy: AnalysisJobTaxonomy = AnalysisJobTaxonomy.objects.get(
             analysis_id=str(analysis.job_id)
         )
@@ -101,36 +72,10 @@ def handle(self, *args, **options):
                     tax_lineage_list = list(filter(None, tax.lineage.split(":")))
                     if len(tax_lineage_list) > 1:
                         taxonomy_lists.append(
-                            [
-                                self.unicode_and_clean("tax", tax_el)
-                                for tax_el in tax_lineage_list
-                            ]
+                            tax_lineage_list
                         )
 
-        # TODO: port these metadata "cleaning" rules.
-        # if depth and len(depth) > 0:
-        #     self.depth = checkIfNumerical(extID, depth)
-        # if altitude and len(altitude) > 0:
-        #     self.altitude = checkIfNumerical(extID, altitude)
-        # if elevation and len(elevation) > 0:
-        #     self.elevation = checkIfNumerical(extID, elevation)
-        # if salinity and len(salinity) > 0:
-        #     self.salinity = checkIfNumerical(extID, salinity)
-        # if temperature and len(temperature) > 0:
-        #     self.temperature = checkIfNumerical(extID, temperature)
-        # if pH and len(pH) > 0:
-        #     self.pH = checkIfNumerical(extID, pH[0])
-        # if longitudeStart and len(longitudeStart) > 0:
-        #     self.longitudeStart = encodeToUnicodeAndClean(extID, longitudeStart)
-        # if longitudeEnd and len(longitudeEnd) > 0:
-        #     self.longitudeEnd = checkIfNumerical(extID, longitudeEnd)
-        # if latitudeStart and len(latitudeStart) > 0:
-        #     self.latitudeStart = checkIfNumerical(extID, latitudeStart)
-        # if latitudeEnd and len(latitudeEnd) > 0:
-        #     self.latitudeEnd = checkIfNumerical(extID, latitudeEnd)
-
-        # TODO: from this list (taken from )
-        SAMPLE_ANNOTATIONS_ACCEPT_LIST = {
+        sample_numeric_fields_to_index = {
             "temperature": "temperature",
             "pH": "pH",
             "altitude": "altitude",
@@ -145,23 +90,98 @@ def handle(self, *args, **options):
             "latitude end": "latitudeEnd",
         }
 
+        sample_text_annotations_to_index = {
+            "sequencing method": "sequencing_method",
+            "geographic location (region and locality)": "location_name",
+            "geographic location (country and/or sea,region)": "location_name",
+            "disease status": "disease_status",
+            "phenotype": "phenotype",
+        }
+
+        sample_annotations_to_index = sample_numeric_fields_to_index.copy()
+        sample_annotations_to_index.update(sample_text_annotations_to_index)
+
         sample_metadata = {}
         for sample_metadata_entry in analysis.sample.metadata.all():
-            if sample_metadata_entry.var.var_name in SAMPLE_ANNOTATIONS_ACCEPT_LIST:
+            if (vn := sample_metadata_entry.var.var_name) in sample_annotations_to_index:
+                indexable_name = sample_annotations_to_index[vn]
+                indexable_value = sample_metadata_entry.var_val_ucv
+
+                if indexable_name in sample_numeric_fields_to_index.values():
+                    try:
+                        indexable_value = float(indexable_value.strip())
+                    except ValueError:
+                        logger.warning(
+                            f"Could not float-parse supposedly numeric field {indexable_name} : {indexable_value}")
+                        continue
                 sample_metadata[
-                    SAMPLE_ANNOTATIONS_ACCEPT_LIST[sample_metadata_entry.var.var_name]
-                ] = sample_metadata_entry.var_val_ucv
-
-        print(
-            render_to_string(
-                "ebi_search/analysis.xml",
-                {
-                    "analysis": analysis,
-                    "analysis_biome": biome_list,
-                    "analysis_taxonomies": taxonomy_lists,
-                    "analysis_go_entries": go_annotation.go_terms,
-                    "analysis_ips_entries": ips_annotation.interpro_identifiers,
-                    "sample_metadata": sample_metadata,
-                },
+                    indexable_name
+                ] = indexable_value
+
+        if 'location_name' not in sample_metadata and analysis.sample.geo_loc_name:
+            sample_metadata['location_name'] = analysis.sample.geo_loc_name
+
+        return {
+            "analysis": analysis,
+            "analysis_biome": biome_list,
+            "analysis_taxonomies": taxonomy_lists,
+            "analysis_go_entries": go_annotation.go_terms,
+            "analysis_ips_entries": ips_annotation.interpro_identifiers,
+            "sample_metadata": sample_metadata,
+        }
+
+    @staticmethod
+    def write_without_blank_lines(fp, string):
+        fp.write(
+            "\n".join(
+                filter(
+                    str.strip,
+                    string.splitlines()
+                )
             )
         )
+
+    def handle(self, *args, **options):
+        """Dump EBI Search XML file of analyses"""
+        is_full_snapshot: str = options["full"]
+        output_dir: str = options["output"]
+
+        pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+        analyses: QuerySet = AnalysisJob.objects.available(None)
+
+        if not is_full_snapshot:
+            analyses = AnalysisJob.objects_for_indexing.to_add()
+
+            removals = AnalysisJob.objects_for_indexing.to_delete()
+
+            # produce incremental deletion file
+            deletions_file = pathlib.Path(output_dir) / pathlib.Path('analyses-deletes.xml')
+            with open(deletions_file, 'w') as d:
+                self.write_without_blank_lines(d,
+                    render_to_string(
+                        "ebi_search/analyses-deletes.xml",
+                        {
+                            "removals": removals
+                        }
+                    )
+                )
+
+        additions_file = pathlib.Path(output_dir) / pathlib.Path('analyses.xml')
+        with open(additions_file, 'w') as a:
+            self.write_without_blank_lines(a,
+                render_to_string(
+                    "ebi_search/analyses.xml",
+                    {
+                        "additions": (self.get_analysis_context(analysis) for analysis in analyses)
+                    }
+                )
+            )
+
+        nowish = timezone.now() + timedelta(minutes=1)
+        # Small buffer into the future so that the indexing time remains ahead of auto-now updated times.
+
+        for analysis in analyses:
+            analysis.last_indexed = nowish
+
+        AnalysisJob.objects.bulk_update(analyses, fields=["last_indexed"])
diff --git a/emgapi/templates/ebi_search/analysis.xml b/emgapi/templates/ebi_search/analysis.xml
@@ -4,13 +4,30 @@
             <dates>
                 <date type="creation_date" value="{{ analysis.submit_time|date:'Y-m-d' }}"/>
                 <date type="completion_date" value="{{ analysis.complete_time|date:'Y-m-d' }}"/>
+                <date type="sample_collection_date" value="{{ analysis.sample.collection_date|date:'Y-m-d' }}"/>
             </dates>
             <additional_fields>
                 <field name="experiment_type">assembly</field>
                 <field name="pipeline_version">{{ analysis.pipeline.release_version }}</field>
                 <field name="sample_name">{{ analysis.sample.sample_name | safe }}</field>
                 <field name="project_name">{{ analysis.study.study_name | safe }}</field>
                 <field name="biome_name">{{ analysis.study.biome.biome_name | safe }}</field>
+
+                {% if analysis.sample.species %}
+                <field name="species">{{ analysis.sample.species | safe }}</field>
+                {% endif %}
+
+                {% if analysis.sample.environment_feature %}
+                <field name="feature">{{ analysis.sample.environment_feature | safe }}</field>
+                {% endif %}
+
+                {% if analysis.sample.environment_material %}
+                <field name="material">{{ analysis.sample.environment_material | safe }}</field>
+                {% endif %}
+
+                <field name="sample_alias">{{ analysis.sample.sample_alias | safe }}</field>
+                <field name="project_name">{{ analysis.study.study_name | safe }}</field>
+
                 <hierarchical_field name="biome">
                 {% for biome_element in analysis_biome %}
                     {% if forloop.first %}
@@ -20,6 +37,11 @@
                     {% endif %}
                 {% endfor %}
                 </hierarchical_field>
+
+                {% for metadata_key, metadata_value in sample_metadata.items %}
+                <field name="{{ metadata_key | safe }}">{{ metadata_value | safe }}</field>
+                {% endfor %}
+
                 {% for taxonomy_lineage_elements in analysis_taxonomies %}
                 <hierarchical_field name="organism">
                     {% for taxonomy_element in taxonomy_lineage_elements %}
@@ -39,14 +61,8 @@
                 {% for ips in analysis_ips_entries %}
                 <field name="interpro_entry">{{ ips.description | safe }}</field>
                 {% endfor %}
-
-                {% for metadata_key, metadata_value in sample_metadata.items %}
-                <field name="{{ metadata_key | safe }}">{{ metadata_value | safe }}</field>
-                {% endfor %}
-
             </additional_fields>
             <cross_references>
-                <ref dbkey="{{ analysis.sample.accession }}" dbname="metagenomics_samples"/>
                 <ref dbkey="{{ analysis.study.accession }}" dbname="metagenomics_projects"/>
                 <ref dbkey="{{ analysis.study.project_id }}" dbname="ena_project"/>
 

diff --git a/emgapi/templatetags/ebi_search_dump.py b/emgapi/templatetags/ebi_search_dump.py