From 1a3acfdfb81a6004f1851a272e2b824a1a5189ed Mon Sep 17 00:00:00 2001 From: sandyr Date: Tue, 24 Oct 2023 15:45:07 +0100 Subject: [PATCH] adds ebi search dump for studies/projects (plus some tweaks to analyses) --- .../commands/ebi_search_analysis_dump.py | 3 +- .../commands/ebi_search_study_dump.py | 108 ++++++++++++++++++ emgapi/migrations/0013_study_last_indexed.py | 18 +++ .../0014_alter_study_last_update.py | 18 +++ emgapi/models.py | 4 +- emgapi/templates/ebi_search/analyses.xml | 3 + .../templates/ebi_search/projects-deletes.xml | 8 ++ emgapi/templates/ebi_search/projects.xml | 40 +++++++ 8 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 emgapi/management/commands/ebi_search_study_dump.py create mode 100644 emgapi/migrations/0013_study_last_indexed.py create mode 100644 emgapi/migrations/0014_alter_study_last_update.py create mode 100644 emgapi/templates/ebi_search/projects-deletes.xml create mode 100644 emgapi/templates/ebi_search/projects.xml diff --git a/emgapi/management/commands/ebi_search_analysis_dump.py b/emgapi/management/commands/ebi_search_analysis_dump.py index 0e447db10..6a079e1d3 100644 --- a/emgapi/management/commands/ebi_search_analysis_dump.py +++ b/emgapi/management/commands/ebi_search_analysis_dump.py @@ -173,7 +173,8 @@ def handle(self, *args, **options): render_to_string( "ebi_search/analyses.xml", { - "additions": (self.get_analysis_context(analysis) for analysis in analyses) + "additions": (self.get_analysis_context(analysis) for analysis in analyses), + "count": analyses.count() } ) ) diff --git a/emgapi/management/commands/ebi_search_study_dump.py b/emgapi/management/commands/ebi_search_study_dump.py new file mode 100644 index 000000000..1bee1aa08 --- /dev/null +++ b/emgapi/management/commands/ebi_search_study_dump.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2023 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import pathlib +from datetime import timedelta + +from django.core.management import BaseCommand +from django.db.models import QuerySet +from django.template.loader import render_to_string +from django.utils import timezone + +from emgapi.models import Study + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Generate the XML dump of studies for EBI Search." + + def add_arguments(self, parser): + super(Command, self).add_arguments(parser) + parser.add_argument( + "--full", + action="store_true", + help="Create a full snapshot rather than incremental.", + ) + parser.add_argument("-o", "--output", help="Output dir for xml files", required=True) + + + @staticmethod + def write_without_blank_lines(fp, string): + fp.write( + "\n".join( + filter( + str.strip, + string.splitlines() + ) + ) + ) + + @staticmethod + def get_study_context(study: Study): + biome_list = study.biome.lineage.split(":")[1:] + + return { + "study": study, + "biome_list": biome_list + } + + def handle(self, *args, **options): + """Dump EBI Search XML file of studies/projects""" + is_full_snapshot: str = options["full"] + output_dir: str = options["output"] + + pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) + + studies: QuerySet = Study.objects.available(None) + + if not is_full_snapshot: + studies = Study.objects_for_indexing.to_add() + + removals = Study.objects_for_indexing.to_delete() + + # produce incremental deletion file + deletions_file = pathlib.Path(output_dir) / pathlib.Path('projects-deletes.xml') + with open(deletions_file, 'w') as d: + self.write_without_blank_lines(d, + render_to_string( + "ebi_search/projects-deletes.xml", + { + "removals": removals + } + ) + ) + + additions_file = pathlib.Path(output_dir) / pathlib.Path('projects.xml') + with open(additions_file, 'w') as a: + self.write_without_blank_lines(a, + render_to_string( + "ebi_search/projects.xml", + { + "additions": (self.get_study_context(study) for study in studies), + "count": studies.count() + } + ) + ) + + nowish = timezone.now() + timedelta(minutes=1) + # Small buffer into the future so that the indexing time remains ahead of auto-now updated times. + + for study in studies: + study.last_indexed = nowish + + Study.objects.bulk_update(studies, fields=["last_indexed"]) diff --git a/emgapi/migrations/0013_study_last_indexed.py b/emgapi/migrations/0013_study_last_indexed.py new file mode 100644 index 000000000..e141ee7fd --- /dev/null +++ b/emgapi/migrations/0013_study_last_indexed.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-10-23 16:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0012_auto_20231020_1525'), + ] + + operations = [ + migrations.AddField( + model_name='study', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + ] diff --git a/emgapi/migrations/0014_alter_study_last_update.py b/emgapi/migrations/0014_alter_study_last_update.py new file mode 100644 index 000000000..4eb998c33 --- /dev/null +++ b/emgapi/migrations/0014_alter_study_last_update.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-10-23 17:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0013_study_last_indexed'), + ] + + operations = [ + migrations.AlterField( + model_name='study', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index 7bb4e7a28..8379497d6 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -893,7 +893,7 @@ def mydata(self, request): return self.get_queryset().mydata(request) -class Study(ENASyncableModel): +class Study(ENASyncableModel, EbiSearchIndexedModel): def __init__(self, *args, **kwargs): super(Study, self).__init__(*args, **kwargs) @@ -927,7 +927,7 @@ def _custom_pk(self): author_name = models.CharField( db_column='AUTHOR_NAME', max_length=100, blank=True, null=True) last_update = models.DateTimeField( - db_column='LAST_UPDATE') + db_column='LAST_UPDATE', auto_now=True) submission_account_id = models.CharField( db_column='SUBMISSION_ACCOUNT_ID', max_length=15, blank=True, null=True) diff --git a/emgapi/templates/ebi_search/analyses.xml b/emgapi/templates/ebi_search/analyses.xml index 90cf547e1..c90e395a1 100644 --- a/emgapi/templates/ebi_search/analyses.xml +++ b/emgapi/templates/ebi_search/analyses.xml @@ -1,5 +1,8 @@ EMG_run + EMG Analysis runs – samples analysed by MGnify pipelines + {% now "Y-m-d" %} + {{ count }} {% for a in additions %} {% include "ebi_search/analysis.xml" with analysis=a.analysis analysis_biome=a.analysis_biome analysis_taxonomies=a.analysis_taxonomies analysis_go_entries=a.analysis_go_entries analysis_ips_entries=a.analysis_ips_entries sample_metadata=a.sample_metadata only %} diff --git a/emgapi/templates/ebi_search/projects-deletes.xml b/emgapi/templates/ebi_search/projects-deletes.xml new file mode 100644 index 000000000..437027a75 --- /dev/null +++ b/emgapi/templates/ebi_search/projects-deletes.xml @@ -0,0 +1,8 @@ + + EMG_Project + + {% for entry in removals %} + + {% endfor %} + + diff --git a/emgapi/templates/ebi_search/projects.xml b/emgapi/templates/ebi_search/projects.xml new file mode 100644 index 000000000..d8827e4a1 --- /dev/null +++ b/emgapi/templates/ebi_search/projects.xml @@ -0,0 +1,40 @@ + + EMG_Project + EMG Projects – studies analysed by MGnify + {% now "Y-m-d" %} + {{ count }} + + {% for addition in additions %} + {% with addition.study as study %} + + {{ study.study_name | safe }} + {{ study.study_abstract | safe }} + + + + + + {{ study.secondary_accession }} + {{ study.biome.biome_name }} + + {% for biome_element in addition.biome_list %} + {% if forloop.first %} + {{ biome_element | safe }} + {% else %} + {{ biome_element | safe }} + {% endif %} + {% endfor %} + + {{ study.centre_name | safe }} + + + + {% for analysis in study.analyses.all %} + + {% endfor %} + + + {% endwith %} + {% endfor %} + +