From 0a8a761f30f77c4c4cceeb0eed0c6395783f7779 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:09:18 +0000 Subject: [PATCH 01/46] adds incremental ebi search dumping for analysisjobs --- .gitignore | 2 +- emgapi/migrations/0012_auto_20231020_1525.py | 23 ++++++++++++++++++++ emgapi/templatetags/ebi_search_dump.py | 19 ++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 emgapi/migrations/0012_auto_20231020_1525.py create mode 100644 emgapi/templatetags/ebi_search_dump.py diff --git a/.gitignore b/.gitignore index 14cd294c5..85316e5ed 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,4 @@ dumps /config/*.yml /config/*.yaml -!/config/*local* \ No newline at end of file +!/config/*local* diff --git a/emgapi/migrations/0012_auto_20231020_1525.py b/emgapi/migrations/0012_auto_20231020_1525.py new file mode 100644 index 000000000..296ffd821 --- /dev/null +++ b/emgapi/migrations/0012_auto_20231020_1525.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.18 on 2023-10-20 15:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0011_analysisjob_analysis_summary_json'), + ] + + operations = [ + migrations.AddField( + model_name='analysisjob', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + migrations.AddField( + model_name='analysisjob', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), + ] diff --git a/emgapi/templatetags/ebi_search_dump.py b/emgapi/templatetags/ebi_search_dump.py new file mode 100644 index 000000000..5480e4eee --- /dev/null +++ b/emgapi/templatetags/ebi_search_dump.py @@ -0,0 +1,19 @@ +from django import template + +register = template.Library() + + +@register.simple_tag +def xml_safe(unsafe_string: str): + if not unsafe_string: + return None + + replacements = str.maketrans({ + "<": "<", + ">": ">", + "&": "&", + "'": "'", + '"': """, + }) + + return unsafe_string.translate(replacements) From 4d51fe3bdc212ea2fb4279e1a4b9ebb45b1ae82f Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:10:32 +0000 Subject: [PATCH 02/46] integrates indexable sample data into analyses ebi search dump --- emgapi/templatetags/ebi_search_dump.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 emgapi/templatetags/ebi_search_dump.py diff --git a/emgapi/templatetags/ebi_search_dump.py b/emgapi/templatetags/ebi_search_dump.py deleted file mode 100644 index 5480e4eee..000000000 --- a/emgapi/templatetags/ebi_search_dump.py +++ /dev/null @@ -1,19 +0,0 @@ -from django import template - -register = template.Library() - - -@register.simple_tag -def xml_safe(unsafe_string: str): - if not unsafe_string: - return None - - replacements = str.maketrans({ - "<": "<", - ">": ">", - "&": "&", - "'": "'", - '"': """, - }) - - return unsafe_string.translate(replacements) From 240d8f4de305002e41e1fb154972910b76105f91 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:12:45 +0000 Subject: [PATCH 03/46] adds ebi search dump for studies/projects (plus some tweaks to analyses) --- emgapi/migrations/0013_study_last_indexed.py | 18 ++++++++++++++++++ .../migrations/0014_alter_study_last_update.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 emgapi/migrations/0013_study_last_indexed.py create mode 100644 emgapi/migrations/0014_alter_study_last_update.py diff --git a/emgapi/migrations/0013_study_last_indexed.py b/emgapi/migrations/0013_study_last_indexed.py new file mode 100644 index 000000000..e141ee7fd --- /dev/null +++ b/emgapi/migrations/0013_study_last_indexed.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-10-23 16:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0012_auto_20231020_1525'), + ] + + operations = [ + migrations.AddField( + model_name='study', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + ] diff --git a/emgapi/migrations/0014_alter_study_last_update.py b/emgapi/migrations/0014_alter_study_last_update.py new file mode 100644 index 000000000..4eb998c33 --- /dev/null +++ b/emgapi/migrations/0014_alter_study_last_update.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-10-23 17:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0013_study_last_indexed'), + ] + + operations = [ + migrations.AlterField( + model_name='study', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), + ] From ed97ccbdf985ce7f87c570562c8d10f3e6133606 Mon Sep 17 00:00:00 2001 From: sandyr Date: Fri, 10 Nov 2023 13:30:02 +0000 Subject: [PATCH 04/46] resolve migration merge conflicts --- ...1020_1525.py => 0013_auto_20231110_1329.py} | 14 ++++++++++++-- emgapi/migrations/0013_study_last_indexed.py | 18 ------------------ .../migrations/0014_alter_study_last_update.py | 18 ------------------ 3 files changed, 12 insertions(+), 38 deletions(-) rename emgapi/migrations/{0012_auto_20231020_1525.py => 0013_auto_20231110_1329.py} (51%) delete mode 100644 emgapi/migrations/0013_study_last_indexed.py delete mode 100644 emgapi/migrations/0014_alter_study_last_update.py diff --git a/emgapi/migrations/0012_auto_20231020_1525.py b/emgapi/migrations/0013_auto_20231110_1329.py similarity index 51% rename from emgapi/migrations/0012_auto_20231020_1525.py rename to emgapi/migrations/0013_auto_20231110_1329.py index 296ffd821..998fc7202 100644 --- a/emgapi/migrations/0012_auto_20231020_1525.py +++ b/emgapi/migrations/0013_auto_20231110_1329.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.18 on 2023-10-20 15:25 +# Generated by Django 3.2.18 on 2023-11-10 13:29 from django.db import migrations, models @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ('emgapi', '0011_analysisjob_analysis_summary_json'), + ('emgapi', '0012_alter_publication_pub_type'), ] operations = [ @@ -20,4 +20,14 @@ class Migration(migrations.Migration): name='last_update', field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), ), + migrations.AddField( + model_name='study', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + migrations.AlterField( + model_name='study', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), ] diff --git a/emgapi/migrations/0013_study_last_indexed.py b/emgapi/migrations/0013_study_last_indexed.py deleted file mode 100644 index e141ee7fd..000000000 --- a/emgapi/migrations/0013_study_last_indexed.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 3.2.18 on 2023-10-23 16:41 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0012_auto_20231020_1525'), - ] - - operations = [ - migrations.AddField( - model_name='study', - name='last_indexed', - field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), - ), - ] diff --git a/emgapi/migrations/0014_alter_study_last_update.py b/emgapi/migrations/0014_alter_study_last_update.py deleted file mode 100644 index 4eb998c33..000000000 --- a/emgapi/migrations/0014_alter_study_last_update.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 3.2.18 on 2023-10-23 17:16 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0013_study_last_indexed'), - ] - - operations = [ - migrations.AlterField( - model_name='study', - name='last_update', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), - ), - ] From ab829f469bc2ed8ab9e5deb142954f9ab77b22d0 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:14:54 +0000 Subject: [PATCH 05/46] adds support for chunking of analysis-runs ebi search dump --- emgapi/management/commands/ebi_search_analysis_dump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emgapi/management/commands/ebi_search_analysis_dump.py b/emgapi/management/commands/ebi_search_analysis_dump.py index 39d9fd12d..76f3bacef 100644 --- a/emgapi/management/commands/ebi_search_analysis_dump.py +++ b/emgapi/management/commands/ebi_search_analysis_dump.py @@ -60,7 +60,7 @@ def get_analysis_context(self, analysis: AnalysisJob): try: go_annotation: Optional[AnalysisJobGoTerm] = AnalysisJobGoTerm.objects.get( - pk=str(analysis.job_id) + pk=analysis.job_id ) except AnalysisJobGoTerm.DoesNotExist: logger.debug(f"Could not find go terms for {analysis.job_id}") @@ -68,7 +68,7 @@ def get_analysis_context(self, analysis: AnalysisJob): try: ips_annotation: Optional[AnalysisJobInterproIdentifier] = AnalysisJobInterproIdentifier.objects.get( - pk=str(analysis.job_id) + pk=analysis.job_id ) except AnalysisJobInterproIdentifier.DoesNotExist: logger.debug(f"Could not find IPS terms for {analysis.job_id}") From fcb9300c48193ad3d67d613bde5a51e92eb72ae5 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:16:40 +0000 Subject: [PATCH 06/46] bugfixes on analysis-run dumper --- emgapi/management/commands/ebi_search_analysis_dump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emgapi/management/commands/ebi_search_analysis_dump.py b/emgapi/management/commands/ebi_search_analysis_dump.py index 76f3bacef..39d9fd12d 100644 --- a/emgapi/management/commands/ebi_search_analysis_dump.py +++ b/emgapi/management/commands/ebi_search_analysis_dump.py @@ -60,7 +60,7 @@ def get_analysis_context(self, analysis: AnalysisJob): try: go_annotation: Optional[AnalysisJobGoTerm] = AnalysisJobGoTerm.objects.get( - pk=analysis.job_id + pk=str(analysis.job_id) ) except AnalysisJobGoTerm.DoesNotExist: logger.debug(f"Could not find go terms for {analysis.job_id}") @@ -68,7 +68,7 @@ def get_analysis_context(self, analysis: AnalysisJob): try: ips_annotation: Optional[AnalysisJobInterproIdentifier] = AnalysisJobInterproIdentifier.objects.get( - pk=analysis.job_id + pk=str(analysis.job_id) ) except AnalysisJobInterproIdentifier.DoesNotExist: logger.debug(f"Could not find IPS terms for {analysis.job_id}") From 6438c3bad58394296d8d2db643aa8c8078f65063 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 11 Sep 2023 13:34:31 +0100 Subject: [PATCH 07/46] Add metagenomics exchange support --- emgapi/management/commands/mgx_api.py | 188 +++++++++++++++++++ emgapi/migrations/0010_auto_20230908_1346.py | 35 ++++ emgapi/models.py | 16 +- tests/me/test_api.py | 31 +++ 4 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 emgapi/management/commands/mgx_api.py create mode 100644 emgapi/migrations/0010_auto_20230908_1346.py create mode 100644 tests/me/test_api.py diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py new file mode 100644 index 000000000..83e7eae7c --- /dev/null +++ b/emgapi/management/commands/mgx_api.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2022 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import requests + +from django.db.models import Count +from django.core.management import BaseCommand +from django.conf import settings + +from emgapi.models import Assembly, AnalysisJob + +logger = logging.getLogger(__name__) +API = { + 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', + 'dev': 'http://wp-np2-5c:8080/ena/registry/metagenome/api/' +} +TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' + +RETRY_COUNT = 5 + +class Command(BaseCommand): + help = "Check and populate metagenomics exchange." + def add_arguments(self, parser): + super(Command, self).add_arguments(parser) + parser.add_argument( + "-s", + "--study", + action="store", + required=False, + type=str, + help="Study accession (rather than all)", + ) + parser.add_argument( + "-p", + "--pipeline", + help="Pipeline version (rather than all). Not applicable to Genomes.", + action="store", + dest="pipeline", + choices=[1.0, 2.0, 3.0, 4.0, 4.1, 5.0], + required=False, + type=float, + ) + parser.add_argument( + "-z", + "--assembly", + action="store", + required=False, + type=str, + help="Assembly accession (rather than all)", + nargs="+" + ) + parser.add_argument( + "-r", + "--run", + action="store", + required=False, + type=str, + help="Run accession (rather than all)", + nargs="+" + ) + parser.add_argument( + "--dev", + action="store_true", + required=False, + help="Populate dev API", + ) + parser.add_argument( + "-b", + "--broker", + action="store", + required=False, + type=str, + default='EMG', + help="Broker name", + choices=["EMG", "MAR"], + ) + def get_request(self, url, params): + if self.auth: + response = requests.get(url, params=params, headers=self.auth) + else: + logging.warning( + "Not authenticated to get private data." + # noqa: E501 + ) + response = requests.get(url, params=params) + return response + + def _check_analysis(self, sourceID, public): + params = { + 'status': 'public' if public else 'private', + } + response = self.get_request(url=self.check_url, params=params) + exists = False + if not response.ok: + logging.error( + "Error retrieving dataset {}, response code: {}".format( + self.check_url, response.status_code + ) + ) + else: + data = response.json()['datasets'] + for item in data: + if item['sourceID'] == sourceID: + exists = True + return exists + + def _post_request(self, url, data): + default = { + "headers": { + "Content-Type": "application/json", + "Accept": "application/json", + } + } + if self.auth: + default["headers"].update(self.auth) + response = requests.post( + url, json=data, **default + ) + else: + logging.warning( + "Not authenticated to POST private data." + # noqa: E501 + ) + response = requests.post( + url, json=data, **default + ) + return response + + def _filtering_analyses(self): + analyses = AnalysisJob.objects.all() + if self.study_accession: + analyses = analyses.filter(study__secondary_accession=self.study_accession) + if self.run_accession: + analyses = analyses.filter(run__accession=self.run_accession) + if self.assembly_accession: + analyses = analyses.filter(assembly__accession=self.assembly_accession) + if self.pipeline_version: + analyses = analyses.filter(pipeline__pipeline_id=self.pipeline_version) + return analyses + def handle(self, *args, **options): + self.auth = None + self.url = API['dev'] + self.broker = options.get("broker") + self.check_url = self.url + f'brokers/{self.broker}/datasets' + + self.study_accession = options.get("study") + self.pipeline_version = options.get("pipeline") + self.assembly_accession = options.get("assembly") + self.run_accession = options.get("run") + + analyses = self._filtering_analyses() + for analysis in analyses: + MGYA = analysis.accession + public = not analysis.is_private + if not public: + self.auth = {"Authorization": TOKEN} + # check is MGYA in ME + if (self._check_analysis(sourceID=MGYA, public=public)): + logging.info(f"{MGYA} exists in ME") + else: + logging.info(f"{MGYA} does not exist in ME") + data = { + 'confidence': 'full', + 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/{MGYA}', + 'method': ['other_metadata'], + 'sourceID': MGYA, + 'sequenceID': analysis.run.accesison, + 'status': 'public' if public else 'private' + } + #self._post_request(url=self.url+'datasets', data=data) + + + diff --git a/emgapi/migrations/0010_auto_20230908_1346.py b/emgapi/migrations/0010_auto_20230908_1346.py new file mode 100644 index 000000000..10677b3ca --- /dev/null +++ b/emgapi/migrations/0010_auto_20230908_1346.py @@ -0,0 +1,35 @@ +# Generated by Django 3.2.18 on 2023-09-08 13:46 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0009_genome_annotations_v2_downloads'), + ] + + operations = [ + migrations.CreateModel( + name='ME_Broker', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('brokerID', models.CharField(blank=True, db_column='BROKER', max_length=10, null=True)), + ], + ), + migrations.CreateModel( + name='MetagenomicsExchange', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('first_created', models.DateTimeField(auto_now_add=True, db_column='FIRST_CREATED')), + ('last_update', models.DateTimeField(auto_now=True, db_column='LAST_UPDATE')), + ('broker_id', models.ForeignKey(db_column='BROKER', on_delete=django.db.models.deletion.CASCADE, to='emgapi.me_broker')), + ], + ), + migrations.AddField( + model_name='analysisjob', + name='metagenomics_exchange', + field=models.ForeignKey(blank=True, db_column='METAGENOMICS_EXCHANGE', null=True, on_delete=django.db.models.deletion.CASCADE, to='emgapi.metagenomicsexchange'), + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index 9d0830c04..5c7cac177 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -1694,6 +1694,17 @@ def get_queryset(self): .select_related( 'analysis_status','experiment_type', 'assembly', 'pipeline', 'run', 'sample', 'study') +class ME_Broker(models.Model): + brokerID = models.CharField(db_column='BROKER', max_length=10, null=True, blank=True) + +class MetagenomicsExchange(models.Model): + """Table to track Metagenomics Exchange population + https://www.ebi.ac.uk/ena/registry/metagenome/api/ + """ + broker_id = models.ForeignKey(ME_Broker, db_column='BROKER', on_delete=models.CASCADE) + first_created = models.DateTimeField(db_column='FIRST_CREATED', auto_now_add=True) + last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) + class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EbiSearchIndexedModel): def __init__(self, *args, **kwargs): @@ -1758,6 +1769,9 @@ def _custom_pk(self): instrument_model = models.CharField( db_column='INSTRUMENT_MODEL', max_length=50, blank=True, null=True) + metagenomics_exchange = models.ForeignKey( + MetagenomicsExchange, db_column='METAGENOMICS_EXCHANGE', + on_delete=models.CASCADE, blank=True, null=True) @property def release_version(self): @@ -2273,4 +2287,4 @@ def __str__(self): return f"Legacy Assembly:{self.legacy_accession} - New Accession:{self.new_accession}" models.CharField.register_lookup(Search) -models.TextField.register_lookup(Search) +models.TextField.register_lookup(Search) \ No newline at end of file diff --git a/tests/me/test_api.py b/tests/me/test_api.py new file mode 100644 index 000000000..70359707e --- /dev/null +++ b/tests/me/test_api.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2020 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import os + +from django.core.management import call_command + +from test_utils.emg_fixtures import * # noqa + +@pytest.mark.django_db +class TestMeAPI: + def test_new_analysis(self, run_multiple_analysis): + call_command( + "mgx_api", + run="ABC01234", + pipeline=5.0, + ) \ No newline at end of file From 64f4900cf45376d5eea8da70dacc512586f9a8f3 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:17:31 +0000 Subject: [PATCH 08/46] Added 2 tests for ME, population script WIP --- emgapi/management/commands/mgx_api.py | 63 +++++++--------- .../commands/populate_existing_me.py | 74 +++++++++++++++++++ emgapi/migrations/0011_auto_20230912_1346.py | 23 ++++++ emgapi/models.py | 57 +++++++++++++- emgcli/settings.py | 7 ++ tests/me/test_api.py | 66 +++++++++++++++-- 6 files changed, 245 insertions(+), 45 deletions(-) create mode 100644 emgapi/management/commands/populate_existing_me.py create mode 100644 emgapi/migrations/0011_auto_20230912_1346.py diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py index 83e7eae7c..cc1edf677 100644 --- a/emgapi/management/commands/mgx_api.py +++ b/emgapi/management/commands/mgx_api.py @@ -15,21 +15,14 @@ # limitations under the License. import logging -import os import requests -from django.db.models import Count from django.core.management import BaseCommand from django.conf import settings -from emgapi.models import Assembly, AnalysisJob +from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker logger = logging.getLogger(__name__) -API = { - 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', - 'dev': 'http://wp-np2-5c:8080/ena/registry/metagenome/api/' -} -TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' RETRY_COUNT = 5 @@ -119,28 +112,37 @@ def _check_analysis(self, sourceID, public): exists = True return exists - def _post_request(self, url, data): + def post_request(self, auth, url, data): default = { "headers": { "Content-Type": "application/json", "Accept": "application/json", } } - if self.auth: - default["headers"].update(self.auth) - response = requests.post( - url, json=data, **default - ) - else: - logging.warning( - "Not authenticated to POST private data." - # noqa: E501 - ) - response = requests.post( - url, json=data, **default - ) + default["headers"].update(auth) + response = requests.post( + url, json=data, **default + ) + print(response.text) + if response.ok: + logging.info("Data added to ME") return response + def _add_record_me(self, analysis, public): + data = { + 'confidence': 'full', + 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/{analysis.accession}', + 'method': ['other_metadata'], + 'sourceID': analysis.accession, + 'sequenceID': analysis.run.secondary_accession, + 'status': 'public' if public else 'private', + 'brokerID': self.broker, + } + response = self.post_request(auth=self.auth, url=self.url + 'datasets', data=data) + if response.ok: + broker = ME_Broker.objects.get_or_create(name=self.broker) + MetagenomicsExchange.objects.get_or_create(analysis=analysis, broker=broker) + def _filtering_analyses(self): analyses = AnalysisJob.objects.all() if self.study_accession: @@ -153,8 +155,8 @@ def _filtering_analyses(self): analyses = analyses.filter(pipeline__pipeline_id=self.pipeline_version) return analyses def handle(self, *args, **options): - self.auth = None - self.url = API['dev'] + self.auth = {"Authorization": settings.ME_TOKEN} + self.url = settings.ME_API['dev'] self.broker = options.get("broker") self.check_url = self.url + f'brokers/{self.broker}/datasets' @@ -167,22 +169,11 @@ def handle(self, *args, **options): for analysis in analyses: MGYA = analysis.accession public = not analysis.is_private - if not public: - self.auth = {"Authorization": TOKEN} # check is MGYA in ME if (self._check_analysis(sourceID=MGYA, public=public)): logging.info(f"{MGYA} exists in ME") else: logging.info(f"{MGYA} does not exist in ME") - data = { - 'confidence': 'full', - 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/{MGYA}', - 'method': ['other_metadata'], - 'sourceID': MGYA, - 'sequenceID': analysis.run.accesison, - 'status': 'public' if public else 'private' - } - #self._post_request(url=self.url+'datasets', data=data) - + self._add_record_me(analysis, public) diff --git a/emgapi/management/commands/populate_existing_me.py b/emgapi/management/commands/populate_existing_me.py new file mode 100644 index 000000000..9a031092e --- /dev/null +++ b/emgapi/management/commands/populate_existing_me.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2022 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import requests + +from django.db.models import Count +from django.core.management import BaseCommand +from django.conf import settings + +from emgapi.models import Assembly, AnalysisJob + +logger = logging.getLogger(__name__) +API = { + 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', + 'dev': 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api/' +} +TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' + +RETRY_COUNT = 5 + +class Command(BaseCommand): + help = "Populate DB with existing metagenomics exchange" + def add_arguments(self, parser): + super(Command, self).add_arguments(parser) + + parser.add_argument( + "--dev", + action="store_true", + required=False, + help="Populate dev API", + ) + parser.add_argument( + "-b", + "--broker", + action="store", + required=False, + type=str, + default='EMG', + help="Broker name", + choices=["EMG", "MAR"], + ) + def get_request(self, url, params): + if self.auth: + response = requests.get(url, params=params, headers=self.auth) + else: + logging.warning( + "Not authenticated to get private data." + # noqa: E501 + ) + response = requests.get(url, params=params) + return response + + def handle(self, *args, **options): + self.auth = None + self.url = API['dev'] + self.broker = options.get("broker") + self.check_url = self.url + f'brokers/{self.broker}/datasets' + + diff --git a/emgapi/migrations/0011_auto_20230912_1346.py b/emgapi/migrations/0011_auto_20230912_1346.py new file mode 100644 index 000000000..3477ab5ca --- /dev/null +++ b/emgapi/migrations/0011_auto_20230912_1346.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.18 on 2023-09-12 13:46 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0010_auto_20230908_1346'), + ] + + operations = [ + migrations.RemoveField( + model_name='analysisjob', + name='metagenomics_exchange', + ), + migrations.AddField( + model_name='metagenomicsexchange', + name='analysis_id', + field=models.ForeignKey(blank=True, db_column='ANALYSIS_ID', null=True, on_delete=django.db.models.deletion.CASCADE, to='emgapi.analysisjob'), + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index 5c7cac177..f30a72993 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -18,7 +18,7 @@ from django.apps import apps from django.conf import settings -from django.core.exceptions import FieldDoesNotExist +from django.core.exceptions import FieldDoesNotExist, ObjectDoesNotExist from django.db import models from django.db.models import (CharField, Count, OuterRef, Prefetch, Q, Subquery, Value, QuerySet, F) @@ -1769,9 +1769,6 @@ def _custom_pk(self): instrument_model = models.CharField( db_column='INSTRUMENT_MODEL', max_length=50, blank=True, null=True) - metagenomics_exchange = models.ForeignKey( - MetagenomicsExchange, db_column='METAGENOMICS_EXCHANGE', - on_delete=models.CASCADE, blank=True, null=True) @property def release_version(self): @@ -1793,6 +1790,58 @@ class Meta: def __str__(self): return self.accession +class ME_BrokerManager(models.Manager): + def get_or_create(self, name): + try: + broker = ME_Broker.objects.get( + brokerID=name + ) + except ObjectDoesNotExist: + logging.warning(f"{name} not in ME db, creating.") + broker = ME_Broker( + brokerID=name, + ) + broker.save() + logging.info(f"Created record in ME_broker for {name}") + return broker + +class ME_Broker(models.Model): + brokerID = models.CharField(db_column='BROKER', max_length=10, null=True, blank=True) + + objects = ME_BrokerManager() + objects_admin = models.Manager() + + +class MetagenomicsExchangeManager(models.Manager): + def get_or_create(self, analysis, broker=None): + try: + me_record = MetagenomicsExchange.objects.get( + analysis=analysis + ) + except ObjectDoesNotExist: + logging.warning(f"{analysis.accession} not in ME db, creating.") + me_record = MetagenomicsExchange( + analysis=analysis, + ) + if broker: + me_record.broker = broker + me_record.save() + logging.info(f"Created record ID: {me_record.id}") + return me_record + +class MetagenomicsExchange(models.Model): + """Table to track Metagenomics Exchange population + https://www.ebi.ac.uk/ena/registry/metagenome/api/ + """ + broker = models.ForeignKey(ME_Broker, db_column='BROKER', on_delete=models.CASCADE) + analysis = models.ForeignKey(AnalysisJob, db_column='ANALYSIS_ID', on_delete=models.CASCADE, blank=True, + null=True) + first_created = models.DateTimeField(db_column='FIRST_CREATED', auto_now_add=True) + last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) + + objects = MetagenomicsExchangeManager() + objects_admin = models.Manager() + class StudyErrorType(models.Model): error_id = models.IntegerField( diff --git a/emgcli/settings.py b/emgcli/settings.py index 4fb645d91..bad609ec1 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -682,3 +682,10 @@ def create_secret_key(var_dir): os.environ['ENA_API_USER'] = EMG_CONF['emg']['ena_api_user'] if 'ena_api_password' in EMG_CONF['emg']: os.environ['ENA_API_PASSWORD'] = EMG_CONF['emg']['ena_api_password'] + +# Metagenomics Exchange +ME_API = { + 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', + 'dev': 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api/' +} +ME_TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' diff --git a/tests/me/test_api.py b/tests/me/test_api.py index 70359707e..4763cba02 100644 --- a/tests/me/test_api.py +++ b/tests/me/test_api.py @@ -15,17 +15,73 @@ # limitations under the License. import pytest -import os +from unittest.mock import patch from django.core.management import call_command from test_utils.emg_fixtures import * # noqa +from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker + @pytest.mark.django_db class TestMeAPI: - def test_new_analysis(self, run_multiple_analysis): + @patch("emgapi.management.commands.mgx_api.Command.post_request") + def test_new_analysis_population( + self, + mock_post_request, + run_multiple_analysis + ): + class MockResponse: + def __init__(self, ok, status_code): + self.ok = ok + self.status_code = status_code + + mock_post_request.return_value = MockResponse(True, 200) + test_run = "ABC01234" + test_pipeline_version = 5.0 + test_broker = 'EMG' + + call_command( + "mgx_api", + run=test_run, + pipeline=test_pipeline_version, + broker=test_broker + ) + analysis = AnalysisJob.objects.filter(run__accession=test_run).filter(pipeline__pipeline_id=test_pipeline_version).first() + assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 + assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 + + @patch("emgapi.management.commands.mgx_api.Command.get_request") + def test_check_existing_analysis( + self, + mock_get_request, + run_multiple_analysis, + me_broker, + metagenomics_exchange, + ): + class MockResponse: + def __init__(self, ok, status_code, json_data): + self.ok = ok + self.status_code = status_code + self.json_data = json_data + + def json(self): + return self.json_data + + test_run = "ABC01234" + test_pipeline_version = 1.0 + test_broker = 'MAR' + analysis = AnalysisJob.objects.filter(run__accession=test_run).filter( + pipeline__pipeline_id=test_pipeline_version).first() + json_data = {'datasets': [{'sourceID': analysis.accession}] } + mock_get_request.return_value = MockResponse(True, 200, json_data) + call_command( "mgx_api", - run="ABC01234", - pipeline=5.0, - ) \ No newline at end of file + run=test_run, + pipeline=test_pipeline_version, + broker=test_broker + ) + + assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 + assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 \ No newline at end of file From 7484116a807c732e5443879c20243463a9928cff Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Thu, 28 Sep 2023 17:00:28 +0100 Subject: [PATCH 09/46] Move metagenomic exchange commands to separate Class. Tests fail in POST request --- emgapi/management/commands/mgx_api.py | 78 +++-------------- emgapi/metagenomics_exchange.py | 111 +++++++++++++++++++++++++ tests/me/test_api.py | 6 +- tests/me/test_metagenomics_exchange.py | 30 +++++++ 4 files changed, 154 insertions(+), 71 deletions(-) create mode 100644 emgapi/metagenomics_exchange.py create mode 100644 tests/me/test_metagenomics_exchange.py diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py index cc1edf677..5e795bf7a 100644 --- a/emgapi/management/commands/mgx_api.py +++ b/emgapi/management/commands/mgx_api.py @@ -15,12 +15,12 @@ # limitations under the License. import logging -import requests from django.core.management import BaseCommand from django.conf import settings from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker +from metagenomics_exchange import MetagenomicsExchangeAPI logger = logging.getLogger(__name__) @@ -82,66 +82,6 @@ def add_arguments(self, parser): help="Broker name", choices=["EMG", "MAR"], ) - def get_request(self, url, params): - if self.auth: - response = requests.get(url, params=params, headers=self.auth) - else: - logging.warning( - "Not authenticated to get private data." - # noqa: E501 - ) - response = requests.get(url, params=params) - return response - - def _check_analysis(self, sourceID, public): - params = { - 'status': 'public' if public else 'private', - } - response = self.get_request(url=self.check_url, params=params) - exists = False - if not response.ok: - logging.error( - "Error retrieving dataset {}, response code: {}".format( - self.check_url, response.status_code - ) - ) - else: - data = response.json()['datasets'] - for item in data: - if item['sourceID'] == sourceID: - exists = True - return exists - - def post_request(self, auth, url, data): - default = { - "headers": { - "Content-Type": "application/json", - "Accept": "application/json", - } - } - default["headers"].update(auth) - response = requests.post( - url, json=data, **default - ) - print(response.text) - if response.ok: - logging.info("Data added to ME") - return response - - def _add_record_me(self, analysis, public): - data = { - 'confidence': 'full', - 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/{analysis.accession}', - 'method': ['other_metadata'], - 'sourceID': analysis.accession, - 'sequenceID': analysis.run.secondary_accession, - 'status': 'public' if public else 'private', - 'brokerID': self.broker, - } - response = self.post_request(auth=self.auth, url=self.url + 'datasets', data=data) - if response.ok: - broker = ME_Broker.objects.get_or_create(name=self.broker) - MetagenomicsExchange.objects.get_or_create(analysis=analysis, broker=broker) def _filtering_analyses(self): analyses = AnalysisJob.objects.all() @@ -154,26 +94,28 @@ def _filtering_analyses(self): if self.pipeline_version: analyses = analyses.filter(pipeline__pipeline_id=self.pipeline_version) return analyses - def handle(self, *args, **options): - self.auth = {"Authorization": settings.ME_TOKEN} - self.url = settings.ME_API['dev'] - self.broker = options.get("broker") - self.check_url = self.url + f'brokers/{self.broker}/datasets' + def handle(self, *args, **options): self.study_accession = options.get("study") self.pipeline_version = options.get("pipeline") self.assembly_accession = options.get("assembly") self.run_accession = options.get("run") + broker = options.get("broker") + url = settings.ME_API['dev'] + check_url = url + f'brokers/{self.broker}/datasets' + ME = MetagenomicsExchangeAPI() analyses = self._filtering_analyses() + for analysis in analyses: MGYA = analysis.accession public = not analysis.is_private # check is MGYA in ME - if (self._check_analysis(sourceID=MGYA, public=public)): + if ME.check_analysis(url=check_url, sourceID=MGYA, public=public, token=settings.ME_TOKEN): logging.info(f"{MGYA} exists in ME") else: logging.info(f"{MGYA} does not exist in ME") - self._add_record_me(analysis, public) + ME.add_record(url=url, mgya=MGYA, run_accession=analysis.run.accesison, public=public, broker=broker, + token=settings.ME_TOKEN) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py new file mode 100644 index 000000000..e46e3bb5f --- /dev/null +++ b/emgapi/metagenomics_exchange.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2018-2021 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import logging +import requests + +class MetagenomicsExchangeAPI: + """Metagenomics Exchange API Client""" + def get_request(self, session, url, params): + response = session.get(url, params=params) + data = None + if not response.ok: + logging.error( + "Error retrieving dataset {}, response code: {}".format( + url, response.status_code + ) + ) + else: + data = response.json() + return data + + def post_request(self, session, url, data): + default = { + "headers": { + "Content-Type": "application/json", + "Accept": "application/json", + } + } + + response = session.post( + url, json=data, **default + ) + + if response.ok: + print('Added') + logging.info("Data added to ME") + logging.debug(response.text) + else: + print(response.text) + return response + def add_record( + self, url, mgya, run_accession, public, broker, token + ): + data = { + 'confidence': 'full', + 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/mgya', + 'method': ['other_metadata'], + 'sourceID': mgya, + 'sequenceID': run_accession, + 'status': 'public' if public else 'private', + 'brokerID': broker, + } + + with requests.Session() as session: + url = url + 'datasets' + session = self._authenticate_session(session, url, token) + print(session) + response = self.post_request(session=session, url=url, data=data) + data = response.json() + return data + + def check_analysis(self, url, sourceID, public, token): + logging.info(f'Check {sourceID}') + params = { + 'status': 'public' if public else 'private', + } + with requests.Session() as session: + session = self._authenticate_session(session, url, token) + response = self.get_request(session=session, url=url, params=params) + if response: + data = response['datasets'] + exists = False + for item in data: + if item['sourceID'] == sourceID: + exists = True + logging.info(f"{sourceID} exists: {exists}") + return exists + + def _authenticate_session(self, session, url, token): + """Authenticate the MGnify API request""" + + logging.debug(f"Authenticating ME account") + + headers = {"Authorization": token} + + response = session.get(url, headers=headers) + + if response.ok: + logging.debug("ME account successfully authenticated.") + print(f'Auth {url} {response}') + else: + print(response) + # Log textual reason of responded HTTP Status + logging.error(f"Authentication services responded: {response.reason}). Program will exit now.") + sys.exit(1) + + return session diff --git a/tests/me/test_api.py b/tests/me/test_api.py index 4763cba02..ee09abdba 100644 --- a/tests/me/test_api.py +++ b/tests/me/test_api.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +""" import pytest from unittest.mock import patch @@ -67,7 +67,6 @@ def __init__(self, ok, status_code, json_data): def json(self): return self.json_data - test_run = "ABC01234" test_pipeline_version = 1.0 test_broker = 'MAR' @@ -84,4 +83,5 @@ def json(self): ) assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 - assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 \ No newline at end of file + assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 + """ \ No newline at end of file diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py new file mode 100644 index 000000000..91da7e36d --- /dev/null +++ b/tests/me/test_metagenomics_exchange.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from emgapi import metagenomics_exchange as ME +from django.conf import settings + +class TestME: + + test_ME = ME.MetagenomicsExchangeAPI() + def test_check_existing_analysis(self): + sourceID = "MGYA00293719" + broker = 'EMG' + url = settings.ME_API['dev'] + f'brokers/{broker}/datasets' + token = settings.ME_TOKEN + assert self.test_ME.check_analysis(url, sourceID, True, token) + + def test_check_not_existing_analysis(self): + sourceID = "MGYA00293719" + broker = 'MAR' + url = settings.ME_API['dev'] + f'brokers/{broker}/datasets' + token = settings.ME_TOKEN + assert not self.test_ME.check_analysis(url, sourceID, True, token) + + def test_post_existing_analysis(self): + sourceID = "MGYA00293719" + broker = 'EMG' + url = settings.ME_API['dev'] + token = settings.ME_TOKEN + assert not self.test_ME.add_record(url=url, mgya=sourceID, run_accession="ERR3063408", public=True, + broker=broker, token=token) From fee19bdee0e845de0485600c904a236675848308 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 29 Sep 2023 15:40:18 +0100 Subject: [PATCH 10/46] My revised version: - Refactored the cli wrapper - Refactored the settings to be in line with the rest - Modified the unit tests --- config/local-tests.yml | 5 +- emgapi/management/commands/mgx_api.py | 1 + emgapi/metagenomics_exchange.py | 128 ++++++++++--------------- emgcli/settings.py | 12 ++- tests/me/test_metagenomics_exchange.py | 38 ++++---- 5 files changed, 79 insertions(+), 105 deletions(-) diff --git a/config/local-tests.yml b/config/local-tests.yml index bf28e5b31..fa833d284 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -14,4 +14,7 @@ emg: results_path: 'fixtures/' celery_broker: 'redis://localhost:6379/0' celery_backend: 'redis://localhost:6379/1' - results_production_dir: '/dummy/path/results' \ No newline at end of file + results_production_dir: '/dummy/path/results' + # metagenomics exchange + me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api_token: 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' \ No newline at end of file diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py index 5e795bf7a..7bbb1ce9d 100644 --- a/emgapi/management/commands/mgx_api.py +++ b/emgapi/management/commands/mgx_api.py @@ -101,6 +101,7 @@ def handle(self, *args, **options): self.assembly_accession = options.get("assembly") self.run_accession = options.get("run") + # FIXME: this command needs adjustments broker = options.get("broker") url = settings.ME_API['dev'] check_url = url + f'brokers/{self.broker}/datasets' diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index e46e3bb5f..166fd617e 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 2018-2021 EMBL - European Bioinformatics Institute +# Copyright 2018-2023 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,98 +14,68 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import logging import requests +from django.conf import settings + + class MetagenomicsExchangeAPI: + """Metagenomics Exchange API Client""" - def get_request(self, session, url, params): - response = session.get(url, params=params) - data = None - if not response.ok: - logging.error( - "Error retrieving dataset {}, response code: {}".format( - url, response.status_code - ) - ) - else: - data = response.json() - return data - def post_request(self, session, url, data): - default = { - "headers": { - "Content-Type": "application/json", - "Accept": "application/json", - } - } + def __init__(self, broker="EMG"): + self.base_url = settings.ME_API + self.__token = settings.ME_API_TOKEN + self.broker = broker - response = session.post( - url, json=data, **default + def get_request(self, endpoint: str, params: dict): + """Make a GET request, returns the response""" + headers = {"Accept": "application/json", "Authorization": self.__token} + response = requests.get( + f"{self.base_url}/{endpoint}", headers=headers, params=params ) + response.raise_for_status() + return response - if response.ok: - print('Added') - logging.info("Data added to ME") - logging.debug(response.text) - else: - print(response.text) + def post_request(self, endpoint: str, data: dict): + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "Authorization": self.__token, + } + response = requests.post( + f"{self.base_url}/{endpoint}", json=data, headers=headers + ) + response.raise_for_status() return response - def add_record( - self, url, mgya, run_accession, public, broker, token - ): + + def add_record(self, mgya: str, run_accession: str, public: bool): data = { - 'confidence': 'full', - 'endPoint': f'https://www.ebi.ac.uk/metagenomics/analyses/mgya', - 'method': ['other_metadata'], - 'sourceID': mgya, - 'sequenceID': run_accession, - 'status': 'public' if public else 'private', - 'brokerID': broker, + "confidence": "full", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/mgya", + "method": ["other_metadata"], + "sourceID": mgya, + "sequenceID": run_accession, + "status": "public" if public else "private", + "brokerID": self.broker, } + response = self.post_request(endpoint="datasets", data=data) + return response.json() - with requests.Session() as session: - url = url + 'datasets' - session = self._authenticate_session(session, url, token) - print(session) - response = self.post_request(session=session, url=url, data=data) - data = response.json() - return data - - def check_analysis(self, url, sourceID, public, token): - logging.info(f'Check {sourceID}') + def check_analysis(self, source_id: str, public: bool) -> bool: + logging.info(f"Check {source_id}") params = { - 'status': 'public' if public else 'private', + "status": "public" if public else "private", } - with requests.Session() as session: - session = self._authenticate_session(session, url, token) - response = self.get_request(session=session, url=url, params=params) - if response: - data = response['datasets'] - exists = False - for item in data: - if item['sourceID'] == sourceID: - exists = True - logging.info(f"{sourceID} exists: {exists}") - return exists - - def _authenticate_session(self, session, url, token): - """Authenticate the MGnify API request""" - - logging.debug(f"Authenticating ME account") - - headers = {"Authorization": token} - - response = session.get(url, headers=headers) - + endpoint = f"brokers/{self.broker}/datasets" + response = self.get_request(endpoint=endpoint, params=params) if response.ok: - logging.debug("ME account successfully authenticated.") - print(f'Auth {url} {response}') - else: - print(response) - # Log textual reason of responded HTTP Status - logging.error(f"Authentication services responded: {response.reason}). Program will exit now.") - sys.exit(1) + data = response.json() + datasets = data.get("datasets") + for item in datasets: + if item.get("sourceID") == source_id: + return True + logging.info(f"{source_id} exists") - return session + return False diff --git a/emgcli/settings.py b/emgcli/settings.py index bad609ec1..53187bf5c 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -684,8 +684,10 @@ def create_secret_key(var_dir): os.environ['ENA_API_PASSWORD'] = EMG_CONF['emg']['ena_api_password'] # Metagenomics Exchange -ME_API = { - 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', - 'dev': 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api/' -} -ME_TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' +try: + ME_API = EMG_CONF['emg']['me_api'] + ME_API_TOKEN = EMG_CONF['emg']['me_api_token'] +except KeyError: + ME_API = "" + ME_API_TOKEN = "" + warnings.warn("The metagenomics exchange API and Token are not configured properly") diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 91da7e36d..e59ea4a33 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -1,30 +1,28 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from emgapi import metagenomics_exchange as ME -from django.conf import settings +import pytest +from unittest import mock -class TestME: +from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI + +from requests import HTTPError - test_ME = ME.MetagenomicsExchangeAPI() + +class TestME: def test_check_existing_analysis(self): - sourceID = "MGYA00293719" - broker = 'EMG' - url = settings.ME_API['dev'] + f'brokers/{broker}/datasets' - token = settings.ME_TOKEN - assert self.test_ME.check_analysis(url, sourceID, True, token) + me_api = MetagenomicsExchangeAPI() + source_id = "MGYA00293719" + assert me_api.check_analysis(source_id, True) def test_check_not_existing_analysis(self): - sourceID = "MGYA00293719" - broker = 'MAR' - url = settings.ME_API['dev'] + f'brokers/{broker}/datasets' - token = settings.ME_TOKEN - assert not self.test_ME.check_analysis(url, sourceID, True, token) + me_api = MetagenomicsExchangeAPI(broker="MAR") + source_id = "MGYA00293719" + assert not me_api.check_analysis(source_id, True) def test_post_existing_analysis(self): - sourceID = "MGYA00293719" - broker = 'EMG' - url = settings.ME_API['dev'] - token = settings.ME_TOKEN - assert not self.test_ME.add_record(url=url, mgya=sourceID, run_accession="ERR3063408", public=True, - broker=broker, token=token) + me_api = MetagenomicsExchangeAPI() + source_id = "MGYA00293719" + # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 + with pytest.raises(HTTPError, match="409 Client Error"): + me_api.add_record(mgya=source_id, run_accession="ERR3063408", public=True) From e7d9b171eb19489b8c1b62c8500c03de3dbc7b26 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Fri, 20 Oct 2023 16:48:26 +0100 Subject: [PATCH 11/46] Modified ME dj function to add new ME records, fixed tests for ME and api --- emgapi/management/commands/mgx_api.py | 19 ++++++++-------- emgapi/metagenomics_exchange.py | 8 +++---- emgapi/models.py | 11 ++++----- tests/me/test_metagenomics_exchange.py | 24 +++++++++++++++++--- tests/me/{test_api.py => test_mgx_api.py} | 27 +++++++++++++++-------- 5 files changed, 59 insertions(+), 30 deletions(-) rename tests/me/{test_api.py => test_mgx_api.py} (80%) diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py index 7bbb1ce9d..984f8fe6e 100644 --- a/emgapi/management/commands/mgx_api.py +++ b/emgapi/management/commands/mgx_api.py @@ -17,10 +17,9 @@ import logging from django.core.management import BaseCommand -from django.conf import settings from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker -from metagenomics_exchange import MetagenomicsExchangeAPI +from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI logger = logging.getLogger(__name__) @@ -101,22 +100,24 @@ def handle(self, *args, **options): self.assembly_accession = options.get("assembly") self.run_accession = options.get("run") - # FIXME: this command needs adjustments broker = options.get("broker") - url = settings.ME_API['dev'] - check_url = url + f'brokers/{self.broker}/datasets' - ME = MetagenomicsExchangeAPI() + broker_obj = ME_Broker.objects.get_or_create(broker) + ME = MetagenomicsExchangeAPI(broker=broker) analyses = self._filtering_analyses() for analysis in analyses: MGYA = analysis.accession public = not analysis.is_private # check is MGYA in ME - if ME.check_analysis(url=check_url, sourceID=MGYA, public=public, token=settings.ME_TOKEN): + if ME.check_analysis(source_id=MGYA, public=public): logging.info(f"{MGYA} exists in ME") else: logging.info(f"{MGYA} does not exist in ME") - ME.add_record(url=url, mgya=MGYA, run_accession=analysis.run.accesison, public=public, broker=broker, - token=settings.ME_TOKEN) + response = ME.add_record(mgya=MGYA, run_accession=analysis.run, public=public) + if response.status_code == 201: + logging.info(f"Populating MetagenomicsExchange table with {MGYA}") + MetagenomicsExchange.objects.get_or_create(analysis, broker=broker_obj) + else: + logging.error(f"{MGYA} response exit code: {response.status_code}. No population to DB.") diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 166fd617e..2b2d44309 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -53,7 +53,7 @@ def post_request(self, endpoint: str, data: dict): def add_record(self, mgya: str, run_accession: str, public: bool): data = { "confidence": "full", - "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/mgya", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", "method": ["other_metadata"], "sourceID": mgya, "sequenceID": run_accession, @@ -61,7 +61,7 @@ def add_record(self, mgya: str, run_accession: str, public: bool): "brokerID": self.broker, } response = self.post_request(endpoint="datasets", data=data) - return response.json() + return response def check_analysis(self, source_id: str, public: bool) -> bool: logging.info(f"Check {source_id}") @@ -75,7 +75,7 @@ def check_analysis(self, source_id: str, public: bool) -> bool: datasets = data.get("datasets") for item in datasets: if item.get("sourceID") == source_id: + logging.info(f"{source_id} exists") return True - logging.info(f"{source_id} exists") - + logging.info(f"{source_id} does not exist") return False diff --git a/emgapi/models.py b/emgapi/models.py index f30a72993..0c573a0b0 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -1813,18 +1813,16 @@ class ME_Broker(models.Model): class MetagenomicsExchangeManager(models.Manager): - def get_or_create(self, analysis, broker=None): + def get_or_create(self, analysis, broker): try: me_record = MetagenomicsExchange.objects.get( - analysis=analysis + analysis=analysis, broker=broker ) except ObjectDoesNotExist: logging.warning(f"{analysis.accession} not in ME db, creating.") me_record = MetagenomicsExchange( - analysis=analysis, + analysis=analysis, broker=broker ) - if broker: - me_record.broker = broker me_record.save() logging.info(f"Created record ID: {me_record.id}") return me_record @@ -1842,6 +1840,9 @@ class MetagenomicsExchange(models.Model): objects = MetagenomicsExchangeManager() objects_admin = models.Manager() + def __str__(self): + return f"{self.analysis.job_id} under {self.broker}" + class StudyErrorType(models.Model): error_id = models.IntegerField( diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index e59ea4a33..f387618df 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -6,7 +6,9 @@ from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI -from requests import HTTPError +import requests +import responses +import settings class TestME: @@ -24,5 +26,21 @@ def test_post_existing_analysis(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 - with pytest.raises(HTTPError, match="409 Client Error"): - me_api.add_record(mgya=source_id, run_accession="ERR3063408", public=True) + with pytest.raises(requests.HTTPError, match="409 Client Error"): + me_api.add_record(mgya=source_id, run_accession="ERR3063408", public=True).json() + + @responses.activate + def test_mock_post_new_analysis(self): + me_api = MetagenomicsExchangeAPI() + endpoint = "datasets" + url = settings.ME_API + f"/{endpoint}" + + responses.add(responses.POST, url, json={'success': True}, status=201) + + response = me_api.add_record(mgya="MGYA00593709", run_accession="SRR3960575", public=True) + + assert response.status_code == 201 + assert response.json() == {'success': True} + + + diff --git a/tests/me/test_api.py b/tests/me/test_mgx_api.py similarity index 80% rename from tests/me/test_api.py rename to tests/me/test_mgx_api.py index ee09abdba..a219747a7 100644 --- a/tests/me/test_api.py +++ b/tests/me/test_mgx_api.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" + import pytest from unittest.mock import patch @@ -25,18 +25,21 @@ @pytest.mark.django_db class TestMeAPI: - @patch("emgapi.management.commands.mgx_api.Command.post_request") + @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.post_request") + @pytest.mark.usefixtures( + "me_broker", + "metagenomics_exchange", + ) def test_new_analysis_population( self, - mock_post_request, - run_multiple_analysis + mock_post_request ): class MockResponse: def __init__(self, ok, status_code): self.ok = ok self.status_code = status_code - mock_post_request.return_value = MockResponse(True, 200) + mock_post_request.return_value = MockResponse(True, 201) test_run = "ABC01234" test_pipeline_version = 5.0 test_broker = 'EMG' @@ -49,9 +52,14 @@ def __init__(self, ok, status_code): ) analysis = AnalysisJob.objects.filter(run__accession=test_run).filter(pipeline__pipeline_id=test_pipeline_version).first() assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 - assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 + broker_id = ME_Broker.objects.filter(brokerID=test_broker).first().id + assert MetagenomicsExchange.objects.filter(analysis=analysis).filter(broker=broker_id).count() == 1 - @patch("emgapi.management.commands.mgx_api.Command.get_request") + @pytest.mark.usefixtures( + "me_broker", + "metagenomics_exchange", + ) + @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.get_request") def test_check_existing_analysis( self, mock_get_request, @@ -67,6 +75,7 @@ def __init__(self, ok, status_code, json_data): def json(self): return self.json_data + test_run = "ABC01234" test_pipeline_version = 1.0 test_broker = 'MAR' @@ -83,5 +92,5 @@ def json(self): ) assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 - assert MetagenomicsExchange.objects.filter(analysis=analysis).count() == 1 - """ \ No newline at end of file + broker_id = ME_Broker.objects.filter(brokerID=test_broker).first().id + assert MetagenomicsExchange.objects.filter(analysis=analysis, broker=broker_id).count() == 1 From 2e0b019d46eead5c0ee6a485b5a3b883614ed357 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 23 Oct 2023 11:06:08 +0100 Subject: [PATCH 12/46] Move test for mock API to test_mgx_api.py --- tests/me/test_metagenomics_exchange.py | 20 -------------------- tests/me/test_mgx_api.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index f387618df..320de1f5d 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -2,14 +2,10 @@ # -*- coding: utf-8 -*- import pytest -from unittest import mock from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI import requests -import responses -import settings - class TestME: def test_check_existing_analysis(self): @@ -28,19 +24,3 @@ def test_post_existing_analysis(self): # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 with pytest.raises(requests.HTTPError, match="409 Client Error"): me_api.add_record(mgya=source_id, run_accession="ERR3063408", public=True).json() - - @responses.activate - def test_mock_post_new_analysis(self): - me_api = MetagenomicsExchangeAPI() - endpoint = "datasets" - url = settings.ME_API + f"/{endpoint}" - - responses.add(responses.POST, url, json={'success': True}, status=201) - - response = me_api.add_record(mgya="MGYA00593709", run_accession="SRR3960575", public=True) - - assert response.status_code == 201 - assert response.json() == {'success': True} - - - diff --git a/tests/me/test_mgx_api.py b/tests/me/test_mgx_api.py index a219747a7..9d86973c4 100644 --- a/tests/me/test_mgx_api.py +++ b/tests/me/test_mgx_api.py @@ -15,14 +15,31 @@ # limitations under the License. import pytest +import responses from unittest.mock import patch from django.core.management import call_command +import settings from test_utils.emg_fixtures import * # noqa +from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker +class TestMockResponse: + @responses.activate + def test_mock_post_new_analysis(self): + me_api = MetagenomicsExchangeAPI() + endpoint = "datasets" + url = settings.ME_API + f"/{endpoint}" + + responses.add(responses.POST, url, json={'success': True}, status=201) + + response = me_api.add_record(mgya="MGYA00593709", run_accession="SRR3960575", public=True) + + assert response.status_code == 201 + assert response.json() == {'success': True} + @pytest.mark.django_db class TestMeAPI: @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.post_request") From c92c3815448bb728dbb714422e3e259d5eb8f6b2 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 23 Oct 2023 11:21:16 +0100 Subject: [PATCH 13/46] Add settings --- tests/me/test_mgx_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/me/test_mgx_api.py b/tests/me/test_mgx_api.py index 9d86973c4..71fc76047 100644 --- a/tests/me/test_mgx_api.py +++ b/tests/me/test_mgx_api.py @@ -18,8 +18,8 @@ import responses from unittest.mock import patch +from django.conf import settings from django.core.management import call_command -import settings from test_utils.emg_fixtures import * # noqa From bdac39c00abbc49c8568b20eae164ccef93f4293 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:46:44 +0000 Subject: [PATCH 14/46] change models and add patch and delete to ME --- ci/configuration.yaml | 5 ++- config/local-tests.yml | 2 +- emgapi/metagenomics_exchange.py | 52 +++++++++++++++++++++++- emgapi/models.py | 55 +------------------------- tests/me/test_metagenomics_exchange.py | 18 ++++++++- tests/me/test_mgx_api.py | 20 ++-------- 6 files changed, 77 insertions(+), 75 deletions(-) diff --git a/ci/configuration.yaml b/ci/configuration.yaml index 3877e4539..8d99cd8f6 100644 --- a/ci/configuration.yaml +++ b/ci/configuration.yaml @@ -17,4 +17,7 @@ emg: results_path: 'fixtures/' celery_broker: 'redis://localhost:6379/0' celery_backend: 'redis://localhost:6379/1' - results_production_dir: '/dummy/path/results' \ No newline at end of file + results_production_dir: '/dummy/path/results' + # metagenomics exchange + me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014 ' \ No newline at end of file diff --git a/config/local-tests.yml b/config/local-tests.yml index fa833d284..4f24fada9 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -17,4 +17,4 @@ emg: results_production_dir: '/dummy/path/results' # metagenomics exchange me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' - me_api_token: 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' \ No newline at end of file + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014 ' \ No newline at end of file diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 2b2d44309..fe3d5c0cc 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -50,7 +50,30 @@ def post_request(self, endpoint: str, data: dict): response.raise_for_status() return response - def add_record(self, mgya: str, run_accession: str, public: bool): + def delete_request(self, endpoint: str, data: dict): + headers = { + "Accept": "application/json", + "Authorization": self.__token, + } + response = requests.delete( + f"{self.base_url}/{endpoint}", json=data, headers=headers + ) + response.raise_for_status() + return response + + def patch_request(self, endpoint: str, data: dict): + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "Authorization": self.__token, + } + response = requests.patch( + f"{self.base_url}/{endpoint}", json=data, headers=headers + ) + response.raise_for_status() + return response + + def add_analysis(self, mgya: str, run_accession: str, public: bool): data = { "confidence": "full", "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", @@ -79,3 +102,30 @@ def check_analysis(self, source_id: str, public: bool) -> bool: return True logging.info(f"{source_id} does not exist") return False + + def delete_analysis(self, registry_id: str): + data = {"registryID": registry_id} + response = self.delete_request(endpoint="datasets", data=data) + if response.ok: + logging.info(f"{registry_id} was deleted") + return True + else: + if response.status_code == 400: + logging.error(f"Bad request for {registry_id}") + elif response.status_code == 401: + logging.error(f"{response.message} for {registry_id}") + return False + + def patch_analysis(self, registry_id: str, data: dict): + response = self.patch_request(endpoint=f"datasets/{registry_id}", data=data) + if response.ok: + logging.info(f"{registry_id} was patched") + return True + else: + if response.status_code == 400: + logging.error(f"Bad request for {registry_id}") + elif response.status_code == 401: + logging.error(f"Fail to authenticate for {registry_id}") + elif response.status_code == 409: + logging.error(f"Conflicts with existing data for {registry_id}") + return False diff --git a/emgapi/models.py b/emgapi/models.py index 0c573a0b0..0a5de019b 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -1706,7 +1706,7 @@ class MetagenomicsExchange(models.Model): last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) -class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EbiSearchIndexedModel): +class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EbiSearchIndexedModel, MetagenomicsExchangeModel): def __init__(self, *args, **kwargs): super(AnalysisJob, self).__init__(*args, **kwargs) setattr(self, 'accession', @@ -1790,59 +1790,6 @@ class Meta: def __str__(self): return self.accession -class ME_BrokerManager(models.Manager): - def get_or_create(self, name): - try: - broker = ME_Broker.objects.get( - brokerID=name - ) - except ObjectDoesNotExist: - logging.warning(f"{name} not in ME db, creating.") - broker = ME_Broker( - brokerID=name, - ) - broker.save() - logging.info(f"Created record in ME_broker for {name}") - return broker - -class ME_Broker(models.Model): - brokerID = models.CharField(db_column='BROKER', max_length=10, null=True, blank=True) - - objects = ME_BrokerManager() - objects_admin = models.Manager() - - -class MetagenomicsExchangeManager(models.Manager): - def get_or_create(self, analysis, broker): - try: - me_record = MetagenomicsExchange.objects.get( - analysis=analysis, broker=broker - ) - except ObjectDoesNotExist: - logging.warning(f"{analysis.accession} not in ME db, creating.") - me_record = MetagenomicsExchange( - analysis=analysis, broker=broker - ) - me_record.save() - logging.info(f"Created record ID: {me_record.id}") - return me_record - -class MetagenomicsExchange(models.Model): - """Table to track Metagenomics Exchange population - https://www.ebi.ac.uk/ena/registry/metagenome/api/ - """ - broker = models.ForeignKey(ME_Broker, db_column='BROKER', on_delete=models.CASCADE) - analysis = models.ForeignKey(AnalysisJob, db_column='ANALYSIS_ID', on_delete=models.CASCADE, blank=True, - null=True) - first_created = models.DateTimeField(db_column='FIRST_CREATED', auto_now_add=True) - last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) - - objects = MetagenomicsExchangeManager() - objects_admin = models.Manager() - - def __str__(self): - return f"{self.analysis.job_id} under {self.broker}" - class StudyErrorType(models.Model): error_id = models.IntegerField( diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 320de1f5d..ce7e55bdb 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -3,9 +3,12 @@ import pytest +from django.conf import settings + from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI import requests +import responses class TestME: def test_check_existing_analysis(self): @@ -23,4 +26,17 @@ def test_post_existing_analysis(self): source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 with pytest.raises(requests.HTTPError, match="409 Client Error"): - me_api.add_record(mgya=source_id, run_accession="ERR3063408", public=True).json() + me_api.add_analysis(mgya=source_id, run_accession="ERR3063408", public=True).json() + + @responses.activate + def test_mock_post_new_analysis(self): + me_api = MetagenomicsExchangeAPI() + endpoint = "datasets" + url = settings.ME_API + f"/{endpoint}" + + responses.add(responses.POST, url, json={'success': True}, status=201) + + response = me_api.add_analysis(mgya="MGYA00593709", run_accession="SRR3960575", public=True) + + assert response.status_code == 201 + assert response.json() == {'success': True} \ No newline at end of file diff --git a/tests/me/test_mgx_api.py b/tests/me/test_mgx_api.py index 71fc76047..e8582c616 100644 --- a/tests/me/test_mgx_api.py +++ b/tests/me/test_mgx_api.py @@ -15,31 +15,16 @@ # limitations under the License. import pytest -import responses + from unittest.mock import patch -from django.conf import settings from django.core.management import call_command from test_utils.emg_fixtures import * # noqa from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker - -class TestMockResponse: - @responses.activate - def test_mock_post_new_analysis(self): - me_api = MetagenomicsExchangeAPI() - endpoint = "datasets" - url = settings.ME_API + f"/{endpoint}" - - responses.add(responses.POST, url, json={'success': True}, status=201) - - response = me_api.add_record(mgya="MGYA00593709", run_accession="SRR3960575", public=True) - - assert response.status_code == 201 - assert response.json() == {'success': True} - +""" @pytest.mark.django_db class TestMeAPI: @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.post_request") @@ -111,3 +96,4 @@ def json(self): assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 broker_id = ME_Broker.objects.filter(brokerID=test_broker).first().id assert MetagenomicsExchange.objects.filter(analysis=analysis, broker=broker_id).count() == 1 +""" \ No newline at end of file From 65e64e0210f023dea329d95ac53d4f8550f6d034 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 24 Oct 2023 15:31:57 +0100 Subject: [PATCH 15/46] Add delete and patch commands with tests to ME API --- emgapi/metagenomics_exchange.py | 11 +++--- tests/me/test_metagenomics_exchange.py | 50 +++++++++++++++++++++++--- tests/me/test_mgx_api.py | 4 +-- 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index fe3d5c0cc..1a8d92654 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -50,13 +50,13 @@ def post_request(self, endpoint: str, data: dict): response.raise_for_status() return response - def delete_request(self, endpoint: str, data: dict): + def delete_request(self, endpoint: str): headers = { "Accept": "application/json", "Authorization": self.__token, } response = requests.delete( - f"{self.base_url}/{endpoint}", json=data, headers=headers + f"{self.base_url}/{endpoint}", headers=headers ) response.raise_for_status() return response @@ -104,15 +104,16 @@ def check_analysis(self, source_id: str, public: bool) -> bool: return False def delete_analysis(self, registry_id: str): - data = {"registryID": registry_id} - response = self.delete_request(endpoint="datasets", data=data) + response = self.delete_request(endpoint=f"datasets/{registry_id}") if response.ok: - logging.info(f"{registry_id} was deleted") + logging.info(f"{registry_id} was deleted with {response.status_code}") return True else: if response.status_code == 400: logging.error(f"Bad request for {registry_id}") elif response.status_code == 401: + logging.error(f"Failed to authenticate for {registry_id}") + else: logging.error(f"{response.message} for {registry_id}") return False diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index ce7e55bdb..2621b1e90 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -11,17 +11,18 @@ import responses class TestME: - def test_check_existing_analysis(self): + + def test_check_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" assert me_api.check_analysis(source_id, True) - def test_check_not_existing_analysis(self): + def test_check_not_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI(broker="MAR") source_id = "MGYA00293719" assert not me_api.check_analysis(source_id, True) - def test_post_existing_analysis(self): + def test_post_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 @@ -39,4 +40,45 @@ def test_mock_post_new_analysis(self): response = me_api.add_analysis(mgya="MGYA00593709", run_accession="SRR3960575", public=True) assert response.status_code == 201 - assert response.json() == {'success': True} \ No newline at end of file + assert response.json() == {'success': True} + + @responses.activate + def test_mock_delete_analysis_from_me(self): + me_api = MetagenomicsExchangeAPI() + registry_id = "MGX0000780" + endpoint = f"datasets/{registry_id}" + url = settings.ME_API + f"/{endpoint}" + + responses.add(responses.DELETE, url, json={'success': True}, status=201) + response = me_api.delete_request(endpoint) + + assert response.status_code == 201 + assert response.json() == {'success': True} + + + def test_wrong_delete_request_me(self): + me_api = MetagenomicsExchangeAPI() + registry_id = "MGX0000780" + endpoint = f"dataset/{registry_id}" + + with pytest.raises(requests.HTTPError, match="404 Client Error"): + me_api.delete_request(endpoint) + + def test_patch_analysis_me(self): + me_api = MetagenomicsExchangeAPI() + registry_id = "MGX0000788" + mgya = "MGYA00593709" + run_accession = "SRR3960575" + public = False + + data = { + "confidence": "full", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", + "method": ["other_metadata"], + "sourceID": mgya, + "sequenceID": run_accession, + "status": "public" if public else "private", + "brokerID": "EMG", + } + assert me_api.patch_analysis(registry_id, data) + diff --git a/tests/me/test_mgx_api.py b/tests/me/test_mgx_api.py index e8582c616..9a75acea0 100644 --- a/tests/me/test_mgx_api.py +++ b/tests/me/test_mgx_api.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +""" import pytest from unittest.mock import patch @@ -24,7 +24,7 @@ from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker -""" + @pytest.mark.django_db class TestMeAPI: @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.post_request") From 3c664e6b49f127afbb624a25e95c72b693ce805e Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 11:51:24 +0000 Subject: [PATCH 16/46] add population scripts and tests for dry-run --- ci/configuration.yaml | 4 +- config/local-tests.yml | 4 +- emgapi/management/commands/mgx_api.py | 123 -------------- .../commands/populate_existing_me.py | 74 --------- .../populate_metagenomics_exchange.py | 151 ++++++++++++++++++ emgapi/metagenomics_exchange.py | 39 +++-- emgapi/migrations/0012_auto_20231115_1448.py | 37 +++++ emgapi/models.py | 64 ++++++++ emgcli/settings.py | 8 + tests/me/test_metagenomics_exchange.py | 10 +- tests/me/test_mgx_api.py | 99 ------------ .../me/test_populate_metagenomics_exchange.py | 66 ++++++++ tests/test_utils/emg_fixtures.py | 28 +++- 13 files changed, 393 insertions(+), 314 deletions(-) delete mode 100644 emgapi/management/commands/mgx_api.py delete mode 100644 emgapi/management/commands/populate_existing_me.py create mode 100644 emgapi/management/commands/populate_metagenomics_exchange.py create mode 100644 emgapi/migrations/0012_auto_20231115_1448.py delete mode 100644 tests/me/test_mgx_api.py create mode 100644 tests/me/test_populate_metagenomics_exchange.py diff --git a/ci/configuration.yaml b/ci/configuration.yaml index 8d99cd8f6..a05127d10 100644 --- a/ci/configuration.yaml +++ b/ci/configuration.yaml @@ -20,4 +20,6 @@ emg: results_production_dir: '/dummy/path/results' # metagenomics exchange me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014 ' \ No newline at end of file + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' + me_api_dev: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api_token_dev: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file diff --git a/config/local-tests.yml b/config/local-tests.yml index 4f24fada9..f37eaff5d 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -17,4 +17,6 @@ emg: results_production_dir: '/dummy/path/results' # metagenomics exchange me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014 ' \ No newline at end of file + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' + me_api_dev: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api_token_dev: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file diff --git a/emgapi/management/commands/mgx_api.py b/emgapi/management/commands/mgx_api.py deleted file mode 100644 index 984f8fe6e..000000000 --- a/emgapi/management/commands/mgx_api.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2017-2022 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -from django.core.management import BaseCommand - -from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker -from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI - -logger = logging.getLogger(__name__) - -RETRY_COUNT = 5 - -class Command(BaseCommand): - help = "Check and populate metagenomics exchange." - def add_arguments(self, parser): - super(Command, self).add_arguments(parser) - parser.add_argument( - "-s", - "--study", - action="store", - required=False, - type=str, - help="Study accession (rather than all)", - ) - parser.add_argument( - "-p", - "--pipeline", - help="Pipeline version (rather than all). Not applicable to Genomes.", - action="store", - dest="pipeline", - choices=[1.0, 2.0, 3.0, 4.0, 4.1, 5.0], - required=False, - type=float, - ) - parser.add_argument( - "-z", - "--assembly", - action="store", - required=False, - type=str, - help="Assembly accession (rather than all)", - nargs="+" - ) - parser.add_argument( - "-r", - "--run", - action="store", - required=False, - type=str, - help="Run accession (rather than all)", - nargs="+" - ) - parser.add_argument( - "--dev", - action="store_true", - required=False, - help="Populate dev API", - ) - parser.add_argument( - "-b", - "--broker", - action="store", - required=False, - type=str, - default='EMG', - help="Broker name", - choices=["EMG", "MAR"], - ) - - def _filtering_analyses(self): - analyses = AnalysisJob.objects.all() - if self.study_accession: - analyses = analyses.filter(study__secondary_accession=self.study_accession) - if self.run_accession: - analyses = analyses.filter(run__accession=self.run_accession) - if self.assembly_accession: - analyses = analyses.filter(assembly__accession=self.assembly_accession) - if self.pipeline_version: - analyses = analyses.filter(pipeline__pipeline_id=self.pipeline_version) - return analyses - - def handle(self, *args, **options): - self.study_accession = options.get("study") - self.pipeline_version = options.get("pipeline") - self.assembly_accession = options.get("assembly") - self.run_accession = options.get("run") - - broker = options.get("broker") - broker_obj = ME_Broker.objects.get_or_create(broker) - ME = MetagenomicsExchangeAPI(broker=broker) - analyses = self._filtering_analyses() - - for analysis in analyses: - MGYA = analysis.accession - public = not analysis.is_private - # check is MGYA in ME - if ME.check_analysis(source_id=MGYA, public=public): - logging.info(f"{MGYA} exists in ME") - else: - logging.info(f"{MGYA} does not exist in ME") - response = ME.add_record(mgya=MGYA, run_accession=analysis.run, public=public) - if response.status_code == 201: - logging.info(f"Populating MetagenomicsExchange table with {MGYA}") - MetagenomicsExchange.objects.get_or_create(analysis, broker=broker_obj) - else: - logging.error(f"{MGYA} response exit code: {response.status_code}. No population to DB.") - - diff --git a/emgapi/management/commands/populate_existing_me.py b/emgapi/management/commands/populate_existing_me.py deleted file mode 100644 index 9a031092e..000000000 --- a/emgapi/management/commands/populate_existing_me.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2017-2022 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import requests - -from django.db.models import Count -from django.core.management import BaseCommand -from django.conf import settings - -from emgapi.models import Assembly, AnalysisJob - -logger = logging.getLogger(__name__) -API = { - 'real': 'https://www.ebi.ac.uk/ena/registry/metagenome/api/', - 'dev': 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api/' -} -TOKEN = 'mgx 3D70473ED7C443CA9E97749F62FFCC5D' - -RETRY_COUNT = 5 - -class Command(BaseCommand): - help = "Populate DB with existing metagenomics exchange" - def add_arguments(self, parser): - super(Command, self).add_arguments(parser) - - parser.add_argument( - "--dev", - action="store_true", - required=False, - help="Populate dev API", - ) - parser.add_argument( - "-b", - "--broker", - action="store", - required=False, - type=str, - default='EMG', - help="Broker name", - choices=["EMG", "MAR"], - ) - def get_request(self, url, params): - if self.auth: - response = requests.get(url, params=params, headers=self.auth) - else: - logging.warning( - "Not authenticated to get private data." - # noqa: E501 - ) - response = requests.get(url, params=params) - return response - - def handle(self, *args, **options): - self.auth = None - self.url = API['dev'] - self.broker = options.get("broker") - self.check_url = self.url + f'brokers/{self.broker}/datasets' - - diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py new file mode 100644 index 000000000..2c4dff5d8 --- /dev/null +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2022 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import responses + +from django.conf import settings +from django.core.management import BaseCommand + +from emgapi.models import AnalysisJob +from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI + +logger = logging.getLogger(__name__) + +RETRY_COUNT = 5 + +class Command(BaseCommand): + help = "Check and populate metagenomics exchange (ME)." + def add_arguments(self, parser): + super(Command, self).add_arguments(parser) + parser.add_argument( + "-s", + "--study", + required=False, + type=str, + help="Study accession list (rather than all)", + nargs='+', + ) + parser.add_argument( + "-p", + "--pipeline", + help="Pipeline version (rather than all). Not applicable to Genomes.", + action="store", + dest="pipeline", + choices=[1.0, 2.0, 3.0, 4.0, 4.1, 5.0], + required=False, + type=float, + ) + parser.add_argument( + "--dev", + action="store_true", + required=False, + help="Populate dev API", + ) + parser.add_argument( + "--dry-run", + action="store_true", + required=False, + help="Dry mode, no population of ME", + ) + # TODO: do I need it? + parser.add_argument( + "--full", + action="store_true", + help="Do a full check of DB", + ) + + def handle(self, *args, **options): + self.study_accession = options.get("study") + self.dry_run = options.get("dry_run") + self.pipeline_version = options.get("pipeline") + if options.get("dev"): + base_url = settings.ME_API_DEV + else: + base_url = settings.ME_API + ME = MetagenomicsExchangeAPI(base_url=base_url) + + new_analyses = AnalysisJob.objects_for_population.to_add() + removals = AnalysisJob.objects_for_population.to_delete() + if self.study_accession: + new_analyses = new_analyses.filter(study__secondary_accession__in=self.study_accession) + removals = removals.filter(study__secondary_accession__in=self.study_accession) + if self.pipeline_version: + new_analyses = new_analyses.filter(pipeline__pipeline_id=self.pipeline_version) + removals = removals.filter(pipeline__pipeline_id=self.pipeline_version) + logging.info(f"Processing {len(new_analyses)} new analyses") + for ajob in new_analyses: + metadata = { + "confidence": "full", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{ajob.accession}", + "method": ["other_metadata"], + "sourceID": ajob.accession, + "sequenceID": ajob.run, + "status": "public" if not ajob.is_private else "private", + "brokerID": settings.MGNIFY_BROKER, + } + registryID, metadata_match = ME.check_analysis(source_id=ajob.accession, metadata=metadata) + if not registryID: + logging.debug(f"Add new {ajob}") + if not self.dry_run: + response = ME.add_analysis(mgya=ajob.accession, run_accession=ajob.run, public=not ajob.is_private) + if response.ok: + logging.debug(f"Added {ajob}") + else: + logging.debug(f"Error adding {ajob}: {response.message}") + else: + logging.info(f"Dry-mode run: no addition to real ME for {ajob}") + else: + if not metadata_match: + logging.debug(f"Patch existing {ajob}") + data = { + "confidence": "full", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{ajob.accession}", + "method": ["other_metadata"], + "sourceID": ajob.accession, + "sequenceID": ajob.run, + "status": "public" if not ajob.is_private else "private", + "brokerID": settings.MGNIFY_BROKER, + } + if not self.dry_run: + if ME.patch_analysis(registry_id=registryID, data=data): + logging.info(f"Analysis {ajob} updated successfully") + else: + logging.info(f"Analysis {ajob} update failed") + else: + logging.info(f"Dry-mode run: no patch to real ME for {ajob}") + else: + logging.debug(f"No edit for {ajob}, metadata is correct") + + logging.info(f"Processing {len(removals)} analyses to remove") + for ajob in removals: + registryID, _ = ME.check_analysis(source_id=ajob.accession) + if registryID: + if not self.dry_run: + if ME.delete_analysis(registryID): + logging.info(f"{ajob} successfully deleted") + else: + logging.info(f"{ajob} failed on delete") + else: + logging.info(f"Dry-mode run: no delete from real ME for {ajob}") + else: + logging.info(f"No {ajob} in ME, nothing to delete") + logging.info("Done") + + + + + diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 1a8d92654..b0245b83e 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -24,10 +24,10 @@ class MetagenomicsExchangeAPI: """Metagenomics Exchange API Client""" - def __init__(self, broker="EMG"): - self.base_url = settings.ME_API + def __init__(self, base_url=None): + self.base_url = base_url if base_url else settings.ME_API self.__token = settings.ME_API_TOKEN - self.broker = broker + self.broker = settings.MGNIFY_BROKER def get_request(self, endpoint: str, params: dict): """Make a GET request, returns the response""" @@ -86,22 +86,39 @@ def add_analysis(self, mgya: str, run_accession: str, public: bool): response = self.post_request(endpoint="datasets", data=data) return response - def check_analysis(self, source_id: str, public: bool) -> bool: + + def check_analysis(self, source_id: str, public=None, metadata=None) -> [str, bool]: logging.info(f"Check {source_id}") - params = { - "status": "public" if public else "private", - } + params = {} + if public: + params = { + "status": "public" if public else "private", + } endpoint = f"brokers/{self.broker}/datasets" response = self.get_request(endpoint=endpoint, params=params) + analysis_registryID = "" + metadata_match = True if response.ok: data = response.json() datasets = data.get("datasets") for item in datasets: if item.get("sourceID") == source_id: - logging.info(f"{source_id} exists") - return True - logging.info(f"{source_id} does not exist") - return False + logging.info(f"{source_id} exists in ME") + analysis_registryID = item.get("registryID") + if metadata: + for metadata_record in metadata: + if not(metadata_record in item): + metadata_match = False + return analysis_registryID, metadata_match + else: + if metadata[metadata_record] != item[metadata_record]: + metadata_match = False + logging.info(f"Incorrect field {metadata[metadata_record]} in ME ({item[metadata_record]})") + return analysis_registryID, metadata_match + return analysis_registryID, metadata_match + else: + logging.info(f"{source_id} does not exist in ME") + return analysis_registryID, metadata_match def delete_analysis(self, registry_id: str): response = self.delete_request(endpoint=f"datasets/{registry_id}") diff --git a/emgapi/migrations/0012_auto_20231115_1448.py b/emgapi/migrations/0012_auto_20231115_1448.py new file mode 100644 index 000000000..cb27bcb04 --- /dev/null +++ b/emgapi/migrations/0012_auto_20231115_1448.py @@ -0,0 +1,37 @@ +# Generated by Django 3.2.18 on 2023-11-15 14:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0011_auto_20230912_1346'), + ] + + operations = [ + migrations.RemoveField( + model_name='metagenomicsexchange', + name='analysis_id', + ), + migrations.RemoveField( + model_name='metagenomicsexchange', + name='broker_id', + ), + migrations.AddField( + model_name='analysisjob', + name='last_populated_me', + field=models.DateTimeField(blank=True, db_column='LAST_POPULATED_ME', help_text='Date at which this model was last appeared in Metagenomics Exchange', null=True), + ), + migrations.AddField( + model_name='analysisjob', + name='last_updated_me', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATED_ME'), + ), + migrations.DeleteModel( + name='ME_Broker', + ), + migrations.DeleteModel( + name='MetagenomicsExchange', + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index 0a5de019b..6aa442075 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -1634,6 +1634,70 @@ def __str__(self): return 'Assembly:{} - Sample:{}'.format(self.assembly, self.sample) +class MetagenomicsExchangeQueryset(models.QuerySet): + """ + to_delete: Objects that have been suppressed since they were last populated, + or that have been added but updated since. + + to_add: Objects that have never been added, + or that have been added but updated since. + """ + def to_delete(self): + updated_after_populating = Q(last_updated_me__gte=F("last_populated_me"), last_populated_me__isnull=False) + + try: + self.model._meta.get_field("suppressed_at") + except FieldDoesNotExist: + return self.filter( + updated_after_populating + ) + else: + return self.filter( + Q(suppressed_at__gte=F("last_populated_me")) + ) + + def to_add(self): + updated_after_populating = Q(last_updated_me__gte=F("last_populated_me"), last_populated_me__isnull=False) + never_populated = Q(last_populated_me__isnull=True) + + try: + self.model._meta.get_field("is_suppressed") + except FieldDoesNotExist: + not_suppressed = Q() + else: + not_suppressed = Q(is_suppressed=False) + + try: + self.model._meta.get_field("is_private") + except FieldDoesNotExist: + not_private = Q() + else: + not_private = Q(is_private=False) + + return self.filter(never_populated | updated_after_populating, not_suppressed, not_private) + + +class MetagenomicsExchangeModel(models.Model): + """Model to track Metagenomics Exchange population + https://www.ebi.ac.uk/ena/registry/metagenome/api/ + """ + last_updated_me = models.DateTimeField( + db_column='LAST_UPDATED_ME', + auto_now=True + ) + last_populated_me = models.DateTimeField( + db_column='LAST_POPULATED_ME', + null=True, + blank=True, + help_text="Date at which this model was last appeared in Metagenomics Exchange" + ) + + objects_for_population = MetagenomicsExchangeQueryset.as_manager() + + class Meta: + abstract = True + + class AnalysisJobQuerySet(BaseQuerySet, MySQLQuerySet, SuppressQuerySet, SelectRelatedOnlyQuerySetMixin): def __init__(self, *args, **kwargs): diff --git a/emgcli/settings.py b/emgcli/settings.py index 53187bf5c..57c88a70d 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -684,6 +684,7 @@ def create_secret_key(var_dir): os.environ['ENA_API_PASSWORD'] = EMG_CONF['emg']['ena_api_password'] # Metagenomics Exchange +MGNIFY_BROKER = "EMG" try: ME_API = EMG_CONF['emg']['me_api'] ME_API_TOKEN = EMG_CONF['emg']['me_api_token'] @@ -691,3 +692,10 @@ def create_secret_key(var_dir): ME_API = "" ME_API_TOKEN = "" warnings.warn("The metagenomics exchange API and Token are not configured properly") +try: + ME_API_DEV = EMG_CONF['emg']['me_api_dev'] + ME_API_TOKEN = EMG_CONF['emg']['me_api_token_dev'] +except KeyError: + ME_API_DEV = "" + ME_API_TOKEN = "" + warnings.warn("The metagenomics exchange DEV API and Token are not configured properly") diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 2621b1e90..c6bd36a0d 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -15,12 +15,14 @@ class TestME: def test_check_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" - assert me_api.check_analysis(source_id, True) + return_values = me_api.check_analysis(source_id, True) + assert return_values[0] def test_check_not_existing_analysis_me(self): - me_api = MetagenomicsExchangeAPI(broker="MAR") - source_id = "MGYA00293719" - assert not me_api.check_analysis(source_id, True) + me_api = MetagenomicsExchangeAPI() + source_id = "MGYA10293719" + return_values = me_api.check_analysis(source_id, True) + assert not return_values[0] def test_post_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() diff --git a/tests/me/test_mgx_api.py b/tests/me/test_mgx_api.py deleted file mode 100644 index 9a75acea0..000000000 --- a/tests/me/test_mgx_api.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2020 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -import pytest - -from unittest.mock import patch - -from django.core.management import call_command - -from test_utils.emg_fixtures import * # noqa - -from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI -from emgapi.models import AnalysisJob, MetagenomicsExchange, ME_Broker - -@pytest.mark.django_db -class TestMeAPI: - @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.post_request") - @pytest.mark.usefixtures( - "me_broker", - "metagenomics_exchange", - ) - def test_new_analysis_population( - self, - mock_post_request - ): - class MockResponse: - def __init__(self, ok, status_code): - self.ok = ok - self.status_code = status_code - - mock_post_request.return_value = MockResponse(True, 201) - test_run = "ABC01234" - test_pipeline_version = 5.0 - test_broker = 'EMG' - - call_command( - "mgx_api", - run=test_run, - pipeline=test_pipeline_version, - broker=test_broker - ) - analysis = AnalysisJob.objects.filter(run__accession=test_run).filter(pipeline__pipeline_id=test_pipeline_version).first() - assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 - broker_id = ME_Broker.objects.filter(brokerID=test_broker).first().id - assert MetagenomicsExchange.objects.filter(analysis=analysis).filter(broker=broker_id).count() == 1 - - @pytest.mark.usefixtures( - "me_broker", - "metagenomics_exchange", - ) - @patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.get_request") - def test_check_existing_analysis( - self, - mock_get_request, - run_multiple_analysis, - me_broker, - metagenomics_exchange, - ): - class MockResponse: - def __init__(self, ok, status_code, json_data): - self.ok = ok - self.status_code = status_code - self.json_data = json_data - - def json(self): - return self.json_data - - test_run = "ABC01234" - test_pipeline_version = 1.0 - test_broker = 'MAR' - analysis = AnalysisJob.objects.filter(run__accession=test_run).filter( - pipeline__pipeline_id=test_pipeline_version).first() - json_data = {'datasets': [{'sourceID': analysis.accession}] } - mock_get_request.return_value = MockResponse(True, 200, json_data) - - call_command( - "mgx_api", - run=test_run, - pipeline=test_pipeline_version, - broker=test_broker - ) - - assert ME_Broker.objects.filter(brokerID=test_broker).count() == 1 - broker_id = ME_Broker.objects.filter(brokerID=test_broker).first().id - assert MetagenomicsExchange.objects.filter(analysis=analysis, broker=broker_id).count() == 1 -""" \ No newline at end of file diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py new file mode 100644 index 000000000..9093051ce --- /dev/null +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2020 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from unittest.mock import patch + +from django.core.management import call_command + +from test_utils.emg_fixtures import * # noqa + + +@pytest.mark.django_db +class TestMeAPI: + @pytest.mark.usefixtures('run_multiple_analysis') + def test_population_dry_mode( + self, + caplog + ): + call_command( + "populate_metagenomics_exchange", + dev=True, + dry_run=True, + ) + assert "Dry-mode run: no addition to real ME for MGYA00001234" in caplog.text + assert "Dry-mode run: no addition to real ME for MGYA00005678" in caplog.text + assert "Dry-mode run: no addition to real ME for MGYA00466090" in caplog.text + assert "Processing 0 analyses to remove" in caplog.text + + @pytest.mark.usefixtures('suppressed_analysis_jobs') + def test_removals_dry_mode( + self, + caplog + ): + call_command( + "populate_metagenomics_exchange", + dev=True, + dry_run=True, + ) + + assert "No MGYA00000002 in ME, nothing to delete" in caplog.text + assert "Processing 0 new analyses" in caplog.text + + @pytest.mark.usefixtures('analysis_existed_in_me') + def test_update_dry_mode(self, caplog): + call_command( + "populate_metagenomics_exchange", + dev=True, + dry_run=True, + ) + assert "Incorrect field None in ME (ERR1806500)" in caplog.text + assert "Dry-mode run: no patch to real ME for MGYA00147343" in caplog.text + assert "Processing 0 analyses to remove" in caplog.text diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index f8275c9a8..9b64eba56 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -36,8 +36,9 @@ 'ena_private_studies', 'ena_suppressed_studies', 'ena_public_runs', 'ena_private_runs', 'ena_suppressed_runs', 'ena_public_samples', 'ena_private_samples', 'ena_suppressed_samples', 'ena_public_assemblies', 'ena_private_assemblies', 'ena_suppressed_assemblies', - 'assembly_extra_annotation', 'ena_suppression_propagation_studies', 'ena_suppression_propagation_runs', 'ena_suppression_propagation_samples', 'ena_suppression_propagation_assemblies', + 'assembly_extra_annotation', 'ena_suppression_propagation_studies', 'ena_suppression_propagation_runs', + 'suppressed_analysis_jobs', 'analysis_existed_in_me', ] @@ -1093,3 +1094,28 @@ def ena_suppression_propagation_assemblies(experiment_type_assembly, study): study=study, ) return assemblies + + +def make_suppressed_analysis_jobs(quantity, emg_props=None): + emg_props = emg_props or {} + analyses = baker.make(emg_models.AnalysisJob, _quantity=quantity, **emg_props) + return analyses + + +@pytest.fixture +def suppressed_analysis_jobs(ena_suppressed_runs): + suppressed_analysisjobs = make_suppressed_analysis_jobs(quantity=5, + emg_props={"is_suppressed": True, + 'last_populated_me': '1970-01-01 00:00:00'}) + return suppressed_analysisjobs + +@pytest.fixture +def analysis_existed_in_me(): + emg_props = { + "job_id": 147343, + "last_populated_me": '1970-01-01 00:00:00', + "last_updated_me": '1980-01-01 00:00:00', + "is_suppressed": False, + "is_private": False + } + return baker.make(emg_models.AnalysisJob, **emg_props) From f0824157f5d19ce199c5ce65994db43b6bd71beb Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Fri, 17 Nov 2023 16:49:08 +0000 Subject: [PATCH 17/46] add population scripts and tests for dry-run --- .../populate_metagenomics_exchange.py | 37 +++++++++---------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 2c4dff5d8..bb62f42b4 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -68,6 +68,17 @@ def add_arguments(self, parser): help="Do a full check of DB", ) + def generate_metadata(self, mgya, run_accession, status): + return { + "confidence": "full", + "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", + "method": ["other_metadata"], + "sourceID": mgya, + "sequenceID": run_accession, + "status": status, + "brokerID": settings.MGNIFY_BROKER, + } + def handle(self, *args, **options): self.study_accession = options.get("study") self.dry_run = options.get("dry_run") @@ -88,15 +99,8 @@ def handle(self, *args, **options): removals = removals.filter(pipeline__pipeline_id=self.pipeline_version) logging.info(f"Processing {len(new_analyses)} new analyses") for ajob in new_analyses: - metadata = { - "confidence": "full", - "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{ajob.accession}", - "method": ["other_metadata"], - "sourceID": ajob.accession, - "sequenceID": ajob.run, - "status": "public" if not ajob.is_private else "private", - "brokerID": settings.MGNIFY_BROKER, - } + metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, + status="public" if not ajob.is_private else "private") registryID, metadata_match = ME.check_analysis(source_id=ajob.accession, metadata=metadata) if not registryID: logging.debug(f"Add new {ajob}") @@ -111,17 +115,8 @@ def handle(self, *args, **options): else: if not metadata_match: logging.debug(f"Patch existing {ajob}") - data = { - "confidence": "full", - "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{ajob.accession}", - "method": ["other_metadata"], - "sourceID": ajob.accession, - "sequenceID": ajob.run, - "status": "public" if not ajob.is_private else "private", - "brokerID": settings.MGNIFY_BROKER, - } if not self.dry_run: - if ME.patch_analysis(registry_id=registryID, data=data): + if ME.patch_analysis(registry_id=registryID, data=metadata): logging.info(f"Analysis {ajob} updated successfully") else: logging.info(f"Analysis {ajob} update failed") @@ -132,7 +127,9 @@ def handle(self, *args, **options): logging.info(f"Processing {len(removals)} analyses to remove") for ajob in removals: - registryID, _ = ME.check_analysis(source_id=ajob.accession) + metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, + status="public" if not ajob.is_private else "private") + registryID, _ = ME.check_analysis(source_id=ajob.accession, metadata=metadata) if registryID: if not self.dry_run: if ME.delete_analysis(registryID): From bb1ce81f39af97ff76813888f1dba7770135d8c2 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 20 Nov 2023 10:39:12 +0000 Subject: [PATCH 18/46] change api adress --- ci/configuration.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/configuration.yaml b/ci/configuration.yaml index a05127d10..79a212638 100644 --- a/ci/configuration.yaml +++ b/ci/configuration.yaml @@ -19,7 +19,5 @@ emg: celery_backend: 'redis://localhost:6379/1' results_production_dir: '/dummy/path/results' # metagenomics exchange - me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' - me_api_dev: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' - me_api_token_dev: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file + me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api/' + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file From 0f309c4e0d9e4e96597756585ac54911eac146dd Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 20 Nov 2023 11:05:43 +0000 Subject: [PATCH 19/46] change api adress --- ci/configuration.yaml | 2 +- config/local-tests.yml | 4 ++-- tests/test_utils/emg_fixtures.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/configuration.yaml b/ci/configuration.yaml index 79a212638..33f780ee9 100644 --- a/ci/configuration.yaml +++ b/ci/configuration.yaml @@ -19,5 +19,5 @@ emg: celery_backend: 'redis://localhost:6379/1' results_production_dir: '/dummy/path/results' # metagenomics exchange - me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api/' + me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file diff --git a/config/local-tests.yml b/config/local-tests.yml index f37eaff5d..b3f7b589b 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -16,7 +16,7 @@ emg: celery_backend: 'redis://localhost:6379/1' results_production_dir: '/dummy/path/results' # metagenomics exchange - me_api: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' - me_api_dev: 'http://wp-np2-5c.ebi.ac.uk:8080/ena/registry/metagenome/api' + me_api_dev: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' me_api_token_dev: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index 9b64eba56..d4afc96a8 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -1106,6 +1106,7 @@ def make_suppressed_analysis_jobs(quantity, emg_props=None): def suppressed_analysis_jobs(ena_suppressed_runs): suppressed_analysisjobs = make_suppressed_analysis_jobs(quantity=5, emg_props={"is_suppressed": True, + "suppressed_at": '1980-01-01 00:00:00', 'last_populated_me': '1970-01-01 00:00:00'}) return suppressed_analysisjobs From 83b6b3afbf2c866910a5c9e75429e9e3674414f6 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 21 Nov 2023 13:00:29 +0000 Subject: [PATCH 20/46] WIP --- .../populate_metagenomics_exchange.py | 5 +-- emgapi/metagenomics_exchange.py | 35 ++++++++++--------- tests/me/test_metagenomics_exchange.py | 10 +++--- .../me/test_populate_metagenomics_exchange.py | 17 +++++++-- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index bb62f42b4..14f340a64 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -101,7 +101,8 @@ def handle(self, *args, **options): for ajob in new_analyses: metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, status="public" if not ajob.is_private else "private") - registryID, metadata_match = ME.check_analysis(source_id=ajob.accession, metadata=metadata) + registryID, metadata_match = ME.check_analysis(source_id=ajob.accession, sequence_id=ajob.run, + metadata=metadata) if not registryID: logging.debug(f"Add new {ajob}") if not self.dry_run: @@ -129,7 +130,7 @@ def handle(self, *args, **options): for ajob in removals: metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, status="public" if not ajob.is_private else "private") - registryID, _ = ME.check_analysis(source_id=ajob.accession, metadata=metadata) + registryID, _ = ME.check_analysis(source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata) if registryID: if not self.dry_run: if ME.delete_analysis(registryID): diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index b0245b83e..92f54fc59 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -87,37 +87,38 @@ def add_analysis(self, mgya: str, run_accession: str, public: bool): return response - def check_analysis(self, source_id: str, public=None, metadata=None) -> [str, bool]: + def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata=None) -> [str, bool]: logging.info(f"Check {source_id}") params = {} if public: params = { "status": "public" if public else "private", } - endpoint = f"brokers/{self.broker}/datasets" + endpoint = f"sequences/{sequence_id}" response = self.get_request(endpoint=endpoint, params=params) analysis_registryID = "" metadata_match = True if response.ok: data = response.json() datasets = data.get("datasets") - for item in datasets: - if item.get("sourceID") == source_id: - logging.info(f"{source_id} exists in ME") - analysis_registryID = item.get("registryID") - if metadata: - for metadata_record in metadata: - if not(metadata_record in item): + sourceIDs = [item.get("sourceID") for item in datasets] + if source_id in sourceIDs: + found_record = [item for item in datasets if item.get("sourceID") == source_id][0] + logging.info(f"{source_id} exists in ME") + analysis_registryID = found_record.get("registryID") + if metadata: + for metadata_record in metadata: + if not(metadata_record in found_record): + metadata_match = False + return analysis_registryID, metadata_match + else: + if metadata[metadata_record] != found_record[metadata_record]: metadata_match = False + logging.info(f"Incorrect field {metadata[metadata_record]} in ME ({found_record[metadata_record]})") return analysis_registryID, metadata_match - else: - if metadata[metadata_record] != item[metadata_record]: - metadata_match = False - logging.info(f"Incorrect field {metadata[metadata_record]} in ME ({item[metadata_record]})") - return analysis_registryID, metadata_match - return analysis_registryID, metadata_match - else: - logging.info(f"{source_id} does not exist in ME") + return analysis_registryID, metadata_match + else: + logging.info(f"{source_id} does not exist in ME") return analysis_registryID, metadata_match def delete_analysis(self, registry_id: str): diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index c6bd36a0d..6de1757e8 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -15,20 +15,22 @@ class TestME: def test_check_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" - return_values = me_api.check_analysis(source_id, True) + seq_id = "ERR3063408" + return_values = me_api.check_analysis(source_id, seq_id, True) assert return_values[0] def test_check_not_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA10293719" - return_values = me_api.check_analysis(source_id, True) + seq_id = "ERR3063408" + return_values = me_api.check_analysis(source_id, seq_id, True) assert not return_values[0] def test_post_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 - with pytest.raises(requests.HTTPError, match="409 Client Error"): + with pytest.raises(requests.HTTPError, match="401 Client Error"): me_api.add_analysis(mgya=source_id, run_accession="ERR3063408", public=True).json() @responses.activate @@ -63,7 +65,7 @@ def test_wrong_delete_request_me(self): registry_id = "MGX0000780" endpoint = f"dataset/{registry_id}" - with pytest.raises(requests.HTTPError, match="404 Client Error"): + with pytest.raises(requests.HTTPError, match="401 Client Error"): me_api.delete_request(endpoint) def test_patch_analysis_me(self): diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py index 9093051ce..3841a9757 100644 --- a/tests/me/test_populate_metagenomics_exchange.py +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -22,6 +22,8 @@ from test_utils.emg_fixtures import * # noqa +from emgapi.models import AnalysisJob + @pytest.mark.django_db class TestMeAPI: @@ -50,8 +52,9 @@ def test_removals_dry_mode( dev=True, dry_run=True, ) - - assert "No MGYA00000002 in ME, nothing to delete" in caplog.text + ajobs = AnalysisJob.objects.all() + for job in ajobs: + assert f"No {job.accession} in ME, nothing to delete" in caplog.text assert "Processing 0 new analyses" in caplog.text @pytest.mark.usefixtures('analysis_existed_in_me') @@ -64,3 +67,13 @@ def test_update_dry_mode(self, caplog): assert "Incorrect field None in ME (ERR1806500)" in caplog.text assert "Dry-mode run: no patch to real ME for MGYA00147343" in caplog.text assert "Processing 0 analyses to remove" in caplog.text + + @pytest.mark.usefixtures('run_multiple_analysis') + def test_population( + self, + caplog + ): + call_command( + "populate_metagenomics_exchange", + dev=True, + ) From 97e226e5acfcee60cd94f4dfb8d3137d4843d720 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 12:01:19 +0000 Subject: [PATCH 21/46] Minor refactor of the MGX indexation process WIP --- .../populate_metagenomics_exchange.py | 143 ++++++++++-------- emgapi/metagenomics_exchange.py | 4 +- emgapi/models.py | 60 ++++++-- emgcli/settings.py | 17 +-- tests/me/test_metagenomics_exchange.py | 4 +- 5 files changed, 139 insertions(+), 89 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 14f340a64..d1e13f1f8 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -15,10 +15,10 @@ # limitations under the License. import logging -import responses from django.conf import settings from django.core.management import BaseCommand +from django.utils import timezone from emgapi.models import AnalysisJob from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI @@ -27,8 +27,10 @@ RETRY_COUNT = 5 + class Command(BaseCommand): help = "Check and populate metagenomics exchange (ME)." + def add_arguments(self, parser): super(Command, self).add_arguments(parser) parser.add_argument( @@ -37,7 +39,7 @@ def add_arguments(self, parser): required=False, type=str, help="Study accession list (rather than all)", - nargs='+', + nargs="+", ) parser.add_argument( "-p", @@ -61,12 +63,6 @@ def add_arguments(self, parser): required=False, help="Dry mode, no population of ME", ) - # TODO: do I need it? - parser.add_argument( - "--full", - action="store_true", - help="Do a full check of DB", - ) def generate_metadata(self, mgya, run_accession, status): return { @@ -76,74 +72,103 @@ def generate_metadata(self, mgya, run_accession, status): "sourceID": mgya, "sequenceID": run_accession, "status": status, - "brokerID": settings.MGNIFY_BROKER, + "brokerID": settings.METAGENOMICS_EXCHANGE_MGNIFY_BROKER, } def handle(self, *args, **options): self.study_accession = options.get("study") self.dry_run = options.get("dry_run") self.pipeline_version = options.get("pipeline") - if options.get("dev"): - base_url = settings.ME_API_DEV - else: - base_url = settings.ME_API - ME = MetagenomicsExchangeAPI(base_url=base_url) - - new_analyses = AnalysisJob.objects_for_population.to_add() - removals = AnalysisJob.objects_for_population.to_delete() + + mgx_api = MetagenomicsExchangeAPI(base_url=settings.METAGENOMICS_EXCHANGE_API) + + analyses_to_index = AnalysisJob.objects_for_mgx_indexing.to_add() + analyses_to_delete = AnalysisJob.objects_for_mgx_indexing.to_delete() + if self.study_accession: - new_analyses = new_analyses.filter(study__secondary_accession__in=self.study_accession) - removals = removals.filter(study__secondary_accession__in=self.study_accession) + analyses_to_index = analyses_to_index.filter( + study__secondary_accession__in=self.study_accession + ) + analyses_to_delete = analyses_to_delete.filter( + study__secondary_accession__in=self.study_accession + ) + if self.pipeline_version: - new_analyses = new_analyses.filter(pipeline__pipeline_id=self.pipeline_version) - removals = removals.filter(pipeline__pipeline_id=self.pipeline_version) - logging.info(f"Processing {len(new_analyses)} new analyses") - for ajob in new_analyses: - metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, - status="public" if not ajob.is_private else "private") - registryID, metadata_match = ME.check_analysis(source_id=ajob.accession, sequence_id=ajob.run, - metadata=metadata) - if not registryID: + analyses_to_index = analyses_to_index.filter( + pipeline__pipeline_id=self.pipeline_version + ) + analyses_to_delete = analyses_to_delete.filter(pipeline__pipeline_id=self.pipeline_version) + + logging.info(f"Indexig {len(analyses_to_index)} new analyses") + + jobs_to_update = [] + + for ajob in analyses_to_index: + metadata = self.generate_metadata( + mgya=ajob.accession, + run_accession=ajob.run, + status="public" if not ajob.is_private else "private", + ) + registry_id, metadata_match = mgx_api.check_analysis( + source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + ) + # The job is not registered + if not registry_id: logging.debug(f"Add new {ajob}") - if not self.dry_run: - response = ME.add_analysis(mgya=ajob.accession, run_accession=ajob.run, public=not ajob.is_private) - if response.ok: - logging.debug(f"Added {ajob}") - else: - logging.debug(f"Error adding {ajob}: {response.message}") - else: + if self.dry_run: logging.info(f"Dry-mode run: no addition to real ME for {ajob}") + continue + + response = mgx_api.add_analysis( + mgya=ajob.accession, + run_accession=ajob.run, + public=not ajob.is_private, + ) + if response.ok: + logging.debug(f"Added {ajob}") + ajob.last_mgx_indexed = timezone.now() + jobs_to_update.append(ajob) + else: + logging.error(f"Error adding {ajob}: {response.message}") + + # else we have to check if the metadata matches, if not we need to update it else: if not metadata_match: logging.debug(f"Patch existing {ajob}") - if not self.dry_run: - if ME.patch_analysis(registry_id=registryID, data=metadata): - logging.info(f"Analysis {ajob} updated successfully") - else: - logging.info(f"Analysis {ajob} update failed") - else: + if self.dry_run: logging.info(f"Dry-mode run: no patch to real ME for {ajob}") + continue + if mgx_api.patch_analysis(registry_id=registry_id, data=metadata): + logging.info(f"Analysis {ajob} updated successfully") + ajob.last_mgx_indexed = timezone.now() + jobs_to_update.append(ajob) + else: + logging.error(f"Analysis {ajob} update failed") else: logging.debug(f"No edit for {ajob}, metadata is correct") - logging.info(f"Processing {len(removals)} analyses to remove") - for ajob in removals: - metadata = self.generate_metadata(mgya=ajob.accession, run_accession=ajob.run, - status="public" if not ajob.is_private else "private") - registryID, _ = ME.check_analysis(source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata) - if registryID: - if not self.dry_run: - if ME.delete_analysis(registryID): - logging.info(f"{ajob} successfully deleted") - else: - logging.info(f"{ajob} failed on delete") - else: + # BULK UPDATE # + AnalysisJob.objects.bulk_update(jobs_to_update, ["last_mgx_indexed"]) + + logging.info(f"Processing {len(analyses_to_delete)} analyses to remove") + for ajob in analyses_to_delete: + metadata = self.generate_metadata( + mgya=ajob.accession, + run_accession=ajob.run, + status="public" if not ajob.is_private else "private", + ) + registry_id, _ = mgx_api.check_analysis( + source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + ) + if registry_id: + if self.dry_run: logging.info(f"Dry-mode run: no delete from real ME for {ajob}") - else: - logging.info(f"No {ajob} in ME, nothing to delete") - logging.info("Done") - - - + if mgx_api.delete_analysis(registry_id): + logging.info(f"{ajob} successfully deleted") + else: + logging.info(f"{ajob} failed on delete") + else: + logging.info(f"{ajob} doesn't exist in the registry, nothing to delete") + logging.info("Done") diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 92f54fc59..f876c573c 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -25,8 +25,8 @@ class MetagenomicsExchangeAPI: """Metagenomics Exchange API Client""" def __init__(self, base_url=None): - self.base_url = base_url if base_url else settings.ME_API - self.__token = settings.ME_API_TOKEN + self.base_url = base_url if base_url else settings.METAGENOMICS_EXCHANGE_API + self.__token = settings.METAGENOMICS_EXCHANGE_API_TOKEN self.broker = settings.MGNIFY_BROKER def get_request(self, endpoint: str, params: dict): diff --git a/emgapi/models.py b/emgapi/models.py index 6aa442075..13aed51b6 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -279,7 +279,13 @@ class Meta: abstract = True -class EbiSearchIndexQueryset(models.QuerySet): +class IndexableModel(models.Model): + last_update = models.DateTimeField( + db_column='LAST_UPDATE', + auto_now=True + ) + +class IndexableModelQueryset(models.QuerySet): """ to_delete: Objects that have been suppressed since they were last indexed, or that have been indexed but updated since. @@ -288,7 +294,8 @@ class EbiSearchIndexQueryset(models.QuerySet): or that have been indexed but updated since. """ def to_delete(self): - updated_after_indexing = Q(last_update__gte=F("last_indexed"), last_indexed__isnull=False) + not_indexed_filter = {f"{self._index_field__isnull}": False} + updated_after_indexing = Q(last_update__gte=F(self._index_field), **not_indexed_filter) try: self.model._meta.get_field("suppressed_at") @@ -298,11 +305,12 @@ def to_delete(self): ) else: return self.filter( - Q(suppressed_at__gte=F("last_indexed")) | updated_after_indexing + Q(suppressed_at__gte=F(self._index_field)) | updated_after_indexing ) def to_add(self): - updated_after_indexing = Q(last_update__gte=F("last_indexed"), last_indexed__isnull=False) + not_indexed_filter = {f"{self._index_field__isnull}": False} + updated_after_indexing = Q(last_update__gte=F(self._index_field), **not_indexed_filter) never_indexed = Q(last_indexed__isnull=True) try: @@ -322,19 +330,43 @@ def to_add(self): return self.filter(never_indexed | updated_after_indexing, not_suppressed, not_private) -class EbiSearchIndexedModel(models.Model): - last_update = models.DateTimeField( - db_column='LAST_UPDATE', - auto_now=True - ) - last_indexed = models.DateTimeField( - db_column='LAST_INDEXED', + +class EBISearchIndexQueryset(IndexableModelQueryset): + + _index_field = "last_ebi_search_indexed" + + +class EBISearchIndexedModel(IndexableModel): + + last_ebi_search_indexed = models.DateTimeField( + db_column='LAST_EBI_SEARCH_INDEXED', null=True, blank=True, help_text="Date at which this model was last included in an EBI Search initial/incremental index." ) - objects_for_indexing = EbiSearchIndexQueryset.as_manager() + objects_for_ebisearch_indexing = EBISearchIndexQueryset.as_manager() + + class Meta: + abstract = True + + +class MetagenomicsExchangeQueryset(IndexableModelQueryset): + + _index_field = "last_mgx_indexed" + + +class MetagenomicsExchangeIndexedModel(IndexableModel): + """Model to track Metagenomics Exchange indexation of analysis jobs + """ + last_mgx_indexed = models.DateTimeField( + db_column='LAST_MGX_INDEXED', + null=True, + blank=True, + help_text="Date at which this model was last indexed in the Metagenomics Exchange" + ) + + objects_for_mgx_indexing = MetagenomicsExchangeQueryset.as_manager() class Meta: abstract = True @@ -1026,7 +1058,7 @@ def mydata(self, request): return self.get_queryset().mydata(request) -class Study(ENASyncableModel, EbiSearchIndexedModel): +class Study(ENASyncableModel, EBISearchIndexedModel): suppressible_descendants = ['samples', 'runs', 'assemblies', 'analyses'] def __init__(self, *args, **kwargs): @@ -1770,7 +1802,7 @@ class MetagenomicsExchange(models.Model): last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) -class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EbiSearchIndexedModel, MetagenomicsExchangeModel): +class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EBISearchIndexedModel, MetagenomicsExchangeModel): def __init__(self, *args, **kwargs): super(AnalysisJob, self).__init__(*args, **kwargs) setattr(self, 'accession', diff --git a/emgcli/settings.py b/emgcli/settings.py index 57c88a70d..27fc146eb 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -684,18 +684,11 @@ def create_secret_key(var_dir): os.environ['ENA_API_PASSWORD'] = EMG_CONF['emg']['ena_api_password'] # Metagenomics Exchange -MGNIFY_BROKER = "EMG" +METAGENOMICS_EXCHANGE_MGNIFY_BROKER = "EMG" +METAGENOMICS_EXCHANGE_API = "" +METAGENOMICS_EXCHANGE_API_TOKEN = "" try: - ME_API = EMG_CONF['emg']['me_api'] - ME_API_TOKEN = EMG_CONF['emg']['me_api_token'] + METAGENOMICS_EXCHANGE_API = EMG_CONF['emg']['me_api'] + METAGENOMICS_EXCHANGE_API_TOKEN = EMG_CONF['emg']['me_api_token'] except KeyError: - ME_API = "" - ME_API_TOKEN = "" warnings.warn("The metagenomics exchange API and Token are not configured properly") -try: - ME_API_DEV = EMG_CONF['emg']['me_api_dev'] - ME_API_TOKEN = EMG_CONF['emg']['me_api_token_dev'] -except KeyError: - ME_API_DEV = "" - ME_API_TOKEN = "" - warnings.warn("The metagenomics exchange DEV API and Token are not configured properly") diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 6de1757e8..58558df01 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -37,7 +37,7 @@ def test_post_existing_analysis_me(self): def test_mock_post_new_analysis(self): me_api = MetagenomicsExchangeAPI() endpoint = "datasets" - url = settings.ME_API + f"/{endpoint}" + url = settings.METAGENOMICS_EXCHANGE_API + f"/{endpoint}" responses.add(responses.POST, url, json={'success': True}, status=201) @@ -51,7 +51,7 @@ def test_mock_delete_analysis_from_me(self): me_api = MetagenomicsExchangeAPI() registry_id = "MGX0000780" endpoint = f"datasets/{registry_id}" - url = settings.ME_API + f"/{endpoint}" + url = settings.METAGENOMICS_EXCHANGE_API + f"/{endpoint}" responses.add(responses.DELETE, url, json={'success': True}, status=201) response = me_api.delete_request(endpoint) From e229f5c2b8a6a6cd04d93a611d5413841fe1d5a6 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 18 Jan 2024 12:34:13 +0000 Subject: [PATCH 22/46] This is still a WIP - MGX --- .../populate_metagenomics_exchange.py | 142 +++++++++++------- emgapi/metagenomics_exchange.py | 2 +- emgapi/migrations/0010_auto_20230908_1346.py | 35 ----- emgapi/migrations/0011_auto_20230912_1346.py | 23 --- emgapi/migrations/0012_auto_20231115_1448.py | 37 ----- emgapi/migrations/0013_auto_20231110_1329.py | 33 ---- emgapi/migrations/0013_auto_20240118_1220.py | 38 +++++ emgapi/models.py | 46 +++--- 8 files changed, 141 insertions(+), 215 deletions(-) delete mode 100644 emgapi/migrations/0010_auto_20230908_1346.py delete mode 100644 emgapi/migrations/0011_auto_20230912_1346.py delete mode 100644 emgapi/migrations/0012_auto_20231115_1448.py delete mode 100644 emgapi/migrations/0013_auto_20231110_1329.py create mode 100644 emgapi/migrations/0013_auto_20240118_1220.py diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index d1e13f1f8..d78a54415 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -19,6 +19,8 @@ from django.conf import settings from django.core.management import BaseCommand from django.utils import timezone +from django.core.paginator import Paginator +from datetime import timedelta from emgapi.models import AnalysisJob from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI @@ -79,7 +81,7 @@ def handle(self, *args, **options): self.study_accession = options.get("study") self.dry_run = options.get("dry_run") self.pipeline_version = options.get("pipeline") - + mgx_api = MetagenomicsExchangeAPI(base_url=settings.METAGENOMICS_EXCHANGE_API) analyses_to_index = AnalysisJob.objects_for_mgx_indexing.to_add() @@ -97,78 +99,102 @@ def handle(self, *args, **options): analyses_to_index = analyses_to_index.filter( pipeline__pipeline_id=self.pipeline_version ) - analyses_to_delete = analyses_to_delete.filter(pipeline__pipeline_id=self.pipeline_version) + analyses_to_delete = analyses_to_delete.filter( + pipeline__pipeline_id=self.pipeline_version + ) - logging.info(f"Indexig {len(analyses_to_index)} new analyses") + logging.info(f"Indexing {len(analyses_to_index)} new analyses") - jobs_to_update = [] + for page in Paginator(analyses_to_index, 100): + jobs_to_update = [] - for ajob in analyses_to_index: - metadata = self.generate_metadata( - mgya=ajob.accession, - run_accession=ajob.run, - status="public" if not ajob.is_private else "private", - ) - registry_id, metadata_match = mgx_api.check_analysis( - source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata - ) - # The job is not registered - if not registry_id: - logging.debug(f"Add new {ajob}") - if self.dry_run: - logging.info(f"Dry-mode run: no addition to real ME for {ajob}") - continue - - response = mgx_api.add_analysis( + for ajob in page: + metadata = self.generate_metadata( mgya=ajob.accession, run_accession=ajob.run, - public=not ajob.is_private, + status="public" if not ajob.is_private else "private", ) - if response.ok: - logging.debug(f"Added {ajob}") - ajob.last_mgx_indexed = timezone.now() - jobs_to_update.append(ajob) - else: - logging.error(f"Error adding {ajob}: {response.message}") - - # else we have to check if the metadata matches, if not we need to update it - else: - if not metadata_match: - logging.debug(f"Patch existing {ajob}") + registry_id, metadata_match = mgx_api.check_analysis( + source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + ) + # The job is not registered + if not registry_id: + logging.debug(f"Add new {ajob}") if self.dry_run: - logging.info(f"Dry-mode run: no patch to real ME for {ajob}") + logging.info(f"Dry-mode run: no addition to real ME for {ajob}") continue - if mgx_api.patch_analysis(registry_id=registry_id, data=metadata): - logging.info(f"Analysis {ajob} updated successfully") - ajob.last_mgx_indexed = timezone.now() + + response = mgx_api.add_analysis( + mgya=ajob.accession, + run_accession=ajob.run, + public=not ajob.is_private, + ) + if response.ok: + logging.debug(f"Added {ajob}") + ajob.mgx_accession = registry_id + ajob.last_mgx_indexed = timezone.now() + timedelta(minutes=1) jobs_to_update.append(ajob) else: - logging.error(f"Analysis {ajob} update failed") + logging.error(f"Error adding {ajob}: {response.message}") + + # else we have to check if the metadata matches, if not we need to update it else: - logging.debug(f"No edit for {ajob}, metadata is correct") + if not metadata_match: + logging.debug(f"Patch existing {ajob}") + if self.dry_run: + logging.info( + f"Dry-mode run: no patch to real ME for {ajob}" + ) + continue + if mgx_api.patch_analysis( + registry_id=registry_id, data=metadata + ): + logging.info(f"Analysis {ajob} updated successfully") + # Just to be safe, update the MGX accession + ajob.mgx_accession = registry_id + ajob.last_mgx_indexed = timezone.now() + jobs_to_update.append(ajob) + else: + logging.error(f"Analysis {ajob} update failed") + else: + logging.debug(f"No edit for {ajob}, metadata is correct") - # BULK UPDATE # - AnalysisJob.objects.bulk_update(jobs_to_update, ["last_mgx_indexed"]) + AnalysisJob.objects.bulk_update( + jobs_to_update, ["last_mgx_indexed", "mgx_accession"], batch_size=100 + ) logging.info(f"Processing {len(analyses_to_delete)} analyses to remove") - for ajob in analyses_to_delete: - metadata = self.generate_metadata( - mgya=ajob.accession, - run_accession=ajob.run, - status="public" if not ajob.is_private else "private", - ) - registry_id, _ = mgx_api.check_analysis( - source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata - ) - if registry_id: - if self.dry_run: - logging.info(f"Dry-mode run: no delete from real ME for {ajob}") - if mgx_api.delete_analysis(registry_id): - logging.info(f"{ajob} successfully deleted") + for page in Paginator(analyses_to_delete, 100): + jobs_to_update = [] + + for ajob in page: + metadata = self.generate_metadata( + mgya=ajob.accession, + run_accession=ajob.run, + status="public" if not ajob.is_private else "private", + ) + registry_id, _ = mgx_api.check_analysis( + source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + ) + if registry_id: + if self.dry_run: + logging.info(f"Dry-mode run: no delete from real ME for {ajob}") + + if mgx_api.delete_analysis(registry_id): + logging.info(f"{ajob} successfully deleted") + ajob.last_mgx_indexed = timezone.now() + jobs_to_update.append(ajob) + else: + logging.info(f"{ajob} failed on delete") else: - logging.info(f"{ajob} failed on delete") - else: - logging.info(f"{ajob} doesn't exist in the registry, nothing to delete") + logging.info( + f"{ajob} doesn't exist in the registry, nothing to delete" + ) + + # BULK UPDATE # + AnalysisJob.objects.bulk_update( + jobs_to_update, ["last_mgx_indexed"], batch_size=100 + ) logging.info("Done") diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index f876c573c..1ba099cef 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -27,7 +27,7 @@ class MetagenomicsExchangeAPI: def __init__(self, base_url=None): self.base_url = base_url if base_url else settings.METAGENOMICS_EXCHANGE_API self.__token = settings.METAGENOMICS_EXCHANGE_API_TOKEN - self.broker = settings.MGNIFY_BROKER + self.broker = settings.METAGENOMICS_EXCHANGE_MGNIFY_BROKER def get_request(self, endpoint: str, params: dict): """Make a GET request, returns the response""" diff --git a/emgapi/migrations/0010_auto_20230908_1346.py b/emgapi/migrations/0010_auto_20230908_1346.py deleted file mode 100644 index 10677b3ca..000000000 --- a/emgapi/migrations/0010_auto_20230908_1346.py +++ /dev/null @@ -1,35 +0,0 @@ -# Generated by Django 3.2.18 on 2023-09-08 13:46 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0009_genome_annotations_v2_downloads'), - ] - - operations = [ - migrations.CreateModel( - name='ME_Broker', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('brokerID', models.CharField(blank=True, db_column='BROKER', max_length=10, null=True)), - ], - ), - migrations.CreateModel( - name='MetagenomicsExchange', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('first_created', models.DateTimeField(auto_now_add=True, db_column='FIRST_CREATED')), - ('last_update', models.DateTimeField(auto_now=True, db_column='LAST_UPDATE')), - ('broker_id', models.ForeignKey(db_column='BROKER', on_delete=django.db.models.deletion.CASCADE, to='emgapi.me_broker')), - ], - ), - migrations.AddField( - model_name='analysisjob', - name='metagenomics_exchange', - field=models.ForeignKey(blank=True, db_column='METAGENOMICS_EXCHANGE', null=True, on_delete=django.db.models.deletion.CASCADE, to='emgapi.metagenomicsexchange'), - ), - ] diff --git a/emgapi/migrations/0011_auto_20230912_1346.py b/emgapi/migrations/0011_auto_20230912_1346.py deleted file mode 100644 index 3477ab5ca..000000000 --- a/emgapi/migrations/0011_auto_20230912_1346.py +++ /dev/null @@ -1,23 +0,0 @@ -# Generated by Django 3.2.18 on 2023-09-12 13:46 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0010_auto_20230908_1346'), - ] - - operations = [ - migrations.RemoveField( - model_name='analysisjob', - name='metagenomics_exchange', - ), - migrations.AddField( - model_name='metagenomicsexchange', - name='analysis_id', - field=models.ForeignKey(blank=True, db_column='ANALYSIS_ID', null=True, on_delete=django.db.models.deletion.CASCADE, to='emgapi.analysisjob'), - ), - ] diff --git a/emgapi/migrations/0012_auto_20231115_1448.py b/emgapi/migrations/0012_auto_20231115_1448.py deleted file mode 100644 index cb27bcb04..000000000 --- a/emgapi/migrations/0012_auto_20231115_1448.py +++ /dev/null @@ -1,37 +0,0 @@ -# Generated by Django 3.2.18 on 2023-11-15 14:48 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0011_auto_20230912_1346'), - ] - - operations = [ - migrations.RemoveField( - model_name='metagenomicsexchange', - name='analysis_id', - ), - migrations.RemoveField( - model_name='metagenomicsexchange', - name='broker_id', - ), - migrations.AddField( - model_name='analysisjob', - name='last_populated_me', - field=models.DateTimeField(blank=True, db_column='LAST_POPULATED_ME', help_text='Date at which this model was last appeared in Metagenomics Exchange', null=True), - ), - migrations.AddField( - model_name='analysisjob', - name='last_updated_me', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATED_ME'), - ), - migrations.DeleteModel( - name='ME_Broker', - ), - migrations.DeleteModel( - name='MetagenomicsExchange', - ), - ] diff --git a/emgapi/migrations/0013_auto_20231110_1329.py b/emgapi/migrations/0013_auto_20231110_1329.py deleted file mode 100644 index 998fc7202..000000000 --- a/emgapi/migrations/0013_auto_20231110_1329.py +++ /dev/null @@ -1,33 +0,0 @@ -# Generated by Django 3.2.18 on 2023-11-10 13:29 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0012_alter_publication_pub_type'), - ] - - operations = [ - migrations.AddField( - model_name='analysisjob', - name='last_indexed', - field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), - ), - migrations.AddField( - model_name='analysisjob', - name='last_update', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), - ), - migrations.AddField( - model_name='study', - name='last_indexed', - field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), - ), - migrations.AlterField( - model_name='study', - name='last_update', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), - ), - ] diff --git a/emgapi/migrations/0013_auto_20240118_1220.py b/emgapi/migrations/0013_auto_20240118_1220.py new file mode 100644 index 000000000..5f2b21809 --- /dev/null +++ b/emgapi/migrations/0013_auto_20240118_1220.py @@ -0,0 +1,38 @@ +# Generated by Django 3.2.18 on 2024-01-18 12:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0012_alter_publication_pub_type'), + ] + + operations = [ + migrations.AddField( + model_name='analysisjob', + name='last_ebi_search_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + migrations.AddField( + model_name='analysisjob', + name='last_mgx_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_MGX_INDEXED', help_text='Date at which this model was last indexed in the Metagenomics Exchange', null=True), + ), + migrations.AddField( + model_name='analysisjob', + name='mgx_accession', + field=models.CharField(blank=True, db_column='MGX_ACCESSION', help_text='The Metagenomics Exchange accession.', max_length=10, null=True, unique=True), + ), + migrations.AddField( + model_name='study', + name='last_ebi_search_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + migrations.AlterField( + model_name='study', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index 13aed51b6..f88e1cb16 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -279,12 +279,6 @@ class Meta: abstract = True -class IndexableModel(models.Model): - last_update = models.DateTimeField( - db_column='LAST_UPDATE', - auto_now=True - ) - class IndexableModelQueryset(models.QuerySet): """ to_delete: Objects that have been suppressed since they were last indexed, @@ -294,8 +288,8 @@ class IndexableModelQueryset(models.QuerySet): or that have been indexed but updated since. """ def to_delete(self): - not_indexed_filter = {f"{self._index_field__isnull}": False} - updated_after_indexing = Q(last_update__gte=F(self._index_field), **not_indexed_filter) + not_indexed_filter = {f"{self.index_field}__isnull": False} + updated_after_indexing = Q(last_update__gte=F(self.index_field), **not_indexed_filter) try: self.model._meta.get_field("suppressed_at") @@ -305,12 +299,12 @@ def to_delete(self): ) else: return self.filter( - Q(suppressed_at__gte=F(self._index_field)) | updated_after_indexing + Q(suppressed_at__gte=F(self.index_field)) | updated_after_indexing ) def to_add(self): - not_indexed_filter = {f"{self._index_field__isnull}": False} - updated_after_indexing = Q(last_update__gte=F(self._index_field), **not_indexed_filter) + not_indexed_filter = {f"{self.index_field}__isnull": False} + updated_after_indexing = Q(last_update__gte=F(self.index_field), **not_indexed_filter) never_indexed = Q(last_indexed__isnull=True) try: @@ -333,10 +327,10 @@ def to_add(self): class EBISearchIndexQueryset(IndexableModelQueryset): - _index_field = "last_ebi_search_indexed" + index_field = "last_ebi_search_indexed" -class EBISearchIndexedModel(IndexableModel): +class EBISearchIndexedModel(models.Model): last_ebi_search_indexed = models.DateTimeField( db_column='LAST_EBI_SEARCH_INDEXED', @@ -353,10 +347,10 @@ class Meta: class MetagenomicsExchangeQueryset(IndexableModelQueryset): - _index_field = "last_mgx_indexed" + index_field = "last_mgx_indexed" -class MetagenomicsExchangeIndexedModel(IndexableModel): +class MetagenomicsExchangeIndexedModel(models.Model): """Model to track Metagenomics Exchange indexation of analysis jobs """ last_mgx_indexed = models.DateTimeField( @@ -365,6 +359,14 @@ class MetagenomicsExchangeIndexedModel(IndexableModel): blank=True, help_text="Date at which this model was last indexed in the Metagenomics Exchange" ) + mgx_accession = models.CharField( + db_column='MGX_ACCESSION', + max_length=10, + unique=True, + null=True, + blank=True, + help_text="The Metagenomics Exchange accession." + ) objects_for_mgx_indexing = MetagenomicsExchangeQueryset.as_manager() @@ -1790,19 +1792,7 @@ def get_queryset(self): .select_related( 'analysis_status','experiment_type', 'assembly', 'pipeline', 'run', 'sample', 'study') -class ME_Broker(models.Model): - brokerID = models.CharField(db_column='BROKER', max_length=10, null=True, blank=True) - -class MetagenomicsExchange(models.Model): - """Table to track Metagenomics Exchange population - https://www.ebi.ac.uk/ena/registry/metagenome/api/ - """ - broker_id = models.ForeignKey(ME_Broker, db_column='BROKER', on_delete=models.CASCADE) - first_created = models.DateTimeField(db_column='FIRST_CREATED', auto_now_add=True) - last_update = models.DateTimeField(db_column='LAST_UPDATE', auto_now=True) - - -class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EBISearchIndexedModel, MetagenomicsExchangeModel): +class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EBISearchIndexedModel, MetagenomicsExchangeIndexedModel): def __init__(self, *args, **kwargs): super(AnalysisJob, self).__init__(*args, **kwargs) setattr(self, 'accession', From a6e9eda4d17ba76be0fb11b1db6126055d543aa0 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 18 Jan 2024 14:14:36 +0000 Subject: [PATCH 23/46] Fixed the migrations for EBI Search and MGX. The test tests/me/test_populate_metagenomics_exchange.py::TestMeAPI::test_removals_dry_mode it's working now. --- .../populate_metagenomics_exchange.py | 6 --- emgapi/metagenomics_exchange.py | 12 +++--- .../0014_analysisjob_last_update.py | 18 +++++++++ emgapi/models.py | 26 +++++++++---- .../me/test_populate_metagenomics_exchange.py | 38 ++++++++----------- tests/test_utils/emg_fixtures.py | 6 +-- 6 files changed, 60 insertions(+), 46 deletions(-) create mode 100644 emgapi/migrations/0014_analysisjob_last_update.py diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index d78a54415..37f2b2544 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -53,12 +53,6 @@ def add_arguments(self, parser): required=False, type=float, ) - parser.add_argument( - "--dev", - action="store_true", - required=False, - help="Populate dev API", - ) parser.add_argument( "--dry-run", action="store_true", diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 1ba099cef..74cf61c6b 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -96,7 +96,7 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata } endpoint = f"sequences/{sequence_id}" response = self.get_request(endpoint=endpoint, params=params) - analysis_registryID = "" + analysis_registry_id = None metadata_match = True if response.ok: data = response.json() @@ -105,21 +105,21 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata if source_id in sourceIDs: found_record = [item for item in datasets if item.get("sourceID") == source_id][0] logging.info(f"{source_id} exists in ME") - analysis_registryID = found_record.get("registryID") + analysis_registry_id = found_record.get("registryID") if metadata: for metadata_record in metadata: if not(metadata_record in found_record): metadata_match = False - return analysis_registryID, metadata_match + return analysis_registry_id , metadata_match else: if metadata[metadata_record] != found_record[metadata_record]: metadata_match = False logging.info(f"Incorrect field {metadata[metadata_record]} in ME ({found_record[metadata_record]})") - return analysis_registryID, metadata_match - return analysis_registryID, metadata_match + return analysis_registry_id, metadata_match + return analysis_registry_id , metadata_match else: logging.info(f"{source_id} does not exist in ME") - return analysis_registryID, metadata_match + return analysis_registry_id, metadata_match def delete_analysis(self, registry_id: str): response = self.delete_request(endpoint=f"datasets/{registry_id}") diff --git a/emgapi/migrations/0014_analysisjob_last_update.py b/emgapi/migrations/0014_analysisjob_last_update.py new file mode 100644 index 000000000..dcb791e04 --- /dev/null +++ b/emgapi/migrations/0014_analysisjob_last_update.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2024-01-18 13:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0013_auto_20240118_1220'), + ] + + operations = [ + migrations.AddField( + model_name='analysisjob', + name='last_update', + field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), + ), + ] diff --git a/emgapi/models.py b/emgapi/models.py index f88e1cb16..ce547dfba 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -279,6 +279,16 @@ class Meta: abstract = True +class IndexableModel(models.Model): + last_update = models.DateTimeField( + db_column='LAST_UPDATE', + auto_now=True + ) + + class Meta: + abstract = True + + class IndexableModelQueryset(models.QuerySet): """ to_delete: Objects that have been suppressed since they were last indexed, @@ -288,8 +298,7 @@ class IndexableModelQueryset(models.QuerySet): or that have been indexed but updated since. """ def to_delete(self): - not_indexed_filter = {f"{self.index_field}__isnull": False} - updated_after_indexing = Q(last_update__gte=F(self.index_field), **not_indexed_filter) + updated_after_indexing = Q(last_update__gte=F(self.index_field), **{f"{self.index_field}__isnull": False}) try: self.model._meta.get_field("suppressed_at") @@ -303,9 +312,8 @@ def to_delete(self): ) def to_add(self): - not_indexed_filter = {f"{self.index_field}__isnull": False} - updated_after_indexing = Q(last_update__gte=F(self.index_field), **not_indexed_filter) - never_indexed = Q(last_indexed__isnull=True) + updated_after_indexing = Q(last_update__gte=F(self.index_field), **{f"{self.index_field}__isnull": False}) + never_indexed = Q(**{f"{self.index_field}__isnull": True}) try: self.model._meta.get_field("is_suppressed") @@ -330,7 +338,7 @@ class EBISearchIndexQueryset(IndexableModelQueryset): index_field = "last_ebi_search_indexed" -class EBISearchIndexedModel(models.Model): +class EBISearchIndexedModel(IndexableModel): last_ebi_search_indexed = models.DateTimeField( db_column='LAST_EBI_SEARCH_INDEXED', @@ -352,6 +360,10 @@ class MetagenomicsExchangeQueryset(IndexableModelQueryset): class MetagenomicsExchangeIndexedModel(models.Model): """Model to track Metagenomics Exchange indexation of analysis jobs + TODO: this model should have the last_update field as it's a requirement. + The current implementation of this works because the analysis jobs are + also extending the EBISearchIndexable model which provided the + last_update field. """ last_mgx_indexed = models.DateTimeField( db_column='LAST_MGX_INDEXED', @@ -1094,8 +1106,6 @@ def _custom_pk(self): db_column='AUTHOR_EMAIL', max_length=100, blank=True, null=True) author_name = models.CharField( db_column='AUTHOR_NAME', max_length=100, blank=True, null=True) - last_update = models.DateTimeField( - db_column='LAST_UPDATE', auto_now=True) submission_account_id = models.CharField( db_column='SUBMISSION_ACCOUNT_ID', max_length=15, blank=True, null=True) diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py index 3841a9757..cd4e7bc09 100644 --- a/tests/me/test_populate_metagenomics_exchange.py +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -16,7 +16,7 @@ import pytest -from unittest.mock import patch +from unittest import mock from django.core.management import call_command @@ -27,14 +27,10 @@ @pytest.mark.django_db class TestMeAPI: - @pytest.mark.usefixtures('run_multiple_analysis') - def test_population_dry_mode( - self, - caplog - ): + @pytest.mark.usefixtures("run_multiple_analysis") + def test_population_dry_mode(self, caplog): call_command( "populate_metagenomics_exchange", - dev=True, dry_run=True, ) assert "Dry-mode run: no addition to real ME for MGYA00001234" in caplog.text @@ -42,38 +38,34 @@ def test_population_dry_mode( assert "Dry-mode run: no addition to real ME for MGYA00466090" in caplog.text assert "Processing 0 analyses to remove" in caplog.text - @pytest.mark.usefixtures('suppressed_analysis_jobs') - def test_removals_dry_mode( - self, - caplog - ): + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") + @pytest.mark.usefixtures("suppressed_analysis_jobs") + def test_removals_dry_mode(self, mock_check_analysis, caplog): + mock_check_analysis.return_value = None, False call_command( "populate_metagenomics_exchange", - dev=True, dry_run=True, ) ajobs = AnalysisJob.objects.all() for job in ajobs: - assert f"No {job.accession} in ME, nothing to delete" in caplog.text - assert "Processing 0 new analyses" in caplog.text + assert ( + f"{job.accession} doesn't exist in the registry, nothing to delete" + in caplog.text + ) + assert "Indexing 0 new analyses" in caplog.text - @pytest.mark.usefixtures('analysis_existed_in_me') + @pytest.mark.usefixtures("analysis_existed_in_me") def test_update_dry_mode(self, caplog): call_command( "populate_metagenomics_exchange", - dev=True, dry_run=True, ) assert "Incorrect field None in ME (ERR1806500)" in caplog.text assert "Dry-mode run: no patch to real ME for MGYA00147343" in caplog.text assert "Processing 0 analyses to remove" in caplog.text - @pytest.mark.usefixtures('run_multiple_analysis') - def test_population( - self, - caplog - ): + @pytest.mark.usefixtures("run_multiple_analysis") + def test_population(self, caplog): call_command( "populate_metagenomics_exchange", - dev=True, ) diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index d4afc96a8..35ad15a9f 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -1107,15 +1107,15 @@ def suppressed_analysis_jobs(ena_suppressed_runs): suppressed_analysisjobs = make_suppressed_analysis_jobs(quantity=5, emg_props={"is_suppressed": True, "suppressed_at": '1980-01-01 00:00:00', - 'last_populated_me': '1970-01-01 00:00:00'}) + 'last_mgx_indexed': '1970-01-01 00:00:00'}) return suppressed_analysisjobs @pytest.fixture def analysis_existed_in_me(): emg_props = { "job_id": 147343, - "last_populated_me": '1970-01-01 00:00:00', - "last_updated_me": '1980-01-01 00:00:00', + "last_mgx_indexed": '1970-01-01 00:00:00', + "last_update": '1980-01-01 00:00:00', "is_suppressed": False, "is_private": False } From 2f832247bc288f6a0c1728191ad9cf38ac5231b1 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 29 Jan 2024 13:19:51 +0000 Subject: [PATCH 24/46] Fix some ME tests --- emgapi/metagenomics_exchange.py | 3 ++- tests/me/test_metagenomics_exchange.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 74cf61c6b..56b8d7970 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -93,8 +93,9 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata if public: params = { "status": "public" if public else "private", + "broker": self.broker } - endpoint = f"sequences/{sequence_id}" + endpoint = f"sequences/{sequence_id}/datasets" response = self.get_request(endpoint=endpoint, params=params) analysis_registry_id = None metadata_match = True diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 58558df01..48f6ff640 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -65,7 +65,7 @@ def test_wrong_delete_request_me(self): registry_id = "MGX0000780" endpoint = f"dataset/{registry_id}" - with pytest.raises(requests.HTTPError, match="401 Client Error"): + with pytest.raises(requests.HTTPError, match="404 Client Error"): me_api.delete_request(endpoint) def test_patch_analysis_me(self): From 5961f7d498aebd4bc5d862c41db3c4ea19404d42 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 14:06:47 +0000 Subject: [PATCH 25/46] WIP - migration tidy up --- .../migrations/0014_analysisjob_last_update.py | 18 ------------------ ...0118_1220.py => 0017_auto_20240129_1401.py} | 17 ++++++++++------- emgapi/models.py | 2 +- 3 files changed, 11 insertions(+), 26 deletions(-) delete mode 100644 emgapi/migrations/0014_analysisjob_last_update.py rename emgapi/migrations/{0013_auto_20240118_1220.py => 0017_auto_20240129_1401.py} (83%) diff --git a/emgapi/migrations/0014_analysisjob_last_update.py b/emgapi/migrations/0014_analysisjob_last_update.py deleted file mode 100644 index dcb791e04..000000000 --- a/emgapi/migrations/0014_analysisjob_last_update.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 3.2.18 on 2024-01-18 13:55 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('emgapi', '0013_auto_20240118_1220'), - ] - - operations = [ - migrations.AddField( - model_name='analysisjob', - name='last_update', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), - ), - ] diff --git a/emgapi/migrations/0013_auto_20240118_1220.py b/emgapi/migrations/0017_auto_20240129_1401.py similarity index 83% rename from emgapi/migrations/0013_auto_20240118_1220.py rename to emgapi/migrations/0017_auto_20240129_1401.py index 5f2b21809..e8e9d380e 100644 --- a/emgapi/migrations/0013_auto_20240118_1220.py +++ b/emgapi/migrations/0017_auto_20240129_1401.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.18 on 2024-01-18 12:20 +# Generated by Django 3.2.18 on 2024-01-29 14:01 from django.db import migrations, models @@ -6,10 +6,18 @@ class Migration(migrations.Migration): dependencies = [ - ('emgapi', '0012_alter_publication_pub_type'), + ('emgapi', '0016_auto_20240117_1757'), ] operations = [ + migrations.RemoveField( + model_name='analysisjob', + name='last_indexed', + ), + migrations.RemoveField( + model_name='study', + name='last_indexed', + ), migrations.AddField( model_name='analysisjob', name='last_ebi_search_indexed', @@ -30,9 +38,4 @@ class Migration(migrations.Migration): name='last_ebi_search_indexed', field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), ), - migrations.AlterField( - model_name='study', - name='last_update', - field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'), - ), ] diff --git a/emgapi/models.py b/emgapi/models.py index ce547dfba..4abcd41fd 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -18,7 +18,7 @@ from django.apps import apps from django.conf import settings -from django.core.exceptions import FieldDoesNotExist, ObjectDoesNotExist +from django.core.exceptions import FieldDoesNotExist from django.db import models from django.db.models import (CharField, Count, OuterRef, Prefetch, Q, Subquery, Value, QuerySet, F) From 60d2a37505a19cf7da7902b1140d9a389f4557ae Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 29 Jan 2024 17:22:10 +0000 Subject: [PATCH 26/46] Adjust field names to new convention (last_ebi_search_indexed) --- emgapi/management/commands/ebi_search_analysis_dump.py | 4 ++-- emgapi/management/commands/ebi_search_study_dump.py | 4 ++-- emgapi/serializers.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/emgapi/management/commands/ebi_search_analysis_dump.py b/emgapi/management/commands/ebi_search_analysis_dump.py index 39d9fd12d..03edb30d3 100644 --- a/emgapi/management/commands/ebi_search_analysis_dump.py +++ b/emgapi/management/commands/ebi_search_analysis_dump.py @@ -211,6 +211,6 @@ def handle(self, *args, **options): # Small buffer into the future so that the indexing time remains ahead of auto-now updated times. for analysis in page: - analysis.last_indexed = nowish + analysis.last_ebi_search_indexed = nowish - AnalysisJob.objects.bulk_update(page, fields=["last_indexed"]) + AnalysisJob.objects.bulk_update(page, fields=["last_ebi_search_indexed"]) diff --git a/emgapi/management/commands/ebi_search_study_dump.py b/emgapi/management/commands/ebi_search_study_dump.py index 888529262..ee14b91c4 100644 --- a/emgapi/management/commands/ebi_search_study_dump.py +++ b/emgapi/management/commands/ebi_search_study_dump.py @@ -103,6 +103,6 @@ def handle(self, *args, **options): # Small buffer into the future so that the indexing time remains ahead of auto-now updated times. for study in studies: - study.last_indexed = nowish + study.last_ebi_search_indexed = nowish - Study.objects.bulk_update(studies, fields=["last_indexed"]) + Study.objects.bulk_update(studies, fields=["last_ebi_search_indexed"]) diff --git a/emgapi/serializers.py b/emgapi/serializers.py index 0079a0c66..0e1a236ff 100644 --- a/emgapi/serializers.py +++ b/emgapi/serializers.py @@ -1021,7 +1021,8 @@ class Meta: 'is_suppressed', 'suppressed_at', 'suppression_reason', - 'last_indexed' + 'last_ebi_search_indexed', + 'last_mgx_indexed', ) @@ -1410,7 +1411,7 @@ class Meta: 'is_suppressed', 'suppression_reason', 'suppressed_at', - 'last_indexed', + 'last_ebi_search_indexed', ) From ad539e7312e7e11eafdb94f49a383303fa3c620e Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 30 Jan 2024 16:37:27 +0000 Subject: [PATCH 27/46] WIP testing --- emgapi/metagenomics_exchange.py | 8 ++- tests/me/test_metagenomics_exchange.py | 1 + .../me/test_populate_metagenomics_exchange.py | 9 ++- tests/test_utils/emg_fixtures.py | 70 ++++++++++++++++++- 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 56b8d7970..24f63b104 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -96,9 +96,15 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata "broker": self.broker } endpoint = f"sequences/{sequence_id}/datasets" - response = self.get_request(endpoint=endpoint, params=params) analysis_registry_id = None metadata_match = True + + try: + response = self.get_request(endpoint=endpoint, params=params) + except: + logging.warning(f"Get API request failed") + return analysis_registry_id , metadata_match + if response.ok: data = response.json() datasets = data.get("datasets") diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 48f6ff640..e65135789 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -26,6 +26,7 @@ def test_check_not_existing_analysis_me(self): return_values = me_api.check_analysis(source_id, seq_id, True) assert not return_values[0] + @pytest.mark.skip(reason="Error on ME API side") def test_post_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA00293719" diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py index cd4e7bc09..f79c18078 100644 --- a/tests/me/test_populate_metagenomics_exchange.py +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -26,13 +26,17 @@ @pytest.mark.django_db -class TestMeAPI: - @pytest.mark.usefixtures("run_multiple_analysis") +class TestPopulateMeAPI: + @pytest.mark.usefixtures("run_multiple_analysis_me") def test_population_dry_mode(self, caplog): call_command( "populate_metagenomics_exchange", dry_run=True, ) + assert "Indexing 3 new analyses" in caplog.text + assert "MGYA00001234 does not exist in ME" in caplog.text + assert "MGYA00005678 does not exist in ME" in caplog.text + assert "MGYA00466090 does not exist in ME" in caplog.text assert "Dry-mode run: no addition to real ME for MGYA00001234" in caplog.text assert "Dry-mode run: no addition to real ME for MGYA00005678" in caplog.text assert "Dry-mode run: no addition to real ME for MGYA00466090" in caplog.text @@ -60,6 +64,7 @@ def test_update_dry_mode(self, caplog): "populate_metagenomics_exchange", dry_run=True, ) + assert "Indexing 1 new analyses" in caplog.text assert "Incorrect field None in ME (ERR1806500)" in caplog.text assert "Dry-mode run: no patch to real ME for MGYA00147343" in caplog.text assert "Processing 0 analyses to remove" in caplog.text diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index 35ad15a9f..a4e4cdd32 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -38,7 +38,7 @@ 'ena_public_assemblies', 'ena_private_assemblies', 'ena_suppressed_assemblies', 'ena_suppression_propagation_samples', 'ena_suppression_propagation_assemblies', 'assembly_extra_annotation', 'ena_suppression_propagation_studies', 'ena_suppression_propagation_runs', - 'suppressed_analysis_jobs', 'analysis_existed_in_me', + 'suppressed_analysis_jobs', 'analysis_existed_in_me', 'run_multiple_analysis_me' ] @@ -1120,3 +1120,71 @@ def analysis_existed_in_me(): "is_private": False } return baker.make(emg_models.AnalysisJob, **emg_props) + + +@pytest.fixture +def run_multiple_analysis_me(study, sample, analysis_status, + experiment_type): + pipeline, created = emg_models.Pipeline.objects.get_or_create( + pk=1, + release_version='1.0', + release_date='1970-01-01', + ) + pipeline4, created4 = emg_models.Pipeline.objects.get_or_create( + pk=4, + release_version='4.0', + release_date='1970-01-01', + ) + pipeline5, created5 = emg_models.Pipeline.objects.get_or_create( + pk=5, + release_version='5.0', + release_date='2020-01-01', + ) + run = emg_models.Run.objects.create( + run_id=1234, + accession='ERR3063408', + sample=sample, + study=study, + is_private=False, + experiment_type=experiment_type + ) + _anl1 = emg_models.AnalysisJob.objects.create( + job_id=1234, + sample=sample, + study=study, + run=run, + is_private=False, + experiment_type=experiment_type, + pipeline=pipeline, + analysis_status=analysis_status, + input_file_name='ABC_FASTQ', + result_directory='test_data/version_1.0/ABC_FASTQ', + submit_time='1970-01-01 00:00:00', + ) + _anl4 = emg_models.AnalysisJob.objects.create( + job_id=5678, + sample=sample, + study=study, + run=run, + is_private=False, + experiment_type=experiment_type, + pipeline=pipeline4, + analysis_status=analysis_status, + input_file_name='ABC_FASTQ', + result_directory='test_data/version_4.0/ABC_FASTQ', + submit_time='1970-01-01 00:00:00', + ) + _anl5 = emg_models.AnalysisJob.objects.create( + job_id=466090, + sample=sample, + study=study, + run=run, + is_private=False, + experiment_type=experiment_type, + pipeline=pipeline5, + analysis_status=analysis_status, + input_file_name='ABC_FASTQ', + result_directory='test_data/version_5.0/ABC_FASTQ', + submit_time='2020-01-01 00:00:00', + ) + return (_anl1, _anl4, _anl5) From 986f2229f49b2cfa57bccdcf65e4ca2f3f47ddbf Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Wed, 31 Jan 2024 16:20:29 +0000 Subject: [PATCH 28/46] Tests for populate ME --- .../populate_metagenomics_exchange.py | 55 +++++--- emgapi/metagenomics_exchange.py | 5 +- emgapi/models.py | 9 ++ .../me/test_populate_metagenomics_exchange.py | 120 ++++++++++++++---- tests/test_utils/emg_fixtures.py | 62 ++++++--- 5 files changed, 181 insertions(+), 70 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 37f2b2544..bd0b80f74 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -76,13 +76,15 @@ def handle(self, *args, **options): self.dry_run = options.get("dry_run") self.pipeline_version = options.get("pipeline") - mgx_api = MetagenomicsExchangeAPI(base_url=settings.METAGENOMICS_EXCHANGE_API) + self.mgx_api = MetagenomicsExchangeAPI(base_url=settings.METAGENOMICS_EXCHANGE_API) - analyses_to_index = AnalysisJob.objects_for_mgx_indexing.to_add() - analyses_to_delete = AnalysisJob.objects_for_mgx_indexing.to_delete() + # never indexed or updated after indexed + analyses_to_index_and_update = AnalysisJob.objects_for_mgx_indexing.to_add() + # suppressed only + analyses_to_delete = AnalysisJob.objects_for_mgx_indexing.get_suppressed() if self.study_accession: - analyses_to_index = analyses_to_index.filter( + analyses_to_index_and_update = analyses_to_index_and_update.filter( study__secondary_accession__in=self.study_accession ) analyses_to_delete = analyses_to_delete.filter( @@ -90,16 +92,23 @@ def handle(self, *args, **options): ) if self.pipeline_version: - analyses_to_index = analyses_to_index.filter( - pipeline__pipeline_id=self.pipeline_version + analyses_to_index_and_update = analyses_to_index_and_update.filter( + pipeline__release_version=self.pipeline_version ) analyses_to_delete = analyses_to_delete.filter( - pipeline__pipeline_id=self.pipeline_version + pipeline__release_version=self.pipeline_version ) - logging.info(f"Indexing {len(analyses_to_index)} new analyses") + self.process_to_index_and_update_records(analyses_to_index_and_update) + self.process_to_delete_records(analyses_to_delete) - for page in Paginator(analyses_to_index, 100): + logging.info("Done") + + + def process_to_index_and_update_records(self, analyses_to_index_and_update): + logging.info(f"Indexing {len(analyses_to_index_and_update)} new analyses") + + for page in Paginator(analyses_to_index_and_update, 100): jobs_to_update = [] for ajob in page: @@ -108,23 +117,25 @@ def handle(self, *args, **options): run_accession=ajob.run, status="public" if not ajob.is_private else "private", ) - registry_id, metadata_match = mgx_api.check_analysis( + registry_id, metadata_match = self.mgx_api.check_analysis( source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata ) # The job is not registered if not registry_id: - logging.debug(f"Add new {ajob}") + logging.info(f"Add new {ajob}") if self.dry_run: logging.info(f"Dry-mode run: no addition to real ME for {ajob}") continue - response = mgx_api.add_analysis( + response = self.mgx_api.add_analysis( mgya=ajob.accession, run_accession=ajob.run, public=not ajob.is_private, ) if response.ok: - logging.debug(f"Added {ajob}") + logging.info(f"Successfully added {ajob}") + registry_id, metadata_match = self.mgx_api.check_analysis( + source_id=ajob.accession, sequence_id=ajob.run) ajob.mgx_accession = registry_id ajob.last_mgx_indexed = timezone.now() + timedelta(minutes=1) jobs_to_update.append(ajob) @@ -134,14 +145,14 @@ def handle(self, *args, **options): # else we have to check if the metadata matches, if not we need to update it else: if not metadata_match: - logging.debug(f"Patch existing {ajob}") + logging.info(f"Patch existing {ajob}") if self.dry_run: logging.info( f"Dry-mode run: no patch to real ME for {ajob}" ) continue - if mgx_api.patch_analysis( - registry_id=registry_id, data=metadata + if self.mgx_api.patch_analysis( + registry_id=registry_id, data=metadata ): logging.info(f"Analysis {ajob} updated successfully") # Just to be safe, update the MGX accession @@ -157,6 +168,10 @@ def handle(self, *args, **options): jobs_to_update, ["last_mgx_indexed", "mgx_accession"], batch_size=100 ) + def process_to_delete_records(self, analyses_to_delete): + """ + This function removes suppressed records from ME. + """ logging.info(f"Processing {len(analyses_to_delete)} analyses to remove") for page in Paginator(analyses_to_delete, 100): @@ -168,14 +183,16 @@ def handle(self, *args, **options): run_accession=ajob.run, status="public" if not ajob.is_private else "private", ) - registry_id, _ = mgx_api.check_analysis( + registry_id, _ = self.mgx_api.check_analysis( source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata ) if registry_id: + logging.info(f"Deleting {ajob}") if self.dry_run: logging.info(f"Dry-mode run: no delete from real ME for {ajob}") + continue - if mgx_api.delete_analysis(registry_id): + if self.mgx_api.delete_analysis(registry_id): logging.info(f"{ajob} successfully deleted") ajob.last_mgx_indexed = timezone.now() jobs_to_update.append(ajob) @@ -190,5 +207,3 @@ def handle(self, *args, **options): AnalysisJob.objects.bulk_update( jobs_to_update, ["last_mgx_indexed"], batch_size=100 ) - - logging.info("Done") diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 24f63b104..a5942bc75 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -88,7 +88,7 @@ def add_analysis(self, mgya: str, run_accession: str, public: bool): def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata=None) -> [str, bool]: - logging.info(f"Check {source_id}") + logging.info(f"Check {source_id} {sequence_id}") params = {} if public: params = { @@ -120,8 +120,9 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata return analysis_registry_id , metadata_match else: if metadata[metadata_record] != found_record[metadata_record]: + print(metadata[metadata_record], found_record[metadata_record]) metadata_match = False - logging.info(f"Incorrect field {metadata[metadata_record]} in ME ({found_record[metadata_record]})") + logging.info(f"Incorrect field {metadata[metadata_record]} != {found_record[metadata_record]})") return analysis_registry_id, metadata_match return analysis_registry_id , metadata_match else: diff --git a/emgapi/models.py b/emgapi/models.py index 4abcd41fd..cf6ab08f8 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -331,6 +331,15 @@ def to_add(self): return self.filter(never_indexed | updated_after_indexing, not_suppressed, not_private) + def get_suppressed(self): + try: + self.model._meta.get_field("suppressed_at") + except FieldDoesNotExist: + return Q() + else: + return self.filter( + Q(suppressed_at__gte=F(self.index_field)) + ) class EBISearchIndexQueryset(IndexableModelQueryset): diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py index f79c18078..d9a35f32d 100644 --- a/tests/me/test_populate_metagenomics_exchange.py +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -15,10 +15,10 @@ # limitations under the License. import pytest - from unittest import mock from django.core.management import call_command +from django.utils import timezone from test_utils.emg_fixtures import * # noqa @@ -29,48 +29,114 @@ class TestPopulateMeAPI: @pytest.mark.usefixtures("run_multiple_analysis_me") def test_population_dry_mode(self, caplog): + """ + 2 of 4 analyses require indexing, both are not in ME API + 1 of 4 needs to be deleted because it was suppressed + 1 was indexed after updated - no action needed + """ call_command( "populate_metagenomics_exchange", dry_run=True, ) - assert "Indexing 3 new analyses" in caplog.text - assert "MGYA00001234 does not exist in ME" in caplog.text - assert "MGYA00005678 does not exist in ME" in caplog.text - assert "MGYA00466090 does not exist in ME" in caplog.text - assert "Dry-mode run: no addition to real ME for MGYA00001234" in caplog.text - assert "Dry-mode run: no addition to real ME for MGYA00005678" in caplog.text + assert "Indexing 2 new analyses" in caplog.text assert "Dry-mode run: no addition to real ME for MGYA00466090" in caplog.text + assert "Dry-mode run: no addition to real ME for MGYA00466091" in caplog.text + assert "Processing 1 analyses to remove" in caplog.text + assert "MGYA00005678 doesn't exist in the registry, nothing to delete" in caplog.text + + + @pytest.mark.usefixtures("run_multiple_analysis_me") + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.add_analysis") + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") + def test_add_new_analysis(self, mock_check_analysis, mock_add_analysis, caplog): + """ + Test checks new added analysis that was not indexed before, It should be added to ME. + Post process is mocked. + AnalysisJob should have updated indexed field and assigned MGX + """ + pipeline = 4.1 + registry_id = "MGX1" + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + self.ok = True + def json(self): + return self.json_data + + def mock_check_process(*args, **kwargs): + if 'metadata' in kwargs: + return None, True + else: + return registry_id, True + + mock_add_analysis.return_value = MockResponse({}, 200) + mock_check_analysis.side_effect = mock_check_process + + call_command( + "populate_metagenomics_exchange", + pipeline=pipeline, + ) + assert "Indexing 1 new analyses" in caplog.text assert "Processing 0 analyses to remove" in caplog.text + assert "Successfully added MGYA00466090" in caplog.text + ajob = AnalysisJob.objects.filter(pipeline__release_version=pipeline).first() + assert ajob.last_mgx_indexed + assert ajob.mgx_accession == registry_id + + @pytest.mark.usefixtures("run_multiple_analysis_me") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") - @pytest.mark.usefixtures("suppressed_analysis_jobs") - def test_removals_dry_mode(self, mock_check_analysis, caplog): - mock_check_analysis.return_value = None, False + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.delete_analysis") + def test_removals(self, + mock_delete_analysis, + mock_check_analysis, + caplog): + """ + Test delete process. + 1 analysis should be removed and updated indexed field in DB + """ + pipeline = 4.0 + mock_check_analysis.return_value = True, True + mock_delete_analysis.return_value = True + call_command( "populate_metagenomics_exchange", - dry_run=True, + pipeline=pipeline ) - ajobs = AnalysisJob.objects.all() - for job in ajobs: - assert ( - f"{job.accession} doesn't exist in the registry, nothing to delete" - in caplog.text - ) assert "Indexing 0 new analyses" in caplog.text + assert "Processing 1 analyses to remove" in caplog.text + assert "Deleting MGYA00005678" in caplog.text + assert "MGYA00005678 successfully deleted" in caplog.text + ajob = AnalysisJob.objects.filter(pipeline__release_version=pipeline).first() + assert ajob.last_mgx_indexed.date() == timezone.now().date() - @pytest.mark.usefixtures("analysis_existed_in_me") - def test_update_dry_mode(self, caplog): + @pytest.mark.usefixtures("run_multiple_analysis_me") + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.patch_analysis") + def test_update(self, + mock_patch_analysis, + mock_check_analysis, + caplog): + """ + Test update process for job that was indexed before updated. + MGX accession and last_mgx_indexed should be updated + """ + pipeline = 5.0 + registry_id = "MGX2" + mock_check_analysis.return_value = registry_id, False + mock_patch_analysis.return_value = True call_command( "populate_metagenomics_exchange", - dry_run=True, + pipeline=pipeline, ) + assert "Indexing 1 new analyses" in caplog.text - assert "Incorrect field None in ME (ERR1806500)" in caplog.text - assert "Dry-mode run: no patch to real ME for MGYA00147343" in caplog.text assert "Processing 0 analyses to remove" in caplog.text + assert "Patch existing MGYA00466091" in caplog.text + assert "Analysis MGYA00466091 updated successfully" in caplog.text + ajob = AnalysisJob.objects.filter(pipeline__release_version=pipeline).first() + assert ajob.last_mgx_indexed.date() == timezone.now().date() + assert ajob.mgx_accession == registry_id + - @pytest.mark.usefixtures("run_multiple_analysis") - def test_population(self, caplog): - call_command( - "populate_metagenomics_exchange", - ) diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index a4e4cdd32..25f99c70a 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -38,7 +38,7 @@ 'ena_public_assemblies', 'ena_private_assemblies', 'ena_suppressed_assemblies', 'ena_suppression_propagation_samples', 'ena_suppression_propagation_assemblies', 'assembly_extra_annotation', 'ena_suppression_propagation_studies', 'ena_suppression_propagation_runs', - 'suppressed_analysis_jobs', 'analysis_existed_in_me', 'run_multiple_analysis_me' + 'suppressed_analysis_jobs', 'run_multiple_analysis_me' ] @@ -1110,46 +1110,45 @@ def suppressed_analysis_jobs(ena_suppressed_runs): 'last_mgx_indexed': '1970-01-01 00:00:00'}) return suppressed_analysisjobs -@pytest.fixture -def analysis_existed_in_me(): - emg_props = { - "job_id": 147343, - "last_mgx_indexed": '1970-01-01 00:00:00', - "last_update": '1980-01-01 00:00:00', - "is_suppressed": False, - "is_private": False - } - return baker.make(emg_models.AnalysisJob, **emg_props) - @pytest.fixture def run_multiple_analysis_me(study, sample, analysis_status, experiment_type): + """ + Run: ERR1806500 + MGYA0000147343: pipeline v1: indexed after created - no action needed + MGYA0000005678: pipeline v4.0: suppressed - delete from ME + MGYA0000466090: pipeline v4.1: never indexed - add to ME + MGYA0000466091: pipeline v5: update in ME + """ pipeline, created = emg_models.Pipeline.objects.get_or_create( pk=1, release_version='1.0', - release_date='1970-01-01', + release_date='2000-01-01', ) pipeline4, created4 = emg_models.Pipeline.objects.get_or_create( pk=4, release_version='4.0', - release_date='1970-01-01', + release_date='2015-01-01', ) - pipeline5, created5 = emg_models.Pipeline.objects.get_or_create( + pipeline4_1, created4_1 = emg_models.Pipeline.objects.get_or_create( pk=5, - release_version='5.0', - release_date='2020-01-01', + release_version='4.1', + release_date='2016-01-01', + ) + pipeline5, created5 = emg_models.Pipeline.objects.get_or_create( + pk=6, ) run = emg_models.Run.objects.create( - run_id=1234, - accession='ERR3063408', + run_id=111, + accession='ERR1806500', sample=sample, study=study, is_private=False, experiment_type=experiment_type ) _anl1 = emg_models.AnalysisJob.objects.create( - job_id=1234, + job_id=147343, sample=sample, study=study, run=run, @@ -1160,6 +1159,8 @@ def run_multiple_analysis_me(study, sample, analysis_status, input_file_name='ABC_FASTQ', result_directory='test_data/version_1.0/ABC_FASTQ', submit_time='1970-01-01 00:00:00', + last_mgx_indexed='2970-01-01 20:00:00', + is_suppressed=False, ) _anl4 = emg_models.AnalysisJob.objects.create( job_id=5678, @@ -1173,6 +1174,9 @@ def run_multiple_analysis_me(study, sample, analysis_status, input_file_name='ABC_FASTQ', result_directory='test_data/version_4.0/ABC_FASTQ', submit_time='1970-01-01 00:00:00', + last_mgx_indexed='1970-01-01 20:00:00', + is_suppressed=True, + suppressed_at='1980-01-01 20:00:00', ) _anl5 = emg_models.AnalysisJob.objects.create( job_id=466090, @@ -1181,10 +1185,26 @@ def run_multiple_analysis_me(study, sample, analysis_status, run=run, is_private=False, experiment_type=experiment_type, + pipeline=pipeline4_1, + analysis_status=analysis_status, + input_file_name='ABC_FASTQ', + result_directory='test_data/version_5.0/ABC_FASTQ', + submit_time='2020-01-01 00:00:00', + is_suppressed=False, + ) + _anl51 = emg_models.AnalysisJob.objects.create( + job_id=466091, + sample=sample, + study=study, + run=run, + is_private=False, + experiment_type=experiment_type, pipeline=pipeline5, analysis_status=analysis_status, input_file_name='ABC_FASTQ', result_directory='test_data/version_5.0/ABC_FASTQ', submit_time='2020-01-01 00:00:00', + last_mgx_indexed='2020-01-01 20:00:00', + is_suppressed=False, ) - return (_anl1, _anl4, _anl5) + return (_anl1, _anl4, _anl5, _anl51) From 916ce45b4707b643ebffe7c2adef0630f3776f0d Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Wed, 31 Jan 2024 16:22:53 +0000 Subject: [PATCH 29/46] rm dev from tests and leave only DEV API --- config/local-tests.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/config/local-tests.yml b/config/local-tests.yml index b3f7b589b..52636a862 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -17,6 +17,4 @@ emg: results_production_dir: '/dummy/path/results' # metagenomics exchange me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' - me_api_dev: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' - me_api_token_dev: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file + me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file From 948caa8e2464a29c0d12edfaf70eaa3ac0136ed4 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Wed, 31 Jan 2024 16:25:05 +0000 Subject: [PATCH 30/46] fix pipeline mock --- tests/test_utils/emg_fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_utils/emg_fixtures.py b/tests/test_utils/emg_fixtures.py index 25f99c70a..5719b506b 100644 --- a/tests/test_utils/emg_fixtures.py +++ b/tests/test_utils/emg_fixtures.py @@ -1138,6 +1138,8 @@ def run_multiple_analysis_me(study, sample, analysis_status, ) pipeline5, created5 = emg_models.Pipeline.objects.get_or_create( pk=6, + release_version='5.0', + release_date='2020-01-01', ) run = emg_models.Run.objects.create( run_id=111, From 6a55c40bfa0ad390fe619d0b54cfb918f1ecccb9 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Thu, 1 Feb 2024 11:52:00 +0000 Subject: [PATCH 31/46] fixes --- ci/configuration.yaml | 3 +- config/local-tests.yml | 3 +- .../populate_metagenomics_exchange.py | 21 ++----- emgapi/metagenomics_exchange.py | 9 ++- emgapi/models.py | 63 +++---------------- emgcli/settings.py | 5 +- 6 files changed, 26 insertions(+), 78 deletions(-) diff --git a/ci/configuration.yaml b/ci/configuration.yaml index 33f780ee9..1102dc73e 100644 --- a/ci/configuration.yaml +++ b/ci/configuration.yaml @@ -19,5 +19,4 @@ emg: celery_backend: 'redis://localhost:6379/1' results_production_dir: '/dummy/path/results' # metagenomics exchange - me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file + me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' \ No newline at end of file diff --git a/config/local-tests.yml b/config/local-tests.yml index 52636a862..db4297888 100644 --- a/config/local-tests.yml +++ b/config/local-tests.yml @@ -16,5 +16,4 @@ emg: celery_backend: 'redis://localhost:6379/1' results_production_dir: '/dummy/path/results' # metagenomics exchange - me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' - me_api_token: 'mgx 871cd915-2826-46bb-94ed-8e6a3d9b6014' \ No newline at end of file + me_api: 'https://wwwdev.ebi.ac.uk/ena/registry/metagenome/api' \ No newline at end of file diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index bd0b80f74..1d188d678 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -60,17 +60,6 @@ def add_arguments(self, parser): help="Dry mode, no population of ME", ) - def generate_metadata(self, mgya, run_accession, status): - return { - "confidence": "full", - "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", - "method": ["other_metadata"], - "sourceID": mgya, - "sequenceID": run_accession, - "status": status, - "brokerID": settings.METAGENOMICS_EXCHANGE_MGNIFY_BROKER, - } - def handle(self, *args, **options): self.study_accession = options.get("study") self.dry_run = options.get("dry_run") @@ -81,7 +70,7 @@ def handle(self, *args, **options): # never indexed or updated after indexed analyses_to_index_and_update = AnalysisJob.objects_for_mgx_indexing.to_add() # suppressed only - analyses_to_delete = AnalysisJob.objects_for_mgx_indexing.get_suppressed() + analyses_to_delete = AnalysisJob.objects_for_mgx_indexing.to_delete() if self.study_accession: analyses_to_index_and_update = analyses_to_index_and_update.filter( @@ -104,15 +93,13 @@ def handle(self, *args, **options): logging.info("Done") - def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.info(f"Indexing {len(analyses_to_index_and_update)} new analyses") for page in Paginator(analyses_to_index_and_update, 100): jobs_to_update = [] - for ajob in page: - metadata = self.generate_metadata( + metadata = self.mgx_api.generate_metadata( mgya=ajob.accession, run_accession=ajob.run, status="public" if not ajob.is_private else "private", @@ -157,7 +144,7 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.info(f"Analysis {ajob} updated successfully") # Just to be safe, update the MGX accession ajob.mgx_accession = registry_id - ajob.last_mgx_indexed = timezone.now() + ajob.last_mgx_indexed = timezone.now() + timedelta(minutes=1) jobs_to_update.append(ajob) else: logging.error(f"Analysis {ajob} update failed") @@ -178,7 +165,7 @@ def process_to_delete_records(self, analyses_to_delete): jobs_to_update = [] for ajob in page: - metadata = self.generate_metadata( + metadata = self.mgx_api.generate_metadata( mgya=ajob.accession, run_accession=ajob.run, status="public" if not ajob.is_private else "private", diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index a5942bc75..bcbab1f88 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -73,16 +73,19 @@ def patch_request(self, endpoint: str, data: dict): response.raise_for_status() return response - def add_analysis(self, mgya: str, run_accession: str, public: bool): - data = { + def generate_metadata(self, mgya, run_accession, status): + return { "confidence": "full", "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", "method": ["other_metadata"], "sourceID": mgya, "sequenceID": run_accession, - "status": "public" if public else "private", + "status": status, "brokerID": self.broker, } + + def add_analysis(self, mgya: str, run_accession: str, public: bool): + data = self.generate_metadata(mgya, run_accession, public) response = self.post_request(endpoint="datasets", data=data) return response diff --git a/emgapi/models.py b/emgapi/models.py index cf6ab08f8..a7d36220a 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -331,16 +331,6 @@ def to_add(self): return self.filter(never_indexed | updated_after_indexing, not_suppressed, not_private) - def get_suppressed(self): - try: - self.model._meta.get_field("suppressed_at") - except FieldDoesNotExist: - return Q() - else: - return self.filter( - Q(suppressed_at__gte=F(self.index_field)) - ) - class EBISearchIndexQueryset(IndexableModelQueryset): @@ -366,6 +356,16 @@ class MetagenomicsExchangeQueryset(IndexableModelQueryset): index_field = "last_mgx_indexed" + def to_delete(self): + try: + self.model._meta.get_field("suppressed_at") + except FieldDoesNotExist: + return Q() + else: + return self.filter( + Q(suppressed_at__gte=F(self.index_field)) + ) + class MetagenomicsExchangeIndexedModel(models.Model): """Model to track Metagenomics Exchange indexation of analysis jobs @@ -1687,49 +1687,6 @@ def __str__(self): return 'Assembly:{} - Sample:{}'.format(self.assembly, self.sample) -class MetagenomicsExchangeQueryset(models.QuerySet): - """ - to_delete: Objects that have been suppressed since they were last populated, - or that have been added but updated since. - - to_add: Objects that have never been added, - or that have been added but updated since. - """ - def to_delete(self): - updated_after_populating = Q(last_updated_me__gte=F("last_populated_me"), last_populated_me__isnull=False) - - try: - self.model._meta.get_field("suppressed_at") - except FieldDoesNotExist: - return self.filter( - updated_after_populating - ) - else: - return self.filter( - Q(suppressed_at__gte=F("last_populated_me")) - ) - - def to_add(self): - updated_after_populating = Q(last_updated_me__gte=F("last_populated_me"), last_populated_me__isnull=False) - never_populated = Q(last_populated_me__isnull=True) - - try: - self.model._meta.get_field("is_suppressed") - except FieldDoesNotExist: - not_suppressed = Q() - else: - not_suppressed = Q(is_suppressed=False) - - try: - self.model._meta.get_field("is_private") - except FieldDoesNotExist: - not_private = Q() - else: - not_private = Q(is_private=False) - - return self.filter(never_populated | updated_after_populating, not_suppressed, not_private) - - class MetagenomicsExchangeModel(models.Model): """Model to track Metagenomics Exchange population https://www.ebi.ac.uk/ena/registry/metagenome/api/ diff --git a/emgcli/settings.py b/emgcli/settings.py index 27fc146eb..0d55f1e94 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -689,6 +689,9 @@ def create_secret_key(var_dir): METAGENOMICS_EXCHANGE_API_TOKEN = "" try: METAGENOMICS_EXCHANGE_API = EMG_CONF['emg']['me_api'] - METAGENOMICS_EXCHANGE_API_TOKEN = EMG_CONF['emg']['me_api_token'] + if EMG_CONF['emg'].get('me_api_token'): + METAGENOMICS_EXCHANGE_API_TOKEN = EMG_CONF['emg']['me_api_token'] + else: + METAGENOMICS_EXCHANGE_API_TOKEN = os.getenv('METAGENOMICS_EXCHANGE_API_TOKEN') except KeyError: warnings.warn("The metagenomics exchange API and Token are not configured properly") From 18d4641a393a6c33f8ef48dc419f2c0fa10ef26c Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Thu, 1 Feb 2024 12:06:17 +0000 Subject: [PATCH 32/46] add mock to patch test --- tests/me/test_metagenomics_exchange.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index e65135789..bc9a4b5a9 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -9,6 +9,7 @@ import requests import responses +from unittest import mock class TestME: @@ -69,8 +70,19 @@ def test_wrong_delete_request_me(self): with pytest.raises(requests.HTTPError, match="404 Client Error"): me_api.delete_request(endpoint) - def test_patch_analysis_me(self): + @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.patch_request") + def test_patch_analysis_me(self, + mock_patch_request): me_api = MetagenomicsExchangeAPI() + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + self.ok = True + def json(self): + return self.json_data + + mock_patch_request.return_value = MockResponse({}, 200) registry_id = "MGX0000788" mgya = "MGYA00593709" run_accession = "SRR3960575" From bfb1b542a42aea8970397b619799b519ac8e88c0 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 6 Feb 2024 16:08:11 +0000 Subject: [PATCH 33/46] some fixes after review --- .../populate_metagenomics_exchange.py | 10 ++++----- emgapi/metagenomics_exchange.py | 4 ++-- emgapi/models.py | 21 ------------------- emgcli/settings.py | 8 +++---- 4 files changed, 11 insertions(+), 32 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 1d188d678..8707cd8e9 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 2017-2022 EMBL - European Bioinformatics Institute +# Copyright 2017-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -96,7 +96,7 @@ def handle(self, *args, **options): def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.info(f"Indexing {len(analyses_to_index_and_update)} new analyses") - for page in Paginator(analyses_to_index_and_update, 100): + for page in Paginator(analyses_to_index_and_update, settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER): jobs_to_update = [] for ajob in page: metadata = self.mgx_api.generate_metadata( @@ -152,7 +152,7 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.debug(f"No edit for {ajob}, metadata is correct") AnalysisJob.objects.bulk_update( - jobs_to_update, ["last_mgx_indexed", "mgx_accession"], batch_size=100 + jobs_to_update, ["last_mgx_indexed", "mgx_accession"], batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER ) def process_to_delete_records(self, analyses_to_delete): @@ -161,7 +161,7 @@ def process_to_delete_records(self, analyses_to_delete): """ logging.info(f"Processing {len(analyses_to_delete)} analyses to remove") - for page in Paginator(analyses_to_delete, 100): + for page in Paginator(analyses_to_delete, settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER): jobs_to_update = [] for ajob in page: @@ -192,5 +192,5 @@ def process_to_delete_records(self, analyses_to_delete): # BULK UPDATE # AnalysisJob.objects.bulk_update( - jobs_to_update, ["last_mgx_indexed"], batch_size=100 + jobs_to_update, ["last_mgx_indexed"], batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER ) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index bcbab1f88..6c3015154 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 2018-2023 EMBL - European Bioinformatics Institute +# Copyright 2018-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ class MetagenomicsExchangeAPI: """Metagenomics Exchange API Client""" def __init__(self, base_url=None): - self.base_url = base_url if base_url else settings.METAGENOMICS_EXCHANGE_API + self.base_url = base_url or settings.METAGENOMICS_EXCHANGE_API self.__token = settings.METAGENOMICS_EXCHANGE_API_TOKEN self.broker = settings.METAGENOMICS_EXCHANGE_MGNIFY_BROKER diff --git a/emgapi/models.py b/emgapi/models.py index a7d36220a..31db68bb3 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -1687,27 +1687,6 @@ def __str__(self): return 'Assembly:{} - Sample:{}'.format(self.assembly, self.sample) -class MetagenomicsExchangeModel(models.Model): - """Model to track Metagenomics Exchange population - https://www.ebi.ac.uk/ena/registry/metagenome/api/ - """ - last_updated_me = models.DateTimeField( - db_column='LAST_UPDATED_ME', - auto_now=True - ) - last_populated_me = models.DateTimeField( - db_column='LAST_POPULATED_ME', - null=True, - blank=True, - help_text="Date at which this model was last appeared in Metagenomics Exchange" - ) - - objects_for_population = MetagenomicsExchangeQueryset.as_manager() - - class Meta: - abstract = True - - class AnalysisJobQuerySet(BaseQuerySet, MySQLQuerySet, SuppressQuerySet, SelectRelatedOnlyQuerySetMixin): def __init__(self, *args, **kwargs): diff --git a/emgcli/settings.py b/emgcli/settings.py index 0d55f1e94..c778d1f5a 100644 --- a/emgcli/settings.py +++ b/emgcli/settings.py @@ -687,11 +687,11 @@ def create_secret_key(var_dir): METAGENOMICS_EXCHANGE_MGNIFY_BROKER = "EMG" METAGENOMICS_EXCHANGE_API = "" METAGENOMICS_EXCHANGE_API_TOKEN = "" +METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER = 100 try: METAGENOMICS_EXCHANGE_API = EMG_CONF['emg']['me_api'] - if EMG_CONF['emg'].get('me_api_token'): + METAGENOMICS_EXCHANGE_API_TOKEN = os.getenv('METAGENOMICS_EXCHANGE_API_TOKEN') + if not METAGENOMICS_EXCHANGE_API_TOKEN: METAGENOMICS_EXCHANGE_API_TOKEN = EMG_CONF['emg']['me_api_token'] - else: - METAGENOMICS_EXCHANGE_API_TOKEN = os.getenv('METAGENOMICS_EXCHANGE_API_TOKEN') except KeyError: - warnings.warn("The metagenomics exchange API and Token are not configured properly") + warnings.warn("The metagenomics exchange API and Token are not configured properly") \ No newline at end of file From 7dbbb141dd3571909f9081c755170e5fee994b0e Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 7 Feb 2024 21:33:41 +0000 Subject: [PATCH 34/46] Rename EBI Seach last_indexed fields --- emgapi/migrations/0017_auto_20240129_1401.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/emgapi/migrations/0017_auto_20240129_1401.py b/emgapi/migrations/0017_auto_20240129_1401.py index e8e9d380e..9bf204673 100644 --- a/emgapi/migrations/0017_auto_20240129_1401.py +++ b/emgapi/migrations/0017_auto_20240129_1401.py @@ -10,18 +10,15 @@ class Migration(migrations.Migration): ] operations = [ - migrations.RemoveField( + migrations.RenameField( model_name='analysisjob', - name='last_indexed', + old_name='last_indexed', + new_name='last_ebi_search_indexed', ), - migrations.RemoveField( + migrations.RenameField( model_name='study', - name='last_indexed', - ), - migrations.AddField( - model_name='analysisjob', - name='last_ebi_search_indexed', - field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + old_name='last_indexed', + new_name='last_ebi_search_indexed', ), migrations.AddField( model_name='analysisjob', @@ -33,9 +30,4 @@ class Migration(migrations.Migration): name='mgx_accession', field=models.CharField(blank=True, db_column='MGX_ACCESSION', help_text='The Metagenomics Exchange accession.', max_length=10, null=True, unique=True), ), - migrations.AddField( - model_name='study', - name='last_ebi_search_indexed', - field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), - ), ] From 4a8f3b4ac0e97d38515377425b5305e08b48e713 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 7 Feb 2024 21:57:56 +0000 Subject: [PATCH 35/46] Add an extra migrate step to rename last_indexed -> last_ebi_search_indexed This is needed because RenameField doesn't allow you to change the db_column name. With this change it seems to work, and the data should be kept: ```sql -- -- Alter field last_indexed on analysisjob -- ALTER TABLE "ANALYSIS_JOB" RENAME COLUMN "LAST_INDEXED" TO "LAST_EBI_SEARCH_INDEXED"; -- -- Alter field last_indexed on study -- ALTER TABLE "STUDY" RENAME COLUMN "LAST_INDEXED" TO "LAST_EBI_SEARCH_INDEXED"; -- -- Rename field last_indexed on analysisjob to last_ebi_search_indexed -- -- -- Rename field last_indexed on study to last_ebi_search_indexed -- ``` --- emgapi/migrations/0017_auto_20240129_1401.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/emgapi/migrations/0017_auto_20240129_1401.py b/emgapi/migrations/0017_auto_20240129_1401.py index 9bf204673..a45176c9e 100644 --- a/emgapi/migrations/0017_auto_20240129_1401.py +++ b/emgapi/migrations/0017_auto_20240129_1401.py @@ -10,6 +10,16 @@ class Migration(migrations.Migration): ] operations = [ + migrations.AlterField( + model_name='analysisjob', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), + migrations.AlterField( + model_name='study', + name='last_indexed', + field=models.DateTimeField(blank=True, db_column='LAST_EBI_SEARCH_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True), + ), migrations.RenameField( model_name='analysisjob', old_name='last_indexed', From f3daefecd49c407981ca73a073be335f4dca6bb6 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 12 Feb 2024 16:16:28 +0000 Subject: [PATCH 36/46] Fixes on the MGX based on feedback --- emgapi/metagenomics_exchange.py | 81 +++++++++++++++----------- tests/me/test_metagenomics_exchange.py | 29 ++++----- 2 files changed, 63 insertions(+), 47 deletions(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 6c3015154..dd69f5902 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -21,7 +21,6 @@ class MetagenomicsExchangeAPI: - """Metagenomics Exchange API Client""" def __init__(self, base_url=None): @@ -55,10 +54,7 @@ def delete_request(self, endpoint: str): "Accept": "application/json", "Authorization": self.__token, } - response = requests.delete( - f"{self.base_url}/{endpoint}", headers=headers - ) - response.raise_for_status() + response = requests.delete(f"{self.base_url}/{endpoint}", headers=headers) return response def patch_request(self, endpoint: str, data: dict): @@ -70,7 +66,6 @@ def patch_request(self, endpoint: str, data: dict): response = requests.patch( f"{self.base_url}/{endpoint}", json=data, headers=headers ) - response.raise_for_status() return response def generate_metadata(self, mgya, run_accession, status): @@ -89,14 +84,15 @@ def add_analysis(self, mgya: str, run_accession: str, public: bool): response = self.post_request(endpoint="datasets", data=data) return response - - def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata=None) -> [str, bool]: + def check_analysis( + self, source_id: str, sequence_id: str, public=None, metadata=None + ): logging.info(f"Check {source_id} {sequence_id}") params = {} if public: params = { "status": "public" if public else "private", - "broker": self.broker + "broker": self.broker, } endpoint = f"sequences/{sequence_id}/datasets" analysis_registry_id = None @@ -105,34 +101,42 @@ def check_analysis(self, source_id: str, sequence_id: str, public=None, metadata try: response = self.get_request(endpoint=endpoint, params=params) except: - logging.warning(f"Get API request failed") - return analysis_registry_id , metadata_match - - if response.ok: - data = response.json() - datasets = data.get("datasets") - sourceIDs = [item.get("sourceID") for item in datasets] - if source_id in sourceIDs: - found_record = [item for item in datasets if item.get("sourceID") == source_id][0] - logging.info(f"{source_id} exists in ME") - analysis_registry_id = found_record.get("registryID") - if metadata: - for metadata_record in metadata: - if not(metadata_record in found_record): + logging.error(f"Get API request failed") + return analysis_registry_id, metadata_match + + data = response.json() + datasets = data.get("datasets", []) + + # The API will return an emtpy datasets array if it can find the accession + if not len(datasets): + logging.info(f"{source_id} does not exist in ME") + return analysis_registry_id, metadata_match + + sourceIDs = [item.get("sourceID") for item in datasets] + if source_id in sourceIDs: + found_record = [ + item for item in datasets if item.get("sourceID") == source_id + ][0] + logging.info(f"{source_id} exists in ME") + analysis_registry_id = found_record.get("registryID") + if metadata: + for metadata_record in metadata: + if not (metadata_record in found_record): + metadata_match = False + return analysis_registry_id, metadata_match + else: + if metadata[metadata_record] != found_record[metadata_record]: metadata_match = False - return analysis_registry_id , metadata_match - else: - if metadata[metadata_record] != found_record[metadata_record]: - print(metadata[metadata_record], found_record[metadata_record]) - metadata_match = False - logging.info(f"Incorrect field {metadata[metadata_record]} != {found_record[metadata_record]})") - return analysis_registry_id, metadata_match - return analysis_registry_id , metadata_match - else: - logging.info(f"{source_id} does not exist in ME") + logging.info( + f"Incorrect field {metadata[metadata_record]} != {found_record[metadata_record]})" + ) + return analysis_registry_id, metadata_match + return analysis_registry_id, metadata_match + return analysis_registry_id, metadata_match def delete_analysis(self, registry_id: str): + """Delete an entry from the registry""" response = self.delete_request(endpoint=f"datasets/{registry_id}") if response.ok: logging.info(f"{registry_id} was deleted with {response.status_code}") @@ -140,13 +144,18 @@ def delete_analysis(self, registry_id: str): else: if response.status_code == 400: logging.error(f"Bad request for {registry_id}") + elif response.status_code == 404: + logging.error(f"{registry_id} not found") elif response.status_code == 401: logging.error(f"Failed to authenticate for {registry_id}") else: - logging.error(f"{response.message} for {registry_id}") + logging.error( + f"Deleted failed for {registry_id}, response message: {response.message}" + ) return False def patch_analysis(self, registry_id: str, data: dict): + """Patch an entry on the registry""" response = self.patch_request(endpoint=f"datasets/{registry_id}", data=data) if response.ok: logging.info(f"{registry_id} was patched") @@ -158,4 +167,8 @@ def patch_analysis(self, registry_id: str, data: dict): logging.error(f"Fail to authenticate for {registry_id}") elif response.status_code == 409: logging.error(f"Conflicts with existing data for {registry_id}") + else: + logging.error( + f"Patch failed for {registry_id}, response message: {response.message}" + ) return False diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index bc9a4b5a9..c964c9d85 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import pytest +import logging from django.conf import settings @@ -11,6 +12,7 @@ import responses from unittest import mock + class TestME: def test_check_existing_analysis_me(self): @@ -33,7 +35,9 @@ def test_post_existing_analysis_me(self): source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 with pytest.raises(requests.HTTPError, match="401 Client Error"): - me_api.add_analysis(mgya=source_id, run_accession="ERR3063408", public=True).json() + me_api.add_analysis( + mgya=source_id, run_accession="ERR3063408", public=True + ).json() @responses.activate def test_mock_post_new_analysis(self): @@ -41,12 +45,14 @@ def test_mock_post_new_analysis(self): endpoint = "datasets" url = settings.METAGENOMICS_EXCHANGE_API + f"/{endpoint}" - responses.add(responses.POST, url, json={'success': True}, status=201) + responses.add(responses.POST, url, json={"success": True}, status=201) - response = me_api.add_analysis(mgya="MGYA00593709", run_accession="SRR3960575", public=True) + response = me_api.add_analysis( + mgya="MGYA00593709", run_accession="SRR3960575", public=True + ) assert response.status_code == 201 - assert response.json() == {'success': True} + assert response.json() == {"success": True} @responses.activate def test_mock_delete_analysis_from_me(self): @@ -55,30 +61,28 @@ def test_mock_delete_analysis_from_me(self): endpoint = f"datasets/{registry_id}" url = settings.METAGENOMICS_EXCHANGE_API + f"/{endpoint}" - responses.add(responses.DELETE, url, json={'success': True}, status=201) + responses.add(responses.DELETE, url, json={"success": True}, status=201) response = me_api.delete_request(endpoint) assert response.status_code == 201 - assert response.json() == {'success': True} - + assert response.json() == {"success": True} def test_wrong_delete_request_me(self): me_api = MetagenomicsExchangeAPI() registry_id = "MGX0000780" endpoint = f"dataset/{registry_id}" - - with pytest.raises(requests.HTTPError, match="404 Client Error"): - me_api.delete_request(endpoint) + assert not me_api.delete_request(endpoint) @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.patch_request") - def test_patch_analysis_me(self, - mock_patch_request): + def test_patch_analysis_me(self, mock_patch_request): me_api = MetagenomicsExchangeAPI() + class MockResponse: def __init__(self, json_data, status_code): self.json_data = json_data self.status_code = status_code self.ok = True + def json(self): return self.json_data @@ -98,4 +102,3 @@ def json(self): "brokerID": "EMG", } assert me_api.patch_analysis(registry_id, data) - From f0a72ff226bd4574386bde2ac1ad6ff94dc2ecf3 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 12 Feb 2024 16:17:21 +0000 Subject: [PATCH 37/46] Bump version --- emgcli/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/emgcli/__init__.py b/emgcli/__init__.py index 57d8f469a..43e1294ff 100644 --- a/emgcli/__init__.py +++ b/emgcli/__init__.py @@ -1 +1 @@ -__version__: str = "2.4.44" +__version__: str = "2.4.45" diff --git a/pyproject.toml b/pyproject.toml index cd268fd7b..4407d26c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,7 +117,7 @@ max-line-length = 119 """ [tool.bumpversion] -current_version = "2.4.44" +current_version = "2.4.45" [[tool.bumpversion.files]] filename = "emgcli/__init__.py" From b2d60bd84825e96ee5955600b2fc3028eb6f04d2 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:25:08 +0000 Subject: [PATCH 38/46] A few bug fixes for the metagenomics exchange command and api wrapper. Add support for Assembly and Run based jobs (both). Remove the public check, we only submit public data to the exchange. Normalize argument names in the MGX wrapper. Added some docstrings --- .../populate_metagenomics_exchange.py | 129 ++++++++++++------ emgapi/metagenomics_exchange.py | 91 ++++++++---- 2 files changed, 149 insertions(+), 71 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 8707cd8e9..b5525d163 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -15,15 +15,15 @@ # limitations under the License. import logging +from datetime import timedelta from django.conf import settings from django.core.management import BaseCommand -from django.utils import timezone from django.core.paginator import Paginator -from datetime import timedelta +from django.utils import timezone -from emgapi.models import AnalysisJob from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI +from emgapi.models import AnalysisJob logger = logging.getLogger(__name__) @@ -65,7 +65,9 @@ def handle(self, *args, **options): self.dry_run = options.get("dry_run") self.pipeline_version = options.get("pipeline") - self.mgx_api = MetagenomicsExchangeAPI(base_url=settings.METAGENOMICS_EXCHANGE_API) + self.mgx_api = MetagenomicsExchangeAPI( + base_url=settings.METAGENOMICS_EXCHANGE_API + ) # never indexed or updated after indexed analyses_to_index_and_update = AnalysisJob.objects_for_mgx_indexing.to_add() @@ -96,63 +98,87 @@ def handle(self, *args, **options): def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.info(f"Indexing {len(analyses_to_index_and_update)} new analyses") - for page in Paginator(analyses_to_index_and_update, settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER): + for page in Paginator( + analyses_to_index_and_update, + settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER, + ): jobs_to_update = [] - for ajob in page: + for annotation_job in page: + + sequence_accession = "" + if annotation_job.run: + sequence_accession = annotation_job.run.accession + if annotation_job.assembly: + sequence_accession = annotation_job.assembly.accession + metadata = self.mgx_api.generate_metadata( - mgya=ajob.accession, - run_accession=ajob.run, - status="public" if not ajob.is_private else "private", + mgya=annotation_job.accession, run_accession=sequence_accession ) registry_id, metadata_match = self.mgx_api.check_analysis( - source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + mgya=annotation_job.accession, + sequence_accession=annotation_job.run, + metadata=metadata, ) # The job is not registered if not registry_id: - logging.info(f"Add new {ajob}") + logging.info(f"Add new {annotation_job}") if self.dry_run: - logging.info(f"Dry-mode run: no addition to real ME for {ajob}") + logging.info( + f"Dry-mode run: no addition to real ME for {annotation_job}" + ) continue response = self.mgx_api.add_analysis( - mgya=ajob.accession, - run_accession=ajob.run, - public=not ajob.is_private, + mgya=annotation_job.accession, run_accession=annotation_job.run ) if response.ok: - logging.info(f"Successfully added {ajob}") + logging.info(f"Successfully added {annotation_job}") registry_id, metadata_match = self.mgx_api.check_analysis( - source_id=ajob.accession, sequence_id=ajob.run) - ajob.mgx_accession = registry_id - ajob.last_mgx_indexed = timezone.now() + timedelta(minutes=1) - jobs_to_update.append(ajob) + mgya=annotation_job.accession, + sequence_id=annotation_job.run, + ) + annotation_job.mgx_accession = registry_id + annotation_job.last_mgx_indexed = timezone.now() + timedelta( + minutes=1 + ) + jobs_to_update.append(annotation_job) else: - logging.error(f"Error adding {ajob}: {response.message}") + logging.error( + f"Error adding {annotation_job}: {response.message}" + ) # else we have to check if the metadata matches, if not we need to update it else: if not metadata_match: - logging.info(f"Patch existing {ajob}") + logging.info(f"Patch existing {annotation_job}") if self.dry_run: logging.info( - f"Dry-mode run: no patch to real ME for {ajob}" + f"Dry-mode run: no patch to real ME for {annotation_job}" ) continue if self.mgx_api.patch_analysis( - registry_id=registry_id, data=metadata + registry_id=registry_id, data=metadata ): - logging.info(f"Analysis {ajob} updated successfully") + logging.info( + f"Analysis {annotation_job} updated successfully" + ) # Just to be safe, update the MGX accession - ajob.mgx_accession = registry_id - ajob.last_mgx_indexed = timezone.now() + timedelta(minutes=1) - jobs_to_update.append(ajob) + annotation_job.mgx_accession = registry_id + annotation_job.last_mgx_indexed = ( + timezone.now() + timedelta(minutes=1) + ) + jobs_to_update.append(annotation_job) else: - logging.error(f"Analysis {ajob} update failed") + logging.error(f"Analysis {annotation_job} update failed") else: - logging.debug(f"No edit for {ajob}, metadata is correct") + logging.debug( + f"No edit for {annotation_job}, metadata is correct" + ) AnalysisJob.objects.bulk_update( - jobs_to_update, ["last_mgx_indexed", "mgx_accession"], batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER + jobs_to_update, + ["last_mgx_indexed", "mgx_accession"], + batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER, ) def process_to_delete_records(self, analyses_to_delete): @@ -161,36 +187,49 @@ def process_to_delete_records(self, analyses_to_delete): """ logging.info(f"Processing {len(analyses_to_delete)} analyses to remove") - for page in Paginator(analyses_to_delete, settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER): + for page in Paginator( + analyses_to_delete, settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER + ): jobs_to_update = [] - for ajob in page: + for annotation_job in page: + + sequence_accession = "" + if annotation_job.run: + sequence_accession = annotation_job.run.accession + if annotation_job.assembly: + sequence_accession = annotation_job.assembly.accession + metadata = self.mgx_api.generate_metadata( - mgya=ajob.accession, - run_accession=ajob.run, - status="public" if not ajob.is_private else "private", + mgya=annotation_job.accession, run_accession=annotation_job.run ) registry_id, _ = self.mgx_api.check_analysis( - source_id=ajob.accession, sequence_id=ajob.run, metadata=metadata + mgya=annotation_job.accession, + sequence_accession=sequence_accession, + metadata=metadata, ) if registry_id: - logging.info(f"Deleting {ajob}") + logging.info(f"Deleting {annotation_job}") if self.dry_run: - logging.info(f"Dry-mode run: no delete from real ME for {ajob}") + logging.info( + f"Dry-mode run: no delete from real ME for {annotation_job}" + ) continue if self.mgx_api.delete_analysis(registry_id): - logging.info(f"{ajob} successfully deleted") - ajob.last_mgx_indexed = timezone.now() - jobs_to_update.append(ajob) + logging.info(f"{annotation_job} successfully deleted") + annotation_job.last_mgx_indexed = timezone.now() + jobs_to_update.append(annotation_job) else: - logging.info(f"{ajob} failed on delete") + logging.info(f"{annotation_job} failed on delete") else: logging.info( - f"{ajob} doesn't exist in the registry, nothing to delete" + f"{annotation_job} doesn't exist in the registry, nothing to delete" ) # BULK UPDATE # AnalysisJob.objects.bulk_update( - jobs_to_update, ["last_mgx_indexed"], batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER + jobs_to_update, + ["last_mgx_indexed"], + batch_size=settings.METAGENOMICS_EXCHANGE_PAGINATOR_NUMBER, ) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index dd69f5902..79b693571 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -15,9 +15,10 @@ # limitations under the License. import logging -import requests +import requests from django.conf import settings +from requests.exceptions import HTTPError class MetagenomicsExchangeAPI: @@ -25,7 +26,7 @@ class MetagenomicsExchangeAPI: def __init__(self, base_url=None): self.base_url = base_url or settings.METAGENOMICS_EXCHANGE_API - self.__token = settings.METAGENOMICS_EXCHANGE_API_TOKEN + self.__token = f"mgx {settings.METAGENOMICS_EXCHANGE_API_TOKEN}" self.broker = settings.METAGENOMICS_EXCHANGE_MGNIFY_BROKER def get_request(self, endpoint: str, params: dict): @@ -68,40 +69,78 @@ def patch_request(self, endpoint: str, data: dict): ) return response - def generate_metadata(self, mgya, run_accession, status): + def generate_metadata(self, mgya, sequence_accession): + """Generate the metadata object for the Metagenomics Exchange API. + + Parameters: + mgya : str + The MGnify Analysis accession. + sequence_accession : str + Either the Run accession or the Assembly accession related to the MGYA. + + Returns: + dict + A dictionary containing metadata for the Metagenomics Exchange API. + """ return { "confidence": "full", "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", "method": ["other_metadata"], "sourceID": mgya, - "sequenceID": run_accession, - "status": status, + "sequenceID": sequence_accession, + "status": "public", "brokerID": self.broker, } - def add_analysis(self, mgya: str, run_accession: str, public: bool): - data = self.generate_metadata(mgya, run_accession, public) + def add_analysis(self, mgya: str, sequence_accession: str): + """Add an analysis to the M. Exchange + + Parameters: + mgya : str + The MGnify Analysis accession. + sequence_accession : str + Either the Run accession or the Assembly accession related to the MGYA. + + Returns: + requests.models.Response + The response object from the API request. + """ + data = self.generate_metadata(mgya, sequence_accession) response = self.post_request(endpoint="datasets", data=data) return response - def check_analysis( - self, source_id: str, sequence_id: str, public=None, metadata=None - ): - logging.info(f"Check {source_id} {sequence_id}") - params = {} - if public: - params = { - "status": "public" if public else "private", - "broker": self.broker, - } - endpoint = f"sequences/{sequence_id}/datasets" + def check_analysis(self, mgya: str, sequence_accesion: str, metadata=None): + """Check if a sequence exists in the M. Exchange + + Parameters: + mgya : str + The MGnify Analysis accession. + sequence_accesion : str + Either the Run accession or the Assembly accession related to the MGYA. + + Returns: + requests.models.Response + The response object from the API request. + """ + if not mgya: + raise ValueError(f"mgya is mandatory.") + if not sequence_accesion: + raise ValueError(f"sequence_accesion is mandatory.") + + logging.info(f"Checking {mgya} - {sequence_accesion}") + + params = { + "broker": self.broker, + } + + endpoint = f"sequences/{sequence_accesion}/datasets" analysis_registry_id = None metadata_match = True try: response = self.get_request(endpoint=endpoint, params=params) - except: - logging.error(f"Get API request failed") + except HTTPError as http_error: + logging.error(f"Get API request failed. HTTP Error: {http_error}") return analysis_registry_id, metadata_match data = response.json() @@ -109,15 +148,15 @@ def check_analysis( # The API will return an emtpy datasets array if it can find the accession if not len(datasets): - logging.info(f"{source_id} does not exist in ME") + logging.info(f"{mgya} does not exist in ME") return analysis_registry_id, metadata_match sourceIDs = [item.get("sourceID") for item in datasets] - if source_id in sourceIDs: - found_record = [ - item for item in datasets if item.get("sourceID") == source_id - ][0] - logging.info(f"{source_id} exists in ME") + if mgya in sourceIDs: + found_record = [item for item in datasets if item.get("sourceID") == mgya][ + 0 + ] + logging.info(f"{mgya} exists in ME") analysis_registry_id = found_record.get("registryID") if metadata: for metadata_record in metadata: From e3e9cfb0a8d2d4e6de251200f9cfb66a6c044af1 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:31:05 +0000 Subject: [PATCH 39/46] Fix typos in method args --- .../populate_metagenomics_exchange.py | 9 +++--- tests/me/test_metagenomics_exchange.py | 25 +++++++---------- .../me/test_populate_metagenomics_exchange.py | 28 +++++++------------ 3 files changed, 24 insertions(+), 38 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index b5525d163..2e1d0da39 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -104,7 +104,6 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): ): jobs_to_update = [] for annotation_job in page: - sequence_accession = "" if annotation_job.run: sequence_accession = annotation_job.run.accession @@ -112,7 +111,7 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): sequence_accession = annotation_job.assembly.accession metadata = self.mgx_api.generate_metadata( - mgya=annotation_job.accession, run_accession=sequence_accession + mgya=annotation_job.accession, sequence_accession=sequence_accession ) registry_id, metadata_match = self.mgx_api.check_analysis( mgya=annotation_job.accession, @@ -129,7 +128,8 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): continue response = self.mgx_api.add_analysis( - mgya=annotation_job.accession, run_accession=annotation_job.run + mgya=annotation_job.accession, + sequence_accession=annotation_job.run, ) if response.ok: logging.info(f"Successfully added {annotation_job}") @@ -193,7 +193,6 @@ def process_to_delete_records(self, analyses_to_delete): jobs_to_update = [] for annotation_job in page: - sequence_accession = "" if annotation_job.run: sequence_accession = annotation_job.run.accession @@ -201,7 +200,7 @@ def process_to_delete_records(self, analyses_to_delete): sequence_accession = annotation_job.assembly.accession metadata = self.mgx_api.generate_metadata( - mgya=annotation_job.accession, run_accession=annotation_job.run + mgya=annotation_job.accession, sequence_accession=sequence_accession ) registry_id, _ = self.mgx_api.check_analysis( mgya=annotation_job.accession, diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index c964c9d85..6531bc63f 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -1,32 +1,29 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest -import logging +from unittest import mock +import pytest +import requests +import responses from django.conf import settings from emgapi.metagenomics_exchange import MetagenomicsExchangeAPI -import requests -import responses -from unittest import mock - class TestME: - def test_check_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() - source_id = "MGYA00293719" - seq_id = "ERR3063408" - return_values = me_api.check_analysis(source_id, seq_id, True) + mgya = "MGYA00293719" + sequence_accession = "ERR3063408" + return_values = me_api.check_analysis(mgya, sequence_accession) assert return_values[0] def test_check_not_existing_analysis_me(self): me_api = MetagenomicsExchangeAPI() source_id = "MGYA10293719" seq_id = "ERR3063408" - return_values = me_api.check_analysis(source_id, seq_id, True) + return_values = me_api.check_analysis(source_id, seq_id) assert not return_values[0] @pytest.mark.skip(reason="Error on ME API side") @@ -35,9 +32,7 @@ def test_post_existing_analysis_me(self): source_id = "MGYA00293719" # Should return -> https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/409 with pytest.raises(requests.HTTPError, match="401 Client Error"): - me_api.add_analysis( - mgya=source_id, run_accession="ERR3063408", public=True - ).json() + me_api.add_analysis(mgya=source_id, sequence_accession="ERR3063408").json() @responses.activate def test_mock_post_new_analysis(self): @@ -48,7 +43,7 @@ def test_mock_post_new_analysis(self): responses.add(responses.POST, url, json={"success": True}, status=201) response = me_api.add_analysis( - mgya="MGYA00593709", run_accession="SRR3960575", public=True + mgya="MGYA00593709", sequence_accession="SRR3960575" ) assert response.status_code == 201 diff --git a/tests/me/test_populate_metagenomics_exchange.py b/tests/me/test_populate_metagenomics_exchange.py index d9a35f32d..d10083ac1 100644 --- a/tests/me/test_populate_metagenomics_exchange.py +++ b/tests/me/test_populate_metagenomics_exchange.py @@ -42,8 +42,10 @@ def test_population_dry_mode(self, caplog): assert "Dry-mode run: no addition to real ME for MGYA00466090" in caplog.text assert "Dry-mode run: no addition to real ME for MGYA00466091" in caplog.text assert "Processing 1 analyses to remove" in caplog.text - assert "MGYA00005678 doesn't exist in the registry, nothing to delete" in caplog.text - + assert ( + "MGYA00005678 doesn't exist in the registry, nothing to delete" + in caplog.text + ) @pytest.mark.usefixtures("run_multiple_analysis_me") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.add_analysis") @@ -56,16 +58,18 @@ def test_add_new_analysis(self, mock_check_analysis, mock_add_analysis, caplog): """ pipeline = 4.1 registry_id = "MGX1" + class MockResponse: def __init__(self, json_data, status_code): self.json_data = json_data self.status_code = status_code self.ok = True + def json(self): return self.json_data def mock_check_process(*args, **kwargs): - if 'metadata' in kwargs: + if "metadata" in kwargs: return None, True else: return registry_id, True @@ -84,14 +88,10 @@ def mock_check_process(*args, **kwargs): assert ajob.last_mgx_indexed assert ajob.mgx_accession == registry_id - @pytest.mark.usefixtures("run_multiple_analysis_me") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.delete_analysis") - def test_removals(self, - mock_delete_analysis, - mock_check_analysis, - caplog): + def test_removals(self, mock_delete_analysis, mock_check_analysis, caplog): """ Test delete process. 1 analysis should be removed and updated indexed field in DB @@ -100,10 +100,7 @@ def test_removals(self, mock_check_analysis.return_value = True, True mock_delete_analysis.return_value = True - call_command( - "populate_metagenomics_exchange", - pipeline=pipeline - ) + call_command("populate_metagenomics_exchange", pipeline=pipeline) assert "Indexing 0 new analyses" in caplog.text assert "Processing 1 analyses to remove" in caplog.text assert "Deleting MGYA00005678" in caplog.text @@ -114,10 +111,7 @@ def test_removals(self, @pytest.mark.usefixtures("run_multiple_analysis_me") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.check_analysis") @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.patch_analysis") - def test_update(self, - mock_patch_analysis, - mock_check_analysis, - caplog): + def test_update(self, mock_patch_analysis, mock_check_analysis, caplog): """ Test update process for job that was indexed before updated. MGX accession and last_mgx_indexed should be updated @@ -138,5 +132,3 @@ def test_update(self, ajob = AnalysisJob.objects.filter(pipeline__release_version=pipeline).first() assert ajob.last_mgx_indexed.date() == timezone.now().date() assert ajob.mgx_accession == registry_id - - From afa3f18eae7e2ca4506b5f59a668b2475e55253e Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:34:12 +0000 Subject: [PATCH 40/46] Typo sequence_accesion -> sequence_accession --- .../commands/populate_metagenomics_exchange.py | 2 +- emgapi/metagenomics_exchange.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 2e1d0da39..937f8613a 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -115,7 +115,7 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): ) registry_id, metadata_match = self.mgx_api.check_analysis( mgya=annotation_job.accession, - sequence_accession=annotation_job.run, + sequence_accession=sequence_accession, metadata=metadata, ) # The job is not registered diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 79b693571..ca17011aa 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -109,13 +109,13 @@ def add_analysis(self, mgya: str, sequence_accession: str): response = self.post_request(endpoint="datasets", data=data) return response - def check_analysis(self, mgya: str, sequence_accesion: str, metadata=None): + def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): """Check if a sequence exists in the M. Exchange Parameters: mgya : str The MGnify Analysis accession. - sequence_accesion : str + sequence_accession : str Either the Run accession or the Assembly accession related to the MGYA. Returns: @@ -124,16 +124,16 @@ def check_analysis(self, mgya: str, sequence_accesion: str, metadata=None): """ if not mgya: raise ValueError(f"mgya is mandatory.") - if not sequence_accesion: - raise ValueError(f"sequence_accesion is mandatory.") + if not sequence_accession: + raise ValueError(f"sequence_accession is mandatory.") - logging.info(f"Checking {mgya} - {sequence_accesion}") + logging.info(f"Checking {mgya} - {sequence_accession}") params = { "broker": self.broker, } - endpoint = f"sequences/{sequence_accesion}/datasets" + endpoint = f"sequences/{sequence_accession}/datasets" analysis_registry_id = None metadata_match = True From c3ba4417cb0b0cf83eb4113293b3021dd1d1d00e Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:36:08 +0000 Subject: [PATCH 41/46] Send the sequence_accession, not a Run object --- emgapi/management/commands/populate_metagenomics_exchange.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 937f8613a..294336775 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -129,13 +129,13 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): response = self.mgx_api.add_analysis( mgya=annotation_job.accession, - sequence_accession=annotation_job.run, + sequence_accession=sequence_accession, ) if response.ok: logging.info(f"Successfully added {annotation_job}") registry_id, metadata_match = self.mgx_api.check_analysis( mgya=annotation_job.accession, - sequence_id=annotation_job.run, + sequence_id=sequence_accession, ) annotation_job.mgx_accession = registry_id annotation_job.last_mgx_indexed = timezone.now() + timedelta( From 4b145c088e34ab61ac9051e25785563840e70497 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:39:05 +0000 Subject: [PATCH 42/46] Add a bit of logging when posting a record to MGX --- emgapi/metagenomics_exchange.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index ca17011aa..6c9103aed 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -106,7 +106,11 @@ def add_analysis(self, mgya: str, sequence_accession: str): The response object from the API request. """ data = self.generate_metadata(mgya, sequence_accession) - response = self.post_request(endpoint="datasets", data=data) + try: + response = self.post_request(endpoint="datasets", data=data) + except HTTPError as http_error: + logging.exception(f"POST request failed. HTTP Error: {http_error}") + raise http_error return response def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): From 6235f031f2cd8012980c88741d0577980f25a735 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 14:52:54 +0000 Subject: [PATCH 43/46] Capture the API error data response, if present in the response. The MGX API returns a json with the error message, which is useful for debugging. --- emgapi/metagenomics_exchange.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 6c9103aed..94b2a36fd 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -109,7 +109,11 @@ def add_analysis(self, mgya: str, sequence_accession: str): try: response = self.post_request(endpoint="datasets", data=data) except HTTPError as http_error: - logging.exception(f"POST request failed. HTTP Error: {http_error}") + try: + response_json = http_error.response.json() + logging.error(f"API response content: {response_json}") + except: + pass raise http_error return response @@ -145,6 +149,11 @@ def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): response = self.get_request(endpoint=endpoint, params=params) except HTTPError as http_error: logging.error(f"Get API request failed. HTTP Error: {http_error}") + try: + response_json = http_error.response.json() + logging.error(f"API response content: {response_json}") + except: + pass return analysis_registry_id, metadata_match data = response.json() From fe6d2b8ec18a2e13049927b0be912855f7754bc5 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 16 Feb 2024 15:32:54 +0000 Subject: [PATCH 44/46] Mock the Metagenomics Exchange API --- emgapi/metagenomics_exchange.py | 22 +++++--- tests/me/test_metagenomics_exchange.py | 70 ++++++++++++++++---------- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 94b2a36fd..77cc030d5 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -127,8 +127,12 @@ def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): Either the Run accession or the Assembly accession related to the MGYA. Returns: - requests.models.Response - The response object from the API request. + tuple + A tuple containing two elements: + - analysis_registry_id : str + The analysis registry ID. + - metadata_match : boolean + True, if the metadata matchs. """ if not mgya: raise ValueError(f"mgya is mandatory.") @@ -143,7 +147,7 @@ def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): endpoint = f"sequences/{sequence_accession}/datasets" analysis_registry_id = None - metadata_match = True + metadata_match = False try: response = self.get_request(endpoint=endpoint, params=params) @@ -171,18 +175,22 @@ def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): ] logging.info(f"{mgya} exists in ME") analysis_registry_id = found_record.get("registryID") + if not analysis_registry_id: + raise ValueError(f"The Metagenomics Exchange 'registryID' for {mgya} is null.") + if metadata: for metadata_record in metadata: if not (metadata_record in found_record): - metadata_match = False - return analysis_registry_id, metadata_match + return analysis_registry_id, False else: if metadata[metadata_record] != found_record[metadata_record]: metadata_match = False logging.info( - f"Incorrect field {metadata[metadata_record]} != {found_record[metadata_record]})" + f"The metadata doesn't match, for field {metadata[metadata_record]} != {found_record[metadata_record]})" ) - return analysis_registry_id, metadata_match + else: + metadata_match = True + return analysis_registry_id, metadata_match return analysis_registry_id, metadata_match return analysis_registry_id, metadata_match diff --git a/tests/me/test_metagenomics_exchange.py b/tests/me/test_metagenomics_exchange.py index 6531bc63f..05148d497 100644 --- a/tests/me/test_metagenomics_exchange.py +++ b/tests/me/test_metagenomics_exchange.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from unittest import mock - import pytest import requests import responses @@ -12,18 +10,35 @@ class TestME: - def test_check_existing_analysis_me(self): - me_api = MetagenomicsExchangeAPI() + + @responses.activate + def test_check_existing_analysis_me(self, settings): + # FIXME: this test doesn't check if the metadata matches or not. mgya = "MGYA00293719" sequence_accession = "ERR3063408" - return_values = me_api.check_analysis(mgya, sequence_accession) - assert return_values[0] + responses.add( + responses.GET, + f"{settings.METAGENOMICS_EXCHANGE_API}/sequences/{sequence_accession}/datasets", + json={"datasets": [{"sourceID": mgya, "registryID": "MGX_FAKE"}]}, + status=200, + ) + me_api = MetagenomicsExchangeAPI() + + registry_id, _ = me_api.check_analysis(mgya, sequence_accession) + assert registry_id == "MGX_FAKE" + @responses.activate def test_check_not_existing_analysis_me(self): + mgya = "MGYA10293719" + sequence_accession = "ERR3063408" + responses.add( + responses.GET, + f"{settings.METAGENOMICS_EXCHANGE_API}/sequences/{sequence_accession}/datasets", + json={"datasets": []}, + status=200, + ) me_api = MetagenomicsExchangeAPI() - source_id = "MGYA10293719" - seq_id = "ERR3063408" - return_values = me_api.check_analysis(source_id, seq_id) + return_values = me_api.check_analysis(mgya, sequence_accession) assert not return_values[0] @pytest.mark.skip(reason="Error on ME API side") @@ -62,38 +77,41 @@ def test_mock_delete_analysis_from_me(self): assert response.status_code == 201 assert response.json() == {"success": True} - def test_wrong_delete_request_me(self): + @responses.activate + def test_incorrect_delete_request_me(self): + # TODO: this test doesn't make much sense me_api = MetagenomicsExchangeAPI() + responses.add( + responses.DELETE, + f"{settings.METAGENOMICS_EXCHANGE_API}/dataset/MGX0000780", + status=401, + ) registry_id = "MGX0000780" endpoint = f"dataset/{registry_id}" - assert not me_api.delete_request(endpoint) + response = me_api.delete_request(endpoint) + assert response.status_code == 401 - @mock.patch("emgapi.metagenomics_exchange.MetagenomicsExchangeAPI.patch_request") - def test_patch_analysis_me(self, mock_patch_request): + @responses.activate + def test_patch_analysis_me(self): me_api = MetagenomicsExchangeAPI() - class MockResponse: - def __init__(self, json_data, status_code): - self.json_data = json_data - self.status_code = status_code - self.ok = True - - def json(self): - return self.json_data - - mock_patch_request.return_value = MockResponse({}, 200) registry_id = "MGX0000788" mgya = "MGYA00593709" run_accession = "SRR3960575" - public = False - data = { "confidence": "full", "endPoint": f"https://www.ebi.ac.uk/metagenomics/analyses/{mgya}", "method": ["other_metadata"], "sourceID": mgya, "sequenceID": run_accession, - "status": "public" if public else "private", + "status": "public", "brokerID": "EMG", } + + responses.add( + responses.PATCH, + f"{settings.METAGENOMICS_EXCHANGE_API}/datasets/{registry_id}", + status=200, + ) + assert me_api.patch_analysis(registry_id, data) From 783f2754d6a6b085dc9a0195991f8db13f74094d Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 19 Feb 2024 13:12:06 +0000 Subject: [PATCH 45/46] Fix mgx_api check_analysis invocation typo --- emgapi/management/commands/populate_metagenomics_exchange.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emgapi/management/commands/populate_metagenomics_exchange.py b/emgapi/management/commands/populate_metagenomics_exchange.py index 294336775..da7e7c252 100644 --- a/emgapi/management/commands/populate_metagenomics_exchange.py +++ b/emgapi/management/commands/populate_metagenomics_exchange.py @@ -135,7 +135,7 @@ def process_to_index_and_update_records(self, analyses_to_index_and_update): logging.info(f"Successfully added {annotation_job}") registry_id, metadata_match = self.mgx_api.check_analysis( mgya=annotation_job.accession, - sequence_id=sequence_accession, + sequence_accession=sequence_accession, ) annotation_job.mgx_accession = registry_id annotation_job.last_mgx_indexed = timezone.now() + timedelta( From 1b031df69c3bfbe2b8499b2deb5c8dfeb0752d4c Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 19 Feb 2024 13:23:17 +0000 Subject: [PATCH 46/46] Mark piece of code to revisit and improve --- emgapi/metagenomics_exchange.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/emgapi/metagenomics_exchange.py b/emgapi/metagenomics_exchange.py index 77cc030d5..55ad381a3 100644 --- a/emgapi/metagenomics_exchange.py +++ b/emgapi/metagenomics_exchange.py @@ -168,6 +168,13 @@ def check_analysis(self, mgya: str, sequence_accession: str, metadata=None): logging.info(f"{mgya} does not exist in ME") return analysis_registry_id, metadata_match + # TODO: this code needs some refactoring to improve it: + """ + try: + found_record = next(item for item in datasets if item.get("sourceID") == mgya) + except StopIteration + ... + """ sourceIDs = [item.get("sourceID") for item in datasets] if mgya in sourceIDs: found_record = [item for item in datasets if item.get("sourceID") == mgya][