Skip to content

Commit

Permalink
adds incremental ebi search dumping for analysisjobs
Browse files Browse the repository at this point in the history
  • Loading branch information
SandyRogers committed Oct 20, 2023
1 parent 2b02f66 commit 974371f
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 66 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ fixtures/*.sig
loglockdir
logs

secret.key
secret.key

dumps
23 changes: 23 additions & 0 deletions emgapi/migrations/0012_auto_20231020_1525.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 3.2.18 on 2023-10-20 15:25

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('emgapi', '0011_analysisjob_analysis_summary_json'),
]

operations = [
migrations.AddField(
model_name='analysisjob',
name='last_indexed',
field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True),
),
migrations.AddField(
model_name='analysisjob',
name='last_update',
field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'),
),
]
66 changes: 64 additions & 2 deletions emgapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
import logging

from django.conf import settings
from django.core.exceptions import FieldDoesNotExist
from django.db import models
from django.db.models import (CharField, Count, OuterRef, Prefetch, Q,
Subquery, Value)
Subquery, Value, F)
from django.db.models.functions import Cast, Concat
from django.utils import timezone

Expand Down Expand Up @@ -160,6 +161,67 @@ class Meta:
abstract = True


class EbiSearchIndexQueryset(models.QuerySet):
"""
to_delete: Objects that have been suppressed since they were last indexed,
or that have been indexed but updated since.
to_add: Objects that have never been indexed,
or that have been indexed but updated since.
"""
def to_delete(self):
updated_after_indexing = Q(last_update__gte=F("last_indexed"), last_indexed__isnull=False)

try:
self.model._meta.get_field("suppressed_at")
except FieldDoesNotExist:
return self.filter(
updated_after_indexing
)
else:
return self.filter(
Q(suppressed_at__gte=F("last_indexed")) | updated_after_indexing
)

def to_add(self):
updated_after_indexing = Q(last_update__gte=F("last_indexed"), last_indexed__isnull=False)
never_indexed = Q(last_indexed__isnull=True)

try:
self.model._meta.get_field("is_suppressed")
except FieldDoesNotExist:
not_suppressed = Q()
else:
not_suppressed = Q(is_suppressed=False)

try:
self.model._meta.get_field("is_private")
except FieldDoesNotExist:
not_private = Q()
else:
not_private = Q(is_private=False)

return self.filter(never_indexed | updated_after_indexing, not_suppressed, not_private)


class EbiSearchIndexedModel(models.Model):
last_update = models.DateTimeField(
db_column='LAST_UPDATE',
auto_now=True
)
last_indexed = models.DateTimeField(
db_column='LAST_INDEXED',
null=True,
blank=True,
help_text="Date at which this model was last included in an EBI Search initial/incremental index."
)

objects_for_indexing = EbiSearchIndexQueryset.as_manager()

class Meta:
abstract = True


class BaseQuerySet(models.QuerySet):
"""Auth mechanism to filter private / suppressed models
"""
Expand Down Expand Up @@ -1538,7 +1600,7 @@ def available(self, request):
)


class AnalysisJob(SuppressibleModel, PrivacyControlledModel):
class AnalysisJob(SuppressibleModel, PrivacyControlledModel, EbiSearchIndexedModel):
def __init__(self, *args, **kwargs):
super(AnalysisJob, self).__init__(*args, **kwargs)
setattr(self, 'accession',
Expand Down
8 changes: 8 additions & 0 deletions emgapi/templates/ebi_search/analyses-deletes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<database xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ebi.ac.uk/ebisearch/XML4dbDumps.xsd">
<name>EMG_run</name>
<entries>
{% for entry in removals %}
<entry id="{{ entry.accession }}_{{ entry.pipeline.release_version }}" />
{% endfor %}
</entries>
</database>
8 changes: 8 additions & 0 deletions emgapi/templates/ebi_search/analyses.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<database xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ebi.ac.uk/ebisearch/XML4dbDumps.xsd">
<name>EMG_run</name>
<entries>
{% for a in additions %}
{% include "ebi_search/analysis.xml" with analysis=a.analysis analysis_biome=a.analysis_biome analysis_taxonomies=a.analysis_taxonomies analysis_go_entries=a.analysis_go_entries analysis_ips_entries=a.analysis_ips_entries sample_metadata=a.sample_metadata only %}
{% endfor %}
</entries>
</database>
127 changes: 64 additions & 63 deletions emgapi/templates/ebi_search/analysis.xml
Original file line number Diff line number Diff line change
@@ -1,69 +1,70 @@
<entry xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="{{ analysis.accession }}_{{ analysis.pipeline.release_version }}">
<name>{{ analysis.accession }}</name>
<dates>
<date type="creation_date" value="{{ analysis.submit_time|date:'Y-m-d' }}"/>
<date type="completion_date" value="{{ analysis.complete_time|date:'Y-m-d' }}"/>
</dates>
<additional_fields>
<field name="experiment_type">assembly</field>
<field name="pipeline_version">{{ analysis.pipeline.release_version }}</field>
<field name="sample_name">{{ analysis.sample.sample_name }}</field>
<field name="project_name">{{ analysis.study.study_name }}</field>
<field name="biome_name">{{ analysis.study.biome.biome_name }}</field>
<hierarchical_field name="biome">
{% for biome_element in analysis_biome %}
{% if forloop.first %}
<root>{{ biome_element }}</root>
{% else %}
<child>{{ biome_element }}</child>
{% endif %}
{% endfor %}
</hierarchical_field>
{% for taxonomy_lineage_elements in analysis_taxonomies %}
<hierarchical_field name="organism">
{% for taxonomy_element in taxonomy_lineage_elements %}
{% if forloop.first %}
<root>{{ taxonomy_element }}</root>
{% else %}
<child>{{ taxonomy_element }}</child>
{% endif %}
{% endfor %}
</hierarchical_field>
{% endfor %}

{% for go_slim in analysis_go_slim_entries %}
<field name="go_term">{{ go_slim.description }}</field>
{% endfor %}
<entry xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="{{ analysis.accession }}_{{ analysis.pipeline.release_version }}">
<name>{{ analysis.accession }}</name>
<dates>
<date type="creation_date" value="{{ analysis.submit_time|date:'Y-m-d' }}"/>
<date type="completion_date" value="{{ analysis.complete_time|date:'Y-m-d' }}"/>
</dates>
<additional_fields>
<field name="experiment_type">assembly</field>
<field name="pipeline_version">{{ analysis.pipeline.release_version }}</field>
<field name="sample_name">{{ analysis.sample.sample_name | safe }}</field>
<field name="project_name">{{ analysis.study.study_name | safe }}</field>
<field name="biome_name">{{ analysis.study.biome.biome_name | safe }}</field>
<hierarchical_field name="biome">
{% for biome_element in analysis_biome %}
{% if forloop.first %}
<root>{{ biome_element | safe }}</root>
{% else %}
<child>{{ biome_element | safe }}</child>
{% endif %}
{% endfor %}
</hierarchical_field>
{% for taxonomy_lineage_elements in analysis_taxonomies %}
<hierarchical_field name="organism">
{% for taxonomy_element in taxonomy_lineage_elements %}
{% if forloop.first %}
<root>{{ taxonomy_element | safe }}</root>
{% else %}
<child>{{ taxonomy_element | safe }}</child>
{% endif %}
{% endfor %}
</hierarchical_field>
{% endfor %}

{% for go_slim in analysis_go_entries %}
<field name="go_term">{{ go_slim.description | safe }}</field>
{% endfor %}

{% for ips in analysis_ips_entries %}
<field name="interpro_entry">{{ ips.description }}</field>
{% endfor %}
{% for ips in analysis_ips_entries %}
<field name="interpro_entry">{{ ips.description | safe }}</field>
{% endfor %}

{% for metadata_key, metadata_value in sample_metadata.items %}
<field name="{{ metadata_key }}">{{ metadata_value }}</field>
{% endfor %}
{% for metadata_key, metadata_value in sample_metadata.items %}
<field name="{{ metadata_key | safe }}">{{ metadata_value | safe }}</field>
{% endfor %}

</additional_fields>
<cross_references>
<ref dbkey="{{ analysis.sample.accession }}" dbname="metagenomics_samples"/>
<ref dbkey="{{ analysis.study.accession }}" dbname="metagenomics_projects"/>
<ref dbkey="{{ analysis.study.project_id }}" dbname="ena_project"/>
</additional_fields>
<cross_references>
<ref dbkey="{{ analysis.sample.accession }}" dbname="metagenomics_samples"/>
<ref dbkey="{{ analysis.study.accession }}" dbname="metagenomics_projects"/>
<ref dbkey="{{ analysis.study.project_id }}" dbname="ena_project"/>

{# Assembly analysis #}
{% if analysis.assembly and analysis.assembly.accession|slice:":4" == "GCA_" %}
<ref dbkey="{{ analysis.assembly.legacy_accession }}" dbname="ena_wgs_sequence_set"/>
<red dbkey="{{ analysis.assembly.accession }}" dbname="assembly"/>
{% endif %}
{# Assembly analysis #}
{% if analysis.assembly and analysis.assembly.accession|slice:":4" == "GCA_" %}
<ref dbkey="{{ analysis.assembly.legacy_accession }}" dbname="ena_wgs_sequence_set"/>
<ref dbkey="{{ analysis.assembly.accession }}" dbname="assembly"/>
{% endif %}

{# WGS/Amplicon analysis #}
{% if analysis.run %}
<red dbkey="{{ analysis.run.accession }}" dbname="ena_run"/>
{% endif %}
{% for go in analysis_go_entries %}
<ref dbkey="{{ go.accession }}" dbname="go"/>
{% endfor %}
{% for ips in analysis_ips_entries %}
<ref dbkey="{{ ips.accession }}" dbname="interpro"/>
{% endfor %}
</cross_references>
</entry>
{# WGS/Amplicon analysis #}
{% if analysis.run %}
<ref dbkey="{{ analysis.run.accession }}" dbname="ena_run"/>
{% endif %}
{% for go in analysis_go_entries %}
<ref dbkey="{{ go.accession }}" dbname="go"/>
{% endfor %}
{% for ips in analysis_ips_entries %}
<ref dbkey="{{ ips.accession }}" dbname="interpro"/>
{% endfor %}
</cross_references>
</entry>
19 changes: 19 additions & 0 deletions emgapi/templatetags/ebi_search_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from django import template

register = template.Library()


@register.simple_tag
def xml_safe(unsafe_string: str):
if not unsafe_string:
return None

replacements = str.maketrans({
"<": "&lt;",
">": "&gt;",
"&": "&amp;",
"'": "&apos;",
'"': "&quot;",
})

return unsafe_string.translate(replacements)

0 comments on commit 974371f

Please sign in to comment.