Skip to content

Commit

Permalink
adds option for genome uploader to update genome metadata in place (w…
Browse files Browse the repository at this point in the history
…ithout removing/re-adding catalogue)
  • Loading branch information
SandyRogers committed Jan 23, 2024
1 parent c00eb19 commit 408af70
Show file tree
Hide file tree
Showing 19 changed files with 213,368 additions and 3 deletions.
24 changes: 21 additions & 3 deletions emgapianns/management/commands/import_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def add_arguments(self, parser):
"E.g. root:Host-Associated:Human:Digestive\\ System:Large\\ intestine")
parser.add_argument('pipeline_version', action='store', type=str,
help='Pipeline version tag that catalogue was produced by. E.g. "1.2.1"')
parser.add_argument('--update-metadata-only', dest='update_metadata_only', action='store_true', default=False,
help="Only update the metadata of genomes in an existing catalogue; "
"i.e. reparse the MGYG*.json files.")
parser.add_argument('--database', type=str,
default='default')

Expand Down Expand Up @@ -105,6 +108,12 @@ def handle_v2(self, *args, **options):
self.catalogue_dir = os.path.join(self.results_directory, catalogue_dir)

self.database = options['database']

if options['update_metadata_only']:
assert emg_models.GenomeCatalogue.objects.filter(
catalogue_id=self.make_slug(catalogue_name, version)
).exists()

self.catalogue_obj = self.get_catalogue(
catalogue_name,
version, gold_biome,
Expand All @@ -123,12 +132,15 @@ def handle_v2(self, *args, **options):
sanity_check_catalogue_dir(self.catalogue_dir)

for d in genome_dirs:
self.upload_dir(d)
self.upload_dir(d, update_metadata_only=options['update_metadata_only'])

self.upload_catalogue_files()
self.catalogue_obj.calculate_genome_count()
self.catalogue_obj.save()

def make_slug(self, catalogue_name, catalogue_version):
return slugify('{0}-v{1}'.format(catalogue_name, catalogue_version).replace('.', '-'))

def get_catalogue(self, catalogue_name, catalogue_version, gold_biome, catalogue_dir, pipeline_version_tag):
logging.warning('GOLD')
logging.warning(gold_biome)
Expand All @@ -137,7 +149,7 @@ def get_catalogue(self, catalogue_name, catalogue_version, gold_biome, catalogue
catalogue, _ = emg_models.GenomeCatalogue.objects \
.using(self.database) \
.get_or_create(
catalogue_id=slugify('{0}-v{1}'.format(catalogue_name, catalogue_version).replace('.', '-')),
catalogue_id=self.make_slug(catalogue_name, catalogue_version),
defaults={
'version': catalogue_version,
'name': '{0} v{1}'.format(catalogue_name, catalogue_version),
Expand All @@ -148,9 +160,11 @@ def get_catalogue(self, catalogue_name, catalogue_version, gold_biome, catalogue
})
return catalogue

def upload_dir(self, directory):
def upload_dir(self, directory, update_metadata_only=False):
logger.info('Uploading dir: {}'.format(directory))
genome, has_pangenome = self.create_genome(directory)
if update_metadata_only:
return
self.upload_cog_results(genome, directory)
self.upload_kegg_class_results(genome, directory)
self.upload_kegg_module_results(genome, directory)
Expand Down Expand Up @@ -208,6 +222,10 @@ def create_genome(self, genome_dir):
defaults=data)
g.save(using=self.database)

# in case we are updating and the geo range metadata has changed:
if g.pangenome_geographic_range.exists():
g.pangenome_geographic_range.clear()

if geo_locations:
[self.attach_geo_location(g, l) for l in geo_locations]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"accession": "MGYG000000001",
"completeness": 90.59,
"contamination": 0.7,
"eggnog_coverage": 93.78,
"ena_sample_accession": "ERS370061",
"ena_study_accession": "ERP105624",
"gc_content": 28.26,
"genome_accession": "GUT_GENOME000001",
"geographic_origin": "Europe",
"gold_biome": "root:Host-Associated:Human:Digestive System:Large intestine",
"ipr_coverage": 86.42,
"length": 3219617,
"n_50": 47258,
"nc_rnas": 63,
"num_contigs": 137,
"num_proteins": 3182,
"pangenome": {
"geographic_range": [
"North America",
"South America"
],
"num_genomes_non_redundant": 4,
"num_genomes_total": 4,
"pangenome_accessory_size": 1804,
"pangenome_core_size": 1350,
"pangenome_size": 3154
},
"rna_16s": 99.74,
"rna_23s": 99.83,
"rna_5s": 88.24,
"taxon_lineage": "d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Peptostreptococcales;f__Peptostreptococcaceae;g__GCA-900066495;s__GCA-900066495 sp902362365",
"trnas": 20,
"type": "Isolate"
}
Loading

0 comments on commit 408af70

Please sign in to comment.