Skip to content

Commit e625081

Browse files
authored
fix: avoid assembly duplication due to duplicate taxonomy ids and add checks for duplicate entity ids (#367) (#368)
1 parent a3a70ca commit e625081

File tree

7 files changed

+353
-11099
lines changed

7 files changed

+353
-11099
lines changed

catalog/build/intermediate/genomes-from-ncbi.tsv

+1-162
Large diffs are not rendered by default.

catalog/build/py/package/catalog_build/build.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
109109
def get_species_df(taxonomy_ids, taxonomic_group_sets, taxonomic_levels):
110110
species_info = get_batched_ncbi_results(
111111
lambda ids: f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join(ids)}/dataset_report",
112-
[str(id) for id in taxonomy_ids],
112+
[str(id) for id in set(taxonomy_ids)],
113113
"taxa"
114114
)
115115
return pd.DataFrame([get_species_row(info, taxonomic_group_sets, taxonomic_levels) for info in species_info])

catalog/build/py/package/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="catalog_build",
5-
version="1.4.0",
5+
version="1.4.1",
66
packages=["catalog_build"],
77
install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],
88
)

catalog/build/ts/build-catalog.ts

+34
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ import {
66
BRCDataCatalogOrganism,
77
WorkflowCategory,
88
} from "../../../app/apis/catalog/brc-analytics-catalog/common/entities";
9+
import {
10+
getGenomeId,
11+
getOrganismId,
12+
} from "../../../app/apis/catalog/brc-analytics-catalog/common/utils";
913
import {
1014
Organisms as SourceOrganisms,
1115
Workflow as SourceWorkflow,
@@ -28,6 +32,9 @@ async function buildCatalog(): Promise<void> {
2832
const organisms = buildOrganisms(genomes);
2933
const workflows = await buildWorkflows();
3034

35+
verifyUniqueIds("assembly", genomes, getGenomeId);
36+
verifyUniqueIds("organism", organisms, getOrganismId);
37+
3138
console.log("Assemblies:", genomes.length);
3239
await saveJson("catalog/output/assemblies.json", genomes);
3340

@@ -228,6 +235,33 @@ async function saveJson(filePath: string, data: unknown): Promise<void> {
228235
await fsp.writeFile(filePath, JSON.stringify(data, undefined, 2) + "\n");
229236
}
230237

238+
/**
239+
* Take a list of entities and check for duplicate IDs, as calculated by the given function, and throw an error if there are any.
240+
* @param entityName - Name of the entity type, to use in the error message.
241+
* @param entities - Array of entities.
242+
* @param getId - Function to get an entity's ID.
243+
*/
244+
function verifyUniqueIds<T>(
245+
entityName: string,
246+
entities: T[],
247+
getId: (entity: T) => string
248+
): void {
249+
const idCounts = new Map<string, number>();
250+
for (const entity of entities) {
251+
const id = getId(entity);
252+
idCounts.set(id, (idCounts.get(id) ?? 0) + 1);
253+
}
254+
const duplicateIdEntries = Array.from(idCounts.entries()).filter(
255+
([, count]) => count > 1
256+
);
257+
if (duplicateIdEntries.length > 0) {
258+
const duplicateIds = duplicateIdEntries.map(([id]) => id);
259+
throw new Error(
260+
`Duplicate ${entityName} IDs found: ${duplicateIds.join(", ")}`
261+
);
262+
}
263+
}
264+
231265
function accumulateArrayValue<T>(array: T[] | undefined, value: T): T[] {
232266
if (!array) return [value];
233267
if (array.includes(value)) return array;

0 commit comments

Comments
 (0)