Skip to content

Commit 919ac99

Browse files
authored
Implement Stable ID Generation for Cell Sets (#93)
* Add sorting by key columns in report generation for consistent DataFrame output * Refactor entity generation order and replace UUID v4 with UUID v5 * Update version to 0.2.3 * Add new assert statement for stable ID generation in test_generate_rdf_graph_with_merge
1 parent b3cf832 commit 919ac99

File tree

5 files changed

+400
-237
lines changed

5 files changed

+400
-237
lines changed

pandasaurus_cxg/anndata_analyzer.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,12 @@ def co_annotation_report(
108108
"""
109109
# Call the core method to generate the full DataFrame
110110
full_df = self._generate_co_annotation_dataframe(disease, enrich)
111-
112111
# Return only the first 5 columns
113112
return full_df.iloc[:, :5]
114113

115-
def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enrich: bool = False):
114+
def _generate_co_annotation_dataframe(
115+
self, disease: Optional[str] = None, enrich: bool = False
116+
):
116117
"""
117118
Core method to generate a full co-annotation dataframe.
118119
@@ -171,7 +172,7 @@ def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enric
171172
for record in temp_result
172173
]
173174
# unique_result = AnndataAnalyzer._remove_duplicates(result)
174-
self.report_df = pd.DataFrame(
175+
report_df = pd.DataFrame(
175176
[
176177
inner_list[:2]
177178
+ inner_list[5:6]
@@ -190,6 +191,9 @@ def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enric
190191
"field_name2_cell_count",
191192
],
192193
)
194+
self.report_df = report_df.sort_values(
195+
["field_name1", "value1", "predicate", "field_name2", "value2"]
196+
).reset_index(drop=True)
193197
return self.report_df
194198

195199
def enriched_co_annotation_report(self, disease: Optional[str] = None):

pandasaurus_cxg/graph_generator/graph_generator.py

+39-33
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,42 @@ def generate_rdf_graph(self, merge: bool = False):
9191
"""
9292
if len(self.graph) != 0:
9393
return
94-
# preprocess
94+
95+
# generate dataset entity and has_source property
96+
citation_dict = {}
97+
uns = self.ea.enricher_manager.anndata.uns
98+
if citation_field_name in uns.keys():
99+
citation_dict = parse_citation_field_into_dict(uns[citation_field_name])
100+
cxg_versioned_dataset_id = (
101+
citation_dict.get("download_link").split("/")[-1].split(".")[0]
102+
)
103+
dataset_class = URIRef(get_cxg_dataset_url(cxg_versioned_dataset_id))
104+
else:
105+
# if citation_field_name doesn't exist we use random uuid as cxg_versioned_dataset_id
106+
cxg_versioned_dataset_id = str(uuid.uuid4())
107+
dataset_class = URIRef(self.ns[cxg_versioned_dataset_id])
108+
self.graph.add((dataset_class, RDF.type, URIRef(DATASET.get("iri"))))
109+
self.graph.add((dataset_class, RDFS.label, Literal(DATASET.get("label"))))
110+
for key, value in uns.items():
111+
if not isinstance(value, str):
112+
continue
113+
if key == citation_field_name:
114+
for citation_key, citation_value in citation_dict.items():
115+
self.graph.add(
116+
(
117+
dataset_class,
118+
URIRef(self.ns[remove_special_characters(citation_key)]),
119+
Literal(citation_value),
120+
)
121+
)
122+
123+
self.graph.add(
124+
(dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
125+
)
126+
has_source = URIRef(HAS_SOURCE["iri"])
127+
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
128+
129+
# preprocess for cell clusters
95130
column_group = ["field_name1", "value1"]
96131
df = self.df.sort_values(by=column_group).reset_index(drop=True)
97132
grouped_df = df.groupby(column_group)
@@ -124,38 +159,9 @@ def generate_rdf_graph(self, merge: bool = False):
124159
) else temp_dict.update({key: value})
125160

126161
if temp_dict not in grouped_dict_uuid.values():
127-
grouped_dict_uuid[str(uuid.uuid4())] = temp_dict
128-
129-
# generate dataset entity and has_source property
130-
uns = self.ea.enricher_manager.anndata.uns
131-
citation_dict = {}
132-
if citation_field_name in uns.keys():
133-
citation_dict = parse_citation_field_into_dict(uns[citation_field_name])
134-
dataset_class = URIRef(
135-
get_cxg_dataset_url(citation_dict.get("download_link").split("/")[-1].split(".")[0])
136-
)
137-
else:
138-
dataset_class = URIRef(self.ns[str(uuid.uuid4())])
139-
self.graph.add((dataset_class, RDF.type, URIRef(DATASET.get("iri"))))
140-
self.graph.add((dataset_class, RDFS.label, Literal(DATASET.get("label"))))
141-
for key, value in uns.items():
142-
if not isinstance(value, str):
143-
continue
144-
if key == citation_field_name:
145-
for citation_key, citation_value in citation_dict.items():
146-
self.graph.add(
147-
(
148-
dataset_class,
149-
URIRef(self.ns[remove_special_characters(citation_key)]),
150-
Literal(citation_value),
151-
)
152-
)
153-
154-
self.graph.add(
155-
(dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
156-
)
157-
has_source = URIRef(HAS_SOURCE["iri"])
158-
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
162+
grouped_dict_uuid[
163+
str(uuid.uuid5(uuid.UUID(cxg_versioned_dataset_id), str(temp_dict)))
164+
] = temp_dict
159165

160166
# generate a resource for each free-text cell_type annotation and cell_type_ontology_term annotation
161167
cell_set_class = URIRef(CLUSTER.get("iri"))

0 commit comments

Comments
 (0)