From 2055a00842a57e991c5624828a3d785b0aa159c5 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sat, 30 Nov 2024 19:44:24 -0500 Subject: [PATCH 1/4] Subclass Sync - Direct in source, indirect in Mondo - Update: Added Python code to implement this case - Update: Includes some refactoring around common '-confirmed' cases and in general --- src/scripts/sync_subclassof.py | 52 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/scripts/sync_subclassof.py b/src/scripts/sync_subclassof.py index 5b1d793d..f68909f3 100644 --- a/src/scripts/sync_subclassof.py +++ b/src/scripts/sync_subclassof.py @@ -63,6 +63,9 @@ from src.scripts.utils import CACHE_DIR, MONDO_PREFIX_MAP, PREFIX_MAP, get_owned_prefix_map +COMMON_SORT_COLS = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id'] + + def _edges_with_metadata_from_plain_edges(edges: Set[RELATIONSHIP], ns_data_map: Dict[CURIE, Dict]) -> List[Dict]: """From simple (subject, predicate, object) edge tuples, create dictionaries with mondo mappings and labels @@ -114,6 +117,20 @@ def _get_mondo_data_map( return mondo_data_map +def _confirmed_df(rows: List[Dict], outpath: Union[str, Path], save=True): + """Create -confirmed cases (exist in both Mondo and source) dataframe""" + subheader = deepcopy(ROBOT_SUBHEADER) + subheader[0]['object_mondo_id'] = 'SC %' + df = pd.DataFrame(rows) + if len(df) == 0: + df = pd.DataFrame(columns=list(subheader[0].keys())) + df = df.sort_values(COMMON_SORT_COLS) + df = pd.concat([pd.DataFrame(subheader), df]) + if save: + df.to_csv(outpath, sep='\t', index=False) + return df + + def _get_direct_scr_rels( curies: List[CURIE], db: SqlImplementation, prefix_map: PREFIX_MAP = None, verbose=True ) -> Set[RELATIONSHIP]: @@ -298,10 +315,12 @@ def sync_subclassof( # - rels_indirect_source_source: Indirect relationships (source IDs) from source # - rels_indirect_source_mondo: Relationships from rels_indirect_source_source that appear in Mondo # - rels_indirect_source_mondo_and_1or2_ids_not_in_mondo: from rels_indirect_source_source not in Mondo (unused) + # - rels_indirect_mondo_source: Relationships from rels_indirect_mondo_mondo that appear in source logging.info('- Collecting indirect subclass relations') rels_indirect_mondo_mondo = ancestors_mondo_mondo.difference(rels_direct_mondo_mondo) - # todo: remove unused, commented out vars? - # rels_indirect_mondo_source = '' # not needed? + rels_indirect_mondo_source, rels_indirect_mondo_mondo_and_1or2_ids_not_in_source = _convert_edge_namespace( + rels_indirect_mondo_mondo, mondo_source_map) + # todo: remove unused, commented out vars? or leave for future cases? # rels_indirect_source_source = ancestors_source_source.difference(rels_direct_source_source) # rels_indirect_source_mondo, rels_indirect_source_mondo_and_1or2_ids_not_in_mondo = _convert_edge_namespace( # rels_indirect_source_source, source_mondo_map) @@ -323,7 +342,7 @@ def sync_subclassof( '\n\nExiting.') # Determine hierarchy differences ----------------------------------------------------------------------------------- - # todo: remove unused, commented out vars? (they were created in anticipation of possible cases) + # todo: remove unused, commented out vars? or leave for future cases? logging.info('Calculating various differences in hierarchies between source and Mondo') # Find which edges appear in both Mondo and source, or only in one or the other # - direct <--> direct @@ -341,6 +360,12 @@ def sync_subclassof( # in_source_only_direct: List[Dict] = _edges_with_metadata_from_plain_edges( # in_source_only_direct_source_edges, source_data_map) + # - direct <--> indirect + in_source_direct_mondo_indirect_edges: Set[RELATIONSHIP] = \ + rels_direct_source_source.intersection(rels_indirect_mondo_source) + in_source_direct_mondo_indirect: List[Dict] = _edges_with_metadata_from_plain_edges( + in_source_direct_mondo_indirect_edges, source_data_map) + # - indirect <--> indirect # - Indirect SCRs that exist in both Mondo and source # in_both_indirect_mondo_edges: Set[RELATIONSHIP] = ( @@ -372,20 +397,13 @@ def sync_subclassof( # Google doc about cases: # https://docs.google.com/document/d/1H8fJKiKD-L1tfS-2LJu8t0_2YXJ1PQJ_7Zkoj7Vf7xA/edit#heading=h.9hixairfgxa1 logging.info('Creating outputs for synchronization cases') - common_sort_cols = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id'] # Case 1: SCR direct in source and Mondo - subheader = deepcopy(ROBOT_SUBHEADER) - subheader[0]['object_mondo_id'] = 'SC %' - df1 = pd.DataFrame(in_both_direct) - if len(df1) == 0: - df1 = pd.DataFrame(columns=list(subheader[0].keys())) - df1 = df1.sort_values(common_sort_cols) - df1 = pd.concat([pd.DataFrame(subheader), df1]) - df1.to_csv(outpath_confirmed, sep='\t', index=False) + _confirmed_df(in_both_direct, outpath_confirmed) # Case 2: SCR is direct in source, but indirect Mondo - pass # no output for this case + _confirmed_df(in_source_direct_mondo_indirect, + outpath_confirmed.replace('confirmed', 'confirmed-direct-source-indirect-mondo')) # Case 3: SCR is direct in the source, but not at all in Mondo subheader = deepcopy(ROBOT_SUBHEADER) @@ -402,14 +420,14 @@ def sync_subclassof( f'higher number than expected.') # - format # only object_mondo_id should ever be missing, but checking both to be safe - df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(common_sort_cols) + df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(COMMON_SORT_COLS) # - obsolete cases obsoletes = (df3['subject_mondo_label'].str.startswith('obsolete') | df3['object_mondo_label'].str.startswith('obsolete')) # todo: sort_values: from here and below for this section, should remove; should not be necessary - df3_obs = df3[obsoletes].sort_values(common_sort_cols) + df3_obs = df3[obsoletes].sort_values(COMMON_SORT_COLS) # - filter: non-obsolete cases - df3 = df3[~obsoletes].sort_values(common_sort_cols) + df3 = df3[~obsoletes].sort_values(COMMON_SORT_COLS) # - self-parentage / proxy merges self_parentage_cases = df3['subject_mondo_id'] == df3['object_mondo_id'] df3_self_parentage = df3[self_parentage_cases] @@ -419,7 +437,7 @@ def sync_subclassof( logging.warning(sp_err) df3_self_parentage.to_csv(outpath_self_parentage, sep='\t', index=False) # - filter: non-self parentage cases - df3 = df3[~self_parentage_cases].sort_values(common_sort_cols) + df3 = df3[~self_parentage_cases].sort_values(COMMON_SORT_COLS) # - save df3_obs = pd.concat([pd.DataFrame(subheader), df3_obs]) df3_obs.to_csv(outpath_added_obsolete, sep='\t', index=False) From cf5cf23f9a27ed0f4a003ed0dd5c955343e3b757 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Mon, 2 Dec 2024 19:03:34 -0500 Subject: [PATCH 2/4] Subclass Sync - Direct in source, indirect in Mondo - Update: Formalized new output *confirmed-direct-source-indirect-mondo.robot.tsv --- src/ontology/mondo-ingest.Makefile | 9 +++++++-- src/scripts/sync_subclassof.py | 15 +++++++++++---- src/scripts/sync_subclassof_config.py | 2 ++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index 4186b392..d86d5d27 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -553,7 +553,7 @@ sync: sync-subclassof sync-synonyms # Synchronization: SubclassOf .PHONY: sync-subclassof -sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv +sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv # todo: drop this? This is really just an alias here for quality of life, but not used by anything. .PHONY: sync-subclassof-% @@ -570,11 +570,16 @@ $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv: $(foreach n,$(ALL_COMPONE $(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed.robot.tsv > $@ -$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv +# TODO: implement this goal && add to master goal +$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) + awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv > $@ + +$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.confirmed-direct-source-indirect-mondo.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv python3 $(SCRIPTSDIR)/sync_subclassof.py \ --outpath-added $(REPORTDIR)/$*.subclass.added.robot.tsv \ --outpath-added-obsolete $(REPORTDIR)/$*.subclass.added-obsolete.robot.tsv \ --outpath-confirmed $(REPORTDIR)/$*.subclass.confirmed.robot.tsv \ + --outpath-confirmed-direct-source-indirect-mondo $(REPORTDIR)/$*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv \ --outpath-direct-in-mondo-only $(REPORTDIR)/$*.subclass.direct-in-mondo-only.tsv \ --outpath-self-parentage $(TMPDIR)/$*.subclass.self-parentage.tsv \ --mondo-db-path $(TMPDIR)/mondo.db \ diff --git a/src/scripts/sync_subclassof.py b/src/scripts/sync_subclassof.py index f68909f3..0312d249 100644 --- a/src/scripts/sync_subclassof.py +++ b/src/scripts/sync_subclassof.py @@ -225,6 +225,7 @@ def _convert_edge_namespace( def sync_subclassof( outpath_added: str = EX_DEFAULTS['outpath_added'], outpath_confirmed: str = EX_DEFAULTS['outpath_confirmed'], + outpath_confirmed_direct_source_indirect_mondo: str = EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'], outpath_added_obsolete: str = EX_DEFAULTS['outpath_added_obsolete'], mondo_db_path: str = EX_DEFAULTS['mondo_db_path'], mondo_ingest_db_path: str = EX_DEFAULTS['mondo_ingest_db_path'], mondo_mappings_path: str = EX_DEFAULTS['mondo_mappings_path'], @@ -402,8 +403,7 @@ def sync_subclassof( _confirmed_df(in_both_direct, outpath_confirmed) # Case 2: SCR is direct in source, but indirect Mondo - _confirmed_df(in_source_direct_mondo_indirect, - outpath_confirmed.replace('confirmed', 'confirmed-direct-source-indirect-mondo')) + _confirmed_df(in_source_direct_mondo_indirect, outpath_confirmed_direct_source_indirect_mondo) # Case 3: SCR is direct in the source, but not at all in Mondo subheader = deepcopy(ROBOT_SUBHEADER) @@ -486,8 +486,13 @@ def cli(): # todo: #remove-temp-defaults 'into Mondo, except for that these terms are obsolete in Mondo.') parser.add_argument( '-c', '--outpath-confirmed', required=False, default=EX_DEFAULTS['outpath_confirmed'], - help='Path to output robot template containing subclass relations for given ontology that exist in Mondo and ' - 'are confirmed to also exist in the source.') + help='Path to output robot template containing direct subclass relations for given ontology that exist in ' + 'Mondo and are confirmed to also exist in the source.') + parser.add_argument( + '-C', '--confirmed-direct-source-indirect-mondo', required=False, + default=EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'], + help='Path to output robot template containing subclass relations for given ontology that exist in Mondo as ' + 'indirect relations and are confirmed to also exist in the source as direct relations.') parser.add_argument( '-M', '--outpath-direct-in-mondo-only', required=False, default=EX_DEFAULTS['outpath_direct_in_mondo_only'], @@ -526,6 +531,8 @@ def run_defaults(use_cache=True): # todo: #remove-temp-defaults sync_subclassof(**{ 'outpath_added': str(REPORTS_DIR / f'{name}.subclass.added.robot.tsv'), 'outpath_confirmed': str(REPORTS_DIR / f'{name}.subclass.confirmed.robot.tsv'), + 'outpath_confirmed_direct_source_indirect_mondo': \ + str(REPORTS_DIR / f'{name}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'), 'onto_config_path': str(METADATA_DIR / f'{name}.yml'), 'mondo_db_path': str(TMP_DIR / 'mondo.db'), 'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'), diff --git a/src/scripts/sync_subclassof_config.py b/src/scripts/sync_subclassof_config.py index 2b9d00ce..030f208e 100644 --- a/src/scripts/sync_subclassof_config.py +++ b/src/scripts/sync_subclassof_config.py @@ -30,6 +30,8 @@ 'outpath_added': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added.robot.tsv'), 'outpath_added_obsolete': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added-obsolete.robot.tsv'), 'outpath_confirmed': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed.robot.tsv'), + 'outpath_confirmed_direct_source_indirect_mondo': \ + str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'), 'onto_config_path': str(METADATA_DIR / f'{EX_ONTO_NAME}.yml'), 'mondo_db_path': str(TMP_DIR / 'mondo.db'), 'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'), From 4da04bc19e34d9b7cfb6fc1ef1b6d5088364965a Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Mon, 2 Dec 2024 20:58:45 -0500 Subject: [PATCH 3/4] Subclass Sync - Direct in source, indirect in Mondo - Needed to fix an inconsistent named param - Added some previously missing make prereqs - Removed a todo --- src/ontology/mondo-ingest.Makefile | 5 ++--- src/scripts/sync_subclassof.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index d86d5d27..5dbbc798 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -567,11 +567,10 @@ $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv: $(foreach n,$(ALL_COMPONENT_ $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.direct-in-mondo-only.tsv) $(TMPDIR)/mondo.db python3 $(SCRIPTSDIR)/sync_subclassof_collate_direct_in_mondo_only.py --outpath $@ -$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) +$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) $(TMPDIR)/mondo.db awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed.robot.tsv > $@ -# TODO: implement this goal && add to master goal -$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) +$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) $(TMPDIR)/mondo.db awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv > $@ $(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.confirmed-direct-source-indirect-mondo.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv diff --git a/src/scripts/sync_subclassof.py b/src/scripts/sync_subclassof.py index 0312d249..2fd580ce 100644 --- a/src/scripts/sync_subclassof.py +++ b/src/scripts/sync_subclassof.py @@ -489,7 +489,7 @@ def cli(): # todo: #remove-temp-defaults help='Path to output robot template containing direct subclass relations for given ontology that exist in ' 'Mondo and are confirmed to also exist in the source.') parser.add_argument( - '-C', '--confirmed-direct-source-indirect-mondo', required=False, + '-C', '--outpath-confirmed-direct-source-indirect-mondo', required=False, default=EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'], help='Path to output robot template containing subclass relations for given ontology that exist in Mondo as ' 'indirect relations and are confirmed to also exist in the source as direct relations.') From bf7f6f17e3ab09b6c4383ff5b08e34b54a37b3b2 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Wed, 4 Dec 2024 16:20:58 -0500 Subject: [PATCH 4/4] Subclass Sync - Direct in source, indirect in Mondo - Delete: Some erroneously declared prereqs from goals. --- src/ontology/mondo-ingest.Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index 5dbbc798..b766175b 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -567,10 +567,10 @@ $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv: $(foreach n,$(ALL_COMPONENT_ $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.direct-in-mondo-only.tsv) $(TMPDIR)/mondo.db python3 $(SCRIPTSDIR)/sync_subclassof_collate_direct_in_mondo_only.py --outpath $@ -$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) $(TMPDIR)/mondo.db +$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed.robot.tsv > $@ -$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) $(TMPDIR)/mondo.db +$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv > $@ $(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.confirmed-direct-source-indirect-mondo.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv