Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subclass Sync - Direct in source, indirect in Mondo #714

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ sync: sync-subclassof sync-synonyms

# Synchronization: SubclassOf
.PHONY: sync-subclassof
sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv
sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv

# todo: drop this? This is really just an alias here for quality of life, but not used by anything.
.PHONY: sync-subclassof-%
Expand All @@ -567,14 +567,18 @@ $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv: $(foreach n,$(ALL_COMPONENT_
$(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.direct-in-mondo-only.tsv) $(TMPDIR)/mondo.db
python3 $(SCRIPTSDIR)/sync_subclassof_collate_direct_in_mondo_only.py --outpath $@

$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv)
$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv) $(TMPDIR)/mondo.db
awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed.robot.tsv > $@

$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv
$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv) $(TMPDIR)/mondo.db
awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv > $@

$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.confirmed-direct-source-indirect-mondo.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv
python3 $(SCRIPTSDIR)/sync_subclassof.py \
--outpath-added $(REPORTDIR)/$*.subclass.added.robot.tsv \
--outpath-added-obsolete $(REPORTDIR)/$*.subclass.added-obsolete.robot.tsv \
--outpath-confirmed $(REPORTDIR)/$*.subclass.confirmed.robot.tsv \
--outpath-confirmed-direct-source-indirect-mondo $(REPORTDIR)/$*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv \
--outpath-direct-in-mondo-only $(REPORTDIR)/$*.subclass.direct-in-mondo-only.tsv \
--outpath-self-parentage $(TMPDIR)/$*.subclass.self-parentage.tsv \
--mondo-db-path $(TMPDIR)/mondo.db \
Expand Down
63 changes: 44 additions & 19 deletions src/scripts/sync_subclassof.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
from src.scripts.utils import CACHE_DIR, MONDO_PREFIX_MAP, PREFIX_MAP, get_owned_prefix_map


COMMON_SORT_COLS = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id']


def _edges_with_metadata_from_plain_edges(edges: Set[RELATIONSHIP], ns_data_map: Dict[CURIE, Dict]) -> List[Dict]:
"""From simple (subject, predicate, object) edge tuples, create dictionaries with mondo mappings and labels

Expand Down Expand Up @@ -114,6 +117,20 @@ def _get_mondo_data_map(
return mondo_data_map


def _confirmed_df(rows: List[Dict], outpath: Union[str, Path], save=True):
"""Create -confirmed cases (exist in both Mondo and source) dataframe"""
subheader = deepcopy(ROBOT_SUBHEADER)
subheader[0]['object_mondo_id'] = 'SC %'
df = pd.DataFrame(rows)
if len(df) == 0:
df = pd.DataFrame(columns=list(subheader[0].keys()))
df = df.sort_values(COMMON_SORT_COLS)
df = pd.concat([pd.DataFrame(subheader), df])
if save:
df.to_csv(outpath, sep='\t', index=False)
return df


def _get_direct_scr_rels(
curies: List[CURIE], db: SqlImplementation, prefix_map: PREFIX_MAP = None, verbose=True
) -> Set[RELATIONSHIP]:
Expand Down Expand Up @@ -208,6 +225,7 @@ def _convert_edge_namespace(

def sync_subclassof(
outpath_added: str = EX_DEFAULTS['outpath_added'], outpath_confirmed: str = EX_DEFAULTS['outpath_confirmed'],
outpath_confirmed_direct_source_indirect_mondo: str = EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'],
outpath_added_obsolete: str = EX_DEFAULTS['outpath_added_obsolete'],
mondo_db_path: str = EX_DEFAULTS['mondo_db_path'], mondo_ingest_db_path: str = EX_DEFAULTS['mondo_ingest_db_path'],
mondo_mappings_path: str = EX_DEFAULTS['mondo_mappings_path'],
Expand Down Expand Up @@ -298,10 +316,12 @@ def sync_subclassof(
# - rels_indirect_source_source: Indirect relationships (source IDs) from source
# - rels_indirect_source_mondo: Relationships from rels_indirect_source_source that appear in Mondo
# - rels_indirect_source_mondo_and_1or2_ids_not_in_mondo: from rels_indirect_source_source not in Mondo (unused)
# - rels_indirect_mondo_source: Relationships from rels_indirect_mondo_mondo that appear in source
logging.info('- Collecting indirect subclass relations')
rels_indirect_mondo_mondo = ancestors_mondo_mondo.difference(rels_direct_mondo_mondo)
# todo: remove unused, commented out vars?
# rels_indirect_mondo_source = '' # not needed?
rels_indirect_mondo_source, rels_indirect_mondo_mondo_and_1or2_ids_not_in_source = _convert_edge_namespace(
rels_indirect_mondo_mondo, mondo_source_map)
# todo: remove unused, commented out vars? or leave for future cases?
# rels_indirect_source_source = ancestors_source_source.difference(rels_direct_source_source)
# rels_indirect_source_mondo, rels_indirect_source_mondo_and_1or2_ids_not_in_mondo = _convert_edge_namespace(
# rels_indirect_source_source, source_mondo_map)
Expand All @@ -323,7 +343,7 @@ def sync_subclassof(
'\n\nExiting.')

# Determine hierarchy differences -----------------------------------------------------------------------------------
# todo: remove unused, commented out vars? (they were created in anticipation of possible cases)
# todo: remove unused, commented out vars? or leave for future cases?
logging.info('Calculating various differences in hierarchies between source and Mondo')
# Find which edges appear in both Mondo and source, or only in one or the other
# - direct <--> direct
Expand All @@ -341,6 +361,12 @@ def sync_subclassof(
# in_source_only_direct: List[Dict] = _edges_with_metadata_from_plain_edges(
# in_source_only_direct_source_edges, source_data_map)

# - direct <--> indirect
in_source_direct_mondo_indirect_edges: Set[RELATIONSHIP] = \
rels_direct_source_source.intersection(rels_indirect_mondo_source)
in_source_direct_mondo_indirect: List[Dict] = _edges_with_metadata_from_plain_edges(
in_source_direct_mondo_indirect_edges, source_data_map)

# - indirect <--> indirect
# - Indirect SCRs that exist in both Mondo and source
# in_both_indirect_mondo_edges: Set[RELATIONSHIP] = (
Expand Down Expand Up @@ -372,20 +398,12 @@ def sync_subclassof(
# Google doc about cases:
# https://docs.google.com/document/d/1H8fJKiKD-L1tfS-2LJu8t0_2YXJ1PQJ_7Zkoj7Vf7xA/edit#heading=h.9hixairfgxa1
logging.info('Creating outputs for synchronization cases')
common_sort_cols = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id']

# Case 1: SCR direct in source and Mondo
subheader = deepcopy(ROBOT_SUBHEADER)
subheader[0]['object_mondo_id'] = 'SC %'
df1 = pd.DataFrame(in_both_direct)
if len(df1) == 0:
df1 = pd.DataFrame(columns=list(subheader[0].keys()))
df1 = df1.sort_values(common_sort_cols)
df1 = pd.concat([pd.DataFrame(subheader), df1])
df1.to_csv(outpath_confirmed, sep='\t', index=False)
_confirmed_df(in_both_direct, outpath_confirmed)

# Case 2: SCR is direct in source, but indirect Mondo
pass # no output for this case
_confirmed_df(in_source_direct_mondo_indirect, outpath_confirmed_direct_source_indirect_mondo)

# Case 3: SCR is direct in the source, but not at all in Mondo
subheader = deepcopy(ROBOT_SUBHEADER)
Expand All @@ -402,14 +420,14 @@ def sync_subclassof(
f'higher number than expected.')
# - format
# only object_mondo_id should ever be missing, but checking both to be safe
df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(common_sort_cols)
df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(COMMON_SORT_COLS)
# - obsolete cases
obsoletes = (df3['subject_mondo_label'].str.startswith('obsolete') |
df3['object_mondo_label'].str.startswith('obsolete'))
# todo: sort_values: from here and below for this section, should remove; should not be necessary
df3_obs = df3[obsoletes].sort_values(common_sort_cols)
df3_obs = df3[obsoletes].sort_values(COMMON_SORT_COLS)
# - filter: non-obsolete cases
df3 = df3[~obsoletes].sort_values(common_sort_cols)
df3 = df3[~obsoletes].sort_values(COMMON_SORT_COLS)
# - self-parentage / proxy merges
self_parentage_cases = df3['subject_mondo_id'] == df3['object_mondo_id']
df3_self_parentage = df3[self_parentage_cases]
Expand All @@ -419,7 +437,7 @@ def sync_subclassof(
logging.warning(sp_err)
df3_self_parentage.to_csv(outpath_self_parentage, sep='\t', index=False)
# - filter: non-self parentage cases
df3 = df3[~self_parentage_cases].sort_values(common_sort_cols)
df3 = df3[~self_parentage_cases].sort_values(COMMON_SORT_COLS)
# - save
df3_obs = pd.concat([pd.DataFrame(subheader), df3_obs])
df3_obs.to_csv(outpath_added_obsolete, sep='\t', index=False)
Expand Down Expand Up @@ -468,8 +486,13 @@ def cli(): # todo: #remove-temp-defaults
'into Mondo, except for that these terms are obsolete in Mondo.')
parser.add_argument(
'-c', '--outpath-confirmed', required=False, default=EX_DEFAULTS['outpath_confirmed'],
help='Path to output robot template containing subclass relations for given ontology that exist in Mondo and '
'are confirmed to also exist in the source.')
help='Path to output robot template containing direct subclass relations for given ontology that exist in '
'Mondo and are confirmed to also exist in the source.')
parser.add_argument(
'-C', '--outpath-confirmed-direct-source-indirect-mondo', required=False,
default=EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'],
help='Path to output robot template containing subclass relations for given ontology that exist in Mondo as '
'indirect relations and are confirmed to also exist in the source as direct relations.')
parser.add_argument(
'-M', '--outpath-direct-in-mondo-only', required=False,
default=EX_DEFAULTS['outpath_direct_in_mondo_only'],
Expand Down Expand Up @@ -508,6 +531,8 @@ def run_defaults(use_cache=True): # todo: #remove-temp-defaults
sync_subclassof(**{
'outpath_added': str(REPORTS_DIR / f'{name}.subclass.added.robot.tsv'),
'outpath_confirmed': str(REPORTS_DIR / f'{name}.subclass.confirmed.robot.tsv'),
'outpath_confirmed_direct_source_indirect_mondo': \
str(REPORTS_DIR / f'{name}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'),
'onto_config_path': str(METADATA_DIR / f'{name}.yml'),
'mondo_db_path': str(TMP_DIR / 'mondo.db'),
'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'),
Expand Down
2 changes: 2 additions & 0 deletions src/scripts/sync_subclassof_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
'outpath_added': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added.robot.tsv'),
'outpath_added_obsolete': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added-obsolete.robot.tsv'),
'outpath_confirmed': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed.robot.tsv'),
'outpath_confirmed_direct_source_indirect_mondo': \
str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'),
'onto_config_path': str(METADATA_DIR / f'{EX_ONTO_NAME}.yml'),
'mondo_db_path': str(TMP_DIR / 'mondo.db'),
'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'),
Expand Down