From ba2a1cdfde1a2325e7f8808845213f96693a7f71 Mon Sep 17 00:00:00 2001 From: Daniel Obraczka Date: Mon, 8 Apr 2024 17:24:56 +0200 Subject: [PATCH] Statistics is missing multi version of moviegraphbenchmark (#40) * Add MGB multi setting to statistics creation * Move statistics function, adapt and update csv * Adapt import and README * Move import again --- README.md | 2 +- dataset_statistics.csv | 7 ++-- sylloge/__init__.py | 2 -- sylloge/base.py | 56 ------------------------------- sylloge/create_statistic.py | 66 +++++++++++++++++++++++++++++++++++-- 5 files changed, 69 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index f44770b..33f84d8 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ Datasets More broad statistics are provided in `dataset_statistics.csv`. You can also get a pandas DataFrame with statistics for specific datasets for example to create tables for publications: ``` >>> ds = MovieGraphBenchmark(graph_pair="multi") ->>> from sylloge.base import create_statistics_df +>>> from sylloge.create_statistic import create_statistics_df >>> stats_df = create_statistics_df([ds]) >>> stats_df.loc[("MovieGraphBenchmark","moviegraphbenchmark_multi","imdb")] Entities Relation Triples Attribute Triples ... Clusters Intra-dataset Matches All Matches diff --git a/dataset_statistics.csv b/dataset_statistics.csv index 1022ef2..dff10d1 100644 --- a/dataset_statistics.csv +++ b/dataset_statistics.csv @@ -37,8 +37,11 @@ MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,imdb,5129,17507,20800,3,13,608 MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1483,22663,25583 MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tmdb,6061,27903,23761,4,30,9991,1920,64,26138 MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1920,22663,26138 -MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,8885,0,5619 -MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,8885,0,5619 +MovieGraphBenchmark,moviegraphbenchmark_multi,imdb,5129,17507,20800,3,13,6082,3598,1,31230 +MovieGraphBenchmark,moviegraphbenchmark_multi,tmdb,6061,27903,23761,4,30,9991,3598,64,31230 +MovieGraphBenchmark,moviegraphbenchmark_multi,tvdb,7814,15455,20902,3,9,7683,3598,22663,31230 +MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,9162,0,9162 +MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,9162,0,9162 OAEI,oaei_marvelcinematicuniverse_marvel,marvelcinematicuniverse,216033,1094598,130517,130,110,56566,1654,0,1654 OAEI,oaei_marvelcinematicuniverse_marvel,marvel,1472619,5152898,1580468,63,127,749980,1654,0,1654 OAEI,oaei_memoryalpha_memorybeta,memoryalpha,254537,2096198,430730,180,287,226110,9296,0,9296 diff --git a/sylloge/__init__.py b/sylloge/__init__.py index f0f3db5..76725ff 100644 --- a/sylloge/__init__.py +++ b/sylloge/__init__.py @@ -10,7 +10,6 @@ TrainTestValSplit, ZipEADataset, ZipEADatasetWithPreSplitFolds, - create_statistics_df, ) from .id_mapped import IdMappedEADataset from .med_bbk_loader import MED_BBK @@ -35,7 +34,6 @@ "BinaryZipEADatasetWithPreSplitFolds", "ZipEADatasetWithPreSplitFolds", "TrainTestValSplit", - "create_statistics_df", ] __version__ = version(__package__) logging.getLogger(__name__).setLevel(logging.INFO) diff --git a/sylloge/base.py b/sylloge/base.py index c130523..3c2c441 100644 --- a/sylloge/base.py +++ b/sylloge/base.py @@ -11,7 +11,6 @@ Callable, Dict, Generic, - Iterable, List, Literal, Mapping, @@ -1085,58 +1084,3 @@ class BinaryZipEADatasetWithPreSplitFolds( def __repr__(self) -> str: return self._binary_repr_adjustment(super().__repr__()) - - -def create_statistics_df( - datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True -): - rows = [] - triples_col = ( - ["Relation Triples", "Attribute Triples"] - if seperate_attribute_relations - else ["Triples"] - ) - index_cols = ["Dataset family", "Task Name", "Dataset Name"] - columns = [ - *index_cols, - "Entities", - *triples_col, - "Relations", - "Properties", - "Literals", - "Clusters", - "Intra-dataset Matches", - "All Matches", - ] - for ds in datasets: - ds_family = str(ds.__class__.__name__).split(".")[-1] - ds_stats, num_clusters = ds.statistics() - all_matches = ds.ent_links.number_of_links - intra_dataset_matches = (0,) * len(ds.dataset_names) - if isinstance(ds.ent_links, PrefixedClusterHelper): - intra_dataset_matches = ds.ent_links.number_of_intra_links - for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)): - if seperate_attribute_relations: - triples = [ds_side.rel_triples, ds_side.attr_triples] - else: - triples = [ds_side.triples] - rows.append( - [ - ds_family, - ds.canonical_name, - ds_side_name, - ds_side.entities, - *triples, - ds_side.relations, - ds_side.properties, - ds_side.literals, - num_clusters, - intra_dataset_matches[i], - all_matches, - ] - ) - statistics_df = pd.DataFrame( - rows, - columns=columns, - ) - return statistics_df.set_index(index_cols) diff --git a/sylloge/create_statistic.py b/sylloge/create_statistic.py index e45d041..7ca87e1 100644 --- a/sylloge/create_statistic.py +++ b/sylloge/create_statistic.py @@ -1,9 +1,68 @@ from typing import Dict, Iterable, Tuple import pandas as pd +from eche import ClusterHelper, PrefixedClusterHelper from sylloge import MED_BBK, OAEI, MovieGraphBenchmark, MultiSourceEADataset, OpenEA -from sylloge.base import create_statistics_df + + +def create_statistics_df( + datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True +): + rows = [] + triples_col = ( + ["Relation Triples", "Attribute Triples"] + if seperate_attribute_relations + else ["Triples"] + ) + index_cols = ["Dataset family", "Task Name", "Dataset Name"] + columns = [ + *index_cols, + "Entities", + *triples_col, + "Relations", + "Properties", + "Literals", + "Clusters", + "Intra-dataset Matches", + "All Matches", + ] + for ds in datasets: + ds_family = str(ds.__class__.__name__).split(".")[-1] + ds_stats, num_clusters = ds.statistics() + intra_dataset_matches = (0,) * len(ds.dataset_names) + if isinstance(ds.ent_links, ClusterHelper): + all_matches = ds.ent_links.number_of_links + if isinstance(ds.ent_links, PrefixedClusterHelper): + intra_dataset_matches = ds.ent_links.number_of_intra_links + else: + all_matches = len(ds.ent_links) + for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)): + if seperate_attribute_relations: + triples = [ds_side.rel_triples, ds_side.attr_triples] + else: + triples = [ds_side.triples] + rows.append( + [ + ds_family, + ds.canonical_name, + ds_side_name, + ds_side.entities, + *triples, + ds_side.relations, + ds_side.properties, + ds_side.literals, + num_clusters, + intra_dataset_matches[i], + all_matches, + ] + ) + statistics_df = pd.DataFrame( + rows, + columns=columns, + ) + return statistics_df.set_index(index_cols) + all_classes_with_args: Tuple[Tuple[type[MultiSourceEADataset], Dict[str, str]], ...] = ( (OpenEA, {"graph_pair": "D_W", "size": "15K", "version": "V1"}), @@ -25,6 +84,7 @@ (MovieGraphBenchmark, {"graph_pair": "imdb-tmdb"}), (MovieGraphBenchmark, {"graph_pair": "imdb-tvdb"}), (MovieGraphBenchmark, {"graph_pair": "tmdb-tvdb"}), + (MovieGraphBenchmark, {"graph_pair": "multi"}), (MED_BBK, {}), (OAEI, {"task": "marvelcinematicuniverse-marvel"}), (OAEI, {"task": "memoryalpha-memorybeta"}), @@ -34,7 +94,7 @@ ) -def create_statistic( +def create_and_write_statistic( classes_with_args: Iterable[ Tuple[type[MultiSourceEADataset], Dict[str, str]] ] = all_classes_with_args, @@ -49,4 +109,4 @@ def create_statistic( if __name__ == "__main__": - create_statistic() + create_and_write_statistic()