diff --git a/README.md b/README.md index d22d629..f44770b 100644 --- a/README.md +++ b/README.md @@ -35,22 +35,49 @@ OpenEA(backend=pandas, graph_pair=D_W, size=15K, version=V1, rel_triples_left=38 2 http://dbpedia.org/resource/E840454 http://dbpedia.org/ontology/activeYearsStartYear 1948^^ 3 http://dbpedia.org/resource/E971710 http://purl.org/dc/elements/1.1/description English singer-songwriter 4 http://dbpedia.org/resource/E022831 http://dbpedia.org/ontology/militaryCommand Commandant of the Marine Corps ->>> ds.ent_links.head() - left right -0 http://dbpedia.org/resource/E123186 http://www.wikidata.org/entity/Q21197 -1 http://dbpedia.org/resource/E228902 http://www.wikidata.org/entity/Q5909974 -2 http://dbpedia.org/resource/E718575 http://www.wikidata.org/entity/Q707008 -3 http://dbpedia.org/resource/E469216 http://www.wikidata.org/entity/Q1471945 -4 http://dbpedia.org/resource/E649433 http://www.wikidata.org/entity/Q1198381 + +The gold standard entity links are stored as [eche](https://github.com/dobraczka/eche) ClusterHelper, which provides convenient functionalities: + +>>> ds.ent_links.clusters[0] +{'http://www.wikidata.org/entity/Q21197', 'http://dbpedia.org/resource/E123186'} +>>> ('http://www.wikidata.org/entity/Q21197', 'http://dbpedia.org/resource/E123186') in ds.ent_links +True +>>> ('http://dbpedia.org/resource/E123186', 'http://www.wikidata.org/entity/Q21197') in ds.ent_links +True +>>> ds.ent_links.links('http://www.wikidata.org/entity/Q21197') +'http://dbpedia.org/resource/E123186' +>>> ds.ent_links.all_pairs() + ``` -You can get a canonical name for a dataset instance to use e.g. to create folders to store experiment results: +Most datasets are binary matching tasks, but for example the `MovieGraphBenchmark` provides a multi-source setting: ``` - >>> ds.canonical_name - 'openea_d_w_15k_v1' +>>> ds = MovieGraphBenchmark(graph_pair="multi") +>>> ds +MovieGraphBenchmark(backend=pandas,graph_pair=multi, rel_triples_0=17507, attr_triples_0=20800 rel_triples_1=27903, attr_triples_1=23761 rel_triples_2=15455, attr_triples_2=20902, ent_links=3598, folds=5) +>>> ds.dataset_names +('imdb', 'tmdb', 'tvdb') +``` + +Here the [`PrefixedClusterHelper`](https://eche.readthedocs.io/en/latest/reference/eche/#eche.PrefixedClusterHelper) various convenience functions: + ``` +Get pairs between specific dataset pairs +>>> list(ds.ent_links.pairs_in_ds_tuple(("imdb","tmdb")))[0] +('https://www.scads.de/movieBenchmark/resource/IMDB/nm0641721', 'https://www.scads.de/movieBenchmark/resource/TMDB/person1236714') + +Get number of intra-dataset pairs +>>> ds.ent_links.number_of_intra_links +(1, 64, 22663) +``` + +For all datasets you can get a canonical name for a dataset instance to use e.g. to create folders to store experiment results: + +``` +>>> ds.canonical_name +'openea_d_w_15k_v1' ``` You can use [dask](https://www.dask.org/) as backend for larger datasets: @@ -127,3 +154,16 @@ Datasets | [MED-BBK](https://sylloge.readthedocs.io/en/latest/source/datasets.html#sylloge.MED_BBK) | 2020 | 1 | Baidu Baike | [Paper](https://aclanthology.org/2020.coling-industry.17.pdf), [Repo](https://github.com/ZihengZZH/industry-eval-EA/tree/main#benchmark) | | [MovieGraphBenchmark](https://sylloge.readthedocs.io/en/latest/source/datasets.html#sylloge.MovieGraphBenchmark) | 2022 | 3 | IMDB, TMDB, TheTVDB | [Paper](http://ceur-ws.org/Vol-2873/paper8.pdf), [Repo](https://github.com/ScaDS/MovieGraphBenchmark) | | [OAEI](https://sylloge.readthedocs.io/en/latest/source/datasets.html#sylloge.OAEI) | 2022 | 5 | Fandom wikis | [Paper](https://ceur-ws.org/Vol-3324/oaei22_paper0.pdf), [Website](http://oaei.ontologymatching.org/2022/knowledgegraph/index.html) | + +More broad statistics are provided in `dataset_statistics.csv`. You can also get a pandas DataFrame with statistics for specific datasets for example to create tables for publications: +``` +>>> ds = MovieGraphBenchmark(graph_pair="multi") +>>> from sylloge.base import create_statistics_df +>>> stats_df = create_statistics_df([ds]) +>>> stats_df.loc[("MovieGraphBenchmark","moviegraphbenchmark_multi","imdb")] + Entities Relation Triples Attribute Triples ... Clusters Intra-dataset Matches All Matches +Dataset family Task Name Dataset Name ... +MovieGraphBenchmark moviegraphbenchmark_multi imdb 5129 17507 20800 ... 3598 1 31230 + +[1 rows x 9 columns] +``` diff --git a/docs/index.rst b/docs/index.rst index 8f6087a..1f9505e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,13 +23,42 @@ This simple library aims to collect entity-alignment benchmark datasets and make # 2 http://dbpedia.org/resource/E840454 http://dbpedia.org/ontology/activeYearsStartYear 1948^^ # 3 http://dbpedia.org/resource/E971710 http://purl.org/dc/elements/1.1/description English singer-songwriter # 4 http://dbpedia.org/resource/E022831 http://dbpedia.org/ontology/militaryCommand Commandant of the Marine Corps - print(ds.ent_links.head()) - # left right - # 0 http://dbpedia.org/resource/E123186 http://www.wikidata.org/entity/Q21197 - # 1 http://dbpedia.org/resource/E228902 http://www.wikidata.org/entity/Q5909974 - # 2 http://dbpedia.org/resource/E718575 http://www.wikidata.org/entity/Q707008 - # 3 http://dbpedia.org/resource/E469216 http://www.wikidata.org/entity/Q1471945 - # 4 http://dbpedia.org/resource/E649433 http://www.wikidata.org/entity/Q1198381 + +The gold standard entity links are stored as [eche](https://github.com/dobraczka/eche) ClusterHelper, which provides convenient functionalities: + +.. code-block:: python + + print(ds.ent_links.clusters[0]) + # {'http://www.wikidata.org/entity/Q21197', 'http://dbpedia.org/resource/E123186'} + print(('http://www.wikidata.org/entity/Q21197', 'http://dbpedia.org/resource/E123186') in ds.ent_links) + # True + print(('http://dbpedia.org/resource/E123186', 'http://www.wikidata.org/entity/Q21197') in ds.ent_links) + # True + print(ds.ent_links.links('http://www.wikidata.org/entity/Q21197')) + # 'http://dbpedia.org/resource/E123186' + print(ds.ent_links.all_pairs()) + # + +Most datasets are binary matching tasks, but for example the `MovieGraphBenchmark` provides a multi-source setting: + +.. code-block:: python + + ds = MovieGraphBenchmark(graph_pair="multi") + print(ds) + MovieGraphBenchmark(backend=pandas,graph_pair=multi, rel_triples_0=17507, attr_triples_0=20800 rel_triples_1=27903, attr_triples_1=23761 rel_triples_2=15455, attr_triples_2=20902, ent_links=3598, folds=5) + print(ds.dataset_names) + ('imdb', 'tmdb', 'tvdb') + +Here the [`PrefixedClusterHelper`](https://eche.readthedocs.io/en/latest/reference/eche/#eche.PrefixedClusterHelper) various convenience functions: + +.. code-block:: python + # Get pairs between specific dataset pairs + print(list(ds.ent_links.pairs_in_ds_tuple(("imdb","tmdb")))[0]) + # ('https://www.scads.de/movieBenchmark/resource/IMDB/nm0641721', 'https://www.scads.de/movieBenchmark/resource/TMDB/person1236714') + + # Get number of intra-dataset pairs + print(ds.ent_links.number_of_intra_links) + # (1, 64, 22663) You can get a canonical name for a dataset instance to use e.g. to create folders to store experiment results: diff --git a/sylloge/__init__.py b/sylloge/__init__.py index 76725ff..f0f3db5 100644 --- a/sylloge/__init__.py +++ b/sylloge/__init__.py @@ -10,6 +10,7 @@ TrainTestValSplit, ZipEADataset, ZipEADatasetWithPreSplitFolds, + create_statistics_df, ) from .id_mapped import IdMappedEADataset from .med_bbk_loader import MED_BBK @@ -34,6 +35,7 @@ "BinaryZipEADatasetWithPreSplitFolds", "ZipEADatasetWithPreSplitFolds", "TrainTestValSplit", + "create_statistics_df", ] __version__ = version(__package__) logging.getLogger(__name__).setLevel(logging.INFO)