Skip to content

Commit

Permalink
Add dataset statistics (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
dobraczka authored Nov 2, 2023
1 parent 0bb288d commit 86df934
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 63 deletions.
51 changes: 51 additions & 0 deletions dataset_statistics.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Dataset family,Task Name,Dataset Name,Entities,Relation Triples,Attribute Triples,Relations,Properties,Literals,Matches
OpenEA,openea_d_w_15k_v1,DBpedia,15000,38265,52134,248,341,28236,15000
OpenEA,openea_d_w_15k_v1,Wikidata,15000,42746,138246,169,649,118515,15000
OpenEA,openea_d_w_15k_v2,DBpedia,15000,73983,51378,167,174,25690,15000
OpenEA,openea_d_w_15k_v2,Wikidata,15000,83365,175686,121,457,146977,15000
OpenEA,openea_d_y_15k_v1,DBpedia,15000,30291,52093,165,256,25297,15000
OpenEA,openea_d_y_15k_v1,YAGO,15000,26638,117114,28,34,105710,15000
OpenEA,openea_d_y_15k_v2,DBpedia,15000,68063,49602,72,89,22561,15000
OpenEA,openea_d_y_15k_v2,YAGO,15000,60970,116151,21,19,104546,15000
OpenEA,openea_en_de_15k_v1,DBpedia_EN,15000,47676,62403,215,285,28973,15000
OpenEA,openea_en_de_15k_v1,DBpedia_DE,15000,50419,133776,131,193,35630,15000
OpenEA,openea_en_de_15k_v2,DBpedia_EN,15000,84867,59511,169,170,23831,15000
OpenEA,openea_en_de_15k_v2,DBpedia_DE,15000,92632,161315,96,115,33185,15000
OpenEA,openea_en_fr_15k_v1,DBpedia_EN,15000,47334,57164,267,307,30281,15000
OpenEA,openea_en_fr_15k_v1,DBpedia_FR,15000,40864,54401,210,403,28760,15000
OpenEA,openea_en_fr_15k_v2,DBpedia_EN,15000,96318,52396,193,188,22761,15000
OpenEA,openea_en_fr_15k_v2,DBpedia_FR,15000,80112,56114,166,220,21645,15000
OpenEA,openea_d_w_100k_v1,DBpedia,100000,293990,334911,413,492,133931,100000
OpenEA,openea_d_w_100k_v1,Wikidata,100000,251708,687860,261,874,542921,100000
OpenEA,openea_d_w_100k_v2,DBpedia,100000,616457,360696,318,327,137483,100000
OpenEA,openea_d_w_100k_v2,Wikidata,100000,588203,878219,239,760,682367,100000
OpenEA,openea_d_y_100k_v1,DBpedia,100000,294188,360415,287,378,101386,100000
OpenEA,openea_d_y_100k_v1,YAGO,100000,400518,649787,32,37,497633,100000
OpenEA,openea_d_y_100k_v2,DBpedia,100000,576547,374785,230,276,97433,100000
OpenEA,openea_d_y_100k_v2,YAGO,100000,865265,755161,31,35,578596,100000
OpenEA,openea_en_de_100k_v1,DBpedia_EN,100000,335359,423666,381,450,147142,100000
OpenEA,openea_en_de_100k_v1,DBpedia_DE,100000,336240,586207,196,251,199527,100000
OpenEA,openea_en_de_100k_v2,DBpedia_EN,100000,622588,430752,323,325,139867,100000
OpenEA,openea_en_de_100k_v2,DBpedia_DE,100000,629395,656458,170,188,200356,100000
OpenEA,openea_en_fr_100k_v1,DBpedia_EN,100000,309607,384248,400,465,145103,100000
OpenEA,openea_en_fr_100k_v1,DBpedia_FR,100000,258285,340725,300,518,157791,100000
OpenEA,openea_en_fr_100k_v2,DBpedia_EN,100000,649902,396150,379,363,145382,100000
OpenEA,openea_en_fr_100k_v2,DBpedia_FR,100000,561391,342768,287,467,157564,100000
MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,9162
MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,9162
MovieGraphBenchmark,moviegraphbenchmark_imdb_tmdb,imdb,5129,17507,20800,3,13,6082,1978
MovieGraphBenchmark,moviegraphbenchmark_imdb_tmdb,tmdb,6061,27903,23761,4,30,9991,1978
MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,imdb,5129,17507,20800,3,13,6082,2488
MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,tvdb,7814,15455,20902,3,9,7683,2488
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tmdb,6061,27903,23761,4,30,9991,2483
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tvdb,7814,15455,20902,3,9,7683,2483
OAEI,oaei_starwars_swg,starwars,536869,6675247,1570786,561,603,622454,1096
OAEI,oaei_starwars_swg,swg,47692,178085,76269,50,146,32765,1096
OAEI,oaei_starwars_swtor,starwars,536869,6675247,1570786,561,603,622454,1358
OAEI,oaei_starwars_swtor,swtor,22791,105543,40605,137,346,16984,1358
OAEI,oaei_marvelcinematicuniverse_marvel,marvelcinematicuniverse,216033,1094598,130517,130,110,56566,1654
OAEI,oaei_marvelcinematicuniverse_marvel,marvel,1472619,5152898,1580468,63,127,749980,1654
OAEI,oaei_memoryalpha_memorybeta,memoryalpha,254537,2096198,430730,180,287,226110,9296
OAEI,oaei_memoryalpha_memorybeta,memorybeta,212302,2048728,494181,327,332,231196,9296
OAEI,oaei_memoryalpha_stexpanded,memoryalpha,254537,2096198,430730,180,287,226110,1725
OAEI,oaei_memoryalpha_stexpanded,stexpanded,55402,412179,155207,133,194,70310,1725
133 changes: 123 additions & 10 deletions sylloge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Any,
Dict,
Generic,
Iterable,
Literal,
Mapping,
Optional,
Expand All @@ -26,7 +27,14 @@
from slugify import slugify

from .dask import read_dask_df_archive_csv
from .typing import BACKEND_LITERAL, COLUMNS, EA_SIDES
from .typing import (
BACKEND_LITERAL,
COLUMNS,
EA_SIDES,
LABEL_HEAD,
LABEL_RELATION,
LABEL_TAIL,
)
from .utils import fix_dataclass_init_docs

BASE_DATASET_KEY = "sylloge"
Expand All @@ -42,6 +50,20 @@
logger = logging.getLogger(__name__)


@dataclass
class DatasetStatistics:
rel_triples: int
attr_triples: int
entities: int
relations: int
properties: int
literals: int

@property
def triples(self) -> int:
return self.rel_triples + self.attr_triples


@fix_dataclass_init_docs
@dataclass
class TrainTestValSplit(Generic[DataFrameType]):
Expand Down Expand Up @@ -134,20 +156,50 @@ def canonical_name(self) -> str:
assert isinstance(name, str) # for mypy
return slugify(name, separator="_")

def _statistics_side(self, left: bool) -> DatasetStatistics:
if left:
attr_triples = self.attr_triples_left
rel_triples = self.rel_triples_left
else:
attr_triples = self.attr_triples_right
rel_triples = self.rel_triples_right
num_attr_triples = len(attr_triples)
num_rel_triples = len(rel_triples)
num_entities = len(
set(attr_triples[LABEL_HEAD]).union(
set(rel_triples[LABEL_HEAD]).union(set(rel_triples[LABEL_TAIL]))
)
)
num_literals = len(set(attr_triples[LABEL_TAIL]))
num_relations = len(set(rel_triples[LABEL_RELATION]))
num_properties = len(set(attr_triples[LABEL_RELATION]))
return DatasetStatistics(
rel_triples=num_rel_triples,
attr_triples=num_attr_triples,
entities=num_entities,
relations=num_relations,
properties=num_properties,
literals=num_literals,
)

def statistics(self) -> Tuple[DatasetStatistics, DatasetStatistics, int]:
"""Provide statistics of datasets.
:return: statistics of left dataset, statistics of right dataset and number of gold standard matches
"""
return (
self._statistics_side(True),
self._statistics_side(False),
len(self.ent_links),
)

@property
def _param_repr(self) -> str:
raise NotImplementedError

@property
def _statistics(self) -> str:
if isinstance(self.rel_triples_left, pd.DataFrame):
return f"rel_triples_left={len(self.rel_triples_left)}, rel_triples_right={len(self.rel_triples_right)}, attr_triples_left={len(self.attr_triples_left)}, attr_triples_right={len(self.attr_triples_right)}, ent_links={len(self.ent_links)}, folds={len(self.folds) if self.folds else None}" # type: ignore
else:
unknown = "unknown_len"
return f"rel_triples_left={unknown}, rel_triples_right={unknown}, attr_triples_left={unknown}, attr_triples_right={unknown}, ent_links={unknown}, folds={unknown if self.folds else None}"

def __repr__(self) -> str:
return f"{self.__class__.__name__}(backend={self.backend}, {self._param_repr}{self._statistics})"
left_ds_stats, right_ds_stats, num_ent_links = self.statistics()
return f"{self.__class__.__name__}(backend={self.backend}, {self._param_repr}rel_triples_left={left_ds_stats.rel_triples}, rel_triples_right={right_ds_stats.rel_triples}, attr_triples_left={left_ds_stats.attr_triples}, attr_triples_right={right_ds_stats.attr_triples}, ent_links={num_ent_links}, folds={len(self.folds) if self.folds else None})"

def _additional_backend_handling(self, backend: BACKEND_LITERAL):
pass
Expand Down Expand Up @@ -675,3 +727,64 @@ def initial_read(self, backend: BACKEND_LITERAL):
)
folds.append(TrainTestValSplit(train=train, test=test, val=val))
return {**super().initial_read(backend=backend), **dict(folds=folds)}


def create_statistics_df(
datasets: Iterable[EADataset], seperate_attribute_relations: bool = True
):
rows = []
triples_col = (
["Relation Triples", "Attribute Triples"]
if seperate_attribute_relations
else ["Triples"]
)
index_cols = ["Dataset family", "Task Name", "Dataset Name"]
columns = [
*index_cols,
"Entities",
*triples_col,
"Relations",
"Properties",
"Literals",
"Matches",
]
for ds in datasets:
ds_family = str(ds.__class__.__name__).split(".")[-1]
ds_left_stats, ds_right_stats, num_ent_links = ds.statistics()
for ds_side, ds_side_name in zip(
[ds_left_stats, ds_right_stats], ds.dataset_names
):
if seperate_attribute_relations:
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
ds_side.rel_triples,
ds_side.attr_triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_ent_links,
]
)
else:
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
ds_side.triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_ent_links,
]
)
df = pd.DataFrame(
rows,
columns=columns,
)
return df.set_index(index_cols)
Loading

0 comments on commit 86df934

Please sign in to comment.