Skip to content

Commit

Permalink
add datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
Samoed committed Nov 18, 2024
1 parent 88c38d1 commit 9208f84
Show file tree
Hide file tree
Showing 12 changed files with 565 additions and 0 deletions.
92 changes: 92 additions & 0 deletions mteb/descriptive_stats/Classification/ArxivClassification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"test": {
"num_samples": 2500,
"number_of_characters": 137209409,
"number_texts_intersect_with_train": 159,
"min_text_length": 3895,
"average_text_length": 54883.7636,
"max_text_length": 559979,
"unique_text": 2495,
"unique_labels": 11,
"labels": {
"4": {
"count": 234
},
"1": {
"count": 194
},
"7": {
"count": 236
},
"3": {
"count": 233
},
"9": {
"count": 219
},
"5": {
"count": 196
},
"2": {
"count": 205
},
"10": {
"count": 212
},
"8": {
"count": 318
},
"0": {
"count": 212
},
"6": {
"count": 241
}
}
},
"train": {
"num_samples": 28388,
"number_of_characters": 1602729054,
"number_texts_intersect_with_train": null,
"min_text_length": 2852,
"average_text_length": 56457.97710300127,
"max_text_length": 2553775,
"unique_text": 27321,
"unique_labels": 11,
"labels": {
"8": {
"count": 3527
},
"9": {
"count": 2560
},
"3": {
"count": 2631
},
"5": {
"count": 2117
},
"1": {
"count": 2137
},
"6": {
"count": 2443
},
"0": {
"count": 2456
},
"10": {
"count": 2581
},
"7": {
"count": 2768
},
"2": {
"count": 2569
},
"4": {
"count": 2599
}
}
}
}
38 changes: 38 additions & 0 deletions mteb/descriptive_stats/Classification/DKHateClassification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"test": {
"num_samples": 329,
"number_of_characters": 29011,
"number_texts_intersect_with_train": 4,
"min_text_length": 1,
"average_text_length": 88.17933130699087,
"max_text_length": 2434,
"unique_text": 326,
"unique_labels": 2,
"labels": {
"0": {
"count": 288
},
"1": {
"count": 41
}
}
},
"train": {
"num_samples": 2960,
"number_of_characters": 307722,
"number_texts_intersect_with_train": null,
"min_text_length": 1,
"average_text_length": 103.96013513513513,
"max_text_length": 5403,
"unique_text": 2902,
"unique_labels": 2,
"labels": {
"0": {
"count": 2576
},
"1": {
"count": 384
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"test": {
"num_samples": 2224,
"number_of_characters": 3209177,
"number_texts_intersect_with_train": 0,
"min_text_length": 4,
"average_text_length": 1442.9752697841727,
"max_text_length": 11140,
"unique_text": 2224,
"unique_labels": 2,
"labels": {
"1": {
"count": 1112
},
"0": {
"count": 1112
}
}
},
"train": {
"num_samples": 20028,
"number_of_characters": 29162515,
"number_texts_intersect_with_train": null,
"min_text_length": 4,
"average_text_length": 1456.0872278809666,
"max_text_length": 22676,
"unique_text": 20028,
"unique_labels": 2,
"labels": {
"1": {
"count": 10014
},
"0": {
"count": 10014
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"train": {
"num_samples": 2048,
"number_of_characters": 282368,
"number_texts_intersect_with_train": null,
"min_text_length": 11,
"average_text_length": 137.875,
"max_text_length": 2698,
"unique_text": 2044,
"unique_labels": 4,
"labels": {
"4": {
"count": 512
},
"3": {
"count": 512
},
"0": {
"count": 279
},
"1": {
"count": 745
}
}
}
}
26 changes: 26 additions & 0 deletions mteb/descriptive_stats/Classification/MyanmarNews.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"train": {
"num_samples": 2048,
"number_of_characters": 354794,
"number_texts_intersect_with_train": null,
"min_text_length": 2,
"average_text_length": 173.2392578125,
"max_text_length": 2268,
"unique_text": 2042,
"unique_labels": 4,
"labels": {
"2": {
"count": 523
},
"0": {
"count": 511
},
"3": {
"count": 507
},
"1": {
"count": 507
}
}
}
}
80 changes: 80 additions & 0 deletions mteb/descriptive_stats/Classification/PatentClassification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"test": {
"num_samples": 2048,
"number_of_characters": 38376596,
"number_texts_intersect_with_train": 9,
"min_text_length": 2168,
"average_text_length": 18738.572265625,
"max_text_length": 226050,
"unique_text": 2048,
"unique_labels": 9,
"labels": {
"7": {
"count": 424
},
"0": {
"count": 309
},
"6": {
"count": 453
},
"2": {
"count": 161
},
"1": {
"count": 266
},
"8": {
"count": 206
},
"4": {
"count": 64
},
"5": {
"count": 147
},
"3": {
"count": 18
}
}
},
"train": {
"num_samples": 25000,
"number_of_characters": 465511243,
"number_texts_intersect_with_train": null,
"min_text_length": 1551,
"average_text_length": 18620.44972,
"max_text_length": 331797,
"unique_text": 24950,
"unique_labels": 9,
"labels": {
"6": {
"count": 5408
},
"0": {
"count": 3614
},
"7": {
"count": 5321
},
"8": {
"count": 2562
},
"2": {
"count": 2099
},
"4": {
"count": 705
},
"1": {
"count": 3357
},
"3": {
"count": 204
},
"5": {
"count": 1730
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"validation": {
"num_samples": 2048,
"number_of_characters": 574726,
"number_texts_intersect_with_train": 503,
"min_text_length": 19,
"average_text_length": 280.6279296875,
"max_text_length": 4159,
"unique_text": 2032,
"unique_labels": 2,
"labels": {
"1": {
"count": 1027
},
"0": {
"count": 1021
}
}
},
"test": {
"num_samples": 2048,
"number_of_characters": 558426,
"number_texts_intersect_with_train": 505,
"min_text_length": 3,
"average_text_length": 272.6689453125,
"max_text_length": 4181,
"unique_text": 2028,
"unique_labels": 2,
"labels": {
"0": {
"count": 1022
},
"1": {
"count": 1026
}
}
},
"train": {
"num_samples": 62089,
"number_of_characters": 17328750,
"number_texts_intersect_with_train": null,
"min_text_length": 2,
"average_text_length": 279.0953308959719,
"max_text_length": 4995,
"unique_text": 51988,
"unique_labels": 2,
"labels": {
"0": {
"count": 31091
},
"1": {
"count": 30998
}
}
}
}
Loading

0 comments on commit 9208f84

Please sign in to comment.