From 0abe1a0563bfe6db822e6f30f7a9117330354a2f Mon Sep 17 00:00:00 2001 From: Imene Kerboua <33312980+imenelydiaker@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:24:33 +0100 Subject: [PATCH] Add descriptive stats to mising tasks and add number of qrels (#1476) * add code for comupting number of qrels * add stats fever hotpotqa msmarco topiocqa * miracl mrtidy * multilongdoc miracl reranking * add multi eurlex * fix tests for descriptive stats * fix tests --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> --- mteb/abstasks/AbsTaskRetrieval.py | 3 + .../MultiEURLEXMultilabelClassification.json | 3750 +++++++++++++++++ .../Reranking/MIRACLReranking.json | 536 +++ .../Retrieval/AlloprofRetrieval.json | 2 + mteb/descriptive_stats/Retrieval/FEVER.json | 30 + .../descriptive_stats/Retrieval/HotpotQA.json | 86 + .../Retrieval/MIRACLRetrieval.json | 536 +++ mteb/descriptive_stats/Retrieval/MSMARCO.json | 86 + .../Retrieval/MrTidyRetrieval.json | 340 ++ .../Retrieval/MultiLongDocRetrieval.json | 790 ++++ .../descriptive_stats/Retrieval/TopiOCQA.json | 30 + .../Retrieval/XPQARetrieval.json | 74 + tests/test_TaskMetadata.py | 9 - tests/test_benchmark/mock_tasks.py | 16 + 14 files changed, 6279 insertions(+), 9 deletions(-) create mode 100644 mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json create mode 100644 mteb/descriptive_stats/Reranking/MIRACLReranking.json create mode 100644 mteb/descriptive_stats/Retrieval/FEVER.json create mode 100644 mteb/descriptive_stats/Retrieval/HotpotQA.json create mode 100644 mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/MSMARCO.json create mode 100644 mteb/descriptive_stats/Retrieval/MrTidyRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/MultiLongDocRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/TopiOCQA.json diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 5345a50a5d..bc86928a53 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -58,6 +58,7 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): num_samples: int num_queries: int num_documents: int + num_relevant_docs: int number_of_characters: int min_document_length: int @@ -419,6 +420,7 @@ def _calculate_metrics_from_split( query_len, doc_len = calculate_length(queries, corpus) num_documents = len(corpus) num_queries = len(queries) + num_relevant_docs = sum(len(relevant_docs[qid]) for qid in relevant_docs) none_queries = sum(q is None or len(q) == 0 for q in queries.values()) # create a list of number of relevant docs per query @@ -466,6 +468,7 @@ def _calculate_metrics_from_split( num_samples=num_documents + num_queries, num_queries=num_queries, num_documents=num_documents, + num_relevant_docs=num_relevant_docs, min_document_length=min(doc_len), average_document_length=sum(doc_len) / num_documents, max_document_length=max(doc_len), diff --git a/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json b/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json new file mode 100644 index 0000000000..9a5c08b6cc --- /dev/null +++ b/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json @@ -0,0 +1,3750 @@ +{ + "test": { + "num_samples": 115000, + "number_of_characters": 1381657027, + "number_texts_intersect_with_train": 0, + "min_text_length": 563, + "average_text_length": 12014.408930434782, + "max_text_length": 1458188, + "unique_texts": 115000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 50784 + }, + "15": { + "count": 30981 + }, + "5": { + "count": 24978 + }, + "6": { + "count": 45080 + }, + "3": { + "count": 63687 + }, + "17": { + "count": 37743 + }, + "1": { + "count": 15019 + }, + "20": { + "count": 14030 + }, + "0": { + "count": 17802 + }, + "2": { + "count": 22402 + }, + "19": { + "count": 10212 + }, + "9": { + "count": 3772 + }, + "4": { + "count": 9062 + }, + "10": { + "count": 7705 + }, + "11": { + "count": 12213 + }, + "7": { + "count": 14306 + }, + "12": { + "count": 11799 + }, + "8": { + "count": 13800 + }, + "13": { + "count": 2346 + }, + "14": { + "count": 4255 + }, + "16": { + "count": 1311 + } + }, + "hf_subset_descriptive_stats": { + "en": { + "num_samples": 5000, + "number_of_characters": 58601463, + "number_texts_intersect_with_train": 0, + "min_text_length": 700, + "average_text_length": 11720.2926, + "max_text_length": 1269363, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "de": { + "num_samples": 5000, + "number_of_characters": 64327081, + "number_texts_intersect_with_train": 0, + "min_text_length": 688, + "average_text_length": 12865.4162, + "max_text_length": 1361562, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "fr": { + "num_samples": 5000, + "number_of_characters": 65405549, + "number_texts_intersect_with_train": 0, + "min_text_length": 676, + "average_text_length": 13081.1098, + "max_text_length": 1440461, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "it": { + "num_samples": 5000, + "number_of_characters": 63817393, + "number_texts_intersect_with_train": 0, + "min_text_length": 696, + "average_text_length": 12763.4786, + "max_text_length": 1404333, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "es": { + "num_samples": 5000, + "number_of_characters": 65401450, + "number_texts_intersect_with_train": 0, + "min_text_length": 683, + "average_text_length": 13080.29, + "max_text_length": 1458188, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "pl": { + "num_samples": 5000, + "number_of_characters": 61412963, + "number_texts_intersect_with_train": 0, + "min_text_length": 697, + "average_text_length": 12282.5926, + "max_text_length": 1381409, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "ro": { + "num_samples": 5000, + "number_of_characters": 64184661, + "number_texts_intersect_with_train": 0, + "min_text_length": 645, + "average_text_length": 12836.9322, + "max_text_length": 1450509, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "nl": { + "num_samples": 5000, + "number_of_characters": 64289871, + "number_texts_intersect_with_train": 0, + "min_text_length": 721, + "average_text_length": 12857.9742, + "max_text_length": 1442428, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "el": { + "num_samples": 5000, + "number_of_characters": 64990715, + "number_texts_intersect_with_train": 0, + "min_text_length": 695, + "average_text_length": 12998.143, + "max_text_length": 1436873, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "hu": { + "num_samples": 5000, + "number_of_characters": 62123205, + "number_texts_intersect_with_train": 0, + "min_text_length": 635, + "average_text_length": 12424.641, + "max_text_length": 1405731, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "pt": { + "num_samples": 5000, + "number_of_characters": 62412308, + "number_texts_intersect_with_train": 0, + "min_text_length": 662, + "average_text_length": 12482.4616, + "max_text_length": 1400357, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "cs": { + "num_samples": 5000, + "number_of_characters": 53917338, + "number_texts_intersect_with_train": 0, + "min_text_length": 563, + "average_text_length": 10783.4676, + "max_text_length": 1183634, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "sv": { + "num_samples": 5000, + "number_of_characters": 58062387, + "number_texts_intersect_with_train": 0, + "min_text_length": 660, + "average_text_length": 11612.4774, + "max_text_length": 1257482, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "bg": { + "num_samples": 5000, + "number_of_characters": 61177134, + "number_texts_intersect_with_train": 0, + "min_text_length": 661, + "average_text_length": 12235.4268, + "max_text_length": 1309869, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "da": { + "num_samples": 5000, + "number_of_characters": 58869790, + "number_texts_intersect_with_train": 0, + "min_text_length": 680, + "average_text_length": 11773.958, + "max_text_length": 1297978, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "fi": { + "num_samples": 5000, + "number_of_characters": 60438431, + "number_texts_intersect_with_train": 0, + "min_text_length": 707, + "average_text_length": 12087.6862, + "max_text_length": 1330363, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "sk": { + "num_samples": 5000, + "number_of_characters": 55654070, + "number_texts_intersect_with_train": 0, + "min_text_length": 595, + "average_text_length": 11130.814, + "max_text_length": 1229063, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "lt": { + "num_samples": 5000, + "number_of_characters": 56226783, + "number_texts_intersect_with_train": 0, + "min_text_length": 597, + "average_text_length": 11245.3566, + "max_text_length": 1274867, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "hr": { + "num_samples": 5000, + "number_of_characters": 55110710, + "number_texts_intersect_with_train": 0, + "min_text_length": 610, + "average_text_length": 11022.142, + "max_text_length": 1252581, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "sl": { + "num_samples": 5000, + "number_of_characters": 53100297, + "number_texts_intersect_with_train": 0, + "min_text_length": 573, + "average_text_length": 10620.0594, + "max_text_length": 1208117, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "et": { + "num_samples": 5000, + "number_of_characters": 54492156, + "number_texts_intersect_with_train": 0, + "min_text_length": 599, + "average_text_length": 10898.4312, + "max_text_length": 1370495, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "lv": { + "num_samples": 5000, + "number_of_characters": 54692551, + "number_texts_intersect_with_train": 0, + "min_text_length": 614, + "average_text_length": 10938.5102, + "max_text_length": 1230284, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + }, + "mt": { + "num_samples": 5000, + "number_of_characters": 62948721, + "number_texts_intersect_with_train": 0, + "min_text_length": 703, + "average_text_length": 12589.7442, + "max_text_length": 1403346, + "unique_texts": 5000, + "min_labels_per_text": 1, + "average_label_per_text": 3.5938, + "max_labels_per_text": 9, + "unique_labels": 21, + "labels": { + "18": { + "count": 2208 + }, + "15": { + "count": 1347 + }, + "5": { + "count": 1086 + }, + "6": { + "count": 1960 + }, + "3": { + "count": 2769 + }, + "17": { + "count": 1641 + }, + "1": { + "count": 653 + }, + "20": { + "count": 610 + }, + "0": { + "count": 774 + }, + "2": { + "count": 974 + }, + "19": { + "count": 444 + }, + "9": { + "count": 164 + }, + "4": { + "count": 394 + }, + "10": { + "count": 335 + }, + "11": { + "count": 531 + }, + "7": { + "count": 622 + }, + "12": { + "count": 513 + }, + "8": { + "count": 600 + }, + "13": { + "count": 102 + }, + "14": { + "count": 185 + }, + "16": { + "count": 57 + } + } + } + } + }, + "train": { + "num_samples": 817239, + "number_of_characters": 6311709460, + "number_texts_intersect_with_train": null, + "min_text_length": 450, + "average_text_length": 7723.211276015952, + "max_text_length": 939852, + "unique_texts": 817106, + "min_labels_per_text": 1, + "average_label_per_text": 3.279778620452524, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 85901 + }, + "20": { + "count": 55421 + }, + "7": { + "count": 71231 + }, + "3": { + "count": 445523 + }, + "0": { + "count": 105847 + }, + "2": { + "count": 131330 + }, + "17": { + "count": 392812 + }, + "19": { + "count": 96924 + }, + "6": { + "count": 293802 + }, + "12": { + "count": 63033 + }, + "18": { + "count": 316672 + }, + "4": { + "count": 74760 + }, + "5": { + "count": 128614 + }, + "10": { + "count": 34808 + }, + "8": { + "count": 55990 + }, + "15": { + "count": 216563 + }, + "14": { + "count": 17360 + }, + "9": { + "count": 31691 + }, + "11": { + "count": 39649 + }, + "13": { + "count": 9126 + }, + "16": { + "count": 13306 + } + }, + "hf_subset_descriptive_stats": { + "en": { + "num_samples": 55000, + "number_of_characters": 386261559, + "number_texts_intersect_with_train": null, + "min_text_length": 566, + "average_text_length": 7022.937436363636, + "max_text_length": 850450, + "unique_texts": 54986, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "de": { + "num_samples": 55000, + "number_of_characters": 415962273, + "number_texts_intersect_with_train": null, + "min_text_length": 592, + "average_text_length": 7562.950418181818, + "max_text_length": 888009, + "unique_texts": 54992, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "fr": { + "num_samples": 55000, + "number_of_characters": 423976667, + "number_texts_intersect_with_train": null, + "min_text_length": 551, + "average_text_length": 7708.666672727273, + "max_text_length": 926327, + "unique_texts": 54991, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "it": { + "num_samples": 55000, + "number_of_characters": 423891859, + "number_texts_intersect_with_train": null, + "min_text_length": 566, + "average_text_length": 7707.124709090909, + "max_text_length": 895850, + "unique_texts": 54992, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "es": { + "num_samples": 52785, + "number_of_characters": 423682977, + "number_texts_intersect_with_train": null, + "min_text_length": 569, + "average_text_length": 8026.57908496732, + "max_text_length": 939852, + "unique_texts": 52775, + "min_labels_per_text": 1, + "average_label_per_text": 3.2420384578952355, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 5414 + }, + "20": { + "count": 3043 + }, + "7": { + "count": 4066 + }, + "3": { + "count": 28995 + }, + "0": { + "count": 5887 + }, + "2": { + "count": 8557 + }, + "17": { + "count": 26280 + }, + "19": { + "count": 6704 + }, + "6": { + "count": 18832 + }, + "12": { + "count": 3541 + }, + "18": { + "count": 21935 + }, + "4": { + "count": 4870 + }, + "5": { + "count": 8222 + }, + "10": { + "count": 2053 + }, + "8": { + "count": 3261 + }, + "15": { + "count": 13176 + }, + "14": { + "count": 1050 + }, + "9": { + "count": 1892 + }, + "11": { + "count": 2188 + }, + "13": { + "count": 530 + }, + "16": { + "count": 635 + } + } + }, + "pl": { + "num_samples": 23197, + "number_of_characters": 191501869, + "number_texts_intersect_with_train": null, + "min_text_length": 538, + "average_text_length": 8255.458421347588, + "max_text_length": 834133, + "unique_texts": 23196, + "min_labels_per_text": 1, + "average_label_per_text": 3.327456136569384, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2228 + }, + "20": { + "count": 1999 + }, + "7": { + "count": 2407 + }, + "3": { + "count": 12498 + }, + "0": { + "count": 3717 + }, + "19": { + "count": 2289 + }, + "6": { + "count": 8410 + }, + "17": { + "count": 10886 + }, + "5": { + "count": 3669 + }, + "2": { + "count": 3816 + }, + "10": { + "count": 1107 + }, + "8": { + "count": 1866 + }, + "18": { + "count": 7637 + }, + "15": { + "count": 6788 + }, + "4": { + "count": 2037 + }, + "14": { + "count": 517 + }, + "9": { + "count": 1020 + }, + "13": { + "count": 304 + }, + "12": { + "count": 2159 + }, + "11": { + "count": 1415 + }, + "16": { + "count": 418 + } + } + }, + "ro": { + "num_samples": 15921, + "number_of_characters": 157122999, + "number_texts_intersect_with_train": null, + "min_text_length": 650, + "average_text_length": 9868.915206331261, + "max_text_length": 882427, + "unique_texts": 15920, + "min_labels_per_text": 1, + "average_label_per_text": 3.434143583945732, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 1801 + }, + "20": { + "count": 1721 + }, + "7": { + "count": 2032 + }, + "3": { + "count": 8085 + }, + "0": { + "count": 3121 + }, + "19": { + "count": 1596 + }, + "6": { + "count": 6154 + }, + "2": { + "count": 2293 + }, + "5": { + "count": 2626 + }, + "10": { + "count": 910 + }, + "8": { + "count": 1516 + }, + "18": { + "count": 5269 + }, + "15": { + "count": 5020 + }, + "4": { + "count": 1525 + }, + "17": { + "count": 6103 + }, + "14": { + "count": 413 + }, + "9": { + "count": 765 + }, + "13": { + "count": 247 + }, + "12": { + "count": 1891 + }, + "11": { + "count": 1224 + }, + "16": { + "count": 363 + } + } + }, + "nl": { + "num_samples": 55000, + "number_of_characters": 426734054, + "number_texts_intersect_with_train": null, + "min_text_length": 590, + "average_text_length": 7758.800981818182, + "max_text_length": 921418, + "unique_texts": 54987, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "el": { + "num_samples": 55000, + "number_of_characters": 428853513, + "number_texts_intersect_with_train": null, + "min_text_length": 598, + "average_text_length": 7797.3366, + "max_text_length": 930674, + "unique_texts": 54988, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "hu": { + "num_samples": 22664, + "number_of_characters": 187808803, + "number_texts_intersect_with_train": null, + "min_text_length": 552, + "average_text_length": 8286.657386163079, + "max_text_length": 853678, + "unique_texts": 22663, + "min_labels_per_text": 1, + "average_label_per_text": 3.3263766325450055, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2159 + }, + "20": { + "count": 1959 + }, + "7": { + "count": 2365 + }, + "3": { + "count": 12159 + }, + "0": { + "count": 3608 + }, + "19": { + "count": 2236 + }, + "6": { + "count": 8188 + }, + "17": { + "count": 10693 + }, + "5": { + "count": 3555 + }, + "18": { + "count": 7423 + }, + "10": { + "count": 1067 + }, + "14": { + "count": 510 + }, + "15": { + "count": 6643 + }, + "8": { + "count": 1838 + }, + "9": { + "count": 1014 + }, + "2": { + "count": 3783 + }, + "13": { + "count": 302 + }, + "4": { + "count": 1985 + }, + "12": { + "count": 2114 + }, + "11": { + "count": 1382 + }, + "16": { + "count": 406 + } + } + }, + "pt": { + "num_samples": 52370, + "number_of_characters": 403330428, + "number_texts_intersect_with_train": null, + "min_text_length": 546, + "average_text_length": 7701.554859652473, + "max_text_length": 900744, + "unique_texts": 52356, + "min_labels_per_text": 1, + "average_label_per_text": 3.249665839220928, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 5376 + }, + "20": { + "count": 3032 + }, + "7": { + "count": 4035 + }, + "3": { + "count": 28786 + }, + "0": { + "count": 5852 + }, + "2": { + "count": 8513 + }, + "17": { + "count": 26076 + }, + "19": { + "count": 6673 + }, + "6": { + "count": 18764 + }, + "12": { + "count": 3521 + }, + "18": { + "count": 21803 + }, + "4": { + "count": 4859 + }, + "5": { + "count": 8188 + }, + "10": { + "count": 2048 + }, + "8": { + "count": 3254 + }, + "15": { + "count": 13146 + }, + "14": { + "count": 1044 + }, + "9": { + "count": 1881 + }, + "11": { + "count": 2179 + }, + "13": { + "count": 519 + }, + "16": { + "count": 636 + } + } + }, + "cs": { + "num_samples": 23187, + "number_of_characters": 168437584, + "number_texts_intersect_with_train": null, + "min_text_length": 450, + "average_text_length": 7264.311208867038, + "max_text_length": 743409, + "unique_texts": 23186, + "min_labels_per_text": 1, + "average_label_per_text": 3.3279855091214903, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2230 + }, + "20": { + "count": 1995 + }, + "7": { + "count": 2407 + }, + "3": { + "count": 12501 + }, + "0": { + "count": 3719 + }, + "19": { + "count": 2282 + }, + "6": { + "count": 8402 + }, + "17": { + "count": 10880 + }, + "5": { + "count": 3664 + }, + "2": { + "count": 3818 + }, + "10": { + "count": 1106 + }, + "8": { + "count": 1868 + }, + "18": { + "count": 7630 + }, + "15": { + "count": 6783 + }, + "4": { + "count": 2041 + }, + "14": { + "count": 523 + }, + "9": { + "count": 1020 + }, + "13": { + "count": 305 + }, + "12": { + "count": 2159 + }, + "11": { + "count": 1416 + }, + "16": { + "count": 417 + } + } + }, + "sv": { + "num_samples": 42490, + "number_of_characters": 314595142, + "number_texts_intersect_with_train": null, + "min_text_length": 553, + "average_text_length": 7403.980748411391, + "max_text_length": 808204, + "unique_texts": 42482, + "min_labels_per_text": 1, + "average_label_per_text": 3.3235820192986587, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 3803 + }, + "20": { + "count": 2729 + }, + "7": { + "count": 3536 + }, + "3": { + "count": 23831 + }, + "0": { + "count": 5288 + }, + "2": { + "count": 7279 + }, + "17": { + "count": 22229 + }, + "19": { + "count": 5071 + }, + "6": { + "count": 15684 + }, + "12": { + "count": 3140 + }, + "5": { + "count": 7025 + }, + "18": { + "count": 16072 + }, + "10": { + "count": 1720 + }, + "8": { + "count": 2864 + }, + "15": { + "count": 11415 + }, + "4": { + "count": 3929 + }, + "14": { + "count": 871 + }, + "9": { + "count": 1716 + }, + "11": { + "count": 1954 + }, + "13": { + "count": 465 + }, + "16": { + "count": 598 + } + } + }, + "bg": { + "num_samples": 15986, + "number_of_characters": 152499367, + "number_texts_intersect_with_train": null, + "min_text_length": 604, + "average_text_length": 9539.557550356561, + "max_text_length": 798373, + "unique_texts": 15985, + "min_labels_per_text": 1, + "average_label_per_text": 3.4323783310396596, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 1810 + }, + "20": { + "count": 1730 + }, + "7": { + "count": 2059 + }, + "3": { + "count": 8104 + }, + "0": { + "count": 3125 + }, + "19": { + "count": 1599 + }, + "6": { + "count": 6159 + }, + "2": { + "count": 2294 + }, + "5": { + "count": 2629 + }, + "10": { + "count": 915 + }, + "8": { + "count": 1573 + }, + "18": { + "count": 5273 + }, + "15": { + "count": 5052 + }, + "4": { + "count": 1526 + }, + "17": { + "count": 6106 + }, + "14": { + "count": 412 + }, + "9": { + "count": 768 + }, + "13": { + "count": 250 + }, + "12": { + "count": 1896 + }, + "11": { + "count": 1226 + }, + "16": { + "count": 364 + } + } + }, + "da": { + "num_samples": 55000, + "number_of_characters": 387088427, + "number_texts_intersect_with_train": null, + "min_text_length": 544, + "average_text_length": 7037.9714, + "max_text_length": 839799, + "unique_texts": 54995, + "min_labels_per_text": 1, + "average_label_per_text": 3.231618181818182, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 6150 + }, + "20": { + "count": 3096 + }, + "7": { + "count": 4175 + }, + "3": { + "count": 30222 + }, + "0": { + "count": 6056 + }, + "2": { + "count": 8803 + }, + "17": { + "count": 26931 + }, + "19": { + "count": 7065 + }, + "6": { + "count": 19431 + }, + "12": { + "count": 3640 + }, + "18": { + "count": 22975 + }, + "4": { + "count": 5065 + }, + "5": { + "count": 8444 + }, + "10": { + "count": 2151 + }, + "8": { + "count": 3317 + }, + "15": { + "count": 13519 + }, + "14": { + "count": 1122 + }, + "9": { + "count": 1926 + }, + "11": { + "count": 2233 + }, + "13": { + "count": 541 + }, + "16": { + "count": 877 + } + } + }, + "fi": { + "num_samples": 42497, + "number_of_characters": 320275075, + "number_texts_intersect_with_train": null, + "min_text_length": 555, + "average_text_length": 7536.416099959997, + "max_text_length": 818453, + "unique_texts": 42486, + "min_labels_per_text": 1, + "average_label_per_text": 3.323575781819893, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 3804 + }, + "20": { + "count": 2728 + }, + "7": { + "count": 3537 + }, + "3": { + "count": 23835 + }, + "0": { + "count": 5288 + }, + "2": { + "count": 7280 + }, + "17": { + "count": 22233 + }, + "19": { + "count": 5072 + }, + "6": { + "count": 15686 + }, + "12": { + "count": 3140 + }, + "5": { + "count": 7028 + }, + "18": { + "count": 16075 + }, + "10": { + "count": 1720 + }, + "8": { + "count": 2865 + }, + "15": { + "count": 11418 + }, + "4": { + "count": 3929 + }, + "14": { + "count": 871 + }, + "9": { + "count": 1716 + }, + "11": { + "count": 1954 + }, + "13": { + "count": 465 + }, + "16": { + "count": 598 + } + } + }, + "sk": { + "num_samples": 22971, + "number_of_characters": 171894895, + "number_texts_intersect_with_train": null, + "min_text_length": 461, + "average_text_length": 7483.126333202734, + "max_text_length": 764206, + "unique_texts": 22970, + "min_labels_per_text": 1, + "average_label_per_text": 3.3276304906186063, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2200 + }, + "20": { + "count": 1974 + }, + "7": { + "count": 2381 + }, + "3": { + "count": 12392 + }, + "0": { + "count": 3666 + }, + "19": { + "count": 2261 + }, + "6": { + "count": 8355 + }, + "17": { + "count": 10833 + }, + "5": { + "count": 3609 + }, + "2": { + "count": 3790 + }, + "10": { + "count": 1086 + }, + "8": { + "count": 1848 + }, + "18": { + "count": 7565 + }, + "15": { + "count": 6693 + }, + "4": { + "count": 2013 + }, + "14": { + "count": 517 + }, + "9": { + "count": 1009 + }, + "13": { + "count": 300 + }, + "12": { + "count": 2130 + }, + "11": { + "count": 1403 + }, + "16": { + "count": 414 + } + } + }, + "lt": { + "num_samples": 23188, + "number_of_characters": 174821647, + "number_texts_intersect_with_train": null, + "min_text_length": 509, + "average_text_length": 7539.315464895636, + "max_text_length": 806603, + "unique_texts": 23186, + "min_labels_per_text": 1, + "average_label_per_text": 3.32805761600828, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2233 + }, + "20": { + "count": 1999 + }, + "7": { + "count": 2403 + }, + "3": { + "count": 12497 + }, + "0": { + "count": 3716 + }, + "19": { + "count": 2288 + }, + "6": { + "count": 8409 + }, + "17": { + "count": 10873 + }, + "5": { + "count": 3662 + }, + "2": { + "count": 3819 + }, + "10": { + "count": 1107 + }, + "8": { + "count": 1867 + }, + "18": { + "count": 7638 + }, + "15": { + "count": 6782 + }, + "4": { + "count": 2042 + }, + "14": { + "count": 519 + }, + "9": { + "count": 1021 + }, + "13": { + "count": 302 + }, + "12": { + "count": 2162 + }, + "11": { + "count": 1414 + }, + "16": { + "count": 418 + } + } + }, + "hr": { + "num_samples": 7944, + "number_of_characters": 78244345, + "number_texts_intersect_with_train": null, + "min_text_length": 724, + "average_text_length": 9849.489551863042, + "max_text_length": 756731, + "unique_texts": 7944, + "min_labels_per_text": 1, + "average_label_per_text": 3.50365055387714, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 1153 + }, + "20": { + "count": 937 + }, + "7": { + "count": 1312 + }, + "3": { + "count": 4036 + }, + "0": { + "count": 1755 + }, + "2": { + "count": 719 + }, + "5": { + "count": 1694 + }, + "10": { + "count": 629 + }, + "8": { + "count": 907 + }, + "18": { + "count": 2663 + }, + "15": { + "count": 2944 + }, + "17": { + "count": 1983 + }, + "6": { + "count": 2648 + }, + "14": { + "count": 251 + }, + "19": { + "count": 737 + }, + "9": { + "count": 361 + }, + "13": { + "count": 167 + }, + "12": { + "count": 1211 + }, + "4": { + "count": 730 + }, + "11": { + "count": 754 + }, + "16": { + "count": 242 + } + } + }, + "sl": { + "num_samples": 23184, + "number_of_characters": 165759223, + "number_texts_intersect_with_train": null, + "min_text_length": 486, + "average_text_length": 7149.724939613527, + "max_text_length": 727123, + "unique_texts": 23183, + "min_labels_per_text": 1, + "average_label_per_text": 3.3279416839199447, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2234 + }, + "20": { + "count": 2001 + }, + "7": { + "count": 2406 + }, + "3": { + "count": 12489 + }, + "0": { + "count": 3718 + }, + "19": { + "count": 2280 + }, + "6": { + "count": 8392 + }, + "17": { + "count": 10862 + }, + "5": { + "count": 3670 + }, + "2": { + "count": 3813 + }, + "10": { + "count": 1108 + }, + "8": { + "count": 1866 + }, + "18": { + "count": 7643 + }, + "15": { + "count": 6788 + }, + "4": { + "count": 2045 + }, + "14": { + "count": 523 + }, + "9": { + "count": 1020 + }, + "13": { + "count": 304 + }, + "12": { + "count": 2157 + }, + "11": { + "count": 1418 + }, + "16": { + "count": 418 + } + } + }, + "et": { + "num_samples": 23126, + "number_of_characters": 167111710, + "number_texts_intersect_with_train": null, + "min_text_length": 505, + "average_text_length": 7226.139842601401, + "max_text_length": 466834, + "unique_texts": 23125, + "min_labels_per_text": 1, + "average_label_per_text": 3.32703450661593, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2211 + }, + "20": { + "count": 2002 + }, + "7": { + "count": 2400 + }, + "3": { + "count": 12457 + }, + "0": { + "count": 3710 + }, + "19": { + "count": 2281 + }, + "6": { + "count": 8405 + }, + "17": { + "count": 10865 + }, + "5": { + "count": 3641 + }, + "2": { + "count": 3816 + }, + "10": { + "count": 1101 + }, + "8": { + "count": 1862 + }, + "18": { + "count": 7586 + }, + "15": { + "count": 6749 + }, + "4": { + "count": 2023 + }, + "14": { + "count": 519 + }, + "9": { + "count": 1019 + }, + "13": { + "count": 306 + }, + "12": { + "count": 2154 + }, + "11": { + "count": 1416 + }, + "16": { + "count": 418 + } + } + }, + "lv": { + "num_samples": 23208, + "number_of_characters": 170528142, + "number_texts_intersect_with_train": null, + "min_text_length": 512, + "average_text_length": 7347.81721820062, + "max_text_length": 743348, + "unique_texts": 23207, + "min_labels_per_text": 1, + "average_label_per_text": 3.327171664943123, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 2229 + }, + "20": { + "count": 2001 + }, + "7": { + "count": 2410 + }, + "3": { + "count": 12510 + }, + "0": { + "count": 3720 + }, + "19": { + "count": 2286 + }, + "6": { + "count": 8413 + }, + "17": { + "count": 10891 + }, + "5": { + "count": 3672 + }, + "2": { + "count": 3819 + }, + "10": { + "count": 1103 + }, + "8": { + "count": 1867 + }, + "18": { + "count": 7631 + }, + "15": { + "count": 6783 + }, + "4": { + "count": 2040 + }, + "14": { + "count": 522 + }, + "9": { + "count": 1022 + }, + "13": { + "count": 305 + }, + "12": { + "count": 2157 + }, + "11": { + "count": 1418 + }, + "16": { + "count": 418 + } + } + }, + "mt": { + "num_samples": 17521, + "number_of_characters": 171326902, + "number_texts_intersect_with_train": null, + "min_text_length": 596, + "average_text_length": 9778.374636150904, + "max_text_length": 913989, + "unique_texts": 17520, + "min_labels_per_text": 1, + "average_label_per_text": 3.4455795902060387, + "max_labels_per_text": 10, + "unique_labels": 21, + "labels": { + "1": { + "count": 1966 + }, + "20": { + "count": 1899 + }, + "7": { + "count": 2250 + }, + "3": { + "count": 8794 + }, + "0": { + "count": 3565 + }, + "19": { + "count": 1814 + }, + "6": { + "count": 6884 + }, + "2": { + "count": 2300 + }, + "5": { + "count": 2952 + }, + "10": { + "count": 971 + }, + "8": { + "count": 1649 + }, + "18": { + "count": 6004 + }, + "15": { + "count": 5750 + }, + "4": { + "count": 1711 + }, + "17": { + "count": 6502 + }, + "14": { + "count": 444 + }, + "9": { + "count": 965 + }, + "13": { + "count": 268 + }, + "12": { + "count": 2021 + }, + "11": { + "count": 1257 + }, + "16": { + "count": 404 + } + } + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Reranking/MIRACLReranking.json b/mteb/descriptive_stats/Reranking/MIRACLReranking.json new file mode 100644 index 0000000000..8f37b97947 --- /dev/null +++ b/mteb/descriptive_stats/Reranking/MIRACLReranking.json @@ -0,0 +1,536 @@ +{ + "dev": { + "number_of_characters": 584993395, + "num_samples": 1260008, + "num_queries": 12524, + "num_documents": 1247484, + "num_relevant_docs": 1247483, + "min_document_length": 5, + "average_document_length": 0.3661874621237627, + "max_document_length": 176, + "unique_documents": 1247484, + "min_query_length": 7, + "average_query_length": 46673.31379750878, + "max_query_length": 48058, + "unique_queries": 12524, + "none_queries": 0, + "min_relevant_docs_per_query": 0, + "average_relevant_docs_per_query": 1.8850207601405302, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 1247483, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 1, + "average_top_ranked_per_query": 99.60747365059086, + "max_top_ranked_per_query": 100, + "hf_subset_descriptive_stats": { + "ar": { + "number_of_characters": 139203930, + "num_samples": 290077, + "num_queries": 2896, + "num_documents": 287181, + "num_relevant_docs": 287181, + "min_document_length": 12, + "average_document_length": 0.29728986249090295, + "max_document_length": 101, + "unique_documents": 287181, + "min_query_length": 9, + "average_query_length": 48038.17472375691, + "max_query_length": 48058, + "unique_queries": 2896, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.7178867403314917, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 287181, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 1, + "average_top_ranked_per_query": 99.16470994475138, + "max_top_ranked_per_query": 100 + }, + "bn": { + "number_of_characters": 22936306, + "num_samples": 41466, + "num_queries": 411, + "num_documents": 41055, + "num_relevant_docs": 41055, + "min_document_length": 16, + "average_document_length": 0.470320302033857, + "max_document_length": 112, + "unique_documents": 41055, + "min_query_length": 12, + "average_query_length": 55759.11678832117, + "max_query_length": 16749, + "unique_queries": 411, + "none_queries": 0, + "min_relevant_docs_per_query": 55, + "average_relevant_docs_per_query": 1.9172749391727495, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 41055, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 55, + "average_top_ranked_per_query": 99.8905109489051, + "max_top_ranked_per_query": 100 + }, + "de": { + "number_of_characters": 16502961, + "num_samples": 30704, + "num_queries": 304, + "num_documents": 30400, + "num_relevant_docs": 30400, + "min_document_length": 15, + "average_document_length": 0.4606578947368421, + "max_document_length": 87, + "unique_documents": 30400, + "min_query_length": 13, + "average_query_length": 54239.99013157895, + "max_query_length": 5224, + "unique_queries": 304, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.542763157894737, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 30400, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "en": { + "number_of_characters": 51198664, + "num_samples": 79487, + "num_queries": 787, + "num_documents": 78700, + "num_relevant_docs": 78700, + "min_document_length": 16, + "average_document_length": 0.40310038119440916, + "max_document_length": 122, + "unique_documents": 78700, + "min_query_length": 19, + "average_query_length": 65015.171537484115, + "max_query_length": 8110, + "unique_queries": 787, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 2.3824650571791612, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 78700, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "es": { + "number_of_characters": 34643777, + "num_samples": 62317, + "num_queries": 617, + "num_documents": 61700, + "num_relevant_docs": 61700, + "min_document_length": 19, + "average_document_length": 0.47573743922204215, + "max_document_length": 88, + "unique_documents": 61700, + "min_query_length": 21, + "average_query_length": 56101.1734197731, + "max_query_length": 21550, + "unique_queries": 617, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 3.053484602917342, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 61700, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "fa": { + "number_of_characters": 27767876, + "num_samples": 63832, + "num_queries": 632, + "num_documents": 63200, + "num_relevant_docs": 63200, + "min_document_length": 18, + "average_document_length": 0.411503164556962, + "max_document_length": 82, + "unique_documents": 63200, + "min_query_length": 14, + "average_query_length": 43895.362341772154, + "max_query_length": 8151, + "unique_queries": 632, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.4667721518987342, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 63200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "fi": { + "number_of_characters": 52848523, + "num_samples": 117879, + "num_queries": 1183, + "num_documents": 116696, + "num_relevant_docs": 116696, + "min_document_length": 14, + "average_document_length": 0.3929526290532666, + "max_document_length": 130, + "unique_documents": 116696, + "min_query_length": 13, + "average_query_length": 44634.54522400676, + "max_query_length": 6755, + "unique_queries": 1183, + "none_queries": 0, + "min_relevant_docs_per_query": 3, + "average_relevant_docs_per_query": 1.7557058326289094, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 116696, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 3, + "average_top_ranked_per_query": 98.64412510566356, + "max_top_ranked_per_query": 100 + }, + "fr": { + "number_of_characters": 17084953, + "num_samples": 34643, + "num_queries": 343, + "num_documents": 34300, + "num_relevant_docs": 34300, + "min_document_length": 16, + "average_document_length": 0.4388338192419825, + "max_document_length": 83, + "unique_documents": 34300, + "min_query_length": 25, + "average_query_length": 49766.475218658896, + "max_query_length": 4404, + "unique_queries": 343, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.3877551020408163, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 34300, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "hi": { + "number_of_characters": 21162593, + "num_samples": 35350, + "num_queries": 350, + "num_documents": 35000, + "num_relevant_docs": 35000, + "min_document_length": 24, + "average_document_length": 0.5334, + "max_document_length": 120, + "unique_documents": 35000, + "min_query_length": 13, + "average_query_length": 60411.21142857143, + "max_query_length": 29681, + "unique_queries": 350, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.9142857142857144, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 35000, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "id": { + "number_of_characters": 51428701, + "num_samples": 94149, + "num_queries": 939, + "num_documents": 93210, + "num_relevant_docs": 93210, + "min_document_length": 13, + "average_document_length": 0.3831563137002468, + "max_document_length": 93, + "unique_documents": 93210, + "min_query_length": 9, + "average_query_length": 54731.615548455804, + "max_query_length": 13961, + "unique_queries": 939, + "none_queries": 0, + "min_relevant_docs_per_query": 3, + "average_relevant_docs_per_query": 2.774227902023429, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 93210, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 3, + "average_top_ranked_per_query": 99.26517571884985, + "max_top_ranked_per_query": 100 + }, + "ja": { + "number_of_characters": 17053080, + "num_samples": 80497, + "num_queries": 797, + "num_documents": 79700, + "num_relevant_docs": 79700, + "min_document_length": 7, + "average_document_length": 0.177465495608532, + "max_document_length": 48, + "unique_documents": 79700, + "min_query_length": 7, + "average_query_length": 21378.840652446674, + "max_query_length": 6592, + "unique_queries": 797, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.7465495608531996, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 79700, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "ko": { + "number_of_characters": 5439444, + "num_samples": 21414, + "num_queries": 213, + "num_documents": 21201, + "num_relevant_docs": 21200, + "min_document_length": 5, + "average_document_length": 0.21725390311777745, + "max_document_length": 92, + "unique_documents": 21201, + "min_query_length": 11, + "average_query_length": 25515.671361502347, + "max_query_length": 4838, + "unique_queries": 213, + "none_queries": 0, + "min_relevant_docs_per_query": 0, + "average_relevant_docs_per_query": 1.9812206572769953, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 21200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 1, + "average_top_ranked_per_query": 99.53521126760563, + "max_top_ranked_per_query": 100 + }, + "ru": { + "number_of_characters": 59556512, + "num_samples": 125947, + "num_queries": 1247, + "num_documents": 124700, + "num_relevant_docs": 124700, + "min_document_length": 15, + "average_document_length": 0.4415878107457899, + "max_document_length": 108, + "unique_documents": 124700, + "min_query_length": 8, + "average_query_length": 47715.67441860465, + "max_query_length": 12427, + "unique_queries": 1247, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.9534883720930232, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 124700, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "sw": { + "number_of_characters": 14840684, + "num_samples": 48581, + "num_queries": 481, + "num_documents": 48100, + "num_relevant_docs": 48100, + "min_document_length": 13, + "average_document_length": 0.38885654885654886, + "max_document_length": 75, + "unique_documents": 48100, + "min_query_length": 10, + "average_query_length": 30814.927234927236, + "max_query_length": 6048, + "unique_queries": 481, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.3846153846153846, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 48100, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "te": { + "number_of_characters": 3910478, + "num_samples": 8484, + "num_queries": 84, + "num_documents": 8400, + "num_relevant_docs": 8400, + "min_document_length": 24, + "average_document_length": 0.3846428571428571, + "max_document_length": 64, + "unique_documents": 8400, + "min_query_length": 19, + "average_query_length": 46514.84523809524, + "max_query_length": 8736, + "unique_queries": 84, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.119047619047619, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 8400, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "th": { + "number_of_characters": 38321622, + "num_samples": 73671, + "num_queries": 730, + "num_documents": 72941, + "num_relevant_docs": 72941, + "min_document_length": 14, + "average_document_length": 0.42866152095529264, + "max_document_length": 176, + "unique_documents": 72941, + "min_query_length": 15, + "average_query_length": 52452.54109589041, + "max_query_length": 12078, + "unique_queries": 730, + "none_queries": 0, + "min_relevant_docs_per_query": 41, + "average_relevant_docs_per_query": 1.632876712328767, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 72941, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 41, + "average_top_ranked_per_query": 99.91917808219178, + "max_top_ranked_per_query": 100 + }, + "yo": { + "number_of_characters": 4939804, + "num_samples": 12019, + "num_queries": 119, + "num_documents": 11900, + "num_relevant_docs": 11900, + "min_document_length": 25, + "average_document_length": 0.376890756302521, + "max_document_length": 56, + "unique_documents": 11900, + "min_query_length": 7, + "average_query_length": 41473.268907563026, + "max_query_length": 5793, + "unique_queries": 119, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 0.8823529411764706, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 11900, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + }, + "zh": { + "number_of_characters": 6153487, + "num_samples": 39491, + "num_queries": 391, + "num_documents": 39100, + "num_relevant_docs": 39100, + "min_document_length": 7, + "average_document_length": 0.10859335038363171, + "max_document_length": 22, + "unique_documents": 39100, + "min_query_length": 7, + "average_query_length": 15726.959079283888, + "max_query_length": 2629, + "unique_queries": 391, + "none_queries": 0, + "min_relevant_docs_per_query": 100, + "average_relevant_docs_per_query": 1.4194373401534526, + "max_relevant_docs_per_query": 100, + "unique_relevant_docs": 39100, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": 100, + "average_top_ranked_per_query": 100.0, + "max_top_ranked_per_query": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json b/mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json index 2899e4e4d7..6ddd5ce0b8 100644 --- a/mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json @@ -4,6 +4,7 @@ "num_samples": 4872, "num_queries": 2316, "num_documents": 2556, + "num_relevant_docs": 2316, "min_document_length": 8, "average_document_length": 154.68348982785602, "max_document_length": 2863, @@ -12,6 +13,7 @@ "average_query_length": 3868.990932642487, "max_query_length": 47930, "unique_queries": 2316, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, "max_relevant_docs_per_query": 1, diff --git a/mteb/descriptive_stats/Retrieval/FEVER.json b/mteb/descriptive_stats/Retrieval/FEVER.json new file mode 100644 index 0000000000..18a770b3e0 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/FEVER.json @@ -0,0 +1,30 @@ +{ + "test": { + "number_of_characters": 2921128337, + "num_samples": 5423234, + "num_queries": 6666, + "num_documents": 5416568, + "num_relevant_docs": 7937, + "min_document_length": 14, + "average_document_length": 0.061047881241406, + "max_document_length": 189, + "unique_documents": 5416568, + "min_query_length": 2, + "average_query_length": 438163.46639663965, + "max_query_length": 374597, + "unique_queries": 6666, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1906690669066906, + "max_relevant_docs_per_query": 15, + "unique_relevant_docs": 1499, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/HotpotQA.json b/mteb/descriptive_stats/Retrieval/HotpotQA.json new file mode 100644 index 0000000000..ae51a60fc8 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/HotpotQA.json @@ -0,0 +1,86 @@ +{ + "train": { + "number_of_characters": 1520922083, + "num_samples": 5318329, + "num_queries": 85000, + "num_documents": 5233329, + "num_relevant_docs": 170000, + "min_document_length": 13, + "average_document_length": 1.7143430118763792, + "max_document_length": 654, + "unique_documents": 5233329, + "min_query_length": 9, + "average_query_length": 17787.651317647058, + "max_query_length": 8276, + "unique_queries": 85000, + "none_queries": 0, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 101307, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "dev": { + "number_of_characters": 1512524238, + "num_samples": 5238776, + "num_queries": 5447, + "num_documents": 5233329, + "num_relevant_docs": 10894, + "min_document_length": 18, + "average_document_length": 0.10965792519446035, + "max_document_length": 630, + "unique_documents": 5233329, + "min_query_length": 9, + "average_query_length": 277574.8782816229, + "max_query_length": 8276, + "unique_queries": 5447, + "none_queries": 0, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 10335, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "test": { + "number_of_characters": 1512632888, + "num_samples": 5240734, + "num_queries": 7405, + "num_documents": 5233329, + "num_relevant_docs": 14810, + "min_document_length": 32, + "average_document_length": 0.13041908888204812, + "max_document_length": 288, + "unique_documents": 5233329, + "min_query_length": 9, + "average_query_length": 204179.65725860905, + "max_query_length": 8276, + "unique_queries": 7405, + "none_queries": 0, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 13783, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json b/mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json new file mode 100644 index 0000000000..a8c8e7075b --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json @@ -0,0 +1,536 @@ +{ + "dev": { + "number_of_characters": 35274535649, + "num_samples": 106345647, + "num_queries": 13495, + "num_documents": 106332152, + "num_relevant_docs": 130408, + "min_document_length": 5, + "average_document_length": 0.004631364932781573, + "max_document_length": 176, + "unique_documents": 106332152, + "min_query_length": 1, + "average_query_length": 2613860.1842163764, + "max_query_length": 84925, + "unique_queries": 13495, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.3059651722860317, + "max_relevant_docs_per_query": 20, + "unique_relevant_docs": 119924, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "ar": { + "number_of_characters": 624607465, + "num_samples": 2064310, + "num_queries": 2896, + "num_documents": 2061414, + "num_relevant_docs": 29197, + "min_document_length": 12, + "average_document_length": 0.041416231771007665, + "max_document_length": 101, + "unique_documents": 2061414, + "min_query_length": 1, + "average_query_length": 215649.89261049725, + "max_query_length": 48538, + "unique_queries": 2896, + "none_queries": 0, + "min_relevant_docs_per_query": 7, + "average_relevant_docs_per_query": 1.953729281767956, + "max_relevant_docs_per_query": 17, + "unique_relevant_docs": 25881, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "bn": { + "number_of_characters": 109132820, + "num_samples": 297676, + "num_queries": 411, + "num_documents": 297265, + "num_relevant_docs": 4206, + "min_document_length": 16, + "average_document_length": 0.06495551107597598, + "max_document_length": 112, + "unique_documents": 297265, + "min_query_length": 1, + "average_query_length": 265482.99513381993, + "max_query_length": 17102, + "unique_queries": 411, + "none_queries": 0, + "min_relevant_docs_per_query": 7, + "average_relevant_docs_per_query": 2.099756690997567, + "max_relevant_docs_per_query": 13, + "unique_relevant_docs": 3729, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "de": { + "number_of_characters": 6274005668, + "num_samples": 15866527, + "num_queries": 305, + "num_documents": 15866222, + "num_relevant_docs": 3144, + "min_document_length": 15, + "average_document_length": 0.0008842684792888944, + "max_document_length": 87, + "unique_documents": 15866222, + "min_query_length": 1, + "average_query_length": 20570464.386885244, + "max_query_length": 64939, + "unique_queries": 305, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.6590163934426227, + "max_relevant_docs_per_query": 20, + "unique_relevant_docs": 3103, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "en": { + "number_of_characters": 12534362069, + "num_samples": 32894020, + "num_queries": 799, + "num_documents": 32893221, + "num_relevant_docs": 8350, + "min_document_length": 16, + "average_document_length": 0.0009776482515956707, + "max_document_length": 122, + "unique_documents": 32893221, + "min_query_length": 1, + "average_query_length": 15687521.790988736, + "max_query_length": 36444, + "unique_queries": 799, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.911138923654568, + "max_relevant_docs_per_query": 16, + "unique_relevant_docs": 7921, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 3984898056, + "num_samples": 10374601, + "num_queries": 648, + "num_documents": 10373953, + "num_relevant_docs": 6443, + "min_document_length": 19, + "average_document_length": 0.0029591419972695076, + "max_document_length": 88, + "unique_documents": 10373953, + "min_query_length": 1, + "average_query_length": 6149486.663580247, + "max_query_length": 56999, + "unique_queries": 648, + "none_queries": 0, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 4.609567901234568, + "max_relevant_docs_per_query": 10, + "unique_relevant_docs": 6410, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fa": { + "number_of_characters": 548173051, + "num_samples": 2207804, + "num_queries": 632, + "num_documents": 2207172, + "num_relevant_docs": 6571, + "min_document_length": 18, + "average_document_length": 0.011782951215401427, + "max_document_length": 82, + "unique_documents": 2207172, + "min_query_length": 1, + "average_query_length": 867321.2721518987, + "max_query_length": 36480, + "unique_queries": 632, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.079113924050633, + "max_relevant_docs_per_query": 20, + "unique_relevant_docs": 6405, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fi": { + "number_of_characters": 647319788, + "num_samples": 1884780, + "num_queries": 1271, + "num_documents": 1883509, + "num_relevant_docs": 12008, + "min_document_length": 14, + "average_document_length": 0.026071019570386975, + "max_document_length": 130, + "unique_documents": 1883509, + "min_query_length": 1, + "average_query_length": 509260.96223446104, + "max_query_length": 11549, + "unique_queries": 1271, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.925255704169945, + "max_relevant_docs_per_query": 16, + "unique_relevant_docs": 11365, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 4741061206, + "num_samples": 14637296, + "num_queries": 343, + "num_documents": 14636953, + "num_relevant_docs": 3429, + "min_document_length": 16, + "average_document_length": 0.0010283561066295698, + "max_document_length": 83, + "unique_documents": 14636953, + "min_query_length": 1, + "average_query_length": 13822291.994169096, + "max_query_length": 52598, + "unique_queries": 343, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.131195335276968, + "max_relevant_docs_per_query": 10, + "unique_relevant_docs": 3407, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "hi": { + "number_of_characters": 179956335, + "num_samples": 506614, + "num_queries": 350, + "num_documents": 506264, + "num_relevant_docs": 3494, + "min_document_length": 24, + "average_document_length": 0.0368760172558191, + "max_document_length": 120, + "unique_documents": 506264, + "min_query_length": 1, + "average_query_length": 514107.61714285717, + "max_query_length": 44761, + "unique_queries": 350, + "none_queries": 0, + "min_relevant_docs_per_query": 6, + "average_relevant_docs_per_query": 2.1485714285714286, + "max_relevant_docs_per_query": 10, + "unique_relevant_docs": 3342, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "id": { + "number_of_characters": 479789527, + "num_samples": 1447275, + "num_queries": 960, + "num_documents": 1446315, + "num_relevant_docs": 9668, + "min_document_length": 13, + "average_document_length": 0.025195064698907223, + "max_document_length": 93, + "unique_documents": 1446315, + "min_query_length": 1, + "average_query_length": 499742.7989583333, + "max_query_length": 39510, + "unique_queries": 960, + "none_queries": 0, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 3.216666666666667, + "max_relevant_docs_per_query": 17, + "unique_relevant_docs": 8286, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ja": { + "number_of_characters": 956943052, + "num_samples": 6954474, + "num_queries": 860, + "num_documents": 6953614, + "num_relevant_docs": 8354, + "min_document_length": 7, + "average_document_length": 0.0021908032283644158, + "max_document_length": 48, + "unique_documents": 6953614, + "min_query_length": 1, + "average_query_length": 1112706.765116279, + "max_query_length": 25232, + "unique_queries": 860, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.0813953488372094, + "max_relevant_docs_per_query": 16, + "unique_relevant_docs": 8066, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ko": { + "number_of_characters": 247737159, + "num_samples": 1486965, + "num_queries": 213, + "num_documents": 1486752, + "num_relevant_docs": 3057, + "min_document_length": 5, + "average_document_length": 0.0030980284539721486, + "max_document_length": 92, + "unique_documents": 1486752, + "min_query_length": 1, + "average_query_length": 1163063.6291079812, + "max_query_length": 25243, + "unique_queries": 213, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.568075117370892, + "max_relevant_docs_per_query": 20, + "unique_relevant_docs": 2835, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ru": { + "number_of_characters": 2969123834, + "num_samples": 9545170, + "num_queries": 1252, + "num_documents": 9543918, + "num_relevant_docs": 13100, + "min_document_length": 15, + "average_document_length": 0.00578944622114314, + "max_document_length": 108, + "unique_documents": 9543918, + "min_query_length": 1, + "average_query_length": 2371460.5271565495, + "max_query_length": 61639, + "unique_queries": 1252, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 2.8434504792332267, + "max_relevant_docs_per_query": 18, + "unique_relevant_docs": 12607, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "sw": { + "number_of_characters": 28413887, + "num_samples": 132406, + "num_queries": 482, + "num_documents": 131924, + "num_relevant_docs": 5092, + "min_document_length": 13, + "average_document_length": 0.14238500955095357, + "max_document_length": 80, + "unique_documents": 131924, + "min_query_length": 1, + "average_query_length": 58911.0020746888, + "max_query_length": 11185, + "unique_queries": 482, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.887966804979253, + "max_relevant_docs_per_query": 17, + "unique_relevant_docs": 3514, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "te": { + "number_of_characters": 197801286, + "num_samples": 518907, + "num_queries": 828, + "num_documents": 518079, + "num_relevant_docs": 1606, + "min_document_length": 14, + "average_document_length": 0.060911559820027446, + "max_document_length": 111, + "unique_documents": 518079, + "min_query_length": 1, + "average_query_length": 238852.32971014493, + "max_query_length": 17811, + "unique_queries": 828, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0314009661835748, + "max_relevant_docs_per_query": 11, + "unique_relevant_docs": 1457, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "th": { + "number_of_characters": 183360331, + "num_samples": 542899, + "num_queries": 733, + "num_documents": 542166, + "num_relevant_docs": 7573, + "min_document_length": 14, + "average_document_length": 0.0579674859729308, + "max_document_length": 176, + "unique_documents": 542166, + "min_query_length": 1, + "average_query_length": 250107.64392905866, + "max_query_length": 31243, + "unique_queries": 733, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.8321964529331514, + "max_relevant_docs_per_query": 15, + "unique_relevant_docs": 6868, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "yo": { + "number_of_characters": 7047836, + "num_samples": 49162, + "num_queries": 119, + "num_documents": 49043, + "num_relevant_docs": 1188, + "min_document_length": 25, + "average_document_length": 0.09145035988826132, + "max_document_length": 56, + "unique_documents": 49043, + "min_query_length": 1, + "average_query_length": 59187.82352941176, + "max_query_length": 10457, + "unique_queries": 119, + "none_queries": 0, + "min_relevant_docs_per_query": 9, + "average_relevant_docs_per_query": 1.2100840336134453, + "max_relevant_docs_per_query": 10, + "unique_relevant_docs": 942, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "zh": { + "number_of_characters": 560802279, + "num_samples": 4934761, + "num_queries": 393, + "num_documents": 4934368, + "num_relevant_docs": 3928, + "min_document_length": 7, + "average_document_length": 0.0008655617092199042, + "max_document_length": 22, + "unique_documents": 4934368, + "min_query_length": 1, + "average_query_length": 1426966.941475827, + "max_query_length": 84925, + "unique_queries": 393, + "none_queries": 0, + "min_relevant_docs_per_query": 8, + "average_relevant_docs_per_query": 2.5292620865139948, + "max_relevant_docs_per_query": 10, + "unique_relevant_docs": 3786, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MSMARCO.json b/mteb/descriptive_stats/Retrieval/MSMARCO.json new file mode 100644 index 0000000000..3a6908af62 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MSMARCO.json @@ -0,0 +1,86 @@ +{ + "train": { + "number_of_characters": 2994608051, + "num_samples": 9344762, + "num_queries": 502939, + "num_documents": 8841823, + "num_relevant_docs": 532751, + "min_document_length": 5, + "average_document_length": 1.8895562600608495, + "max_document_length": 215, + "unique_documents": 8841823, + "min_query_length": 4, + "average_query_length": 5920.9982304016985, + "max_query_length": 1670, + "unique_queries": 502939, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0592755781516248, + "max_relevant_docs_per_query": 7, + "unique_relevant_docs": 516472, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "dev": { + "number_of_characters": 2978133099, + "num_samples": 8848803, + "num_queries": 6980, + "num_documents": 8841823, + "num_relevant_docs": 7437, + "min_document_length": 9, + "average_document_length": 0.026258159657799075, + "max_document_length": 186, + "unique_documents": 8841823, + "min_query_length": 4, + "average_query_length": 426633.37091690546, + "max_query_length": 1670, + "unique_queries": 6980, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0654727793696275, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 7433, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "test": { + "number_of_characters": 2977902337, + "num_samples": 8841866, + "num_queries": 43, + "num_documents": 8841823, + "num_relevant_docs": 9260, + "min_document_length": 16, + "average_document_length": 0.00015924317869742472, + "max_document_length": 55, + "unique_documents": 8841823, + "min_query_length": 4, + "average_query_length": 69253509.97674419, + "max_query_length": 1670, + "unique_queries": 43, + "none_queries": 0, + "min_relevant_docs_per_query": 132, + "average_relevant_docs_per_query": 95.3953488372093, + "max_relevant_docs_per_query": 582, + "unique_relevant_docs": 9139, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MrTidyRetrieval.json b/mteb/descriptive_stats/Retrieval/MrTidyRetrieval.json new file mode 100644 index 0000000000..fd9d85feb0 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MrTidyRetrieval.json @@ -0,0 +1,340 @@ +{ + "test": { + "number_of_characters": 19085636965, + "num_samples": 58051987, + "num_queries": 8661, + "num_documents": 58043326, + "num_relevant_docs": 10105, + "min_document_length": 6, + "average_document_length": 0.005544547877907617, + "max_document_length": 144, + "unique_documents": 58043326, + "min_query_length": 1, + "average_query_length": 2203592.557556864, + "max_query_length": 61639, + "unique_queries": 8661, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1667243967209329, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 8926, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "bengali": { + "number_of_characters": 110965082, + "num_samples": 304170, + "num_queries": 111, + "num_documents": 304059, + "num_relevant_docs": 130, + "min_document_length": 20, + "average_document_length": 0.018641118993353262, + "max_document_length": 122, + "unique_documents": 304059, + "min_query_length": 1, + "average_query_length": 999634.3603603604, + "max_query_length": 16791, + "unique_queries": 111, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1711711711711712, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 116, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "english": { + "number_of_characters": 12550942597, + "num_samples": 32907844, + "num_queries": 744, + "num_documents": 32907100, + "num_relevant_docs": 935, + "min_document_length": 16, + "average_document_length": 0.0009153647693051043, + "max_document_length": 108, + "unique_documents": 32907100, + "min_query_length": 1, + "average_query_length": 16869506.014784947, + "max_query_length": 36444, + "unique_queries": 744, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.256720430107527, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 908, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "finnish": { + "number_of_characters": 656119952, + "num_samples": 1910011, + "num_queries": 1254, + "num_documents": 1908757, + "num_relevant_docs": 1451, + "min_document_length": 13, + "average_document_length": 0.024742803824688003, + "max_document_length": 89, + "unique_documents": 1908757, + "min_query_length": 1, + "average_query_length": 523183.990430622, + "max_query_length": 29374, + "unique_queries": 1254, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.157097288676236, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 1186, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "russian": { + "number_of_characters": 2994189913, + "num_samples": 9598499, + "num_queries": 995, + "num_documents": 9597504, + "num_relevant_docs": 1168, + "min_document_length": 14, + "average_document_length": 0.004856262628283353, + "max_document_length": 138, + "unique_documents": 9597504, + "min_query_length": 1, + "average_query_length": 3009189.2512562815, + "max_query_length": 61639, + "unique_queries": 995, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1738693467336683, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 1100, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "korean": { + "number_of_characters": 249708451, + "num_samples": 1496547, + "num_queries": 421, + "num_documents": 1496126, + "num_relevant_docs": 492, + "min_document_length": 6, + "average_document_length": 0.006410556330148664, + "max_document_length": 122, + "unique_documents": 1496126, + "min_query_length": 1, + "average_query_length": 593108.9311163896, + "max_query_length": 25243, + "unique_queries": 421, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.168646080760095, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 397, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "japanese": { + "number_of_characters": 971241388, + "num_samples": 7000747, + "num_queries": 720, + "num_documents": 7000027, + "num_relevant_docs": 923, + "min_document_length": 6, + "average_document_length": 0.0018598499691501189, + "max_document_length": 44, + "unique_documents": 7000027, + "min_query_length": 1, + "average_query_length": 1348928.2902777777, + "max_query_length": 25232, + "unique_queries": 720, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.2819444444444446, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 880, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "telugu": { + "number_of_characters": 202749454, + "num_samples": 548870, + "num_queries": 646, + "num_documents": 548224, + "num_relevant_docs": 677, + "min_document_length": 13, + "average_document_length": 0.04451647501751109, + "max_document_length": 119, + "unique_documents": 548224, + "min_query_length": 1, + "average_query_length": 313815.8653250774, + "max_query_length": 17811, + "unique_queries": 646, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0479876160990713, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 600, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "thai": { + "number_of_characters": 192485963, + "num_samples": 570045, + "num_queries": 1190, + "num_documents": 568855, + "num_relevant_docs": 1368, + "min_document_length": 13, + "average_document_length": 0.08903850717669705, + "max_document_length": 144, + "unique_documents": 568855, + "min_query_length": 1, + "average_query_length": 161710.34705882354, + "max_query_length": 31244, + "unique_queries": 1190, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.149579831932773, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 1163, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "swahili": { + "number_of_characters": 29636822, + "num_samples": 137359, + "num_queries": 670, + "num_documents": 136689, + "num_relevant_docs": 743, + "min_document_length": 15, + "average_document_length": 0.2054664237795287, + "max_document_length": 98, + "unique_documents": 136689, + "min_query_length": 1, + "average_query_length": 44192.1447761194, + "max_query_length": 11185, + "unique_queries": 670, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.108955223880597, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 552, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "arabic": { + "number_of_characters": 640057511, + "num_samples": 2107667, + "num_queries": 1081, + "num_documents": 2106586, + "num_relevant_docs": 1257, + "min_document_length": 12, + "average_document_length": 0.015663257991840828, + "max_document_length": 93, + "unique_documents": 2106586, + "min_query_length": 1, + "average_query_length": 592067.0814061054, + "max_query_length": 48538, + "unique_queries": 1081, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1628122109158188, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 1138, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "indonesian": { + "number_of_characters": 487539832, + "num_samples": 1470228, + "num_queries": 829, + "num_documents": 1469399, + "num_relevant_docs": 961, + "min_document_length": 17, + "average_document_length": 0.02276577022306399, + "max_document_length": 128, + "unique_documents": 1469399, + "min_query_length": 1, + "average_query_length": 588065.5971049457, + "max_query_length": 39510, + "unique_queries": 829, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1592279855247285, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 886, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MultiLongDocRetrieval.json b/mteb/descriptive_stats/Retrieval/MultiLongDocRetrieval.json new file mode 100644 index 0000000000..a7de31f155 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MultiLongDocRetrieval.json @@ -0,0 +1,790 @@ +{ + "dev": { + "number_of_characters": 6641969996, + "num_samples": 496309, + "num_queries": 2600, + "num_documents": 493709, + "num_relevant_docs": 2600, + "min_document_length": 3, + "average_document_length": 0.49106862544535346, + "max_document_length": 2041, + "unique_documents": 493709, + "min_query_length": 36, + "average_query_length": 2554510.5965384617, + "max_query_length": 471024, + "unique_queries": 2600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 2600, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "ar": { + "number_of_characters": 222400555, + "num_samples": 7807, + "num_queries": 200, + "num_documents": 7607, + "num_relevant_docs": 200, + "min_document_length": 6, + "average_document_length": 1.8212172998553964, + "max_document_length": 194, + "unique_documents": 7607, + "min_query_length": 2173, + "average_query_length": 1111933.505, + "max_query_length": 276627, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "de": { + "number_of_characters": 337742837, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 7, + "average_document_length": 3.0726, + "max_document_length": 2041, + "unique_documents": 10000, + "min_query_length": 104, + "average_query_length": 1688560.555, + "max_query_length": 186335, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "en": { + "number_of_characters": 2666569772, + "num_samples": 200200, + "num_queries": 200, + "num_documents": 200000, + "num_relevant_docs": 200, + "min_document_length": 16, + "average_document_length": 0.08122, + "max_document_length": 180, + "unique_documents": 200000, + "min_query_length": 2137, + "average_query_length": 13332767.64, + "max_query_length": 382998, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 349277698, + "num_samples": 9751, + "num_queries": 200, + "num_documents": 9551, + "num_relevant_docs": 200, + "min_document_length": 19, + "average_document_length": 2.5779499528845147, + "max_document_length": 305, + "unique_documents": 9551, + "min_query_length": 2657, + "average_query_length": 1746265.38, + "max_query_length": 471024, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 360123367, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 13, + "average_document_length": 2.8433, + "max_document_length": 1590, + "unique_documents": 10000, + "min_query_length": 2093, + "average_query_length": 1800474.67, + "max_query_length": 425370, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "hi": { + "number_of_characters": 71144060, + "num_samples": 4006, + "num_queries": 200, + "num_documents": 3806, + "num_relevant_docs": 200, + "min_document_length": 4, + "average_document_length": 4.098528638991067, + "max_document_length": 318, + "unique_documents": 3806, + "min_query_length": 2426, + "average_query_length": 355642.305, + "max_query_length": 227264, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "it": { + "number_of_characters": 366359892, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 9, + "average_document_length": 1.9923, + "max_document_length": 950, + "unique_documents": 10000, + "min_query_length": 2491, + "average_query_length": 1831699.845, + "max_query_length": 312623, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ja": { + "number_of_characters": 144819833, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 3, + "average_document_length": 1.2325, + "max_document_length": 576, + "unique_documents": 10000, + "min_query_length": 1245, + "average_query_length": 724037.54, + "max_query_length": 234888, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ko": { + "number_of_characters": 85323582, + "num_samples": 6376, + "num_queries": 200, + "num_documents": 6176, + "num_relevant_docs": 200, + "min_document_length": 8, + "average_document_length": 1.9056023316062176, + "max_document_length": 664, + "unique_documents": 6176, + "min_query_length": 1490, + "average_query_length": 426559.065, + "max_query_length": 171299, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "pt": { + "number_of_characters": 211070508, + "num_samples": 6769, + "num_queries": 200, + "num_documents": 6569, + "num_relevant_docs": 200, + "min_document_length": 7, + "average_document_length": 3.722788856751408, + "max_document_length": 506, + "unique_documents": 6569, + "min_query_length": 3078, + "average_query_length": 1055230.265, + "max_query_length": 400864, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ru": { + "number_of_characters": 359366331, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 8, + "average_document_length": 1.7575, + "max_document_length": 216, + "unique_documents": 10000, + "min_query_length": 2901, + "average_query_length": 1796743.78, + "max_query_length": 303226, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "th": { + "number_of_characters": 259954258, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 30, + "average_document_length": 2.1562, + "max_document_length": 1123, + "unique_documents": 10000, + "min_query_length": 36, + "average_query_length": 1299663.48, + "max_query_length": 183497, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "zh": { + "number_of_characters": 1207817303, + "num_samples": 200200, + "num_queries": 200, + "num_documents": 200000, + "num_relevant_docs": 200, + "min_document_length": 5, + "average_document_length": 0.02679, + "max_document_length": 476, + "unique_documents": 200000, + "min_query_length": 1038, + "average_query_length": 6039059.725, + "max_query_length": 278468, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "test": { + "number_of_characters": 6642036446, + "num_samples": 497509, + "num_queries": 3800, + "num_documents": 493709, + "num_relevant_docs": 3800, + "min_document_length": 3, + "average_document_length": 0.6256620802942624, + "max_document_length": 2589, + "unique_documents": 493709, + "min_query_length": 36, + "average_query_length": 1747823.039736842, + "max_query_length": 471024, + "unique_queries": 3800, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 3800, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "ar": { + "number_of_characters": 222401855, + "num_samples": 7807, + "num_queries": 200, + "num_documents": 7607, + "num_relevant_docs": 200, + "min_document_length": 7, + "average_document_length": 1.9921125279347969, + "max_document_length": 695, + "unique_documents": 7607, + "min_query_length": 2173, + "average_query_length": 1111933.505, + "max_query_length": 276627, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "de": { + "number_of_characters": 337736841, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 10, + "average_document_length": 2.473, + "max_document_length": 957, + "unique_documents": 10000, + "min_query_length": 104, + "average_query_length": 1688560.555, + "max_query_length": 186335, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "en": { + "number_of_characters": 2666618592, + "num_samples": 200800, + "num_queries": 800, + "num_documents": 200000, + "num_relevant_docs": 800, + "min_document_length": 18, + "average_document_length": 0.32532, + "max_document_length": 255, + "unique_documents": 200000, + "min_query_length": 2137, + "average_query_length": 3333191.91, + "max_query_length": 382998, + "unique_queries": 800, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 800, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 349279473, + "num_samples": 9751, + "num_queries": 200, + "num_documents": 9551, + "num_relevant_docs": 200, + "min_document_length": 40, + "average_document_length": 2.763794367081981, + "max_document_length": 480, + "unique_documents": 9551, + "min_query_length": 2657, + "average_query_length": 1746265.38, + "max_query_length": 471024, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 360124893, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 33, + "average_document_length": 2.9959, + "max_document_length": 2589, + "unique_documents": 10000, + "min_query_length": 2093, + "average_query_length": 1800474.67, + "max_query_length": 425370, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "hi": { + "number_of_characters": 71149213, + "num_samples": 4006, + "num_queries": 200, + "num_documents": 3806, + "num_relevant_docs": 200, + "min_document_length": 6, + "average_document_length": 5.452443510246979, + "max_document_length": 2022, + "unique_documents": 3806, + "min_query_length": 2426, + "average_query_length": 355642.305, + "max_query_length": 227264, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "it": { + "number_of_characters": 366362888, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 12, + "average_document_length": 2.2919, + "max_document_length": 1899, + "unique_documents": 10000, + "min_query_length": 2491, + "average_query_length": 1831699.845, + "max_query_length": 312623, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ja": { + "number_of_characters": 144818654, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 6, + "average_document_length": 1.1146, + "max_document_length": 416, + "unique_documents": 10000, + "min_query_length": 1245, + "average_query_length": 724037.54, + "max_query_length": 234888, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ko": { + "number_of_characters": 85323557, + "num_samples": 6376, + "num_queries": 200, + "num_documents": 6176, + "num_relevant_docs": 200, + "min_document_length": 8, + "average_document_length": 1.9015544041450778, + "max_document_length": 330, + "unique_documents": 6176, + "min_query_length": 1490, + "average_query_length": 426559.065, + "max_query_length": 171299, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "pt": { + "number_of_characters": 211068744, + "num_samples": 6769, + "num_queries": 200, + "num_documents": 6569, + "num_relevant_docs": 200, + "min_document_length": 4, + "average_document_length": 3.4542548333079615, + "max_document_length": 511, + "unique_documents": 6569, + "min_query_length": 3078, + "average_query_length": 1055230.265, + "max_query_length": 400864, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "ru": { + "number_of_characters": 359367730, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 12, + "average_document_length": 1.8974, + "max_document_length": 413, + "unique_documents": 10000, + "min_query_length": 2901, + "average_query_length": 1796743.78, + "max_query_length": 303226, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "th": { + "number_of_characters": 259952294, + "num_samples": 10200, + "num_queries": 200, + "num_documents": 10000, + "num_relevant_docs": 200, + "min_document_length": 11, + "average_document_length": 1.9598, + "max_document_length": 309, + "unique_documents": 10000, + "min_query_length": 36, + "average_query_length": 1299663.48, + "max_query_length": 183497, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 200, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "zh": { + "number_of_characters": 1207831712, + "num_samples": 200800, + "num_queries": 800, + "num_documents": 200000, + "num_relevant_docs": 800, + "min_document_length": 3, + "average_document_length": 0.098835, + "max_document_length": 646, + "unique_documents": 200000, + "min_query_length": 1038, + "average_query_length": 1509764.93125, + "max_query_length": 278468, + "unique_queries": 800, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 800, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/TopiOCQA.json b/mteb/descriptive_stats/Retrieval/TopiOCQA.json new file mode 100644 index 0000000000..bc3bca51ef --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/TopiOCQA.json @@ -0,0 +1,30 @@ +{ + "validation": { + "number_of_characters": 11369989152, + "num_samples": 25703106, + "num_queries": 2514, + "num_documents": 25700592, + "num_relevant_docs": 2514, + "min_document_length": 1, + "average_document_length": 0.0012305553117220023, + "max_document_length": 31, + "unique_documents": 25700592, + "min_query_length": 1, + "average_query_length": 4522656.136038186, + "max_query_length": 28038, + "unique_queries": 2514, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1940, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/XPQARetrieval.json b/mteb/descriptive_stats/Retrieval/XPQARetrieval.json index b00f01f22b..9b33a6cb7d 100644 --- a/mteb/descriptive_stats/Retrieval/XPQARetrieval.json +++ b/mteb/descriptive_stats/Retrieval/XPQARetrieval.json @@ -4,6 +4,7 @@ "num_samples": 81710, "num_queries": 27856, "num_documents": 53854, + "num_relevant_docs": 55424, "min_document_length": 3, "average_document_length": 20.861588739926468, "max_document_length": 298, @@ -12,6 +13,7 @@ "average_query_length": 150.2376866743251, "max_query_length": 4229, "unique_queries": 27856, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9896611143021252, "max_relevant_docs_per_query": 17, @@ -30,6 +32,7 @@ "num_samples": 2245, "num_queries": 750, "num_documents": 1495, + "num_relevant_docs": 1503, "min_document_length": 8, "average_document_length": 14.893645484949833, "max_document_length": 111, @@ -38,6 +41,7 @@ "average_query_length": 123.35466666666666, "max_query_length": 1200, "unique_queries": 750, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.004, "max_relevant_docs_per_query": 5, @@ -56,6 +60,7 @@ "num_samples": 2283, "num_queries": 750, "num_documents": 1533, + "num_relevant_docs": 1544, "min_document_length": 8, "average_document_length": 14.524461839530332, "max_document_length": 111, @@ -64,6 +69,7 @@ "average_query_length": 256.05066666666664, "max_query_length": 4229, "unique_queries": 750, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.058666666666667, "max_relevant_docs_per_query": 5, @@ -82,6 +88,7 @@ "num_samples": 2237, "num_queries": 742, "num_documents": 1495, + "num_relevant_docs": 1502, "min_document_length": 11, "average_document_length": 19.614046822742473, "max_document_length": 162, @@ -90,6 +97,7 @@ "average_query_length": 124.68463611859838, "max_query_length": 1200, "unique_queries": 742, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.024258760107817, "max_relevant_docs_per_query": 5, @@ -108,6 +116,7 @@ "num_samples": 2014, "num_queries": 766, "num_documents": 1248, + "num_relevant_docs": 1250, "min_document_length": 17, "average_document_length": 34.076121794871796, "max_document_length": 144, @@ -116,6 +125,7 @@ "average_query_length": 113.31070496083551, "max_query_length": 383, "unique_queries": 766, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.6318537859007833, "max_relevant_docs_per_query": 5, @@ -134,6 +144,7 @@ "num_samples": 2265, "num_queries": 766, "num_documents": 1499, + "num_relevant_docs": 1504, "min_document_length": 17, "average_document_length": 28.370246831220815, "max_document_length": 144, @@ -142,6 +153,7 @@ "average_query_length": 226.55483028720627, "max_query_length": 1130, "unique_queries": 766, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9634464751958225, "max_relevant_docs_per_query": 5, @@ -160,6 +172,7 @@ "num_samples": 2014, "num_queries": 766, "num_documents": 1248, + "num_relevant_docs": 1250, "min_document_length": 15, "average_document_length": 31.848557692307693, "max_document_length": 144, @@ -168,6 +181,7 @@ "average_query_length": 113.31070496083551, "max_query_length": 383, "unique_queries": 766, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.6318537859007833, "max_relevant_docs_per_query": 5, @@ -186,6 +200,7 @@ "num_samples": 2734, "num_queries": 793, "num_documents": 1941, + "num_relevant_docs": 1942, "min_document_length": 12, "average_document_length": 19.08397733127254, "max_document_length": 140, @@ -194,6 +209,7 @@ "average_query_length": 167.11475409836066, "max_query_length": 266, "unique_queries": 793, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.4489281210592684, "max_relevant_docs_per_query": 5, @@ -212,6 +228,7 @@ "num_samples": 2729, "num_queries": 793, "num_documents": 1936, + "num_relevant_docs": 1961, "min_document_length": 12, "average_document_length": 19.13326446280992, "max_document_length": 140, @@ -220,6 +237,7 @@ "average_query_length": 301.3543505674653, "max_query_length": 1401, "unique_queries": 793, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.472887767969735, "max_relevant_docs_per_query": 5, @@ -238,6 +256,7 @@ "num_samples": 2734, "num_queries": 793, "num_documents": 1941, + "num_relevant_docs": 1942, "min_document_length": 12, "average_document_length": 19.287995878413188, "max_document_length": 133, @@ -246,6 +265,7 @@ "average_query_length": 167.11475409836066, "max_query_length": 266, "unique_queries": 793, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.4489281210592684, "max_relevant_docs_per_query": 5, @@ -264,6 +284,7 @@ "num_samples": 2297, "num_queries": 749, "num_documents": 1548, + "num_relevant_docs": 1550, "min_document_length": 12, "average_document_length": 27.120801033591732, "max_document_length": 110, @@ -272,6 +293,7 @@ "average_query_length": 159.1268357810414, "max_query_length": 359, "unique_queries": 749, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.069425901201602, "max_relevant_docs_per_query": 5, @@ -290,6 +312,7 @@ "num_samples": 2423, "num_queries": 749, "num_documents": 1674, + "num_relevant_docs": 1684, "min_document_length": 12, "average_document_length": 25.079450418160096, "max_document_length": 110, @@ -298,6 +321,7 @@ "average_query_length": 306.890520694259, "max_query_length": 1798, "unique_queries": 749, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.248331108144192, "max_relevant_docs_per_query": 5, @@ -316,6 +340,7 @@ "num_samples": 2297, "num_queries": 749, "num_documents": 1548, + "num_relevant_docs": 1550, "min_document_length": 11, "average_document_length": 23.992894056847547, "max_document_length": 110, @@ -324,6 +349,7 @@ "average_query_length": 159.1268357810414, "max_query_length": 359, "unique_queries": 749, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.069425901201602, "max_relevant_docs_per_query": 5, @@ -342,6 +368,7 @@ "num_samples": 2176, "num_queries": 925, "num_documents": 1251, + "num_relevant_docs": 1286, "min_document_length": 8, "average_document_length": 24.753796962430055, "max_document_length": 97, @@ -350,6 +377,7 @@ "average_query_length": 63.84540540540541, "max_query_length": 246, "unique_queries": 925, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.3902702702702703, "max_relevant_docs_per_query": 5, @@ -368,6 +396,7 @@ "num_samples": 2431, "num_queries": 925, "num_documents": 1506, + "num_relevant_docs": 1670, "min_document_length": 8, "average_document_length": 20.562416998671978, "max_document_length": 97, @@ -376,6 +405,7 @@ "average_query_length": 173.6810810810811, "max_query_length": 2000, "unique_queries": 925, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.8054054054054054, "max_relevant_docs_per_query": 5, @@ -394,6 +424,7 @@ "num_samples": 2163, "num_queries": 912, "num_documents": 1251, + "num_relevant_docs": 1286, "min_document_length": 8, "average_document_length": 25.50519584332534, "max_document_length": 118, @@ -402,6 +433,7 @@ "average_query_length": 64.75548245614036, "max_query_length": 246, "unique_queries": 912, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.4100877192982457, "max_relevant_docs_per_query": 8, @@ -420,6 +452,7 @@ "num_samples": 1935, "num_queries": 663, "num_documents": 1272, + "num_relevant_docs": 1276, "min_document_length": 13, "average_document_length": 25.617924528301888, "max_document_length": 134, @@ -428,6 +461,7 @@ "average_query_length": 114.68778280542986, "max_query_length": 293, "unique_queries": 663, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9245852187028658, "max_relevant_docs_per_query": 5, @@ -446,6 +480,7 @@ "num_samples": 1964, "num_queries": 663, "num_documents": 1301, + "num_relevant_docs": 1316, "min_document_length": 13, "average_document_length": 25.046887009992314, "max_document_length": 134, @@ -454,6 +489,7 @@ "average_query_length": 241.5052790346908, "max_query_length": 1561, "unique_queries": 663, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9849170437405732, "max_relevant_docs_per_query": 5, @@ -472,6 +508,7 @@ "num_samples": 1935, "num_queries": 663, "num_documents": 1272, + "num_relevant_docs": 1276, "min_document_length": 11, "average_document_length": 25.56132075471698, "max_document_length": 131, @@ -480,6 +517,7 @@ "average_query_length": 114.68778280542986, "max_query_length": 293, "unique_queries": 663, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9245852187028658, "max_relevant_docs_per_query": 5, @@ -498,6 +536,7 @@ "num_samples": 2426, "num_queries": 825, "num_documents": 1601, + "num_relevant_docs": 1601, "min_document_length": 5, "average_document_length": 12.004996876951905, "max_document_length": 49, @@ -506,6 +545,7 @@ "average_query_length": 79.62424242424242, "max_query_length": 368, "unique_queries": 825, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9406060606060607, "max_relevant_docs_per_query": 5, @@ -524,6 +564,7 @@ "num_samples": 2570, "num_queries": 825, "num_documents": 1745, + "num_relevant_docs": 1748, "min_document_length": 5, "average_document_length": 11.01432664756447, "max_document_length": 49, @@ -532,6 +573,7 @@ "average_query_length": 267.0690909090909, "max_query_length": 1116, "unique_queries": 825, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.1187878787878787, "max_relevant_docs_per_query": 5, @@ -550,6 +592,7 @@ "num_samples": 2423, "num_queries": 822, "num_documents": 1601, + "num_relevant_docs": 1601, "min_document_length": 13, "average_document_length": 26.398500936914427, "max_document_length": 154, @@ -558,6 +601,7 @@ "average_query_length": 79.91484184914842, "max_query_length": 368, "unique_queries": 822, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.9476885644768855, "max_relevant_docs_per_query": 6, @@ -576,6 +620,7 @@ "num_samples": 1543, "num_queries": 654, "num_documents": 889, + "num_relevant_docs": 1023, "min_document_length": 4, "average_document_length": 16.050618672665916, "max_document_length": 149, @@ -584,6 +629,7 @@ "average_query_length": 42.448012232415905, "max_query_length": 231, "unique_queries": 654, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.5642201834862386, "max_relevant_docs_per_query": 5, @@ -602,6 +648,7 @@ "num_samples": 1823, "num_queries": 654, "num_documents": 1169, + "num_relevant_docs": 1277, "min_document_length": 4, "average_document_length": 12.206159110350727, "max_document_length": 149, @@ -610,6 +657,7 @@ "average_query_length": 200.93272171253824, "max_query_length": 1948, "unique_queries": 654, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.952599388379205, "max_relevant_docs_per_query": 5, @@ -628,6 +676,7 @@ "num_samples": 1503, "num_queries": 614, "num_documents": 889, + "num_relevant_docs": 1023, "min_document_length": 5, "average_document_length": 30.35658042744657, "max_document_length": 298, @@ -636,6 +685,7 @@ "average_query_length": 45.21335504885994, "max_query_length": 231, "unique_queries": 614, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.6661237785016287, "max_relevant_docs_per_query": 9, @@ -654,6 +704,7 @@ "num_samples": 2364, "num_queries": 785, "num_documents": 1579, + "num_relevant_docs": 1633, "min_document_length": 8, "average_document_length": 26.707409753008232, "max_document_length": 150, @@ -662,6 +713,7 @@ "average_query_length": 101.9171974522293, "max_query_length": 219, "unique_queries": 785, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.080254777070064, "max_relevant_docs_per_query": 5, @@ -680,6 +732,7 @@ "num_samples": 2538, "num_queries": 785, "num_documents": 1753, + "num_relevant_docs": 1873, "min_document_length": 8, "average_document_length": 24.056474614945806, "max_document_length": 150, @@ -688,6 +741,7 @@ "average_query_length": 252.27388535031847, "max_query_length": 1459, "unique_queries": 785, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.385987261146497, "max_relevant_docs_per_query": 5, @@ -706,6 +760,7 @@ "num_samples": 2356, "num_queries": 777, "num_documents": 1579, + "num_relevant_docs": 1633, "min_document_length": 5, "average_document_length": 26.67067764407853, "max_document_length": 180, @@ -714,6 +769,7 @@ "average_query_length": 102.96653796653797, "max_query_length": 219, "unique_queries": 777, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.101673101673102, "max_relevant_docs_per_query": 6, @@ -732,6 +788,7 @@ "num_samples": 2422, "num_queries": 800, "num_documents": 1622, + "num_relevant_docs": 1712, "min_document_length": 9, "average_document_length": 21.005548705302097, "max_document_length": 126, @@ -740,6 +797,7 @@ "average_query_length": 154.05875, "max_query_length": 500, "unique_queries": 800, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.14, "max_relevant_docs_per_query": 5, @@ -758,6 +816,7 @@ "num_samples": 2439, "num_queries": 800, "num_documents": 1639, + "num_relevant_docs": 1775, "min_document_length": 9, "average_document_length": 20.787675411836485, "max_document_length": 126, @@ -766,6 +825,7 @@ "average_query_length": 228.2825, "max_query_length": 1206, "unique_queries": 800, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.21875, "max_relevant_docs_per_query": 5, @@ -784,6 +844,7 @@ "num_samples": 2419, "num_queries": 797, "num_documents": 1622, + "num_relevant_docs": 1712, "min_document_length": 9, "average_document_length": 22.887792848335387, "max_document_length": 136, @@ -792,6 +853,7 @@ "average_query_length": 154.63864491844416, "max_query_length": 500, "unique_queries": 797, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.148055207026349, "max_relevant_docs_per_query": 6, @@ -810,6 +872,7 @@ "num_samples": 2057, "num_queries": 782, "num_documents": 1275, + "num_relevant_docs": 1329, "min_document_length": 3, "average_document_length": 20.40392156862745, "max_document_length": 146, @@ -818,6 +881,7 @@ "average_query_length": 105.79923273657289, "max_query_length": 441, "unique_queries": 782, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.6994884910485935, "max_relevant_docs_per_query": 5, @@ -836,6 +900,7 @@ "num_samples": 2266, "num_queries": 782, "num_documents": 1484, + "num_relevant_docs": 1584, "min_document_length": 3, "average_document_length": 17.53032345013477, "max_document_length": 146, @@ -844,6 +909,7 @@ "average_query_length": 184.0076726342711, "max_query_length": 1240, "unique_queries": 782, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.0255754475703327, "max_relevant_docs_per_query": 5, @@ -862,6 +928,7 @@ "num_samples": 2044, "num_queries": 769, "num_documents": 1275, + "num_relevant_docs": 1329, "min_document_length": 6, "average_document_length": 20.975686274509805, "max_document_length": 162, @@ -870,6 +937,7 @@ "average_query_length": 107.58777633289986, "max_query_length": 441, "unique_queries": 769, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.728218465539662, "max_relevant_docs_per_query": 17, @@ -888,6 +956,7 @@ "num_samples": 2529, "num_queries": 824, "num_documents": 1705, + "num_relevant_docs": 1707, "min_document_length": 5, "average_document_length": 5.901466275659824, "max_document_length": 29, @@ -896,6 +965,7 @@ "average_query_length": 43.36771844660194, "max_query_length": 236, "unique_queries": 824, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.0716019417475726, "max_relevant_docs_per_query": 5, @@ -914,6 +984,7 @@ "num_samples": 2587, "num_queries": 824, "num_documents": 1763, + "num_relevant_docs": 1865, "min_document_length": 5, "average_document_length": 5.7073170731707314, "max_document_length": 29, @@ -922,6 +993,7 @@ "average_query_length": 231.748786407767, "max_query_length": 965, "unique_queries": 824, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.2633495145631066, "max_relevant_docs_per_query": 5, @@ -940,6 +1012,7 @@ "num_samples": 2525, "num_queries": 820, "num_documents": 1705, + "num_relevant_docs": 1707, "min_document_length": 10, "average_document_length": 19.835777126099707, "max_document_length": 130, @@ -948,6 +1021,7 @@ "average_query_length": 43.579268292682926, "max_query_length": 236, "unique_queries": 820, + "none_queries": 0, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.0817073170731706, "max_relevant_docs_per_query": 6, diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index b72539c3e0..40acaca430 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -531,26 +531,17 @@ def test_empty_descriptive_stat_in_new_datasets(task: AbsTask): # DON'T ADD NEW DATASETS TO THIS LIST # THIS IS ONLY INTENDED FOR HISTORIC DATASETS exceptions = [ - "FEVER", - "HotpotQA", - "MSMARCO", "MSMARCOv2", - "TopiOCQA", - "MIRACLRetrieval", - "MrTidyRetrieval", "BrightRetrieval", - "MultiLongDocRetrieval", "NeuCLIR2022Retrieval", "NeuCLIR2023Retrieval", "BibleNLPBitextMining", "FloresBitextMining", "FilipinoHateSpeechClassification", "SwissJudgementClassification", - "MultiEURLEXMultilabelClassification", "MindSmallReranking", "WebLINXCandidatesReranking", "VoyageMMarcoReranking", - "MIRACLReranking", ] if task.metadata.name.startswith("Mock"): diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index e4cf91e0d8..ad011dfcef 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -1130,6 +1130,7 @@ class MockRerankingTask(AbsTaskReranking): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1199,6 +1200,7 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "num_samples": 8, "num_queries": 4, "num_documents": 4, + "num_relevant_docs": 8, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1226,6 +1228,7 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1253,6 +1256,7 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1334,6 +1338,7 @@ class MockRetrievalTask(AbsTaskRetrieval): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1397,6 +1402,7 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_samples": 8, "num_queries": 4, "num_documents": 4, + "num_relevant_docs": 8, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1424,6 +1430,7 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1451,6 +1458,7 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1715,6 +1723,7 @@ class MockInstructionRetrieval(AbsTaskRetrieval): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1783,6 +1792,7 @@ class MockInstructionReranking(AbsTaskReranking): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1856,6 +1866,7 @@ class MockMultilingualInstructionRetrieval(AbsTaskRetrieval, MultilingualTask): "num_samples": 8, "num_queries": 4, "num_documents": 4, + "num_relevant_docs": 8, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1883,6 +1894,7 @@ class MockMultilingualInstructionRetrieval(AbsTaskRetrieval, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1910,6 +1922,7 @@ class MockMultilingualInstructionRetrieval(AbsTaskRetrieval, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -1997,6 +2010,7 @@ class MockMultilingualInstructionReranking(AbsTaskReranking, MultilingualTask): "num_samples": 8, "num_queries": 4, "num_documents": 4, + "num_relevant_docs": 8, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -2024,6 +2038,7 @@ class MockMultilingualInstructionReranking(AbsTaskReranking, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29, @@ -2051,6 +2066,7 @@ class MockMultilingualInstructionReranking(AbsTaskReranking, MultilingualTask): "num_samples": 4, "num_queries": 2, "num_documents": 2, + "num_relevant_docs": 4, "min_document_length": 23, "average_document_length": 26.0, "max_document_length": 29,