embeddings-benchmark · x-tabdeveloping · Nov 15, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/refresh.py b/refresh.py
@@ -0,0 +1,179 @@
+from __future__ import annotations
+
+import json
+import logging
+import re
+from pathlib import Path
+
+from huggingface_hub import HfApi, get_hf_file_metadata, get_safetensors_metadata, hf_hub_download, hf_hub_url
+from huggingface_hub.errors import NotASafetensorsRepoError
+from huggingface_hub.hf_api import ModelInfo
+from huggingface_hub.repocard import metadata_load
+from mteb import get_task, ModelMeta
+from tqdm.autonotebook import tqdm
+
+API = HfApi()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+library_mapping = {
+    "sentence-transformers": "Sentence Transformers",
+}
+
+
+def get_model_dir(model_id: str) -> Path:
+    external_result_dir = Path("results") / model_id.replace("/", "__") / "external"
+    if not external_result_dir.exists():
+        external_result_dir.mkdir(parents=True, exist_ok=True)
+    return external_result_dir
+
+
+def simplify_dataset_name(name: str) -> str:
+    return name.replace("MTEB ", "").split()[0]
+
+
+def get_model_parameters_memory(model_info: ModelInfo) -> tuple[int, float]:
+    try:
+        safetensors = API.get_safetensors_metadata(model_info.id)
+        num_parameters = sum(safetensors.parameter_count.values())
+        return num_parameters, round(num_parameters * 4 / 1024 ** 3, 2)
+    except NotASafetensorsRepoError as e:
+        logger.info(f"Could not find SafeTensors metadata for {model_info.id}")
+
+    filenames = [sib.rfilename for sib in model_info.siblings]
+    if "pytorch_model.bin" in filenames:
+        url = hf_hub_url(model_info.id, filename="pytorch_model.bin")
+        meta = get_hf_file_metadata(url)
+        bytes_per_param = 4
+        num_params = round(meta.size / bytes_per_param)
+        size_gb = round(meta.size * (4 / bytes_per_param) / 1024 ** 3, 2)
+        return num_params, size_gb
+    if "pytorch_model.bin.index.json" in filenames:
+        index_path = hf_hub_download(model_info.id, filename="pytorch_model.bin.index.json")
+        size = json.load(open(index_path))
+        bytes_per_param = 4
+        if "metadata" in size and "total_size" in size["metadata"]:
+            return round(size["metadata"]["total_size"] / bytes_per_param), round(size["metadata"]["total_size"] / 1024 ** 3, 2)
+    logger.info(f"Could not find the model parameters for {model_info.id}")
+
+
+def get_dim_seq_size(model: ModelInfo) -> tuple[str, str, int, float]:
+    siblings = model.siblings or []
+    filenames = [sib.rfilename for sib in siblings]
+    dim, seq = None, None
+    for filename in filenames:
+        if re.match(r"\d+_Pooling/config.json", filename):
+            st_config_path = hf_hub_download(model.id, filename=filename)
+            dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
+            break
+    for filename in filenames:
+        if re.match(r"\d+_Dense/config.json", filename):
+            st_config_path = hf_hub_download(model.id, filename=filename)
+            dim = json.load(open(st_config_path)).get("out_features", dim)
+    if "config.json" in filenames:
+        config_path = hf_hub_download(model.id, filename="config.json")
+        config = json.load(open(config_path))
+        if not dim:
+            dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
+        seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
+    if dim is None or seq is None:
+        raise Exception(f"Could not find dim or seq for model {model.id}")
+    parameters, memory = get_model_parameters_memory(model)
+    return dim, seq, parameters, memory
+
+
+def create_model_meta(model_info: ModelInfo) -> None:
+    readme_path = hf_hub_download(model_info.id, filename="README.md", etag_timeout=30)
+    meta = metadata_load(readme_path)
+    dim, seq, parameters, memory = get_dim_seq_size(model_info)
+
+    release_date = str(model_info.created_at.date()) if model_info.created_at else ""
+    library = [library_mapping[model_info.library_name]] if model_info.library_name in library_mapping else []
+    model_dict = ModelMeta(
+        name=model_info.id,
+        revision=model_info.sha,
+        release_date=release_date,
+        open_weights=True,
+        framework=library,
+        license=meta.get("license", None),
+        embed_dim=dim,
+        max_tokens=seq,
+        n_parameters=parameters,
+        languages=meta.get("language", []),
+    )
+    model_dir = get_model_dir(model_info.id)
+    model_meta_path = model_dir / "model_meta.json"
+    with model_meta_path.open("w") as f:
+        json.dump(model_dict.model_dump(), f, indent=4)
+
+
+def parse_readme(model_info: ModelInfo) -> None:
+    model_id = model_info.id
+    try:
+        readme_path = hf_hub_download(model_info.id, filename="README.md", etag_timeout=30)
+    except Exception:
+        logger.warning(f"ERROR: Could not fetch metadata for {model_id}, trying again")
+        readme_path = hf_hub_download(model_id, filename="README.md", etag_timeout=30)
+    meta = metadata_load(readme_path)
+    if "model-index" not in meta:
+        logger.info(f"Could not find model-index in {model_id}")
+        return
+    model_index = meta["model-index"][0]
+    results = model_index.get("results", [])
+    model_results = {}
+    for result in results:
+        output_dict = {}
+        dataset = result["dataset"]
+        dataset_type = dataset.get("type", "")
+        if dataset_type not in model_results:
+            output_dict["dataset_revision"] = dataset.get("revision", "")
+            output_dict["task_name"] = simplify_dataset_name(dataset.get("name", ""))
+            output_dict["evaluation_time"] = 0
+            output_dict["mteb_version"] = "0.0.0"
+            output_dict["scores"] = {}
+        else:
+            output_dict = model_results[dataset_type]
+        try:
+            mteb_task = get_task(output_dict["task_name"])
+            mteb_task_metadata = mteb_task.metadata
+            mteb_task_eval_languages = mteb_task_metadata.eval_langs
+        except Exception:
+            logger.info(f"Error getting task for {model_id} {output_dict['task_name']}")
+            continue
+        scores_dict = output_dict["scores"]
+        current_split = dataset.get("split", "")
+        current_config = dataset.get("config", "")
+        cur_split_metrics = {
+            "hf_subset": current_config,
+            "languages": mteb_task_eval_languages if isinstance(mteb_task_eval_languages, list) else mteb_task_eval_languages.get(current_config, "None"),
+        }
+        for metric in result["metrics"]:
+            cur_split_metrics[metric["type"]] = metric["value"]
+
+        main_score_str = "main_score"
+        if main_score_str not in cur_split_metrics:
+            cur_split_metrics[main_score_str] = cur_split_metrics.get(mteb_task.metadata.main_score, 0)
+        split_metrics = scores_dict.get(current_split, [])
+        split_metrics.append(cur_split_metrics)
+        scores_dict[current_split] = split_metrics
+        model_results[dataset_type] = output_dict
+    for model_result in model_results:
+        model_dir = get_model_dir(model_id)
+        task_name = model_results[model_result]["task_name"]
+        result_file = model_dir / f"{task_name}.json"
+        with result_file.open("w") as f:
+            json.dump(model_results[model_result], f, indent=4)
+
+
+def get_mteb_data() -> None:
+    models = list(API.list_models(filter="mteb", full=True))
+    models = [model for model in models if model.id == "intfloat/multilingual-e5-large"]
+    for i, model in enumerate(models, start=1):
+        logger.info(f"[{i}/{len(models)}] Processing {model.id}")
+        create_model_meta(model)
+        parse_readme(model)
+
+
+if __name__ == "__main__":
+    get_mteb_data()
diff --git a/results/intfloat__multilingual-e5-large/external/AmazonCounterfactualClassification.json b/results/intfloat__multilingual-e5-large/external/AmazonCounterfactualClassification.json
@@ -0,0 +1,50 @@
+{
+    "dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
+    "task_name": "AmazonCounterfactualClassification",
+    "evaluation_time": 0,
+    "mteb_version": "0.0.0",
+    "scores": {
+        "test": [
+            {
+                "hf_subset": "en",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "accuracy": 79.05970149253731,
+                "ap": 43.486574390835635,
+                "f1": 73.32700092140148,
+                "main_score": 79.05970149253731
+            },
+            {
+                "hf_subset": "de",
+                "languages": [
+                    "deu-Latn"
+                ],
+                "accuracy": 71.22055674518201,
+                "ap": 81.55756710830498,
+                "f1": 69.28271787752661,
+                "main_score": 71.22055674518201
+            },
+            {
+                "hf_subset": "en-ext",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "accuracy": 80.41979010494754,
+                "ap": 29.34879922376344,
+                "f1": 67.62475449011278,
+                "main_score": 80.41979010494754
+            },
+            {
+                "hf_subset": "ja",
+                "languages": [
+                    "jpn-Jpan"
+                ],
+                "accuracy": 77.8372591006424,
+                "ap": 26.557560591210738,
+                "f1": 64.96619417368707,
+                "main_score": 77.8372591006424
+            }
+        ]
+    }
+}
diff --git a/results/intfloat__multilingual-e5-large/external/AmazonPolarityClassification.json b/results/intfloat__multilingual-e5-large/external/AmazonPolarityClassification.json
@@ -0,0 +1,20 @@
+{
+    "dataset_revision": "e2d317d38cd51312af73b3d32a06d1a08b442046",
+    "task_name": "AmazonPolarityClassification",
+    "evaluation_time": 0,
+    "mteb_version": "0.0.0",
+    "scores": {
+        "test": [
+            {
+                "hf_subset": "default",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "accuracy": 93.489875,
+                "ap": 90.98758636917603,
+                "f1": 93.48554819717332,
+                "main_score": 93.489875
+            }
+        ]
+    }
+}
diff --git a/results/intfloat__multilingual-e5-large/external/AmazonReviewsClassification.json b/results/intfloat__multilingual-e5-large/external/AmazonReviewsClassification.json
@@ -0,0 +1,64 @@
+{
+    "dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
+    "task_name": "AmazonReviewsClassification",
+    "evaluation_time": 0,
+    "mteb_version": "0.0.0",
+    "scores": {
+        "test": [
+            {
+                "hf_subset": "en",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "accuracy": 47.564,
+                "f1": 46.75122173518047,
+                "main_score": 47.564
+            },
+            {
+                "hf_subset": "de",
+                "languages": [
+                    "deu-Latn"
+                ],
+                "accuracy": 45.400000000000006,
+                "f1": 44.17195682400632,
+                "main_score": 45.400000000000006
+            },
+            {
+                "hf_subset": "es",
+                "languages": [
+                    "spa-Latn"
+                ],
+                "accuracy": 43.068,
+                "f1": 42.38155696855596,
+                "main_score": 43.068
+            },
+            {
+                "hf_subset": "fr",
+                "languages": [
+                    "fra-Latn"
+                ],
+                "accuracy": 41.89,
+                "f1": 40.84407321682663,
+                "main_score": 41.89
+            },
+            {
+                "hf_subset": "ja",
+                "languages": [
+                    "jpn-Jpan"
+                ],
+                "accuracy": 40.120000000000005,
+                "f1": 39.522976223819114,
+                "main_score": 40.120000000000005
+            },
+            {
+                "hf_subset": "zh",
+                "languages": [
+                    "cmn-Hans"
+                ],
+                "accuracy": 38.832,
+                "f1": 38.0392533394713,
+                "main_score": 38.832
+            }
+        ]
+    }
+}
diff --git a/results/intfloat__multilingual-e5-large/external/ArguAna.json b/results/intfloat__multilingual-e5-large/external/ArguAna.json
@@ -0,0 +1,47 @@
+{
+    "dataset_revision": "None",
+    "task_name": "ArguAna",
+    "evaluation_time": 0,
+    "mteb_version": "0.0.0",
+    "scores": {
+        "test": [
+            {
+                "hf_subset": "default",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "map_at_1": 30.725,
+                "map_at_10": 46.055,
+                "map_at_100": 46.900999999999996,
+                "map_at_1000": 46.911,
+                "map_at_3": 41.548,
+                "map_at_5": 44.297,
+                "mrr_at_1": 31.152,
+                "mrr_at_10": 46.231,
+                "mrr_at_100": 47.07,
+                "mrr_at_1000": 47.08,
+                "mrr_at_3": 41.738,
+                "mrr_at_5": 44.468999999999994,
+                "ndcg_at_1": 30.725,
+                "ndcg_at_10": 54.379999999999995,
+                "ndcg_at_100": 58.138,
+                "ndcg_at_1000": 58.389,
+                "ndcg_at_3": 45.156,
+                "ndcg_at_5": 50.123,
+                "precision_at_1": 30.725,
+                "precision_at_10": 8.087,
+                "precision_at_100": 0.9769999999999999,
+                "precision_at_1000": 0.1,
+                "precision_at_3": 18.54,
+                "precision_at_5": 13.542000000000002,
+                "recall_at_1": 30.725,
+                "recall_at_10": 80.868,
+                "recall_at_100": 97.653,
+                "recall_at_1000": 99.57300000000001,
+                "recall_at_3": 55.619,
+                "recall_at_5": 67.71000000000001,
+                "main_score": 54.379999999999995
+            }
+        ]
+    }
+}
diff --git a/results/intfloat__multilingual-e5-large/external/ArxivClusteringP2P.json b/results/intfloat__multilingual-e5-large/external/ArxivClusteringP2P.json
@@ -0,0 +1,18 @@
+{
+    "dataset_revision": "a122ad7f3f0291bf49cc6f4d32aa80929df69d5d",
+    "task_name": "ArxivClusteringP2P",
+    "evaluation_time": 0,
+    "mteb_version": "0.0.0",
+    "scores": {
+        "test": [
+            {
+                "hf_subset": "default",
+                "languages": [
+                    "eng-Latn"
+                ],
+                "v_measure": 44.30960650674069,
+                "main_score": 44.30960650674069
+            }
+        ]
+    }
+}