Skip to content

Commit

Permalink
fix: Leaderboard demo data loading (#1507)
Browse files Browse the repository at this point in the history
* Made get_scores error tolerant

* Added join_revisions, made get_scores failsafe

* Fetching metadata fixed fr HF models

* Added failsafe metadata fetching to leaderboard code

* Added revision joining to leaderboard app

* fix

* Only show models that have metadata, when filter_models is called

* Ran linting
  • Loading branch information
x-tabdeveloping authored Nov 27, 2024
1 parent cde720e commit 0affa31
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 59 deletions.
2 changes: 1 addition & 1 deletion mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def update_task_info(task_names: str) -> gr.DataFrame:
return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1))


all_results = load_results().filter_models()
all_results = load_results().join_revisions()

# Model sizes in million parameters
min_model_size, max_model_size = 0, 10_000
Expand Down
10 changes: 9 additions & 1 deletion mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame.from_records(records)


def failsafe_get_model_meta(model_name):
try:
return get_model_meta(model_name)
except Exception as e:
return None


def scores_to_tables(
scores_long: list[dict], search_query: str | None = None
) -> tuple[gr.DataFrame, gr.DataFrame]:
Expand Down Expand Up @@ -132,7 +139,8 @@ def scores_to_tables(
joint_table["borda_rank"] = get_borda_rank(per_task)
joint_table = joint_table.reset_index()
joint_table = joint_table.drop(columns=["model_revision"])
model_metas = joint_table["model_name"].map(get_model_meta)
model_metas = joint_table["model_name"].map(failsafe_get_model_meta)
joint_table = joint_table[model_metas.notna()]
joint_table["model_link"] = model_metas.map(lambda m: m.reference)
joint_table.insert(
1,
Expand Down
175 changes: 125 additions & 50 deletions mteb/load_results/benchmark_results.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from __future__ import annotations

import json
import warnings
from collections import defaultdict
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Callable, Literal
from typing import Any, Callable, Literal, Optional

import numpy as np
import pandas as pd
from packaging.version import InvalidVersion, Version
from pydantic import BaseModel, ConfigDict

from mteb.abstasks.AbsTask import AbsTask, ScoresDict
Expand Down Expand Up @@ -89,36 +92,45 @@ def get_scores(
format: Literal["wide", "long"] = "wide",
) -> dict | list:
if format == "wide":
scores = {
res.task_name: res.get_score(
splits=splits,
languages=languages,
scripts=scripts,
getter=getter,
aggregation=aggregation,
)
for res in self.task_results
}
return scores
if format == "long":
entries = []
for task_res in self.task_results:
entry = dict( # noqa
model_name=self.model_name,
model_revision=self.model_revision,
task_name=task_res.task_name,
score=task_res.get_score(
scores = {}
for res in self.task_results:
try:
scores[res.task_name] = res.get_score(
splits=splits,
languages=languages,
scripts=scripts,
getter=getter,
aggregation=aggregation,
),
mteb_version=task_res.mteb_version,
dataset_revision=task_res.dataset_revision,
evaluation_time=task_res.evaluation_time,
kg_co2_emissions=task_res.kg_co2_emissions,
)
entries.append(entry)
)
except Exception as e:
warnings.warn(
f"Couldn't get scores for {res.task_name} due to {e}."
)
return scores
if format == "long":
entries = []
for task_res in self.task_results:
try:
entry = dict( # noqa
model_name=self.model_name,
model_revision=self.model_revision,
task_name=task_res.task_name,
score=task_res.get_score(
splits=splits,
languages=languages,
getter=getter,
aggregation=aggregation,
),
mteb_version=task_res.mteb_version,
dataset_revision=task_res.dataset_revision,
evaluation_time=task_res.evaluation_time,
kg_co2_emissions=task_res.kg_co2_emissions,
)
entries.append(entry)
except Exception as e:
warnings.warn(
f"Couldn't get scores for {task_res.task_name} due to {e}."
)
return entries

def __iter__(self):
Expand Down Expand Up @@ -198,6 +210,8 @@ def filter_models(
n_parameters_range: tuple[int | None, int | None] = (None, None),
use_instructions: bool | None = None,
) -> BenchmarkResults:
# if model_names is None:
# model_names = [model_res.model_name for model_res in self]
model_metas = get_model_metas(
model_names=model_names,
languages=languages,
Expand All @@ -206,13 +220,64 @@ def filter_models(
n_parameters_range=n_parameters_range,
use_instructions=use_instructions,
)
model_revision_pairs = {(meta.name, meta.revision) for meta in model_metas}
models = {meta.name for meta in model_metas}
# model_revision_pairs = {(meta.name, meta.revision) for meta in model_metas}
new_model_results = []
for model_res in self:
if (model_res.model_name, model_res.model_revision) in model_revision_pairs:
if model_res.model_name in models:
new_model_results.append(model_res)
return type(self).model_construct(model_results=new_model_results)

def join_revisions(self):
def parse_version(version_str: str) -> Optional[Version]:
try:
return Version(version_str)
except (InvalidVersion, TypeError):
return None

def keep_best(group: pd.DataFrame) -> pd.DataFrame:
is_main_revision = group["revision"] == group["main_revision"]
if is_main_revision.sum() == 1:
return group[is_main_revision]
if group["mteb_version"].notna().any():
group = group.dropna(subset=["mteb_version"])
group = group.sort_values("mteb_version", ascending=False)
return group.head(n=1)
return group.head(n=1)

records = []
for model_result in self:
for task_result in model_result:
records.append(
dict(
model=model_result.model_name,
revision=model_result.model_revision,
task_name=task_result.task_name,
mteb_version=task_result.mteb_version,
task_result=task_result,
)
)
task_df = pd.DataFrame.from_records(records)
model_to_main_revision = {
meta.name: meta.revision for meta in get_model_metas()
}
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
task_df["mteb_version"] = task_df["mteb_version"].map(parse_version)
task_df = (
task_df.groupby(["model", "task_name"])
.apply(keep_best)
.reset_index(drop=True)
)
model_results = []
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
model_result = ModelResult.model_construct(
model_name=model,
model_revision=model_revision,
task_results=list(group["task_result"]),
)
model_results.append(model_result)
return BenchmarkResults.model_construct(model_results=model_results)

def get_scores(
self,
splits: list[Split] | None = None,
Expand All @@ -225,33 +290,43 @@ def get_scores(
entries = []
if format == "wide":
for model_res in self:
model_scores = model_res.get_scores(
splits=splits,
languages=languages,
scripts=scripts,
getter=getter,
aggregation=aggregation,
format="wide",
)
entries.append(
{
"model": model_res.model_name,
"revision": model_res.model_revision,
**model_scores,
}
)
if format == "long":
for model_res in self:
entries.extend(
model_res.get_scores(
try:
model_scores = model_res.get_scores(
splits=splits,
languages=languages,
scripts=scripts,
getter=getter,
aggregation=aggregation,
format="long",
format="wide",
)
entries.append(
{
"model": model_res.model_name,
"revision": model_res.model_revision,
**model_scores,
}
)
except Exception as e:
warnings.warn(
f"Couldn't get scores for {model_res.model_name}({model_res.model_revision}), due to: {e}"
)
if format == "long":
for model_res in self:
try:
entries.extend(
model_res.get_scores(
splits=splits,
languages=languages,
scripts=scripts,
getter=getter,
aggregation=aggregation,
format="long",
)
)
except Exception as e:
warnings.warn(
f"Couldn't get scores for {model_res.model_name}({model_res.model_revision}), due to: {e}"
)
)
return entries

def __iter__(self):
Expand Down
41 changes: 34 additions & 7 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import logging
from collections.abc import Iterable
from functools import lru_cache
from typing import Any

from huggingface_hub import ModelCard
from sentence_transformers import SentenceTransformer

from mteb.encoder_interface import Encoder
Expand Down Expand Up @@ -152,21 +154,46 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta:
return MODEL_REGISTRY[model_name]
else: # assume it is a sentence-transformers model
logger.info(
"Model not found in model registry, assuming it is a sentence-transformers model."
"Model not found in model registry, assuming it is on HF Hub model."
)
logger.info(
f"Attempting to extract metadata by loading the model ({model_name}) using sentence-transformers."
f"Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
)
model = SentenceTransformer(
model_name, revision=revision, trust_remote_code=True
)
meta = model_meta_from_sentence_transformers(model)

meta = model_meta_from_hf_hub(model_name)
meta.revision = revision
meta.name = model_name
return meta


@lru_cache
def model_meta_from_hf_hub(model_name: str) -> ModelMeta:
try:
card = ModelCard.load(model_name)
card_data = card.data.to_dict()
frameworks = ["PyTorch"]
if card_data.get("library_name", None) == "sentence-transformers":
frameworks.append("Sentence Transformers")
return ModelMeta(
name=model_name,
revision=None,
# TODO
release_date=None,
# TODO: We need a mapping between conflicting language codes
languages=None,
license=card_data.get("license", None),
framework=frameworks,
public_training_data=bool(card_data.get("datasets", None)),
)
except Exception as e:
logger.warning(f"Failed to extract metadata from model: {e}.")
return ModelMeta(
name=None,
revision=None,
languages=None,
release_date=None,
)


def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
try:
name = (
Expand Down

0 comments on commit 0affa31

Please sign in to comment.