Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Count unique texts, data leaks in calculate metrics #1438

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,11 @@ def calculate_metadata_metrics(

descriptive_stats = {}
hf_subset_stat = "hf_subset_descriptive_stats"
pbar_split = tqdm.tqdm(self.metadata.eval_splits, desc="Processing Splits...")
eval_splits = self.metadata.eval_splits
if self.metadata.type in ["Classification", "MultilabelClassification"]:
eval_splits += ["train"]
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved

pbar_split = tqdm.tqdm(eval_splits, desc="Processing Splits...")
for split in pbar_split:
pbar_split.set_postfix_str(f"Split: {split}")
logger.info(f"Processing metadata for split {split}")
Expand All @@ -215,12 +219,8 @@ def calculate_metadata_metrics(
if isinstance(self.metadata.eval_langs, dict)
else self.metadata.eval_langs
)
if self.metadata.type == "Classification":
eval_langs += ["train"]

pbar_subsets = tqdm.tqdm(
self.metadata.eval_langs, desc="Processing Languages..."
)
pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...")
for hf_subset in pbar_subsets:
pbar_subsets.set_postfix_str(f"Language: {hf_subset}")
logger.info(f"Processing metadata for language {hf_subset}")
Expand Down
39 changes: 34 additions & 5 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,31 @@ class BitextDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
unique_pairs: Number of duplicate pairs

min_sentence1_length: Minimum length of sentence1
average_sentence1_length: Average length of sentence1
max_sentence1_length: Maximum length of sentence1
unique_sentence1: Number of duplicates in sentence1

min_sentence2_length: Minimum length of sentence2
average_sentence2_length: Average length of sentence2
max_sentence2_length: Maximum length of sentence2
"""

num_samples: int
number_of_characters: int
unique_pairs: int

min_sentence1_length: int
average_sentence1_length: float
max_sentence1_length: int
unique_sentence1: int

min_sentence2_length: int
average_sentence2_length: float
max_sentence2_length: int
unique_sentence2: int


class AbsTaskBitextMining(AbsTask):
Expand Down Expand Up @@ -153,12 +170,24 @@ def _calculate_metrics_from_split(
sent_1, sent_2 = pairs_cols[0]
sentence1 = self.dataset[split][sent_1]
sentence2 = self.dataset[split][sent_2]
total_s1_len = sum([len(s1) for s1 in sentence1])
total_s2_len = sum([len(s2) for s2 in sentence2])

s1_len = [len(s1) for s1 in sentence1]
s2_len = [len(s2) for s2 in sentence2]
total_s1_len = sum(s1_len)
total_s2_len = sum(s2_len)

unique_pairs = len(set(zip(sentence1, sentence2)))
unique_sentence1 = len(set(sentence1))
unique_sentence2 = len(set(sentence2))
return BitextDescriptiveStatistics(
average_sentence1_length=total_s1_len / len(sentence1),
average_sentence2_length=total_s2_len / len(sentence2),
num_samples=len(sentence1),
number_of_characters=total_s1_len + total_s2_len,
unique_pairs=unique_pairs,
min_sentence1_length=min(s1_len),
average_sentence1_length=sum(s1_len) / len(sentence1),
max_sentence1_length=max(s1_len),
unique_sentence1=unique_sentence1,
min_sentence2_length=min(s2_len),
average_sentence2_length=total_s2_len / len(sentence2),
max_sentence2_length=max(s2_len),
unique_sentence2=unique_sentence2,
)
29 changes: 28 additions & 1 deletion mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,26 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
num_texts_in_train: Number of texts in the train split

min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_text: Number of unique texts

unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int
num_texts_in_train: int | None

min_text_length: int
average_text_length: float
max_text_length: int
unique_text: int

unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -206,25 +218,40 @@ def _undersample_data(self, X, y, samples_per_label: int, idxs=None):
def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> ClassificationDescriptiveStatistics:
train_text = []
if hf_subset:
text = self.dataset[hf_subset][split]["text"]
label = self.dataset[hf_subset][split]["label"]
if split != "train":
train_text = self.dataset[hf_subset]["train"]["text"]
elif compute_overall:
text = []
label = []
for hf_subset in self.metadata.eval_langs:
text.extend(self.dataset[hf_subset][split]["text"])
label.extend(self.dataset[hf_subset][split]["label"])
if split != "train":
train_text.extend(self.dataset[hf_subset]["train"]["text"])
else:
text = self.dataset[split]["text"]
label = self.dataset[split]["label"]
if split != "train":
train_text = self.dataset["train"]["text"]

total_text_len = sum([len(t) for t in text])
text_len = [len(t) for t in text]
total_text_len = sum(text_len)
label_count = Counter(label)
num_texts_in_train = (
len(set(text) & set(train_text)) if split != "train" else None
)
return ClassificationDescriptiveStatistics(
num_samples=len(text),
number_of_characters=total_text_len,
num_texts_in_train=num_texts_in_train,
min_text_length=min(text_len),
average_text_length=total_text_len / len(text),
max_text_length=max(text_len),
unique_text=len(set(text)),
unique_labels=len(label_count),
labels={
str(label): {"count": count} for label, count in label_count.items()
Expand Down
26 changes: 25 additions & 1 deletion mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,31 @@ class ClusteringDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.

min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts

min_labels_per_text: Minimum number of labels per text
average_labels_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int

min_text_length: int
average_text_length: float
max_text_length: int
unique_texts: int

min_labels_per_text: int
average_labels_per_text: float
max_labels_per_text: int

unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -96,7 +111,11 @@ def _calculate_metrics_from_split(
sentences = self.dataset[split]["sentences"]
labels = self.dataset[split]["labels"]

total_text_len = sum([len(t) for t in sentences])
text_len = [len(t) for t in sentences]
all_sentences = []
for s in sentences:
all_sentences.extend(s)
total_text_len = sum(text_len)
total_labels = []
for label in labels:
if isinstance(label, list):
Expand All @@ -107,8 +126,13 @@ def _calculate_metrics_from_split(
return ClusteringDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
min_text_length=min(text_len),
average_text_length=total_text_len / len(sentences),
max_text_length=max(text_len),
unique_texts=len(set(all_sentences)),
min_labels_per_text=min(label_counter.values()),
average_labels_per_text=len(total_labels) / len(sentences),
max_labels_per_text=max(label_counter.values()),
unique_labels=len(label_counter),
labels={
str(label): {
Expand Down
21 changes: 20 additions & 1 deletion mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,30 @@ class ClusteringFastDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.

min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts

min_labels_per_text: Minimum number of labels per text
average_labels_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int

min_text_length: int
average_text_length: float
max_text_length: int
unique_texts: int

min_labels_per_text: int
average_labels_per_text: float
max_labels_per_text: int
unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -226,7 +240,8 @@ def _calculate_metrics_from_split(
sentences = self.dataset[split]["sentences"]
labels = self.dataset[split]["labels"]

total_text_len = sum([len(t) for t in sentences])
text_len = [len(t) for t in sentences]
total_text_len = sum(text_len)
total_labels = []
for label in labels:
if isinstance(label, list):
Expand All @@ -237,8 +252,12 @@ def _calculate_metrics_from_split(
return ClusteringFastDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
min_text_length=min(text_len),
average_text_length=total_text_len / len(sentences),
max_text_length=max(text_len),
min_labels_per_text=min(label_counter.values()),
average_labels_per_text=len(total_labels) / len(sentences),
max_labels_per_text=max(label_counter.values()),
unique_labels=len(label_counter),
labels={
str(label): {
Expand Down
Loading
Loading