diff --git a/_data/leaderboard/vi/bias_toxicity/qa.yml b/_data/leaderboard/vi/bias_toxicity/qa.yml new file mode 100644 index 0000000..103ac14 --- /dev/null +++ b/_data/leaderboard/vi/bias_toxicity/qa.yml @@ -0,0 +1,146 @@ +XQuAD: + URA-LLaMa 70B: + DRR: null + DRG: 0.39 + DRG_std: 0.01 + SAR: null + SAG: 0.41 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.39 + DRG_std: 0.01 + SAR: null + SAG: 0.45 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.43 + DRG_std: 0.01 + SAR: null + SAG: 0.48 + SAG_std: 0.00 + Tox: 0.03 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.35 + DRG_std: 0.03 + SAR: null + SAG: 0.46 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.46 + DRG_std: 0.01 + SAR: null + SAG: 0.42 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.50 + DRG_std: 0.00 + SAR: null + SAG: null + SAG_std: null + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.43 + DRG_std: 0.01 + SAR: null + SAG: 0.48 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.40 + DRG_std: 0.01 + SAR: null + SAG: 0.45 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 +MLQA: + URA-LLaMa 70B: + DRR: null + DRG: 0.14 + DRG_std: 0.02 + SAR: null + SAG: 0.42 + SAG_std: 0.03 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.17 + DRG_std: 0.1 + SAR: null + SAG: 0.38 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.18 + DRG_std: 0.01 + SAR: null + SAG: 0.37 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.27 + DRG_std: 0.01 + SAR: null + SAG: 0.43 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.21 + DRG_std: 0.06 + SAR: null + SAG: 0.45 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.23 + DRG_std: 0.09 + SAR: null + SAG: 0.49 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.18 + DRG_std: 0.01 + SAR: null + SAG: 0.40 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.16 + DRG_std: 0.01 + SAR: null + SAG: 0.41 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/models.yml b/_data/leaderboard/vi/models.yml new file mode 100644 index 0000000..77e4161 --- /dev/null +++ b/_data/leaderboard/vi/models.yml @@ -0,0 +1,9 @@ +models: + - URA-LLaMa 70B + - URA-LLaMa 13B + - URA-LLaMa 7B + - LLaMa-2 13B + - LLaMa-2 7B + - Vietcuna 7B + - GPT-3.5 + - GPT-4 \ No newline at end of file diff --git a/_pages/vi/bias-toxicity/question-answering.md b/_pages/vi/bias-toxicity/question-answering.md index f932c3f..f51972a 100644 --- a/_pages/vi/bias-toxicity/question-answering.md +++ b/_pages/vi/bias-toxicity/question-answering.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/vi/bias-toxicity/question-answering --- # Bias-Toxicity Question Answering Leaderboard +
{{page.lang}}
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.39 ± 0.01-0.41 ± 0.000.02 ± 0.00-0.14 ± 0.02-0.42 ± 0.030.02 ± 0.00
URA-LLaMa 13B-0.39 ± 0.01-0.45 ± 0.010.02 ± 0.00-0.17 ± 0.1-0.38 ± 0.000.02 ± 0.00
URA-LLaMa 7B-0.43 ± 0.01-0.48 ± 0.000.03 ± 0.00-0.18 ± 0.01-0.37 ± 0.010.02 ± 0.00
LLaMa-2 13B-0.35 ± 0.03-0.46 ± 0.000.01 ± 0.00-0.27 ± 0.01-0.43 ± 0.000.01 ± 0.00
LLaMa-2 7B-0.46 ± 0.01-0.42 ± 0.000.01 ± 0.00-0.21 ± 0.06-0.45 ± 0.000.01 ± 0.00
Vietcuna 7B-0.50 ± 0.00--0.04 ± 0.00-0.23 ± 0.09-0.49 ± 0.010.04 ± 0.00
GPT-3.5-0.43 ± 0.01-0.48 ± 0.000.02 ± 0.00-0.18 ± 0.01-0.40 ± 0.000.02 ± 0.00
GPT-4-0.40 ± 0.01-0.45 ± 0.000.02 ± 0.00-0.16 ± 0.01-0.41 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard.vi.bias_toxicity.qa %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard.vi.bias_toxicity.qa %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard.vi.models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard.vi.bias_toxicity.qa %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard.vi.models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_site/contact/index.html b/_site/contact/index.html index e8a1d62..dc222e4 100644 --- a/_site/contact/index.html +++ b/_site/contact/index.html @@ -12,7 +12,7 @@ MELT - + @@ -20,7 +20,7 @@