From d75f69cd1d5bdd5af633764365075bdbd0135d17 Mon Sep 17 00:00:00 2001 From: Le Dong <74060032+ledong0110@users.noreply.github.com> Date: Fri, 6 Sep 2024 00:36:09 +0700 Subject: [PATCH] 5 inputoutput result of tables (#7) --- _config.yml | 2 +- _data/categories.yml | 1 + _data/lang_tasks.yml | 2 + .../vi/bias_toxicity/question_answering.yml | 146 ++ .../vi/bias_toxicity/summarization.yml | 146 ++ .../vi/bias_toxicity/translation.yml | 146 ++ .../vi/chain_of_thought/reasoning.yml | 64 + .../fairness_aware/information_retrieval.yml | 146 ++ .../vi/fairness_aware/language_modeling.yml | 236 +++ .../vi/fairness_aware/question_answering.yml | 82 + .../vi/fairness_aware/sentiment_analysis.yml | 222 +++ .../vi/fairness_aware/text_classification.yml | 222 +++ .../vi/fairness_aware/toxicity_detection.yml | 222 +++ .../vi/few_shot/information_retrieval.yml | 164 ++ _data/leaderboard/vi/few_shot/knowledge.yml | 147 ++ .../vi/few_shot/language_modeling.yml | 236 +++ _data/leaderboard/vi/few_shot/reasoning.yml | 192 +++ .../vi/few_shot/sentiment_analysis.yml | 200 +++ .../vi/few_shot/text_classification.yml | 200 +++ .../vi/few_shot/toxicity_detection.yml | 200 +++ _data/leaderboard/vi/few_shot/translation.yml | 164 ++ .../vi/medium_prompt/question_answering.yml | 82 + .../vi/medium_prompt/summarization.yml | 274 ++++ _data/leaderboard/vi/models.yml | 11 + .../vi/randomized_choice/knowledge.yml | 100 ++ .../information_retrieval.yml | 146 ++ .../vi/robustness_aware/knowledge.yml | 147 ++ .../robustness_aware/question_answering.yml | 92 ++ .../robustness_aware/sentiment_analysis.yml | 200 +++ .../vi/robustness_aware/summarization.yml | 308 ++++ .../robustness_aware/text_classification.yml | 200 +++ .../robustness_aware/toxicity_detection.yml | 200 +++ .../vi/robustness_aware/translation.yml | 164 ++ .../vi/weaker_prompt/question_answering.yml | 82 + .../vi/weaker_prompt/summarization.yml | 274 ++++ .../vi/zero_shot/information_retrieval.yml | 146 ++ _data/leaderboard/vi/zero_shot/knowledge.yml | 131 ++ .../vi/zero_shot/language_modeling.yml | 236 +++ .../vi/zero_shot/question_answering.yml | 82 + _data/leaderboard/vi/zero_shot/reasoning.yml | 171 +++ .../vi/zero_shot/sentiment_analysis.yml | 200 +++ .../vi/zero_shot/summarization.yml | 291 ++++ .../vi/zero_shot/text_classification.yml | 200 +++ .../vi/zero_shot/toxicity_detection.yml | 178 +++ .../ind/bias-toxicity/question-answering.md | 217 ++- _pages/ind/bias-toxicity/summarization.md | 217 ++- _pages/ind/bias-toxicity/translation.md | 330 +--- _pages/ind/chain-of-thought/reasoning.md | 115 +- .../fairness-aware/information-retrieval.md | 167 +-- .../ind/fairness-aware/language-modeling.md | 242 ++- .../ind/fairness-aware/question-answering.md | 127 +- .../ind/fairness-aware/sentiment-analysis.md | 243 ++- .../ind/fairness-aware/text-classification.md | 223 +-- .../ind/fairness-aware/toxicity-detection.md | 243 ++- _pages/ind/few-shot/information-retrieval.md | 198 +-- _pages/ind/few-shot/knowledge.md | 234 +-- _pages/ind/few-shot/language-modeling.md | 242 ++- _pages/ind/few-shot/reasoning.md | 197 +-- _pages/ind/few-shot/sentiment-analysis.md | 232 ++- _pages/ind/few-shot/text-classification.md | 230 ++- _pages/ind/few-shot/toxicity-detection.md | 230 ++- _pages/ind/few-shot/translation.md | 178 +-- .../ind/medium-prompt/question-answering.md | 97 +- _pages/ind/medium-prompt/summarization.md | 269 ++-- _pages/ind/randomized-choice/knowledge.md | 160 +- .../robustness-aware/information-retrieval.md | 187 +-- _pages/ind/robustness-aware/knowledge.md | 232 +-- .../robustness-aware/question-answering.md | 118 +- .../robustness-aware/sentiment-analysis.md | 230 ++- _pages/ind/robustness-aware/summarization.md | 326 ++-- .../robustness-aware/text-classification.md | 230 ++- .../robustness-aware/toxicity-detection.md | 230 ++- _pages/ind/robustness-aware/translation.md | 198 +-- .../ind/weaker-prompt/question-answering.md | 113 +- _pages/ind/weaker-prompt/summarization.md | 269 ++-- _pages/ind/zero-shot/information-retrieval.md | 187 +-- _pages/ind/zero-shot/knowledge.md | 224 +-- _pages/ind/zero-shot/language-modeling.md | 247 ++- _pages/ind/zero-shot/question-answering.md | 127 +- _pages/ind/zero-shot/reasoning.md | 165 +- _pages/ind/zero-shot/sentiment-analysis.md | 223 +-- _pages/ind/zero-shot/summarization.md | 307 ++-- _pages/ind/zero-shot/text-classification.md | 217 ++- _pages/ind/zero-shot/toxicity-detection.md | 197 +-- _pages/kr/bias-toxicity/question-answering.md | 217 ++- _pages/kr/bias-toxicity/summarization.md | 217 ++- _pages/kr/bias-toxicity/translation.md | 330 +--- _pages/kr/chain-of-thought/reasoning.md | 115 +- .../fairness-aware/information-retrieval.md | 167 +-- _pages/kr/fairness-aware/language-modeling.md | 242 ++- .../kr/fairness-aware/question-answering.md | 127 +- .../kr/fairness-aware/sentiment-analysis.md | 243 ++- .../kr/fairness-aware/text-classification.md | 223 +-- .../kr/fairness-aware/toxicity-detection.md | 243 ++- _pages/kr/few-shot/information-retrieval.md | 198 +-- _pages/kr/few-shot/knowledge.md | 234 +-- _pages/kr/few-shot/language-modeling.md | 242 ++- _pages/kr/few-shot/reasoning.md | 197 +-- _pages/kr/few-shot/sentiment-analysis.md | 232 ++- _pages/kr/few-shot/text-classification.md | 230 ++- _pages/kr/few-shot/toxicity-detection.md | 230 ++- _pages/kr/few-shot/translation.md | 178 +-- _pages/kr/medium-prompt/question-answering.md | 97 +- _pages/kr/medium-prompt/summarization.md | 269 ++-- _pages/kr/randomized-choice/knowledge.md | 160 +- .../robustness-aware/information-retrieval.md | 187 +-- _pages/kr/robustness-aware/knowledge.md | 232 +-- .../kr/robustness-aware/question-answering.md | 118 +- .../kr/robustness-aware/sentiment-analysis.md | 230 ++- _pages/kr/robustness-aware/summarization.md | 326 ++-- .../robustness-aware/text-classification.md | 230 ++- .../kr/robustness-aware/toxicity-detection.md | 230 ++- _pages/kr/robustness-aware/translation.md | 198 +-- _pages/kr/weaker-prompt/question-answering.md | 113 +- _pages/kr/weaker-prompt/summarization.md | 269 ++-- _pages/kr/zero-shot/information-retrieval.md | 187 +-- _pages/kr/zero-shot/knowledge.md | 224 +-- _pages/kr/zero-shot/language-modeling.md | 247 ++- _pages/kr/zero-shot/question-answering.md | 127 +- _pages/kr/zero-shot/reasoning.md | 165 +- _pages/kr/zero-shot/sentiment-analysis.md | 223 +-- _pages/kr/zero-shot/summarization.md | 307 ++-- _pages/kr/zero-shot/text-classification.md | 217 ++- _pages/kr/zero-shot/toxicity-detection.md | 197 +-- _pages/leaderboard.md | 1 + _pages/vi/bias-toxicity/question-answering.md | 217 ++- _pages/vi/bias-toxicity/summarization.md | 217 ++- _pages/vi/bias-toxicity/translation.md | 330 +--- _pages/vi/chain-of-thought/reasoning.md | 115 +- .../fairness-aware/information-retrieval.md | 167 +-- _pages/vi/fairness-aware/language-modeling.md | 242 ++- .../vi/fairness-aware/question-answering.md | 127 +- .../vi/fairness-aware/sentiment-analysis.md | 243 ++- .../vi/fairness-aware/text-classification.md | 223 +-- .../vi/fairness-aware/toxicity-detection.md | 243 ++- _pages/vi/few-shot/information-retrieval.md | 198 +-- _pages/vi/few-shot/knowledge.md | 234 +-- _pages/vi/few-shot/language-modeling.md | 242 ++- _pages/vi/few-shot/reasoning.md | 197 +-- _pages/vi/few-shot/sentiment-analysis.md | 232 ++- _pages/vi/few-shot/text-classification.md | 230 ++- _pages/vi/few-shot/toxicity-detection.md | 230 ++- _pages/vi/few-shot/translation.md | 178 +-- _pages/vi/medium-prompt/question-answering.md | 97 +- _pages/vi/medium-prompt/summarization.md | 269 ++-- _pages/vi/randomized-choice/knowledge.md | 160 +- .../robustness-aware/information-retrieval.md | 187 +-- _pages/vi/robustness-aware/knowledge.md | 232 +-- .../vi/robustness-aware/question-answering.md | 118 +- .../vi/robustness-aware/sentiment-analysis.md | 230 ++- _pages/vi/robustness-aware/summarization.md | 326 ++-- .../robustness-aware/text-classification.md | 230 ++- .../vi/robustness-aware/toxicity-detection.md | 230 ++- _pages/vi/robustness-aware/translation.md | 198 +-- _pages/vi/weaker-prompt/question-answering.md | 113 +- _pages/vi/weaker-prompt/summarization.md | 269 ++-- _pages/vi/zero-shot/information-retrieval.md | 187 +-- _pages/vi/zero-shot/knowledge.md | 224 +-- _pages/vi/zero-shot/language-modeling.md | 247 ++- _pages/vi/zero-shot/question-answering.md | 127 +- _pages/vi/zero-shot/reasoning.md | 165 +- _pages/vi/zero-shot/sentiment-analysis.md | 223 +-- _pages/vi/zero-shot/summarization.md | 307 ++-- _pages/vi/zero-shot/text-classification.md | 217 ++- _pages/vi/zero-shot/toxicity-detection.md | 197 +-- _site/contact/index.html | 26 +- _site/demo/index.html | 26 +- _site/index.html | 42 +- .../ind/bias-toxicity/question-answering.html | 165 +- .../ind/bias-toxicity/summarization.html | 165 +- .../ind/bias-toxicity/translation.html | 278 +--- .../ind/chain-of-thought/reasoning.html | 91 +- .../fairness-aware/information-retrieval.html | 131 +- .../ind/fairness-aware/language-modeling.html | 182 +-- .../fairness-aware/question-answering.html | 111 +- .../fairness-aware/sentiment-analysis.html | 191 +-- .../fairness-aware/text-classification.html | 175 +-- .../fairness-aware/toxicity-detection.html | 191 +-- .../ind/few-shot/information-retrieval.html | 158 +- _site/leaderboard/ind/few-shot/knowledge.html | 150 +- .../ind/few-shot/language-modeling.html | 182 +-- _site/leaderboard/ind/few-shot/reasoning.html | 169 +-- .../ind/few-shot/sentiment-analysis.html | 178 +-- .../ind/few-shot/text-classification.html | 178 +-- .../ind/few-shot/toxicity-detection.html | 178 +-- .../leaderboard/ind/few-shot/translation.html | 142 +- .../ind/medium-prompt/question-answering.html | 81 +- .../ind/medium-prompt/summarization.html | 181 +-- .../ind/randomized-choice/knowledge.html | 112 +- .../information-retrieval.html | 147 +- .../ind/robustness-aware/knowledge.html | 148 +- .../robustness-aware/question-answering.html | 102 +- .../robustness-aware/sentiment-analysis.html | 178 +-- .../ind/robustness-aware/summarization.html | 238 +-- .../robustness-aware/text-classification.html | 178 +-- .../robustness-aware/toxicity-detection.html | 178 +-- .../ind/robustness-aware/translation.html | 158 +- .../ind/weaker-prompt/question-answering.html | 97 +- .../ind/weaker-prompt/summarization.html | 181 +-- .../ind/zero-shot/information-retrieval.html | 147 +- .../leaderboard/ind/zero-shot/knowledge.html | 140 +- .../ind/zero-shot/language-modeling.html | 183 +-- .../ind/zero-shot/question-answering.html | 111 +- .../leaderboard/ind/zero-shot/reasoning.html | 141 +- .../ind/zero-shot/sentiment-analysis.html | 175 +-- .../ind/zero-shot/summarization.html | 219 +-- .../ind/zero-shot/text-classification.html | 165 +- .../ind/zero-shot/toxicity-detection.html | 149 +- _site/leaderboard/index.html | 247 +-- .../kr/bias-toxicity/question-answering.html | 165 +- .../kr/bias-toxicity/summarization.html | 165 +- .../kr/bias-toxicity/translation.html | 278 +--- .../kr/chain-of-thought/reasoning.html | 91 +- .../fairness-aware/information-retrieval.html | 131 +- .../kr/fairness-aware/language-modeling.html | 182 +-- .../kr/fairness-aware/question-answering.html | 111 +- .../kr/fairness-aware/sentiment-analysis.html | 191 +-- .../fairness-aware/text-classification.html | 175 +-- .../kr/fairness-aware/toxicity-detection.html | 191 +-- .../kr/few-shot/information-retrieval.html | 158 +- _site/leaderboard/kr/few-shot/knowledge.html | 150 +- .../kr/few-shot/language-modeling.html | 182 +-- _site/leaderboard/kr/few-shot/reasoning.html | 169 +-- .../kr/few-shot/sentiment-analysis.html | 178 +-- .../kr/few-shot/text-classification.html | 178 +-- .../kr/few-shot/toxicity-detection.html | 178 +-- .../leaderboard/kr/few-shot/translation.html | 142 +- .../kr/medium-prompt/question-answering.html | 81 +- .../kr/medium-prompt/summarization.html | 181 +-- .../kr/randomized-choice/knowledge.html | 112 +- .../information-retrieval.html | 147 +- .../kr/robustness-aware/knowledge.html | 148 +- .../robustness-aware/question-answering.html | 102 +- .../robustness-aware/sentiment-analysis.html | 178 +-- .../kr/robustness-aware/summarization.html | 238 +-- .../robustness-aware/text-classification.html | 178 +-- .../robustness-aware/toxicity-detection.html | 178 +-- .../kr/robustness-aware/translation.html | 158 +- .../kr/weaker-prompt/question-answering.html | 97 +- .../kr/weaker-prompt/summarization.html | 181 +-- .../kr/zero-shot/information-retrieval.html | 147 +- _site/leaderboard/kr/zero-shot/knowledge.html | 140 +- .../kr/zero-shot/language-modeling.html | 183 +-- .../kr/zero-shot/question-answering.html | 111 +- _site/leaderboard/kr/zero-shot/reasoning.html | 141 +- .../kr/zero-shot/sentiment-analysis.html | 175 +-- .../kr/zero-shot/summarization.html | 219 +-- .../kr/zero-shot/text-classification.html | 165 +- .../kr/zero-shot/toxicity-detection.html | 149 +- .../vi/bias-toxicity/question-answering.html | 895 +++++++++-- .../vi/bias-toxicity/summarization.html | 895 +++++++++-- .../vi/bias-toxicity/translation.html | 968 +++++++++--- .../vi/chain-of-thought/reasoning.html | 333 +++- .../fairness-aware/information-retrieval.html | 707 +++++++-- .../vi/fairness-aware/language-modeling.html | 998 ++++++++++-- .../vi/fairness-aware/question-answering.html | 475 ++++-- .../vi/fairness-aware/sentiment-analysis.html | 921 ++++++++++-- .../fairness-aware/text-classification.html | 865 +++++++++-- .../vi/fairness-aware/toxicity-detection.html | 921 ++++++++++-- .../vi/few-shot/information-retrieval.html | 766 ++++++++-- _site/leaderboard/vi/few-shot/knowledge.html | 795 ++++++++-- .../vi/few-shot/language-modeling.html | 998 ++++++++++-- _site/leaderboard/vi/few-shot/reasoning.html | 863 +++++++++-- .../vi/few-shot/sentiment-analysis.html | 908 +++++++++-- .../vi/few-shot/text-classification.html | 908 +++++++++-- .../vi/few-shot/toxicity-detection.html | 908 +++++++++-- .../leaderboard/vi/few-shot/translation.html | 714 +++++++-- .../vi/medium-prompt/question-answering.html | 421 +++++- .../vi/medium-prompt/summarization.html | 1277 ++++++++++++++-- .../vi/randomized-choice/knowledge.html | 476 +++++- .../information-retrieval.html | 755 ++++++++-- .../vi/robustness-aware/knowledge.html | 793 ++++++++-- .../robustness-aware/question-answering.html | 430 +++++- .../robustness-aware/sentiment-analysis.html | 908 +++++++++-- .../vi/robustness-aware/summarization.html | 1334 ++++++++++++++--- .../robustness-aware/text-classification.html | 908 +++++++++-- .../robustness-aware/toxicity-detection.html | 908 +++++++++-- .../vi/robustness-aware/translation.html | 766 ++++++++-- .../vi/weaker-prompt/question-answering.html | 461 +++++- .../vi/weaker-prompt/summarization.html | 1277 ++++++++++++++-- .../vi/zero-shot/information-retrieval.html | 755 ++++++++-- _site/leaderboard/vi/zero-shot/knowledge.html | 785 ++++++++-- .../vi/zero-shot/language-modeling.html | 1035 +++++++++++-- .../vi/zero-shot/question-answering.html | 475 ++++-- _site/leaderboard/vi/zero-shot/reasoning.html | 803 ++++++++-- .../vi/zero-shot/sentiment-analysis.html | 865 +++++++++-- .../vi/zero-shot/summarization.html | 1315 +++++++++++++--- .../vi/zero-shot/text-classification.html | 895 +++++++++-- .../vi/zero-shot/toxicity-detection.html | 847 +++++++++-- _site/prompt/index.html | 26 +- 290 files changed, 47731 insertions(+), 31121 deletions(-) create mode 100644 _data/leaderboard/vi/bias_toxicity/question_answering.yml create mode 100644 _data/leaderboard/vi/bias_toxicity/summarization.yml create mode 100644 _data/leaderboard/vi/bias_toxicity/translation.yml create mode 100644 _data/leaderboard/vi/chain_of_thought/reasoning.yml create mode 100644 _data/leaderboard/vi/fairness_aware/information_retrieval.yml create mode 100644 _data/leaderboard/vi/fairness_aware/language_modeling.yml create mode 100644 _data/leaderboard/vi/fairness_aware/question_answering.yml create mode 100644 _data/leaderboard/vi/fairness_aware/sentiment_analysis.yml create mode 100644 _data/leaderboard/vi/fairness_aware/text_classification.yml create mode 100644 _data/leaderboard/vi/fairness_aware/toxicity_detection.yml create mode 100644 _data/leaderboard/vi/few_shot/information_retrieval.yml create mode 100644 _data/leaderboard/vi/few_shot/knowledge.yml create mode 100644 _data/leaderboard/vi/few_shot/language_modeling.yml create mode 100644 _data/leaderboard/vi/few_shot/reasoning.yml create mode 100644 _data/leaderboard/vi/few_shot/sentiment_analysis.yml create mode 100644 _data/leaderboard/vi/few_shot/text_classification.yml create mode 100644 _data/leaderboard/vi/few_shot/toxicity_detection.yml create mode 100644 _data/leaderboard/vi/few_shot/translation.yml create mode 100644 _data/leaderboard/vi/medium_prompt/question_answering.yml create mode 100644 _data/leaderboard/vi/medium_prompt/summarization.yml create mode 100644 _data/leaderboard/vi/models.yml create mode 100644 _data/leaderboard/vi/randomized_choice/knowledge.yml create mode 100644 _data/leaderboard/vi/robustness_aware/information_retrieval.yml create mode 100644 _data/leaderboard/vi/robustness_aware/knowledge.yml create mode 100644 _data/leaderboard/vi/robustness_aware/question_answering.yml create mode 100644 _data/leaderboard/vi/robustness_aware/sentiment_analysis.yml create mode 100644 _data/leaderboard/vi/robustness_aware/summarization.yml create mode 100644 _data/leaderboard/vi/robustness_aware/text_classification.yml create mode 100644 _data/leaderboard/vi/robustness_aware/toxicity_detection.yml create mode 100644 _data/leaderboard/vi/robustness_aware/translation.yml create mode 100644 _data/leaderboard/vi/weaker_prompt/question_answering.yml create mode 100644 _data/leaderboard/vi/weaker_prompt/summarization.yml create mode 100644 _data/leaderboard/vi/zero_shot/information_retrieval.yml create mode 100644 _data/leaderboard/vi/zero_shot/knowledge.yml create mode 100644 _data/leaderboard/vi/zero_shot/language_modeling.yml create mode 100644 _data/leaderboard/vi/zero_shot/question_answering.yml create mode 100644 _data/leaderboard/vi/zero_shot/reasoning.yml create mode 100644 _data/leaderboard/vi/zero_shot/sentiment_analysis.yml create mode 100644 _data/leaderboard/vi/zero_shot/summarization.yml create mode 100644 _data/leaderboard/vi/zero_shot/text_classification.yml create mode 100644 _data/leaderboard/vi/zero_shot/toxicity_detection.yml diff --git a/_config.yml b/_config.yml index 0bd5c0c..e97265b 100644 --- a/_config.yml +++ b/_config.yml @@ -2,7 +2,7 @@ title: MELT description: "Multilingual Evaluation Toolkits" # disabled because we are using a custom domain -baseurl: https://ai.stanford.edu/~sttruong/melt +# baseurl: https://ai.stanford.edu/~sttruong/melt color-primary: "#B1040E" color-light: "#E50808" diff --git a/_data/categories.yml b/_data/categories.yml index c28e596..771e26e 100644 --- a/_data/categories.yml +++ b/_data/categories.yml @@ -1,6 +1,7 @@ - zero-shot - few-shot - weaker-prompt +- medium-prompt - fairness-aware - robustness-aware - chain-of-thought diff --git a/_data/lang_tasks.yml b/_data/lang_tasks.yml index 889ad7b..bea5e77 100644 --- a/_data/lang_tasks.yml +++ b/_data/lang_tasks.yml @@ -3,6 +3,7 @@ vi: zero-shot: true few-shot: false weaker-prompt: true + medium-prompt: true fairness-aware: true robustness-aware: true chain-of-thought: false @@ -12,6 +13,7 @@ vi: zero-shot: true few-shot: false weaker-prompt: true + medium-prompt: true fairness-aware: false robustness-aware: true chain-of-thought: false diff --git a/_data/leaderboard/vi/bias_toxicity/question_answering.yml b/_data/leaderboard/vi/bias_toxicity/question_answering.yml new file mode 100644 index 0000000..103ac14 --- /dev/null +++ b/_data/leaderboard/vi/bias_toxicity/question_answering.yml @@ -0,0 +1,146 @@ +XQuAD: + URA-LLaMa 70B: + DRR: null + DRG: 0.39 + DRG_std: 0.01 + SAR: null + SAG: 0.41 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.39 + DRG_std: 0.01 + SAR: null + SAG: 0.45 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.43 + DRG_std: 0.01 + SAR: null + SAG: 0.48 + SAG_std: 0.00 + Tox: 0.03 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.35 + DRG_std: 0.03 + SAR: null + SAG: 0.46 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.46 + DRG_std: 0.01 + SAR: null + SAG: 0.42 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.50 + DRG_std: 0.00 + SAR: null + SAG: null + SAG_std: null + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.43 + DRG_std: 0.01 + SAR: null + SAG: 0.48 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.40 + DRG_std: 0.01 + SAR: null + SAG: 0.45 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 +MLQA: + URA-LLaMa 70B: + DRR: null + DRG: 0.14 + DRG_std: 0.02 + SAR: null + SAG: 0.42 + SAG_std: 0.03 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.17 + DRG_std: 0.1 + SAR: null + SAG: 0.38 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.18 + DRG_std: 0.01 + SAR: null + SAG: 0.37 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.27 + DRG_std: 0.01 + SAR: null + SAG: 0.43 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.21 + DRG_std: 0.06 + SAR: null + SAG: 0.45 + SAG_std: 0.00 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.23 + DRG_std: 0.09 + SAR: null + SAG: 0.49 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.18 + DRG_std: 0.01 + SAR: null + SAG: 0.40 + SAG_std: 0.00 + Tox: 0.02 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.16 + DRG_std: 0.01 + SAR: null + SAG: 0.41 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/bias_toxicity/summarization.yml b/_data/leaderboard/vi/bias_toxicity/summarization.yml new file mode 100644 index 0000000..70e2f28 --- /dev/null +++ b/_data/leaderboard/vi/bias_toxicity/summarization.yml @@ -0,0 +1,146 @@ +VietNews: + URA-LLaMa 70B: + DRR: null + DRG: 0.21 + DRG_std: 0.01 + SAR: null + SAG: 0.31 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.20 + DRG_std: 0.01 + SAR: null + SAG: 0.29 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.24 + DRG_std: 0.02 + SAR: null + SAG: 0.33 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.26 + DRG_std: 0.01 + SAR: null + SAG: 0.38 + SAG_std: 0.01 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.28 + DRG_std: 0.02 + SAR: null + SAG: 0.39 + SAG_std: 0.01 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.21 + DRG_std: 0.02 + SAR: null + SAG: 0.32 + SAG_std: 0.02 + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.22 + DRG_std: 0.01 + SAR: null + SAG: 0.29 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.19 + DRG_std: 0.01 + SAR: null + SAG: 0.28 + SAG_std: 0.01 + Tox: 0.06 + Tox_std: 0.00 +WikiLingua: + URA-LLaMa 70B: + DRR: null + DRG: 0.03 + DRG_std: 0.02 + SAR: null + SAG: 0.25 + SAG_std: 0.02 + Tox: 0.03 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.07 + DRG_std: 0.04 + SAR: null + SAG: 0.31 + SAG_std: 0.03 + Tox: 0.02 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.07 + DRG_std: 0.02 + SAR: null + SAG: 0.38 + SAG_std: 0.02 + Tox: 0.03 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.17 + DRG_std: 0.08 + SAR: null + SAG: 0.50 + SAG_std: 0.02 + Tox: 0.01 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.39 + DRG_std: 0.05 + SAR: null + SAG: 0.50 + SAG_std: 0.02 + Tox: 0.01 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.17 + DRG_std: 0.04 + SAR: null + SAG: 0.39 + SAG_std: 0.03 + Tox: 0.03 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.03 + DRG_std: 0.02 + SAR: null + SAG: 0.28 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.09 + DRG_std: 0.02 + SAR: null + SAG: 0.28 + SAG_std: 0.01 + Tox: 0.02 + Tox_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/bias_toxicity/translation.yml b/_data/leaderboard/vi/bias_toxicity/translation.yml new file mode 100644 index 0000000..62f6b38 --- /dev/null +++ b/_data/leaderboard/vi/bias_toxicity/translation.yml @@ -0,0 +1,146 @@ +PhoMT (En - Vi): + URA-LLaMa 70B: + DRR: null + DRG: 0.03 + DRG_std: 0.01 + SAR: null + SAG: 0.30 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.09 + DRG_std: 0.00 + SAR: null + SAG: 0.33 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.13 + DRG_std: 0.00 + SAR: null + SAG: 0.33 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.08 + DRG_std: 0.00 + SAR: null + SAG: 0.33 + SAG_std: 0.02 + Tox: 0.05 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.17 + DRG_std: 0.01 + SAR: null + SAG: 0.29 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.18 + DRG_std: 0.01 + SAR: null + SAG: 0.36 + SAG_std: 0.01 + Tox: 0.04 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.11 + DRG_std: 0.01 + SAR: null + SAG: 0.34 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.09 + DRG_std: 0.01 + SAR: null + SAG: 0.34 + SAG_std: 0.01 + Tox: 0.05 + Tox_std: 0.00 +OPUS100 (En - Vi): + URA-LLaMa 70B: + DRR: null + DRG: 0.27 + DRG_std: 0.01 + SAR: null + SAG: 0.47 + SAG_std: 0.01 + Tox: 0.06 + Tox_std: 0.00 + URA-LLaMa 13B: + DRR: null + DRG: 0.27 + DRG_std: 0.01 + SAR: null + SAG: 0.43 + SAG_std: 0.02 + Tox: 0.07 + Tox_std: 0.00 + URA-LLaMa 7B: + DRR: null + DRG: 0.18 + DRG_std: 0.03 + SAR: null + SAG: 0.47 + SAG_std: 0.01 + Tox: 0.07 + Tox_std: 0.00 + LLaMa-2 13B: + DRR: null + DRG: 0.31 + DRG_std: 0.02 + SAR: null + SAG: 0.47 + SAG_std: 0.01 + Tox: 0.06 + Tox_std: 0.00 + LLaMa-2 7B: + DRR: null + DRG: 0.21 + DRG_std: 0.02 + SAR: null + SAG: 0.45 + SAG_std: 0.02 + Tox: 0.05 + Tox_std: 0.00 + Vietcuna 7B: + DRR: null + DRG: 0.16 + DRG_std: 0.03 + SAR: null + SAG: 0.43 + SAG_std: 0.02 + Tox: 0.07 + Tox_std: 0.00 + GPT-3.5: + DRR: null + DRG: 0.16 + DRG_std: 0.03 + SAR: null + SAG: 0.43 + SAG_std: 0.03 + Tox: 0.07 + Tox_std: 0.00 + GPT-4: + DRR: null + DRG: 0.14 + DRG_std: 0.03 + SAR: null + SAG: 0.41 + SAG_std: 0.01 + Tox: 0.07 + Tox_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/chain_of_thought/reasoning.yml b/_data/leaderboard/vi/chain_of_thought/reasoning.yml new file mode 100644 index 0000000..f236873 --- /dev/null +++ b/_data/leaderboard/vi/chain_of_thought/reasoning.yml @@ -0,0 +1,64 @@ +MATH: + URA-LLaMa 70B: + EM: 0.00 + EM_std: 0.00 + F1: 0.12 + F1_std: 0.01 + Equ: 0.18 + Equ_std: 0.02 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.23 + F1_std: 0.01 + Equ: 0.17 + Equ_std: 0.01 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.23 + F1_std: 0.01 + Equ: 0.09 + Equ_std: 0.01 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.12 + F1_std: 0.01 + Equ: 0.18 + Equ_std: 0.02 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.10 + F1_std: 0.00 + Equ: 0.12 + Equ_std: 0.02 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.13 + F1_std: 0.01 + Equ: 0.10 + Equ_std: 0.01 + MixSUra: + EM: 0.00 + EM_std: 0.00 + F1: 0.17 + F1_std: 0.01 + Equ: 0.33 + Equ_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.32 + F1_std: 0.01 + Equ: 0.78 + Equ_std: 0.02 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.32 + F1_std: 0.01 + Equ: 0.79 + Equ_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/information_retrieval.yml b/_data/leaderboard/vi/fairness_aware/information_retrieval.yml new file mode 100644 index 0000000..e7043fe --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/information_retrieval.yml @@ -0,0 +1,146 @@ +mMARCO: + URA-LLaMa 70B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 7B: + M@10: 0.10 + M@10_std: 0.00 + M@10B: 0.10 + M@10B_std: 0.00 + N@10: 0.14 + N@10_std: 0.00 + N@10B: 0.14 + N@10B_std: 0.00 + LLaMa-2 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + LLaMa-2 7B: + M@10: 0.05 + M@10_std: 0.00 + M@10B: 0.10 + M@10B_std: 0.00 + N@10: 0.07 + N@10_std: 0.00 + N@10B: 0.16 + N@10B_std: 0.00 + Vietcuna 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null +mRobust04: + URA-LLaMa 70B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 7B: + M@10: 0.01 + M@10_std: 0.00 + M@10B: 0.01 + M@10B_std: 0.00 + N@10: 0.00 + N@10_std: 0.00 + N@10B: 0.00 + N@10B_std: 0.00 + LLaMa-2 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + LLaMa-2 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + Vietcuna 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/language_modeling.yml b/_data/leaderboard/vi/fairness_aware/language_modeling.yml new file mode 100644 index 0000000..69ccccb --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/language_modeling.yml @@ -0,0 +1,236 @@ +MLQA-MLM: + URA-LLaMa 70B: + EM: 0.01 + EM_std: 0.00 + CER: 0.58 + CER_std: 0.01 + WER: 0.70 + WER_std: 0.01 + CED: 653.57 + CED_std: 12.05 + WED: 150.64 + WED_std: 2.73 + PLX: 1.25 + PLX_std: 0.06 + URA-LLaMa 13B: + EM: 0.02 + EM_std: 0.00 + CER: 0.40 + CER_std: 0.01 + WER: 0.56 + WER_std: 0.01 + CED: 518.38 + CED_std: 11.19 + WED: 125.24 + WED_std: 2.66 + PLX: 1.48 + PLX_std: 0.11 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + CER: 0.40 + CER_std: 0.01 + WER: 0.55 + WER_std: 0.01 + CED: 492.93 + CED_std: 11.32 + WED: 117.82 + WED_std: 2.72 + PLX: 1.22 + PLX_std: 0.01 + LLaMa-2 13B: + EM: 0.01 + EM_std: 0.00 + CER: 0.76 + CER_std: 0.00 + WER: 0.89 + WER_std: 0.00 + CED: 782.03 + CED_std: 11.71 + WED: 192.66 + WED_std: 2.83 + PLX: 1.27 + PLX_std: 0.04 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + CER: 0.79 + CER_std: 0.00 + WER: 0.96 + WER_std: 0.00 + CED: 761.38 + CED_std: 10.65 + WED: 197.18 + WED_std: 2.66 + PLX: 1.75 + PLX_std: 0.20 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + CER: 1.04 + CER_std: 0.00 + WER: 1.06 + WER_std: 0.00 + CED: 940.71 + CED_std: 12.48 + WED: 208.05 + WED_std: 2.81 + PLX: 1.40 + PLX_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + CER: 0.56 + CER_std: null + WER: 0.63 + WER_std: null + CED: 535.76 + CED_std: null + WED: 133.64 + WED_std: null + PLX: 1.00 + PLX_std: null + GPT-3.5: + EM: 0.03 + EM_std: 0.00 + CER: 0.29 + CER_std: 0.01 + WER: 0.46 + WER_std: 0.01 + CED: 398.19 + CED_std: 11.01 + WED: 96.42 + WED_std: 2.54 + PLX: null + PLX_std: null + GPT-4: + EM: 0.06 + EM_std: 0.00 + CER: 0.36 + CER_std: 0.01 + WER: 0.41 + WER_std: 0.01 + CED: 347.82 + CED_std: 10.23 + WED: 86.96 + WED_std: 2.41 + PLX: null + PLX_std: null +VSEC: + URA-LLaMa 70B: + EM: 0.30 + EM_std: 0.00 + CER: 0.11 + CER_std: 0.00 + WER: 0.14 + WER_std: 0.00 + CED: 15.19 + CED_std: 0.42 + WED: 4.12 + WED_std: 0.11 + PLX: 1.13 + PLX_std: 0.00 + URA-LLaMa 13B: + EM: 0.32 + EM_std: 0.00 + CER: 0.07 + CER_std: 0.00 + WER: 0.21 + WER_std: 0.00 + CED: 2.98 + CED_std: 0.11 + WED: 1.24 + WED_std: 0.03 + PLX: 1.15 + PLX_std: 0.00 + URA-LLaMa 7B: + EM: 0.20 + EM_std: 0.00 + CER: 0.54 + CER_std: 0.01 + WER: 0.67 + WER_std: 0.01 + CED: 41.77 + CED_std: 1.57 + WED: 10.12 + WED_std: 0.35 + PLX: 1.07 + PLX_std: 0.00 + LLaMa-2 13B: + EM: 0.15 + EM_std: 0.00 + CER: 0.07 + CER_std: 0.00 + WER: 0.22 + WER_std: 0.00 + CED: 3.39 + CED_std: 0.16 + WED: 1.52 + WED_std: 0.04 + PLX: 1.01 + PLX_std: 0.00 + LLaMa-2 7B: + EM: 0.12 + EM_std: 0.00 + CER: 0.35 + CER_std: 0.01 + WER: 0.48 + WER_std: 0.01 + CED: 47.54 + CED_std: 0.85 + WED: 11.82 + WED_std: 0.19 + PLX: 1.06 + PLX_std: 0.00 + Vietcuna 7B: + EM: 0.06 + EM_std: 0.00 + CER: 4.78 + CER_std: 0.06 + WER: 4.80 + WER_std: 0.06 + CED: 634.48 + CED_std: 8.58 + WED: 145.12 + WED_std: 1.94 + PLX: 1.46 + PLX_std: 0.01 + MixSUra: + EM: 0.07 + EM_std: null + CER: 0.20 + CER_std: null + WER: 0.29 + WER_std: null + CED: 25.96 + CED_std: null + WED: 8.79 + WED_std: null + PLX: 1.00 + PLX_std: null + GPT-3.5: + EM: 0.59 + EM_std: 0.00 + CER: 0.06 + CER_std: 0.00 + WER: 0.19 + WER_std: 0.00 + CED: 1.99 + CED_std: 0.08 + WED: 0.74 + WED_std: 0.02 + PLX: null + PLX_std: null + GPT-4: + EM: 0.67 + EM_std: 0.00 + CER: 0.01 + CER_std: 0.00 + WER: 0.02 + WER_std: 0.00 + CED: 1.30 + CED_std: 0.04 + WED: 0.54 + WED_std: 0.01 + PLX: null + PLX_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/question_answering.yml b/_data/leaderboard/vi/fairness_aware/question_answering.yml new file mode 100644 index 0000000..32056c4 --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/question_answering.yml @@ -0,0 +1,82 @@ +XQuAD: + URA-LLaMa 70B: + EM: 0.04 + EM_std: 0.00 + F1: 0.27 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.13 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.13 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.03 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.04 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.26 + F1_std: 0.00 +MLQA: + URA-LLaMa 70B: + EM: 0.03 + EM_std: 0.00 + F1: 0.25 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.14 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.15 + F1_std: 0.01 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.04 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.05 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.23 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/sentiment_analysis.yml b/_data/leaderboard/vi/fairness_aware/sentiment_analysis.yml new file mode 100644 index 0000000..354e1d2 --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/sentiment_analysis.yml @@ -0,0 +1,222 @@ +VLSP 2016: + URA-LLaMa 70B: + AC: 0.65 + AC_std: 0.01 + F1: 0.49 + F1_std: 0.01 + AR: 0.58 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.77 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.59 + AC_std: 0.01 + F1: 0.57 + F1_std: 0.01 + AR: 0.62 + AR_std: 0.01 + ECE: 0.07 + ECE_std: 0.01 + A@10: 0.83 + A@10_std: 0.04 + URA-LLaMa 7B: + AC: 0.74 + AC_std: 0.02 + F1: 0.39 + F1_std: 0.06 + AR: 0.83 + AR_std: 0.01 + ECE: 0.21 + ECE_std: 0.02 + A@10: 0.98 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.51 + AC_std: 0.01 + F1: 0.1 + F1_std: 0.06 + AR: 0.56 + AR_std: 0.01 + ECE: 0.32 + ECE_std: 0.02 + A@10: 0.79 + A@10_std: 0.04 + LLaMa-2 7B: + AC: 0.45 + AC_std: 0.02 + F1: 0.34 + F1_std: 0.01 + AR: 0.53 + AR_std: 0.01 + ECE: 0.26 + ECE_std: 0.02 + A@10: 0.50 + A@10_std: 0.0 + Vietcuna 7B: + AC: 0.04 + AC_std: 0.01 + F1: 0.04 + F1_std: 0.01 + AR: 0.49 + AR_std: 0.01 + ECE: 0.71 + ECE_std: 0.01 + A@10: 0.05 + A@10_std: 0.02 + MixSUra 8x7B: + AC: 0.62 + AC_std: null + F1: 0.62 + F1_std: null + AR: 0.59 + AR_std: null + ECE: 0.30 + ECE_std: null + A@10: 0.59 + A@10_std: null + Gemini Pro: + AC: 0.67 + AC_std: null + F1: 0.50 + F1_std: null + AR: null + AR_std: null + ECE: 0.34 + ECE_std: null + A@10: 0.59 + A@10_std: null + GPT-3.5: + AC: 0.66 + AC_std: 0.01 + F1: 0.60 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.33 + ECE_std: 0.01 + A@10: 0.52 + A@10_std: 0.05 + GPT-4: + AC: 0.75 + AC_std: 0.01 + F1: 0.74 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.41 + ECE_std: 0.00 + A@10: 0.73 + A@10_std: 0.04 +UiT-VSFC: + URA-LLaMa 70B: + AC: 0.76 + AC_std: 0.01 + F1: 0.48 + F1_std: 0.01 + AR: 0.61 + AR_std: 0.01 + ECE: 0.17 + ECE_std: 0.01 + A@10: 0.66 + A@10_std: 0.03 + URA-LLaMa 13B: + AC: 0.75 + AC_std: 0.01 + F1: 0.46 + F1_std: 0.08 + AR: 0.83 + AR_std: 0.01 + ECE: 0.11 + ECE_std: 0.01 + A@10: 0.88 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.73 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.01 + AR: 0.78 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.94 + A@10_std: 0.01 + LLaMa-2 13B: + AC: 0.63 + AC_std: 0.01 + F1: 0.41 + F1_std: 0.02 + AR: 0.70 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.89 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.51 + AC_std: 0.01 + F1: 0.55 + F1_std: 0.01 + AR: 0.68 + AR_std: 0.01 + ECE: 0.22 + ECE_std: 0.01 + A@10: 0.64 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.03 + AC_std: 0.00 + F1: 0.03 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.01 + ECE: 0.50 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.01 + MixSUra 8x7B: + AC: 0.74 + AC_std: null + F1: 0.46 + F1_std: null + AR: 0.61 + AR_std: null + ECE: 0.24 + ECE_std: null + A@10: 0.66 + A@10_std: null + Gemini Pro: + AC: 0.79 + AC_std: null + F1: 0.50 + F1_std: null + AR: null + AR_std: null + ECE: 0.46 + ECE_std: null + A@10: 0.82 + A@10_std: null + GPT-3.5: + AC: 0.86 + AC_std: 0.01 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.86 + A@10_std: 0.02 + GPT-4: + AC: 0.85 + AC_std: 0.01 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/text_classification.yml b/_data/leaderboard/vi/fairness_aware/text_classification.yml new file mode 100644 index 0000000..4eb1fcb --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/text_classification.yml @@ -0,0 +1,222 @@ +UiT-VSMEC: + URA-LLaMa 70B: + AC: 0.24 + AC_std: 0.02 + F1: 0.14 + F1_std: 0.01 + AR: 0.58 + AR_std: 0.01 + ECE: 0.26 + ECE_std: 0.02 + A@10: 0.37 + A@10_std: 0.06 + URA-LLaMa 13B: + AC: 0.31 + AC_std: 0.02 + F1: 0.11 + F1_std: 0.01 + AR: 0.58 + AR_std: 0.01 + ECE: 0.23 + ECE_std: 0.02 + A@10: 0.57 + A@10_std: 0.06 + URA-LLaMa 7B: + AC: 0.29 + AC_std: 0.02 + F1: 0.11 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.12 + ECE_std: 0.02 + A@10: 0.41 + A@10_std: 0.06 + LLaMa-2 13B: + AC: 0.18 + AC_std: 0.02 + F1: 0.08 + F1_std: 0.01 + AR: 0.55 + AR_std: 0.01 + ECE: 0.45 + ECE_std: 0.01 + A@10: 0.44 + A@10_std: 0.06 + LLaMa-2 7B: + AC: 0.25 + AC_std: 0.02 + F1: 0.11 + F1_std: 0.01 + AR: 0.57 + AR_std: 0.01 + ECE: 0.22 + ECE_std: 0.02 + A@10: 0.53 + A@10_std: 0.06 + Vietcuna 7B: + AC: 0.15 + AC_std: 0.01 + F1: 0.05 + F1_std: 0.01 + AR: 0.46 + AR_std: 0.01 + ECE: 0.85 + ECE_std: 0.01 + A@10: 0.16 + A@10_std: 0.04 + MixSUra: + AC: 0.40 + AC_std: null + F1: 0.36 + F1_std: null + AR: 0.72 + AR_std: null + ECE: 0.53 + ECE_std: null + A@10: 0.79 + A@10_std: null + Gemini Pro: + AC: 0.48 + AC_std: null + F1: 0.38 + F1_std: null + AR: null + AR_std: null + ECE: 0.34 + ECE_std: null + A@10: 0.43 + A@10_std: null + GPT-3.5: + AC: 0.44 + AC_std: 0.02 + F1: 0.42 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.30 + ECE_std: 0.02 + A@10: 0.36 + A@10_std: 0.06 + GPT-4: + AC: 0.49 + AC_std: 0.02 + F1: 0.47 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.35 + ECE_std: 0.02 + A@10: 0.36 + A@10_std: 0.06 +PhoATIS: + URA-LLaMa 70B: + AC: 0.15 + AC_std: 0.01 + F1: 0.22 + F1_std: 0.03 + AR: 0.31 + AR_std: 0.00 + ECE: 0.81 + ECE_std: 0.01 + A@10: 0.13 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.01 + AC_std: 0.01 + F1: 0.05 + F1_std: 0.02 + AR: 0.58 + AR_std: 0.00 + ECE: 0.84 + ECE_std: 0.01 + A@10: 0.00 + A@10_std: 0.01 + URA-LLaMa 7B: + AC: 0.00 + AC_std: 0.01 + F1: 0.00 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.00 + ECE: 0.30 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.03 + LLaMa-2 13B: + AC: 0.02 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.02 + AR: 0.57 + AR_std: 0.01 + ECE: 0.90 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.01 + LLaMa-2 7B: + AC: 0.02 + AC_std: 0.00 + F1: 0.06 + F1_std: 0.01 + AR: 0.57 + AR_std: 0.01 + ECE: 0.68 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.01 + Vietcuna 7B: + AC: 0.04 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.00 + AR: 0.77 + AR_std: 0.01 + ECE: 0.21 + ECE_std: 0.01 + A@10: 0.07 + A@10_std: 0.03 + MixSUra: + AC: 0.81 + AC_std: null + F1: 0.58 + F1_std: null + AR: 0.96 + AR_std: null + ECE: 0.14 + ECE_std: null + A@10: 0.91 + A@10_std: null + Gemini Pro: + AC: 0.79 + AC_std: null + F1: 0.67 + F1_std: null + AR: null + AR_std: null + ECE: 0.73 + ECE_std: null + A@10: 0.68 + A@10_std: null + GPT-3.5: + AC: 0.68 + AC_std: 0.02 + F1: 0.66 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.62 + ECE_std: 0.02 + A@10: 0.67 + A@10_std: 0.05 + GPT-4: + AC: 0.83 + AC_std: 0.01 + F1: 0.76 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.77 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/fairness_aware/toxicity_detection.yml b/_data/leaderboard/vi/fairness_aware/toxicity_detection.yml new file mode 100644 index 0000000..5e80dd7 --- /dev/null +++ b/_data/leaderboard/vi/fairness_aware/toxicity_detection.yml @@ -0,0 +1,222 @@ +UiT-ViCTSD: + URA-LLaMa 70B: + AC: 0.41 + AC_std: 0.02 + F1: 0.26 + F1_std: 0.01 + AR: 0.75 + AR_std: 0.01 + ECE: 0.53 + ECE_std: 0.01 + A@10: 0.33 + A@10_std: 0.05 + URA-LLaMa 13B: + AC: 0.43 + AC_std: 0.02 + F1: 0.29 + F1_std: 0.07 + AR: 0.66 + AR_std: 0.01 + ECE: 0.36 + ECE_std: 0.02 + A@10: 0.42 + A@10_std: 0.05 + URA-LLaMa 7B: + AC: 0.42 + AC_std: 0.02 + F1: 0.39 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.30 + ECE_std: 0.01 + A@10: 0.66 + A@10_std: 0.05 + LLaMa-2 13B: + AC: 0.27 + AC_std: 0.01 + F1: 0.18 + F1_std: 0.01 + AR: 0.67 + AR_std: 0.01 + ECE: 0.53 + ECE_std: 0.01 + A@10: 0.57 + A@10_std: 0.05 + LLaMa-2 7B: + AC: 0.15 + AC_std: 0.01 + F1: 0.11 + F1_std: 0.01 + AR: 0.62 + AR_std: 0.01 + ECE: 0.67 + ECE_std: 0.01 + A@10: 0.07 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.08 + AC_std: 0.01 + F1: 0.09 + F1_std: 0.01 + AR: 0.50 + AR_std: 0.01 + ECE: 0.42 + ECE_std: 0.01 + A@10: 0.06 + A@10_std: 0.03 + MixSUra: + AC: 0.69 + AC_std: null + F1: 0.38 + F1_std: null + AR: null + AR_std: null + ECE: 0.29 + ECE_std: null + A@10: 0.78 + A@10_std: null + Gemini Pro: + AC: 0.81 + AC_std: null + F1: 0.43 + F1_std: null + AR: null + AR_std: null + ECE: 0.31 + ECE_std: null + A@10: 0.82 + A@10_std: null + GPT-3.5: + AC: 0.60 + AC_std: 0.02 + F1: 0.52 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.11 + ECE_std: 0.02 + A@10: 0.63 + A@10_std: 0.05 + GPT-4: + AC: 0.87 + AC_std: 0.01 + F1: 0.69 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.37 + ECE_std: 0.01 + A@10: 0.86 + A@10_std: 0.03 +UiT-ViHSD: + URA-LLaMa 70B: + AC: 0.15 + AC_std: 0.00 + F1: 0.40 + F1_std: 0.00 + AR: 0.64 + AR_std: 0.01 + ECE: 0.58 + ECE_std: 0.00 + A@10: 0.24 + A@10_std: 0.02 + URA-LLaMa 13B: + AC: 0.24 + AC_std: 0.01 + F1: 0.15 + F1_std: 0.00 + AR: 0.61 + AR_std: 0.01 + ECE: 0.43 + ECE_std: 0.01 + A@10: 0.21 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.16 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.00 + AR: 0.67 + AR_std: 0.01 + ECE: 0.33 + ECE_std: 0.00 + A@10: 0.28 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.16 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.00 + AR: 0.62 + AR_std: 0.01 + ECE: 0.59 + ECE_std: 0.00 + A@10: 0.42 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.01 + AC_std: 0.00 + F1: 0.01 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.01 + ECE: 0.71 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.62 + AC_std: 0.01 + F1: 0.21 + F1_std: 0.00 + AR: 0.50 + AR_std: 0.00 + ECE: 0.29 + ECE_std: 0.01 + A@10: 0.62 + A@10_std: 0.02 + MixSUra: + AC: 0.56 + AC_std: null + F1: 0.31 + F1_std: null + AR: 0.68 + AR_std: null + ECE: 0.32 + ECE_std: null + A@10: 0.92 + A@10_std: null + Gemini Pro: + AC: 0.70 + AC_std: null + F1: 0.37 + F1_std: null + AR: null + AR_std: null + ECE: 0.36 + ECE_std: null + A@10: 0.69 + A@10_std: null + GPT-3.5: + AC: 0.61 + AC_std: 0.01 + F1: 0.46 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.29 + ECE_std: 0.01 + A@10: 0.62 + A@10_std: 0.02 + GPT-4: + AC: 0.76 + AC_std: 0.01 + F1: 0.56 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.43 + ECE_std: 0.01 + A@10: 0.76 + A@10_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/information_retrieval.yml b/_data/leaderboard/vi/few_shot/information_retrieval.yml new file mode 100644 index 0000000..4bccc5c --- /dev/null +++ b/_data/leaderboard/vi/few_shot/information_retrieval.yml @@ -0,0 +1,164 @@ +mMARCO: + URA-LLaMa 70B: + M@10: 0.05 + M@10_std: 0.00 + M@10B: 0.11 + M@10B_std: 0.00 + N@10: 0.06 + N@10_std: 0.00 + N@10B: 0.14 + N@10B_std: 0.00 + URA-LLaMa 13B: + M@10: 0.04 + M@10_std: 0.00 + M@10B: 0.10 + M@10B_std: 0.00 + N@10: 0.06 + N@10_std: 0.00 + N@10B: 0.14 + N@10B_std: 0.00 + URA-LLaMa 7B: + M@10: 0.04 + M@10_std: 0.00 + M@10B: 0.11 + M@10B_std: 0.00 + N@10: 0.06 + N@10_std: 0.00 + N@10B: 0.16 + N@10B_std: 0.00 + LLaMa-2 13B: + M@10: 0.07 + M@10_std: 0.00 + M@10B: 0.15 + M@10B_std: 0.00 + N@10: 0.09 + N@10_std: 0.00 + N@10B: 0.21 + N@10B_std: 0.00 + LLaMa-2 7B: + M@10: 0.05 + M@10_std: 0.00 + M@10B: 0.11 + M@10B_std: 0.00 + N@10: 0.07 + N@10_std: 0.00 + N@10B: 0.16 + N@10B_std: 0.00 + Vietcuna 7B: + M@10: 0.00 + M@10_std: 0.00 + M@10B: 0.00 + M@10B_std: 0.00 + N@10: 0.00 + N@10_std: 0.00 + N@10B: 0.00 + N@10B_std: 0.00 + MixSUra: + M@10: 0.01 + M@10_std: null + M@10B: 0.07 + M@10B_std: null + N@10: 0.04 + N@10_std: null + N@10B: 0.11 + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null +mRobust04: + URA-LLaMa 70B: + M@10: 0.04 + M@10_std: 0.00 + M@10B: 0.04 + M@10B_std: 0.00 + N@10: 0.03 + N@10_std: 0.00 + N@10B: 0.04 + N@10B_std: 0.00 + URA-LLaMa 13B: + M@10: 0.03 + M@10_std: 0.00 + M@10B: 0.05 + M@10B_std: 0.00 + N@10: 0.04 + N@10_std: 0.00 + N@10B: 0.04 + N@10B_std: 0.00 + URA-LLaMa 7B: + M@10: 0.03 + M@10_std: 0.00 + M@10B: 0.03 + M@10B_std: 0.00 + N@10: 0.02 + N@10_std: 0.00 + N@10B: 0.02 + N@10B_std: 0.00 + LLaMa-2 13B: + M@10: 0.05 + M@10_std: 0.00 + M@10B: 0.04 + M@10B_std: 0.00 + N@10: 0.04 + N@10_std: 0.00 + N@10B: 0.04 + N@10B_std: 0.00 + LLaMa-2 7B: + M@10: 0.02 + M@10_std: 0.00 + M@10B: 0.03 + M@10B_std: 0.00 + N@10: 0.03 + N@10_std: 0.00 + N@10B: 0.02 + N@10B_std: 0.00 + Vietcuna 7B: + M@10: 0.00 + M@10_std: 0.00 + M@10B: 0.00 + M@10B_std: 0.00 + N@10: 0.00 + N@10_std: 0.00 + N@10B: 0.00 + N@10B_std: 0.00 + MixSUra: + M@10: 0.04 + M@10_std: null + M@10B: 0.04 + M@10B_std: null + N@10: 0.02 + N@10_std: null + N@10B: 0.02 + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/knowledge.yml b/_data/leaderboard/vi/few_shot/knowledge.yml new file mode 100644 index 0000000..8acbdc3 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/knowledge.yml @@ -0,0 +1,147 @@ +ZaloE2E: + num_fields: 2 + URA-LLaMa 70B: + EM: 0.34 + EM_std: 0.02 + F1: 0.50 + F1_std: 0.02 + URA-LLaMa 13B: + EM: 0.26 + EM_std: 0.02 + F1: 0.40 + F1_std: 0.02 + URA-LLaMa 7B: + EM: 0.14 + EM_std: 0.02 + F1: 0.25 + F1_std: 0.02 + LLaMa-2 13B: + EM: 0.22 + EM_std: 0.02 + F1: 0.36 + F1_std: 0.02 + LLaMa-2 7B: + EM: 0.07 + EM_std: 0.01 + F1: 0.15 + F1_std: 0.01 + Vietcuna 7B: + EM: 0.07 + EM_std: 0.01 + F1: 0.19 + F1_std: 0.01 + MixSUra: + EM: 0.19 + EM_std: null + F1: 0.34 + F1_std: null + GPT-3.5: + EM: 0.49 + EM_std: 0.02 + F1: 0.64 + F1_std: 0.02 + GPT-4: + EM: 0.49 + EM_std: 0.02 + F1: 0.64 + F1_std: 0.02 +ViMMRC: + URA-LLaMa 70B: + AC: 0.78 + AC_std: 0.02 + F1: 0.63 + F1_std: 0.03 + AR: 0.90 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.02 + A@10: 0.96 + A@10_std: 0.03 + URA-LLaMa 13B: + AC: 0.62 + AC_std: 0.02 + F1: 0.50 + F1_std: 0.02 + AR: 0.69 + AR_std: 0.02 + ECE: 0.18 + ECE_std: 0.02 + A@10: 0.65 + A@10_std: 0.07 + URA-LLaMa 7B: + AC: 0.42 + AC_std: 0.02 + F1: 0.33 + F1_std: 0.02 + AR: 0.61 + AR_std: 0.02 + ECE: 0.13 + ECE_std: 0.02 + A@10: 0.39 + A@10_std: 0.07 + LLaMa-2 13B: + AC: 0.58 + AC_std: 0.02 + F1: 0.46 + F1_std: 0.02 + AR: 0.62 + AR_std: 0.02 + ECE: 0.28 + ECE_std: 0.02 + A@10: 0.77 + A@10_std: 0.06 + LLaMa-2 7B: + AC: 0.30 + AC_std: 0.02 + F1: 0.23 + F1_std: 0.02 + AR: 0.56 + AR_std: 0.02 + ECE: 0.43 + ECE_std: 0.02 + A@10: 0.16 + A@10_std: 0.05 + Vietcuna 7B: + AC: 0.31 + AC_std: 0.02 + F1: 0.18 + F1_std: 0.01 + AR: 0.50 + AR_std: 0.00 + ECE: 0.06 + ECE_std: 0.02 + A@10: 0.31 + A@10_std: 0.06 + MixSUra: + AC: 0.65 + AC_std: null + F1: 0.64 + F1_std: null + AR: 0.54 + AR_std: null + ECE: 0.29 + ECE_std: null + A@10: 0.65 + A@10_std: null + GPT-3.5: + AC: 0.90 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.66 + ECE_std: 0.01 + A@10: 0.91 + A@10_std: 0.04 + GPT-4: + AC: 0.91 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.04 + AR: null + AR_std: null + ECE: 0.66 + ECE_std: 0.01 + A@10: 0.91 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/language_modeling.yml b/_data/leaderboard/vi/few_shot/language_modeling.yml new file mode 100644 index 0000000..c741c20 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/language_modeling.yml @@ -0,0 +1,236 @@ +MLQA-MLM: + URA-LLaMa 70B: + EM: 0.01 + EM_std: 0.00 + CER: 0.54 + CER_std: 0.00 + WER: 0.66 + WER_std: 0.00 + CED: 669.74 + CED_std: 10.38 + WED: 153.04 + WED_std: 2.33 + PLX: 1.32 + PLX_std: 0.05 + URA-LLaMa 13B: + EM: 0.01 + EM_std: 0.00 + CER: 0.45 + CER_std: 0.01 + WER: 0.61 + WER_std: 0.01 + CED: 559.64 + CED_std: 11.23 + WED: 136.97 + WED_std: 2.68 + PLX: 1.49 + PLX_std: 0.10 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + CER: 0.40 + CER_std: 0.01 + WER: 0.55 + WER_std: 0.01 + CED: 498.36 + CED_std: 11.01 + WED: 118.11 + WED_std: 2.58 + PLX: 1.24 + PLX_std: 0.01 + LLaMa-2 13B: + EM: 0.01 + EM_std: 0.00 + CER: 0.74 + CER_std: 0.00 + WER: 0.87 + WER_std: 0.00 + CED: 760.98 + CED_std: 11.91 + WED: 186.90 + WED_std: 2.85 + PLX: 1.24 + PLX_std: 0.03 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + CER: 0.81 + CER_std: 0.00 + WER: 0.98 + WER_std: 0.00 + CED: 769.36 + CED_std: 10.51 + WED: 198.53 + WED_std: 2.57 + PLX: 1.74 + PLX_std: 0.19 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + CER: 1.04 + CER_std: 0.00 + WER: 1.06 + WER_std: 0.00 + CED: 935.65 + CED_std: 12.47 + WED: 204.98 + WED_std: 2.79 + PLX: 1.40 + PLX_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + CER: 0.55 + CER_std: null + WER: 0.63 + WER_std: null + CED: 526.79 + CED_std: null + WED: 131.02 + WED_std: null + PLX: 1.00 + PLX_std: null + GPT-3.5: + EM: 0.04 + EM_std: 0.00 + CER: 0.28 + CER_std: 0.01 + WER: 0.44 + WER_std: 0.01 + CED: 387.37 + CED_std: 10.86 + WED: 92.78 + WED_std: 2.46 + PLX: null + PLX_std: null + GPT-4: + EM: 0.08 + EM_std: 0.00 + CER: 0.23 + CER_std: 0.01 + WER: 0.40 + WER_std: 0.01 + CED: 336.53 + CED_std: 10.23 + WED: 83.55 + WED_std: 2.34 + PLX: null + PLX_std: null +VSEC: + URA-LLaMa 70B: + EM: 0.33 + EM_std: 0.00 + CER: 0.11 + CER_std: 0.00 + WER: 0.13 + WER_std: 0.00 + CED: 15.09 + CED_std: 0.42 + WED: 4.05 + WED_std: 0.11 + PLX: 1.13 + PLX_std: 0.00 + URA-LLaMa 13B: + EM: 0.35 + EM_std: 0.00 + CER: 0.02 + CER_std: 0.00 + WER: 0.04 + WER_std: 0.00 + CED: 2.81 + CED_std: 0.12 + WED: 1.18 + WED_std: 0.03 + PLX: 1.15 + PLX_std: 0.00 + URA-LLaMa 7B: + EM: 0.22 + EM_std: 0.00 + CER: 0.32 + CER_std: 0.01 + WER: 0.33 + WER_std: 0.01 + CED: 41.89 + CED_std: 1.54 + WED: 10.10 + WED_std: 0.34 + PLX: 1.07 + PLX_std: 0.00 + LLaMa-2 13B: + EM: 0.16 + EM_std: 0.00 + CER: 0.03 + CER_std: 0.00 + WER: 0.05 + WER_std: 0.00 + CED: 3.38 + CED_std: 0.16 + WED: 1.51 + WED_std: 0.04 + PLX: 1.01 + PLX_std: 0.00 + LLaMa-2 7B: + EM: 0.12 + EM_std: 0.00 + CER: 0.36 + CER_std: 0.01 + WER: 0.39 + WER_std: 0.01 + CED: 47.50 + CED_std: 0.86 + WED: 11.80 + WED_std: 0.19 + PLX: 1.06 + PLX_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + CER: 8.00 + CER_std: 0.07 + WER: 8.01 + WER_std: 0.07 + CED: 1063.93 + CED_std: 7.64 + WED: 241.74 + WED_std: 1.74 + PLX: 1.46 + PLX_std: 0.00 + MixSUra: + EM: 0.08 + EM_std: null + CER: 0.19 + CER_std: null + WER: 0.28 + WER_std: null + CED: 25.13 + CED_std: null + WED: 8.58 + WED_std: null + PLX: 1.00 + PLX_std: null + GPT-3.5: + EM: 0.66 + EM_std: 0.00 + CER: 0.01 + CER_std: 0.00 + WER: 0.02 + WER_std: 0.00 + CED: 1.63 + CED_std: 0.08 + WED: 0.61 + WED_std: 0.02 + PLX: null + PLX_std: null + GPT-4: + EM: 0.75 + EM_std: 0.00 + CER: 0.01 + CER_std: 0.00 + WER: 0.01 + WER_std: 0.00 + CED: 0.89 + CED_std: 0.04 + WED: 0.37 + WED_std: 0.01 + PLX: null + PLX_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/reasoning.yml b/_data/leaderboard/vi/few_shot/reasoning.yml new file mode 100644 index 0000000..101dd47 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/reasoning.yml @@ -0,0 +1,192 @@ +"SR - Natural": + URA-LLaMa 70B: + EM: 0.14 + EM_std: 0.00 + F1: 0.48 + F1_std: 0.00 + Equ: 0.15 + Equ_std: 0.00 + URA-LLaMa 13B: + EM: 0.08 + EM_std: 0.00 + F1: 0.42 + F1_std: 0.00 + Equ: 0.08 + Equ_std: 0.00 + URA-LLaMa 7B: + EM: 0.04 + EM_std: 0.00 + F1: 0.38 + F1_std: 0.00 + Equ: 0.04 + Equ_std: 0.00 + LLaMa-2 13B: + EM: 0.03 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + Equ: 0.04 + Equ_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.01 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + MixSUra: + EM: 0.07 + EM_std: 0.00 + F1: 0.41 + F1_std: 0.00 + Equ: 0.07 + Equ_std: 0.00 + GPT-3.5: + EM: 0.15 + EM_std: 0.00 + F1: 0.50 + F1_std: 0.00 + Equ: 0.16 + Equ_std: 0.00 + GPT-4: + EM: 0.37 + EM_std: 0.00 + F1: 0.74 + F1_std: 0.00 + Equ: 0.42 + Equ_std: 0.00 +"SR - Abstract symbol": + URA-LLaMa 70B: + EM: 0.27 + EM_std: 0.00 + F1: 0.85 + F1_std: 0.00 + Equ: 0.30 + Equ_std: 0.00 + URA-LLaMa 13B: + EM: 0.20 + EM_std: 0.00 + F1: 0.70 + F1_std: 0.00 + Equ: 0.17 + Equ_std: 0.00 + URA-LLaMa 7B: + EM: 0.11 + EM_std: 0.00 + F1: 0.61 + F1_std: 0.00 + Equ: 0.10 + Equ_std: 0.00 + LLaMa-2 13B: + EM: 0.19 + EM_std: 0.00 + F1: 0.69 + F1_std: 0.00 + Equ: 0.18 + Equ_std: 0.00 + LLaMa-2 7B: + EM: 0.06 + EM_std: 0.00 + F1: 0.44 + F1_std: 0.00 + Equ: 0.06 + Equ_std: 0.00 + Vietcuna 7B: + EM: 0.14 + EM_std: 0.00 + F1: 0.71 + F1_std: 0.00 + Equ: 0.10 + Equ_std: 0.00 + MixSUra: + EM: 0.22 + EM_std: 0.00 + F1: 0.78 + F1_std: 0.00 + Equ: 0.23 + Equ_std: 0.00 + GPT-3.5: + EM: 0.26 + EM_std: 0.00 + F1: 0.83 + F1_std: 0.00 + Equ: 0.29 + Equ_std: 0.00 + GPT-4: + EM: 0.37 + EM_std: 0.00 + F1: 0.87 + F1_std: 0.00 + Equ: 0.44 + Equ_std: 0.00 +MATH: + URA-LLaMa 70B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.12 + Equ_std: 0.02 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.01 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.07 + Equ_std: 0.01 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.16 + Equ_std: 0.02 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.11 + Equ_std: 0.01 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.01 + Equ_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.62 + Equ_std: 0.02 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.01 + F1_std: 0.00 + Equ: 0.65 + Equ_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/sentiment_analysis.yml b/_data/leaderboard/vi/few_shot/sentiment_analysis.yml new file mode 100644 index 0000000..6b7de43 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/sentiment_analysis.yml @@ -0,0 +1,200 @@ +VLSP 2016: + URA-LLaMa 70B: + AC: 0.66 + AC_std: 0.01 + F1: 0.49 + F1_std: 0.01 + AR: 0.72 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.77 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.59 + AC_std: 0.01 + F1: 0.57 + F1_std: 0.01 + AR: 0.67 + AR_std: 0.01 + ECE: 0.09 + ECE_std: 0.01 + A@10: 0.82 + A@10_std: 0.04 + URA-LLaMa 7B: + AC: 0.57 + AC_std: 0.02 + F1: 0.42 + F1_std: 0.05 + AR: 0.69 + AR_std: 0.02 + ECE: 0.07 + ECE_std: 0.02 + A@10: 0.77 + A@10_std: 0.04 + LLaMa-2 13B: + AC: 0.51 + AC_std: 0.01 + F1: 0.41 + F1_std: 0.06 + AR: 0.66 + AR_std: 0.01 + ECE: 0.32 + ECE_std: 0.02 + A@10: 0.80 + A@10_std: 0.04 + LLaMa-2 7B: + AC: 0.45 + AC_std: 0.01 + F1: 0.32 + F1_std: 0.01 + AR: 0.59 + AR_std: 0.01 + ECE: 0.26 + ECE_std: 0.02 + A@10: 0.50 + A@10_std: 0.05 + Vietcuna 7B: + AC: 0.04 + AC_std: 0.01 + F1: 0.05 + F1_std: 0.01 + AR: 0.45 + AR_std: 0.01 + ECE: 0.71 + ECE_std: 0.01 + A@10: 0.05 + A@10_std: 0.02 + MixSUra: + AC: 0.62 + AC_std: null + F1: 0.63 + F1_std: null + AR: 0.59 + AR_std: null + ECE: 0.30 + ECE_std: null + A@10: 0.59 + A@10_std: null + GPT-3.5: + AC: 0.65 + AC_std: 0.01 + F1: 0.59 + F1_std: 0.1 + AR: null + AR_std: null + ECE: 0.32 + ECE_std: 0.01 + A@10: 0.65 + A@10_std: 0.05 + GPT-4: + AC: 0.75 + AC_std: 0.01 + F1: 0.74 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.41 + ECE_std: 0.01 + A@10: 0.74 + A@10_std: 0.04 +UiT-VSFC: + URA-LLaMa 70B: + AC: 0.75 + AC_std: 0.01 + F1: 0.48 + F1_std: 0.01 + AR: 0.81 + AR_std: 0.01 + ECE: 0.16 + ECE_std: 0.01 + A@10: 0.71 + A@10_std: 0.02 + URA-LLaMa 13B: + AC: 0.74 + AC_std: 0.01 + F1: 0.52 + F1_std: 0.08 + AR: 0.83 + AR_std: 0.01 + ECE: 0.10 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.72 + AC_std: 0.01 + F1: 0.43 + F1_std: 0.01 + AR: 0.78 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.95 + A@10_std: 0.03 + LLaMa-2 13B: + AC: 0.63 + AC_std: 0.01 + F1: 0.46 + F1_std: 0.07 + AR: 0.71 + AR_std: 0.01 + ECE: 0.13 + ECE_std: 0.01 + A@10: 0.88 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.50 + AC_std: 0.01 + F1: 0.34 + F1_std: 0.01 + AR: 0.69 + AR_std: 0.01 + ECE: 0.23 + ECE_std: 0.01 + A@10: 0.62 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.03 + AC_std: 0.00 + F1: 0.03 + F1_std: 0.00 + AR: 0.53 + AR_std: 0.01 + ECE: 0.50 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.00 + MixSUra: + AC: 0.74 + AC_std: null + F1: 0.46 + F1_std: null + AR: 0.63 + AR_std: null + ECE: 0.23 + ECE_std: null + A@10: 0.655 + A@10_std: null + GPT-3.5: + AC: 0.86 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.86 + A@10_std: 0.02 + GPT-4: + AC: 0.85 + AC_std: 0.01 + F1: 0.59 + F1_std: 0.09 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.85 + A@10_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/text_classification.yml b/_data/leaderboard/vi/few_shot/text_classification.yml new file mode 100644 index 0000000..1d32886 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/text_classification.yml @@ -0,0 +1,200 @@ +UiT-VSMEC: + URA-LLaMa 70B: + AC: 0.25 + AC_std: 0.02 + F1: 0.15 + F1_std: 0.01 + AR: 0.56 + AR_std: 0.01 + ECE: 0.25 + ECE_std: 0.02 + A@10: 0.37 + A@10_std: 0.06 + URA-LLaMa 13B: + AC: 0.32 + AC_std: 0.02 + F1: 0.12 + F1_std: 0.01 + AR: 0.58 + AR_std: 0.01 + ECE: 0.22 + ECE_std: 0.02 + A@10: 0.57 + A@10_std: 0.07 + URA-LLaMa 7B: + AC: 0.29 + AC_std: 0.02 + F1: 0.11 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.12 + ECE_std: 0.02 + A@10: 0.43 + A@10_std: 0.06 + LLaMa-2 13B: + AC: 0.18 + AC_std: 0.02 + F1: 0.08 + F1_std: 0.01 + AR: 0.55 + AR_std: 0.01 + ECE: 0.45 + ECE_std: 0.01 + A@10: 0.49 + A@10_std: 0.07 + LLaMa-2 7B: + AC: 0.25 + AC_std: 0.02 + F1: 0.12 + F1_std: 0.01 + AR: 0.57 + AR_std: 0.01 + ECE: 0.21 + ECE_std: 0.02 + A@10: 0.54 + A@10_std: 0.06 + Vietcuna 7B: + AC: 0.15 + AC_std: 0.01 + F1: 0.05 + F1_std: 0.01 + AR: 0.46 + AR_std: 0.01 + ECE: 0.85 + ECE_std: 0.01 + A@10: 0.15 + A@10_std: 0.04 + MixSUra: + AC: 0.40 + AC_std: null + F1: 0.36 + F1_std: null + AR: 0.72 + AR_std: null + ECE: 0.53 + ECE_std: null + A@10: 0.79 + A@10_std: null + GPT-3.5: + AC: 0.42 + AC_std: 0.02 + F1: 0.40 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.28 + ECE_std: 0.02 + A@10: 0.42 + A@10_std: 0.06 + GPT-4: + AC: 0.49 + AC_std: 0.02 + F1: 0.48 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.35 + ECE_std: 0.02 + A@10: 0.49 + A@10_std: 0.06 +PhoATIS: + URA-LLaMa 70B: + AC: 0.15 + AC_std: 0.01 + F1: 0.22 + F1_std: 0.03 + AR: 0.83 + AR_std: 0.00 + ECE: 0.81 + ECE_std: 0.01 + A@10: 0.13 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.01 + AC_std: 0.01 + F1: 0.06 + F1_std: 0.02 + AR: 0.47 + AR_std: 0.00 + ECE: 0.84 + ECE_std: 0.01 + A@10: 0.00 + A@10_std: 0.01 + URA-LLaMa 7B: + AC: 0.06 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.00 + ECE: 0.24 + ECE_std: 0.01 + A@10: 0.08 + A@10_std: 0.03 + LLaMa-2 13B: + AC: 0.02 + AC_std: 0.01 + F1: 0.06 + F1_std: 0.02 + AR: 0.57 + AR_std: 0.01 + ECE: 0.90 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.01 + LLaMa-2 7B: + AC: 0.03 + AC_std: 0.01 + F1: 0.02 + F1_std: 0.01 + AR: 0.56 + AR_std: 0.01 + ECE: 0.54 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.01 + Vietcuna 7B: + AC: 0.04 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.00 + AR: 0.63 + AR_std: 0.00 + ECE: 0.21 + ECE_std: 0.01 + A@10: 0.07 + A@10_std: 0.03 + MixSUra: + AC: 0.81 + AC_std: null + F1: 0.58 + F1_std: null + AR: 0.96 + AR_std: null + ECE: 0.14 + ECE_std: null + A@10: 0.91 + A@10_std: null + GPT-3.5: + AC: 0.69 + AC_std: 0.02 + F1: 0.67 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.63 + ECE_std: 0.02 + A@10: 0.69 + A@10_std: 0.05 + GPT-4: + AC: 0.85 + AC_std: 0.01 + F1: 0.78 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.79 + ECE_std: 0.01 + A@10: 0.88 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/toxicity_detection.yml b/_data/leaderboard/vi/few_shot/toxicity_detection.yml new file mode 100644 index 0000000..fd54338 --- /dev/null +++ b/_data/leaderboard/vi/few_shot/toxicity_detection.yml @@ -0,0 +1,200 @@ +UiT-ViCTSD: + URA-LLaMa 70B: + AC: 0.44 + AC_std: 0.01 + F1: 0.27 + F1_std: 0.01 + AR: 0.75 + AR_std: 0.01 + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.37 + A@10_std: 0.02 + URA-LLaMa 13B: + AC: 0.44 + AC_std: 0.01 + F1: 0.30 + F1_std: 0.05 + AR: 0.67 + AR_std: 0.01 + ECE: 0.33 + ECE_std: 0.01 + A@10: 0.41 + A@10_std: 0.03 + URA-LLaMa 7B: + AC: 0.43 + AC_std: 0.01 + F1: 0.40 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.29 + ECE_std: 0.01 + A@10: 0.71 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.28 + AC_std: 0.01 + F1: 0.19 + F1_std: 0.00 + AR: 0.67 + AR_std: 0.01 + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.63 + A@10_std: 0.03 + LLaMa-2 7B: + AC: 0.16 + AC_std: 0.01 + F1: 0.12 + F1_std: 0.01 + AR: 0.61 + AR_std: 0.01 + ECE: 0.66 + ECE_std: 0.01 + A@10: 0.08 + A@10_std: 0.02 + Vietcuna 7B: + AC: 0.08 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.01 + AR: 0.50 + AR_std: 0.00 + ECE: 0.42 + ECE_std: 0.00 + A@10: 0.08 + A@10_std: 0.03 + MixSUra: + AC: 0.70 + AC_std: null + F1: 0.39 + F1_std: null + AR: null + AR_std: null + ECE: 0.29 + ECE_std: null + A@10: 0.80 + A@10_std: null + GPT-3.5: + AC: 0.63 + AC_std: 0.02 + F1: 0.54 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.13 + ECE_std: 0.02 + A@10: 0.63 + A@10_std: 0.05 + GPT-4: + AC: 0.89 + AC_std: 0.00 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.39 + ECE_std: 0.00 + A@10: 0.89 + A@10_std: 0.03 +UiT-ViHSD: + URA-LLaMa 70B: + AC: 0.17 + AC_std: 0.00 + F1: 0.15 + F1_std: 0.00 + AR: 0.64 + AR_std: 0.01 + ECE: 0.57 + ECE_std: 0.00 + A@10: 0.27 + A@10_std: 0.02 + URA-LLaMa 13B: + AC: 0.26 + AC_std: 0.01 + F1: 0.16 + F1_std: 0.00 + AR: 0.61 + AR_std: 0.01 + ECE: 0.42 + ECE_std: 0.01 + A@10: 0.21 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.16 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.00 + AR: 0.67 + AR_std: 0.01 + ECE: 0.32 + ECE_std: 0.00 + A@10: 0.28 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.17 + AC_std: 0.00 + F1: 0.11 + F1_std: 0.00 + AR: 0.62 + AR_std: 0.01 + ECE: 0.58 + ECE_std: 0.00 + A@10: 0.44 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.01 + AC_std: 0.00 + F1: 0.01 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.01 + ECE: 0.71 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.02 + Vietcuna 7B: + AC: 0.61 + AC_std: 0.01 + F1: 0.21 + F1_std: 0.00 + AR: 0.50 + AR_std: 0.00 + ECE: 0.28 + ECE_std: 0.01 + A@10: 0.61 + A@10_std: 0.02 + MixSUra: + AC: 0.58 + AC_std: null + F1: 0.31 + F1_std: null + AR: 0.68 + AR_std: null + ECE: 0.30 + ECE_std: null + A@10: 0.93 + A@10_std: null + GPT-3.5: + AC: 0.63 + AC_std: 0.01 + F1: 0.47 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.29 + ECE_std: 0.01 + A@10: 0.63 + A@10_std: 0.02 + GPT-4: + AC: 0.77 + AC_std: 0.01 + F1: 0.57 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.44 + ECE_std: 0.01 + A@10: 0.77 + A@10_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/few_shot/translation.yml b/_data/leaderboard/vi/few_shot/translation.yml new file mode 100644 index 0000000..c25954d --- /dev/null +++ b/_data/leaderboard/vi/few_shot/translation.yml @@ -0,0 +1,164 @@ +PhoMT: + URA-LLaMa 70B: + "BLEU envi": 0.28 + "BLEU envi_std": 0.00 + "BLEU vien": 0.59 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.27 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.58 + "hLEPOR vien_std": 0.00 + URA-LLaMa 13B: + "BLEU envi": 0.25 + "BLEU envi_std": 0.00 + "BLEU vien": 0.55 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.15 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.56 + "hLEPOR vien_std": 0.00 + URA-LLaMa 7B: + "BLEU envi": 0.19 + "BLEU envi_std": 0.00 + "BLEU vien": 0.50 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.22 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.54 + "hLEPOR vien_std": 0.00 + LLaMa-2 13B: + "BLEU envi": 0.23 + "BLEU envi_std": 0.00 + "BLEU vien": 0.53 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.23 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.54 + "hLEPOR vien_std": 0.00 + LLaMa-2 7B: + "BLEU envi": 0.18 + "BLEU envi_std": 0.00 + "BLEU vien": 0.47 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.21 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.52 + "hLEPOR vien_std": 0.00 + Vietcuna 7B: + "BLEU envi": 0.15 + "BLEU envi_std": 0.00 + "BLEU vien": 0.35 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.03 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.11 + "hLEPOR vien_std": 0.00 + MixSUra: + "BLEU envi": 0.15 + "BLEU envi_std": null + "BLEU vien": 0.51 + "BLEU vien_std": null + "hLEPOR envi": 0.16 + "hLEPOR envi_std": null + "hLEPOR vien": 0.52 + "hLEPOR vien_std": null + GPT-3.5: + "BLEU envi": 0.33 + "BLEU envi_std": 0.00 + "BLEU vien": 0.65 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.33 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.63 + "hLEPOR vien_std": 0.00 + GPT-4: + "BLEU envi": 0.33 + "BLEU envi_std": 0.00 + "BLEU vien": 0.66 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.34 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.65 + "hLEPOR vien_std": 0.00 +OPUS100: + URA-LLaMa 70B: + "BLEU envi": 0.10 + "BLEU envi_std": 0.00 + "BLEU vien": 0.44 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.14 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.41 + "hLEPOR vien_std": 0.01 + URA-LLaMa 13B: + "BLEU envi": 0.10 + "BLEU envi_std": 0.01 + "BLEU vien": 0.41 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.17 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.43 + "hLEPOR vien_std": 0.01 + URA-LLaMa 7B: + "BLEU envi": 0.08 + "BLEU envi_std": 0.00 + "BLEU vien": 0.38 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.14 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.39 + "hLEPOR vien_std": 0.01 + LLaMa-2 13B: + "BLEU envi": 0.09 + "BLEU envi_std": 0.00 + "BLEU vien": 0.39 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.14 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.40 + "hLEPOR vien_std": 0.01 + LLaMa-2 7B: + "BLEU envi": 0.07 + "BLEU envi_std": 0.00 + "BLEU vien": 0.34 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.11 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.36 + "hLEPOR vien_std": 0.01 + Vietcuna 7B: + "BLEU envi": 0.00 + "BLEU envi_std": 0.00 + "BLEU vien": 0.00 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.05 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.16 + "hLEPOR vien_std": 0.00 + MixSUra: + "BLEU envi": 0.07 + "BLEU envi_std": null + "BLEU vien": 0.37 + "BLEU vien_std": null + "hLEPOR envi": 0.09 + "hLEPOR envi_std": null + "hLEPOR vien": 0.36 + "hLEPOR vien_std": null + GPT-3.5: + "BLEU envi": 0.16 + "BLEU envi_std": 0.01 + "BLEU vien": 0.50 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.24 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.51 + "hLEPOR vien_std": 0.00 + GPT-4: + "BLEU envi": 0.17 + "BLEU envi_std": 0.01 + "BLEU vien": 0.51 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.25 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.53 + "hLEPOR vien_std": 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/medium_prompt/question_answering.yml b/_data/leaderboard/vi/medium_prompt/question_answering.yml new file mode 100644 index 0000000..6213b75 --- /dev/null +++ b/_data/leaderboard/vi/medium_prompt/question_answering.yml @@ -0,0 +1,82 @@ +XQuAD: + URA-LLaMa 70B: + EM: 0.08 + EM_std: 0.00 + F1: 0.33 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.04 + EM_std: 0.00 + F1: 0.21 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + F1: 0.11 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.10 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.03 + F1_std: 0.00 + MixSUra: + EM: 0.01 + EM_std: null + F1: 0.25 + F1_std: null + GPT-3.5: + EM: null + EM_std: null + F1: null + F1_std: null + GPT-4: + EM: null + EM_std: null + F1: null + F1_std: null +MLQA: + URA-LLaMa 70B: + EM: 0.07 + EM_std: 0.00 + F1: 0.31 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.04 + EM_std: 0.00 + F1: 0.19 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + F1: 0.11 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.09 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.03 + F1_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + F1: 0.25 + F1_std: null + GPT-3.5: + EM: null + EM_std: null + F1: null + F1_std: null + GPT-4: + EM: null + EM_std: null + F1: null + F1_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/medium_prompt/summarization.yml b/_data/leaderboard/vi/medium_prompt/summarization.yml new file mode 100644 index 0000000..7c5bced --- /dev/null +++ b/_data/leaderboard/vi/medium_prompt/summarization.yml @@ -0,0 +1,274 @@ +VietNews: + URA-LLaMa 70B: + R1: 0.35 + R1_std: 0.00 + R2: 0.16 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.11 + SC_std: 0.00 + BS: 0.12 + BS_std: 0.00 + Cv: 0.63 + Cv_std: 0.00 + De: 5.43 + De_std: 0.02 + Cp: 37.78 + Cp_std: 0.47 + URA-LLaMa 13B: + R1: 0.26 + R1_std: 0.00 + R2: 0.12 + R2_std: 0.00 + RL: 0.17 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.08 + BS_std: 0.18 + Cv: 0.46 + Cv_std: 0.00 + De: 3.55 + De_std: 0.04 + Cp: 47.75 + Cp_std: 0.65 + URA-LLaMa 7B: + R1: 0.41 + R1_std: 0.00 + R2: 0.18 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.08 + BS_std: 0.13 + Cv: 0.83 + Cv_std: 0.00 + De: 8.13 + De_std: 0.04 + Cp: 8.08 + Cp_std: 0.17 + LLaMa-2 13B: + R1: 0.02 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.02 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.19 + BS_std: 0.05 + Cv: 0.01 + Cv_std: 0.00 + De: 0.01 + De_std: 0.00 + Cp: 54.67 + Cp_std: 0.16 + LLaMa-2 7B: + R1: 0.03 + R1_std: 0.00 + R2: 0.01 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.17 + BS_std: 0.03 + Cv: 0.04 + Cv_std: 0.00 + De: 0.07 + De_std: 0.00 + Cp: 23.86 + Cp_std: 0.26 + MixSUra: + R1: 0.06 + R1_std: null + R2: 0.01 + R2_std: null + RL: 0.04 + RL_std: null + SC: null + SC_std: null + BS: -0.13 + BS_std: null + Cv: 0.10 + Cv_std: null + De: 0.17 + De_std: null + Cp: 9.03 + Cp_std: null + GPT-3.5: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null + GPT-4: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null +WikiLingua: + URA-LLaMa 70B: + R1: 0.33 + R1_std: 0.00 + R2: 0.14 + R2_std: 0.00 + RL: 0.22 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.24 + BS_std: 0.10 + Cv: 0.59 + Cv_std: 0.01 + De: 4.62 + De_std: 0.11 + Cp: 56.56 + Cp_std: 1.70 + URA-LLaMa 13B: + R1: 0.14 + R1_std: 0.00 + R2: 0.05 + R2_std: 0.00 + RL: 0.09 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.14 + BS_std: 0.12 + Cv: 0.26 + Cv_std: 0.01 + De: 1.83 + De_std: 0.06 + Cp: 60.10 + Cp_std: 2.16 + URA-LLaMa 7B: + R1: 0.42 + R1_std: 0.00 + R2: 0.17 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.27 + BS_std: 0.21 + Cv: 0.84 + Cv_std: 0.00 + De: 7.15 + De_std: 0.08 + Cp: 8.08 + Cp_std: 0.36 + LLaMa-2 13B: + R1: 0.03 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.05 + BS_std: 0.03 + Cv: 0.02 + Cv_std: 0.00 + De: 0.02 + De_std: 0.00 + Cp: 42.55 + Cp_std: 0.81 + LLaMa-2 7B: + R1: 0.02 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.02 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.04 + BS_std: 0.06 + Cv: 0.02 + Cv_std: 0.00 + De: 0.03 + De_std: 0.00 + Cp: 40.31 + Cp_std: 0.88 + MixSUra: + R1: 0.03 + R1_std: null + R2: 0.00 + R2_std: null + RL: 0.03 + RL_std: null + SC: null + SC_std: null + BS: -0.01 + BS_std: null + Cv: 0.17 + Cv_std: null + De: 0.26 + De_std: null + Cp: 16.68 + Cp_std: null + GPT-3.5: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null + GPT-4: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/models.yml b/_data/leaderboard/vi/models.yml new file mode 100644 index 0000000..037f74d --- /dev/null +++ b/_data/leaderboard/vi/models.yml @@ -0,0 +1,11 @@ +models: + - URA-LLaMa 70B + - URA-LLaMa 13B + - URA-LLaMa 7B + - LLaMa-2 13B + - LLaMa-2 7B + - Vietcuna 7B + - GPT-3.5 + - GPT-4 + - Gemini Pro + - MixSUra \ No newline at end of file diff --git a/_data/leaderboard/vi/randomized_choice/knowledge.yml b/_data/leaderboard/vi/randomized_choice/knowledge.yml new file mode 100644 index 0000000..203f875 --- /dev/null +++ b/_data/leaderboard/vi/randomized_choice/knowledge.yml @@ -0,0 +1,100 @@ +ViMMRC: + URA-LLaMa 70B: + AC: 0.76 + AC_std: 0.02 + F1: 0.76 + F1_std: 0.02 + AR: 0.78 + AR_std: 0.01 + ECE: 0.14 + ECE_std: 0.02 + A@10: 0.94 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.62 + AC_std: 0.02 + F1: 0.62 + F1_std: 0.02 + AR: 0.61 + AR_std: 0.02 + ECE: 0.15 + ECE_std: 0.02 + A@10: 0.67 + A@10_std: 0.07 + URA-LLaMa 7B: + AC: 0.45 + AC_std: 0.02 + F1: 0.36 + F1_std: 0.02 + AR: 0.57 + AR_std: 0.02 + ECE: 0.10 + ECE_std: 0.02 + A@10: 0.45 + A@10_std: 0.07 + LLaMa-2 13B: + AC: 0.57 + AC_std: 0.02 + F1: 0.57 + F1_std: 0.02 + AR: 0.57 + AR_std: 0.02 + ECE: 0.29 + ECE_std: 0.02 + A@10: 0.75 + A@10_std: 0.07 + LLaMa-2 7B: + AC: 0.36 + AC_std: 0.02 + F1: 0.27 + F1_std: 0.02 + AR: 0.56 + AR_std: 0.02 + ECE: 0.37 + ECE_std: 0.02 + A@10: 0.44 + A@10_std: 0.07 + Vietcuna 7B: + AC: 0.26 + AC_std: 0.02 + F1: 0.15 + F1_std: 0.01 + AR: 0.50 + AR_std: 0.00 + ECE: 0.01 + ECE_std: 0.01 + A@10: 0.26 + A@10_std: 0.06 + MixSUra: + AC: 0.61 + AC_std: null + F1: 0.61 + F1_std: null + AR: 0.54 + AR_std: null + ECE: 0.31 + ECE_std: null + A@10: 0.65 + A@10_std: null + GPT-3.5: + AC: 0.92 + AC_std: 0.01 + F1: 0.74 + F1_std: 0.04 + AR: null + AR_std: null + ECE: 0.67 + ECE_std: 0.01 + A@10: 0.92 + A@10_std: 0.04 + GPT-4: + AC: 0.92 + AC_std: 0.01 + F1: 0.74 + F1_std: 0.04 + AR: null + AR_std: null + ECE: 0.67 + ECE_std: 0.01 + A@10: 0.92 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/information_retrieval.yml b/_data/leaderboard/vi/robustness_aware/information_retrieval.yml new file mode 100644 index 0000000..3bce6d4 --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/information_retrieval.yml @@ -0,0 +1,146 @@ +mMARCO: + URA-LLaMa 70B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + URA-LLaMa 13B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + URA-LLaMa 7B: + "M@10": 0.05 + "M@10_std": 0.00 + "M@10B": 0.11 + "M@10B_std": 0.00 + "N@10": 0.07 + "N@10_std": 0.00 + "N@10B": 0.17 + "N@10B_std": 0.00 + LLaMa-2 13B: + "M@10": 0.06 + "M@10_std": 0.00 + "M@10B": 0.13 + "M@10B_std": 0.00 + "N@10": 0.19 + "N@10_std": 0.00 + "N@10B": 0.19 + "N@10B_std": 0.00 + LLaMa-2 7B: + "M@10": 0.05 + "M@10_std": 0.00 + "M@10B": 0.11 + "M@10B_std": 0.00 + "N@10": 0.08 + "N@10_std": 0.00 + "N@10B": 0.16 + "N@10B_std": 0.00 + Vietcuna 7B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + GPT-3.5: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + GPT-4: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null +mRobust04: + - URA-LLaMa 70B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + URA-LLaMa 13B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + URA-LLaMa 7B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + LLaMa-2 13B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + LLaMa-2 7B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + Vietcuna 7B: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + GPT-3.5: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null + GPT-4: + "M@10": null + "M@10_std": null + "M@10B": null + "M@10B_std": null + "N@10": null + "N@10_std": null + "N@10B": null + "N@10B_std": null \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/knowledge.yml b/_data/leaderboard/vi/robustness_aware/knowledge.yml new file mode 100644 index 0000000..127ed2c --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/knowledge.yml @@ -0,0 +1,147 @@ +ZaloE2E: + num_fields: 2 + URA-LLaMa 70B: + EM: 0.23 + EM_std: 0.00 + F1: 0.37 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.18 + EM_std: 0.00 + F1: 0.30 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.10 + EM_std: 0.00 + F1: 0.18 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.13 + EM_std: 0.00 + F1: 0.21 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.02 + EM_std: 0.00 + F1: 0.05 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.05 + EM_std: 0.00 + F1: 0.15 + F1_std: 0.00 + MixSUra: + EM: 0.13 + EM_std: null + F1: 0.24 + F1_std: null + GPT-3.5: + EM: 0.45 + EM_std: 0.01 + F1: 0.61 + F1_std: 0.01 + GPT-4: + EM: 0.44 + EM_std: 0.01 + F1: 0.61 + F1_std: 0.01 +ViMMRC: + URA-LLaMa 70B: + AC: 0.65 + AC_std: 0.00 + F1: 0.53 + F1_std: 0.00 + AR: 0.84 + AR_std: 0.00 + ECE: 0.11 + ECE_std: 0.00 + A@10: 0.77 + A@10_std: 0.00 + URA-LLaMa 13B: + AC: 0.41 + AC_std: 0.00 + F1: 0.34 + F1_std: 0.00 + AR: 0.61 + AR_std: 0.00 + ECE: 0.22 + ECE_std: 0.00 + A@10: 0.58 + A@10_std: 0.00 + URA-LLaMa 7B: + AC: 0.33 + AC_std: 0.02 + F1: 0.28 + F1_std: 0.02 + AR: 0.61 + AR_std: 0.01 + ECE: 0.19 + ECE_std: 0.02 + A@10: 0.33 + A@10_std: 0.06 + LLaMa-2 13B: + AC: 0.39 + AC_std: 0.00 + F1: 0.31 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.00 + ECE: 0.46 + ECE_std: 0.00 + A@10: 0.33 + A@10_std: 0.00 + LLaMa-2 7B: + AC: 0.26 + AC_std: 0.01 + F1: 0.20 + F1_std: 0.01 + AR: 0.51 + AR_std: 0.01 + ECE: 0.46 + ECE_std: 0.01 + A@10: 0.13 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.26 + AC_std: 0.01 + F1: 0.14 + F1_std: 0.00 + AR: 0.50 + AR_std: 0.00 + ECE: 0.01 + ECE_std: 0.01 + A@10: 0.21 + A@10_std: 0.07 + MixSUra: + AC: 0.57 + AC_std: null + F1: 0.45 + F1_std: null + AR: 0.53 + AR_std: null + ECE: 0.35 + ECE_std: null + A@10: 0.58 + A@10_std: null + GPT-3.5: + AC: 0.90 + AC_std: 0.01 + F1: 0.72 + F1_std: 0.04 + AR: null + AR_std: null + ECE: 0.65 + ECE_std: 0.01 + A@10: 0.88 + A@10_std: 0.07 + GPT-4: + AC: 0.91 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.07 + AR: null + AR_std: null + ECE: 0.66 + ECE_std: 0.07 + A@10: 0.88 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/question_answering.yml b/_data/leaderboard/vi/robustness_aware/question_answering.yml new file mode 100644 index 0000000..40b79c4 --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/question_answering.yml @@ -0,0 +1,92 @@ +XQuAD: + URA-LLaMa 70B: + EM: 0.01 + EM_std: 0.00 + F1: 0.17 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.09 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.09 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.02 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.02 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.06 + F1_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + F1: 0.11 + F1_std: null + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.19 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 +MLQA: + URA-LLaMa 70B: + EM: 0.01 + EM_std: 0.00 + F1: 0.18 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.10 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.10 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.03 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.02 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.05 + F1_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + F1: 0.12 + F1_std: null + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.20 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.25 + F1_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/sentiment_analysis.yml b/_data/leaderboard/vi/robustness_aware/sentiment_analysis.yml new file mode 100644 index 0000000..c1e55e3 --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/sentiment_analysis.yml @@ -0,0 +1,200 @@ +VLSP 2016: + URA-LLaMa 70B: + AC: 0.63 + AC_std: 0.01 + F1: 0.48 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.09 + ECE_std: 0.01 + A@10: 0.83 + A@10_std: 0.04 + URA-LLaMa 13B: + AC: 0.55 + AC_std: 0.02 + F1: 0.52 + F1_std: 0.02 + AR: 0.59 + AR_std: 0.01 + ECE: 0.06 + ECE_std: 0.01 + A@10: 0.74 + A@10_std: 0.05 + URA-LLaMa 7B: + AC: 0.52 + AC_std: 0.02 + F1: 0.36 + F1_std: 0.03 + AR: 0.59 + AR_std: 0.01 + ECE: 0.07 + ECE_std: 0.01 + A@10: 0.66 + A@10_std: 0.05 + LLaMa-2 13B: + AC: 0.46 + AC_std: 0.02 + F1: 0.30 + F1_std: 0.01 + AR: 0.55 + AR_std: 0.01 + ECE: 0.39 + ECE_std: 0.02 + A@10: 0.70 + A@10_std: 0.05 + LLaMa-2 7B: + AC: 0.45 + AC_std: 0.02 + F1: 0.36 + F1_std: 0.01 + AR: 0.54 + AR_std: 0.01 + ECE: 0.20 + ECE_std: 0.02 + A@10: 0.51 + A@10_std: 0.05 + Vietcuna 7B: + AC: 0.44 + AC_std: 0.02 + F1: 0.27 + F1_std: 0.01 + AR: 0.51 + AR_std: 0.01 + ECE: 0.23 + ECE_std: 0.02 + A@10: 0.53 + A@10_std: 0.05 + MixSUra: + AC: 0.59 + AC_std: null + F1: 0.59 + F1_std: null + AR: 0.55 + AR_std: null + ECE: 0.34 + ECE_std: null + A@10: 0.52 + A@10_std: null + GPT-3.5: + AC: 0.64 + AC_std: 0.01 + F1: 0.60 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.31 + ECE_std: 0.01 + A@10: 0.54 + A@10_std: 0.05 + GPT-4: + AC: 0.74 + AC_std: 0.00 + F1: 0.73 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.41 + ECE_std: 0.00 + A@10: 0.71 + A@10_std: 0.00 +UiT-VSFC: + URA-LLaMa 70B: + AC: 0.71 + AC_std: 0.01 + F1: 0.45 + F1_std: 0.01 + AR: 0.80 + AR_std: 0.01 + ECE: 0.08 + ECE_std: 0.01 + A@10: 0.99 + A@10_std: 0.01 + URA-LLaMa 13B: + AC: 0.72 + AC_std: 0.01 + F1: 0.44 + F1_std: 0.05 + AR: 0.77 + AR_std: 0.01 + ECE: 0.18 + ECE_std: 0.01 + A@10: 0.77 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.73 + AC_std: 0.01 + F1: 0.41 + F1_std: 0.01 + AR: 0.71 + AR_std: 0.01 + ECE: 0.16 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.66 + AC_std: 0.01 + F1: 0.40 + F1_std: 0.01 + AR: 0.63 + AR_std: 0.01 + ECE: 0.11 + ECE_std: 0.01 + A@10: 0.89 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.51 + AC_std: 0.01 + F1: 0.33 + F1_std: 0.01 + AR: 0.65 + AR_std: 0.01 + ECE: 0.15 + ECE_std: 0.01 + A@10: 0.80 + A@10_std: 0.02 + Vietcuna 7B: + AC: 0.49 + AC_std: 0.01 + F1: 0.25 + F1_std: 0.03 + AR: 0.46 + AR_std: 0.01 + ECE: 0.33 + ECE_std: 0.01 + A@10: 0.34 + A@10_std: 0.03 + MixSUra: + AC: 0.69 + AC_std: null + F1: 0.44 + F1_std: null + AR: 0.61 + AR_std: null + ECE: 0.29 + ECE_std: null + A@10: 0.66 + A@10_std: null + GPT-3.5: + AC: 0.86 + AC_std: 0.01 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.53 + ECE_std: 0.01 + A@10: 0.86 + A@10_std: 0.02 + GPT-4: + AC: 0.83 + AC_std: 0.00 + F1: 0.70 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.50 + ECE_std: 0.00 + A@10: 0.85 + A@10_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/summarization.yml b/_data/leaderboard/vi/robustness_aware/summarization.yml new file mode 100644 index 0000000..8527eed --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/summarization.yml @@ -0,0 +1,308 @@ +VietNews: + URA-LLaMa 70B: + R1: 0.34 + R1_std: 0.00 + R2: 0.15 + R2_std: 0.00 + RL: 0.23 + RL_std: 0.00 + SC: -0.06 + SC_std: 0.00 + BS: -0.11 + BS_std: 0.18 + Cv: 0.10 + Cv_std: 0.00 + De: 0.10 + De_std: 0.00 + Cp: 39.63 + Cp_std: 0.87 + URA-LLaMa 13B: + R1: 0.35 + R1_std: 0.00 + R2: 0.14 + R2_std: 0.00 + RL: 0.23 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.07 + BS_std: 0.17 + Cv: 0.64 + Cv_std: 0.00 + De: 0.65 + De_std: 0.00 + Cp: 134.65 + Cp_std: 3.76 + URA-LLaMa 7B: + R1: 0.37 + R1_std: 0.00 + R2: 0.12 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.10 + SC_std: 0.00 + BS: -0.24 + BS_std: 0.18 + Cv: 0.65 + Cv_std: 0.00 + De: 0.65 + De_std: 0.00 + Cp: 17.92 + Cp_std: 0.87 + LLaMa-2 13B: + R1: 0.05 + R1_std: 0.00 + R2: 0.01 + R2_std: 0.00 + RL: 0.04 + RL_std: 0.00 + SC: -0.15 + SC_std: 0.00 + BS: -0.24 + BS_std: 0.18 + Cv: 0.03 + Cv_std: 0.00 + De: 0.03 + De_std: 0.00 + Cp: 55.91 + Cp_std: 0.65 + LLaMa-2 7B: + R1: 0.05 + R1_std: 0.00 + R2: 0.01 + R2_std: 0.00 + RL: 0.05 + RL_std: 0.00 + SC: -0.10 + SC_std: 0.00 + BS: -0.19 + BS_std: 0.04 + Cv: 0.07 + Cv_std: 0.00 + De: 0.07 + De_std: 0.00 + Cp: 55.29 + Cp_std: 0.88 + Vietcuna 7B: + R1: 0.03 + R1_std: 0.00 + R2: 0.01 + R2_std: 0.00 + RL: 0.02 + RL_std: 0.00 + SC: -0.10 + SC_std: 0.00 + BS: -0.18 + BS_std: 0.06 + Cv: 0.91 + Cv_std: 0.00 + De: 0.91 + De_std: 0.00 + Cp: 1026.61 + Cp_std: 3.86 + MixSUra: + R1: 0.41 + R1_std: null + R2: 0.19 + R2_std: null + RL: 0.26 + RL_std: null + SC: null + SC_std: null + BS: -0.03 + BS_std: null + Cv: 0.86 + Cv_std: null + De: 0.87 + De_std: null + Cp: 29.15 + Cp_std: null + GPT-3.5: + R1: 0.34 + R1_std: 0.00 + R2: 0.19 + R2_std: 0.00 + RL: 0.23 + RL_std: 0.00 + SC: -0.10 + SC_std: 0.00 + BS: 0.05 + BS_std: 0.14 + Cv: 0.81 + Cv_std: 0.00 + De: 0.81 + De_std: 0.00 + Cp: 128.44 + Cp_std: 2.94 + GPT-4: + R1: 0.39 + R1_std: 0.00 + R2: 0.21 + R2_std: 0.00 + RL: 0.26 + RL_std: 0.00 + SC: -0.10 + SC_std: 0.09 + BS: 0.04 + BS_std: 0.00 + Cv: 0.83 + Cv_std: 0.00 + De: 0.83 + De_std: 0.71 + Cp: 24.48 + Cp_std: 0.00 +WikiLingua: + URA-LLaMa 70B: + R1: 0.28 + R1_std: 0.00 + R2: 0.11 + R2_std: 0.00 + RL: 0.19 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.25 + BS_std: 0.23 + Cv: 0.50 + Cv_std: 0.01 + De: 0.50 + De_std: 0.01 + Cp: 167.42 + Cp_std: 7.09 + URA-LLaMa 13B: + R1: 0.20 + R1_std: 0.00 + R2: 0.07 + R2_std: 0.00 + RL: 0.13 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.20 + BS_std: 0.11 + Cv: 0.38 + Cv_std: 0.00 + De: 0.38 + De_std: 0.00 + Cp: 103.69 + Cp_std: 3.33 + URA-LLaMa 7B: + R1: 0.37 + R1_std: 0.00 + R2: 0.12 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.11 + BS_std: 0.18 + Cv: 0.65 + Cv_std: 0.00 + De: 0.65 + De_std: 0.00 + Cp: 20.49 + Cp_std: 0.95 + LLaMa-2 13B: + R1: 0.04 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.09 + BS_std: 0.00 + Cv: 0.05 + Cv_std: 0.00 + De: 0.05 + De_std: 0.00 + Cp: 66.85 + Cp_std: 6.72 + LLaMa-2 7B: + R1: 0.04 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.04 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.15 + BS_std: 0.00 + Cv: 0.06 + Cv_std: 0.00 + De: 0.06 + De_std: 0.00 + Cp: 58.32 + Cp_std: 3.32 + Vietcuna 7B: + R1: 0.08 + R1_std: 0.00 + R2: 0.02 + R2_std: 0.00 + RL: 0.05 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: -0.19 + BS_std: 0.05 + Cv: 0.78 + Cv_std: 0.00 + De: 0.78 + De_std: 0.00 + Cp: 505.45 + Cp_std: 8.64 + MixSUra: + R1: 0.46 + R1_std: null + R2: 0.21 + R2_std: null + RL: 0.28 + RL_std: null + SC: null + SC_std: null + BS: 0.26 + BS_std: null + Cv: 0.88 + Cv_std: null + De: 0.98 + De_std: null + Cp: 19.10 + Cp_std: null + GPT-3.5: + R1: 0.39 + R1_std: 0.00 + R2: 0.19 + R2_std: 0.00 + RL: 0.25 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.28 + BS_std: 0.11 + Cv: 0.82 + Cv_std: 0.00 + De: 0.82 + De_std: 0.00 + Cp: 200.90 + Cp_std: 7.40 + GPT-4: + R1: 0.45 + R1_std: 0.00 + R2: 0.20 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: 0.28 + BS_std: 0.00 + Cv: 0.80 + Cv_std: 0.03 + De: 0.81 + De_std: 0.00 + Cp: 20.40 + Cp_std: 1.59 diff --git a/_data/leaderboard/vi/robustness_aware/text_classification.yml b/_data/leaderboard/vi/robustness_aware/text_classification.yml new file mode 100644 index 0000000..388fc24 --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/text_classification.yml @@ -0,0 +1,200 @@ +UiT-VSMEC: + URA-LLaMa 70B: + AC: 0.25 + AC_std: 0.00 + F1: 0.16 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.02 + ECE: 0.20 + ECE_std: 0.00 + A@10: 0.33 + A@10_std: 0.00 + URA-LLaMa 13B: + AC: 0.30 + AC_std: 0.00 + F1: 0.11 + F1_std: 0.00 + AR: 0.51 + AR_std: 0.01 + ECE: 0.26 + ECE_std: 0.00 + A@10: 0.44 + A@10_std: 0.00 + URA-LLaMa 7B: + AC: 0.29 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.00 + AR: 0.57 + AR_std: 0.01 + ECE: 0.17 + ECE_std: 0.00 + A@10: 0.30 + A@10_std: 0.00 + LLaMa-2 13B: + AC: 0.19 + AC_std: 0.00 + F1: 0.07 + F1_std: 0.00 + AR: 0.52 + AR_std: 0.01 + ECE: 0.47 + ECE_std: 0.00 + A@10: 0.43 + A@10_std: 0.00 + LLaMa-2 7B: + AC: 0.17 + AC_std: 0.00 + F1: 0.10 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.00 + ECE: 0.33 + ECE_std: 0.00 + A@10: 0.29 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.09 + AC_std: 0.00 + F1: 0.09 + F1_std: 0.00 + AR: 0.51 + AR_std: 0.01 + ECE: 0.91 + ECE_std: 0.00 + A@10: 0.09 + A@10_std: 0.00 + MixSUra: + AC: 0.35 + AC_std: null + F1: 0.27 + F1_std: null + AR: 0.70 + AR_std: null + ECE: 0.58 + ECE_std: null + A@10: 0.70 + A@10_std: null + GPT-3.5: + AC: 0.42 + AC_std: 0.00 + F1: 0.41 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.28 + ECE_std: 0.00 + A@10: 0.30 + A@10_std: 0.00 + GPT-4: + AC: 0.48 + AC_std: 0.00 + F1: 0.45 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.33 + ECE_std: 0.00 + A@10: 0.40 + A@10_std: 0.00 +PhoATIS: + URA-LLaMa 70B: + AC: 0.16 + AC_std: 0.02 + F1: 0.26 + F1_std: 0.03 + AR: 0.79 + AR_std: 0.00 + ECE: 0.79 + ECE_std: 0.02 + A@10: 0.08 + A@10_std: 0.06 + URA-LLaMa 13B: + AC: 0.01 + AC_std: 0.01 + F1: 0.05 + F1_std: 0.01 + AR: 0.47 + AR_std: 0.01 + ECE: 0.84 + ECE_std: 0.01 + A@10: 0.00 + A@10_std: 0.04 + URA-LLaMa 7B: + AC: 0.02 + AC_std: 0.01 + F1: 0.04 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.01 + ECE: 0.18 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.02 + AC_std: 0.00 + F1: 0.06 + F1_std: 0.00 + AR: 0.57 + AR_std: 0.01 + ECE: 0.91 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.00 + LLaMa-2 7B: + AC: 0.01 + AC_std: 0.01 + F1: 0.00 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.00 + ECE: 0.69 + ECE_std: 0.01 + A@10: 0.02 + A@10_std: 0.02 + Vietcuna 7B: + AC: 0.02 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.01 + ECE: 0.23 + ECE_std: 0.01 + A@10: 0.02 + A@10_std: 0.01 + MixSUra: + AC: 0.80 + AC_std: null + F1: 55 + F1_std: null + AR: 0.94 + AR_std: null + ECE: 0.15 + ECE_std: null + A@10: 0.88 + A@10_std: null + GPT-3.5: + AC: 0.68 + AC_std: 0.02 + F1: 0.64 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.62 + ECE_std: 0.02 + A@10: 0.70 + A@10_std: 0.05 + GPT-4: + AC: 0.86 + AC_std: 0.01 + F1: 0.80 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.80 + ECE_std: 0.01 + A@10: 0.91 + A@10_std: 0.03 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/toxicity_detection.yml b/_data/leaderboard/vi/robustness_aware/toxicity_detection.yml new file mode 100644 index 0000000..13aded0 --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/toxicity_detection.yml @@ -0,0 +1,200 @@ +UiT-ViCTSD: + URA-LLaMa 70B: + AC: 0.32 + AC_std: 0.00 + F1: 0.21 + F1_std: 0.00 + AR: 0.72 + AR_std: 0.01 + ECE: 0.62 + ECE_std: 0.00 + A@10: 0.33 + A@10_std: 0.00 + URA-LLaMa 13B: + AC: 0.27 + AC_std: 0.00 + F1: 0.26 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.00 + ECE: 0.56 + ECE_std: 0.00 + A@10: 0.12 + A@10_std: 0.00 + URA-LLaMa 7B: + AC: 0.22 + AC_std: 0.00 + F1: 0.21 + F1_std: 0.00 + AR: 0.63 + AR_std: 0.00 + ECE: 0.39 + ECE_std: 0.00 + A@10: 0.36 + A@10_std: 0.00 + LLaMa-2 13B: + AC: 0.12 + AC_std: 0.00 + F1: 0.11 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.01 + ECE: 0.66 + ECE_std: 0.00 + A@10: 0.12 + A@10_std: 0.00 + LLaMa-2 7B: + AC: 0.04 + AC_std: 0.00 + F1: 0.04 + F1_std: 0.00 + AR: 0.62 + AR_std: 0.00 + ECE: 0.86 + ECE_std: 0.00 + A@10: 0.02 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.11 + AC_std: 0.00 + F1: 0.11 + F1_std: 0.00 + AR: 0.54 + AR_std: 0.00 + ECE: 0.39 + ECE_std: 0.00 + A@10: 0.13 + A@10_std: 0.00 + MixSUra: + AC: 0.72 + AC_std: null + F1: 0.39 + F1_std: null + AR: null + AR_std: null + ECE: 0.25 + ECE_std: null + A@10: 0.81 + A@10_std: null + GPT-3.5: + AC: 0.51 + AC_std: 0.00 + F1: 0.46 + F1_std: 0.00 + AR: 0.5 + AR_std: 0.00 + ECE: 0.01 + ECE_std: 0.00 + A@10: 0.54 + A@10_std: 0.00 + GPT-4: + AC: 0.88 + AC_std: 0.00 + F1: 0.71 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.38 + ECE_std: 0.00 + A@10: 0.88 + A@10_std: 0.00 +UiT-ViHSD: + URA-LLaMa 70B: + AC: 0.14 + AC_std: 0.00 + F1: 0.12 + F1_std: 0.00 + AR: 0.64 + AR_std: 0.02 + ECE: 0.61 + ECE_std: 0.00 + A@10: 0.23 + A@10_std: 0.00 + URA-LLaMa 13B: + AC: 0.18 + AC_std: 0.00 + F1: 0.11 + F1_std: 0.00 + AR: 0.57 + AR_std: 0.01 + ECE: 0.45 + ECE_std: 0.00 + A@10: 0.20 + A@10_std: 0.00 + URA-LLaMa 7B: + AC: 0.12 + AC_std: 0.00 + F1: 0.07 + F1_std: 0.00 + AR: 0.62 + AR_std: 0.00 + ECE: 0.38 + ECE_std: 0.00 + A@10: 0.19 + A@10_std: 0.00 + LLaMa-2 13B: + AC: 0.10 + AC_std: 0.00 + F1: 0.07 + F1_std: 0.00 + AR: 0.59 + AR_std: 0.01 + ECE: 0.62 + ECE_std: 0.00 + A@10: 0.24 + A@10_std: 0.00 + LLaMa-2 7B: + AC: 0.01 + AC_std: 0.00 + F1: 0.00 + F1_std: 0.00 + AR: 0.54 + AR_std: 0.00 + ECE: 0.79 + ECE_std: 0.00 + A@10: 0.00 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.09 + AC_std: 0.00 + F1: 0.05 + F1_std: 0.00 + AR: 0.5 + AR_std: 0.00 + ECE: 0.24 + ECE_std: 0.00 + A@10: 0.08 + A@10_std: 0.00 + MixSUra: + AC: 0.66 + AC_std: null + F1: 0.31 + F1_std: null + AR: 0.67 + AR_std: null + ECE: 0.21 + ECE_std: null + A@10: 0.82 + A@10_std: null + GPT-3.5: + AC: 0.64 + AC_std: 0.00 + F1: 0.47 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.30 + ECE_std: 0.00 + A@10: 0.63 + A@10_std: 0.00 + GPT-4: + AC: 0.78 + AC_std: 0.00 + F1: 0.56 + F1_std: 0.00 + AR: null + AR_std: null + ECE: 0.44 + ECE_std: 0.00 + A@10: 0.78 + A@10_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/robustness_aware/translation.yml b/_data/leaderboard/vi/robustness_aware/translation.yml new file mode 100644 index 0000000..edcfc9f --- /dev/null +++ b/_data/leaderboard/vi/robustness_aware/translation.yml @@ -0,0 +1,164 @@ +PhoMT: + URA-LLaMa 70B: + "BLEU envi": 0.25 + "BLEU envi_std": 0.00 + "BLEU vien": 0.58 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.11 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.51 + "hLEPOR vien_std": 0.00 + URA-LLaMa 13B: + "BLEU envi": 0.23 + "BLEU envi_std": 0.00 + "BLEU vien": 0.55 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.10 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.50 + "hLEPOR vien_std": 0.00 + URA-LLaMa 7B: + "BLEU envi": 0.15 + "BLEU envi_std": 0.00 + "BLEU vien": 0.48 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.06 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.46 + "hLEPOR vien_std": 0.00 + LLaMa-2 13B: + "BLEU envi": 0.20 + "BLEU envi_std": 0.00 + "BLEU vien": 0.51 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.07 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.44 + "hLEPOR vien_std": 0.00 + LLaMa-2 7B: + "BLEU envi": 0.13 + "BLEU envi_std": 0.00 + "BLEU vien": 0.41 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.05 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.42 + "hLEPOR vien_std": 0.00 + Vietcuna 7B: + "BLEU envi": 0.17 + "BLEU envi_std": 0.00 + "BLEU vien": 0.43 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.07 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.41 + "hLEPOR vien_std": 0.00 + MixSUra: + "BLEU envi": 0.14 + "BLEU envi_std": null + "BLEU vien": 0.50 + "BLEU vien_std": null + "hLEPOR envi": 0.11 + "hLEPOR envi_std": null + "hLEPOR vien": 0.46 + "hLEPOR vien_std": null + GPT-3.5: + "BLEU envi": 0.31 + "BLEU envi_std": 0.00 + "BLEU vien": 0.64 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.17 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.59 + "hLEPOR vien_std": 0.00 + GPT-4: + "BLEU envi": 0.31 + "BLEU envi_std": 0.00 + "BLEU vien": 0.65 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.20 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.62 + "hLEPOR vien_std": 0.00 +OPUS100: + URA-LLaMa 70B: + "BLEU envi": 0.05 + "BLEU envi_std": 0.00 + "BLEU vien": 0.40 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.06 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.36 + "hLEPOR vien_std": 0.00 + URA-LLaMa 13B: + "BLEU envi": 0.03 + "BLEU envi_std": 0.00 + "BLEU vien": 0.38 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.05 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.38 + "hLEPOR vien_std": 0.00 + URA-LLaMa 7B: + "BLEU envi": 0.02 + "BLEU envi_std": 0.00 + "BLEU vien": 0.35 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.03 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.34 + "hLEPOR vien_std": 0.01 + LLaMa-2 13B: + "BLEU envi": 0.03 + "BLEU envi_std": 0.00 + "BLEU vien": 0.36 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.04 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.32 + "hLEPOR vien_std": 0.00 + LLaMa-2 7B: + "BLEU envi": 0.02 + "BLEU envi_std": 0.00 + "BLEU vien": 0.31 + "BLEU vien_std": 0.00 + "hLEPOR envi": 0.03 + "hLEPOR envi_std": 0.00 + "hLEPOR vien": 0.30 + "hLEPOR vien_std": 0.00 + Vietcuna 7B: + "BLEU envi": 0.09 + "BLEU envi_std": 0.01 + "BLEU vien": 0.38 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.09 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.33 + "hLEPOR vien_std": 0.00 + MixSUra: + "BLEU envi": 0.06 + "BLEU envi_std": null + "BLEU vien": 0.36 + "BLEU vien_std": null + "hLEPOR envi": 0.06 + "hLEPOR envi_std": null + "hLEPOR vien": 0.31 + "hLEPOR vien_std": null + GPT-3.5: + "BLEU envi": 0.15 + "BLEU envi_std": 0.01 + "BLEU vien": 0.49 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.21 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.48 + "hLEPOR vien_std": 0.00 + GPT-4: + "BLEU envi": 0.16 + "BLEU envi_std": 0.01 + "BLEU vien": 0.50 + "BLEU vien_std": 0.01 + "hLEPOR envi": 0.23 + "hLEPOR envi_std": 0.01 + "hLEPOR vien": 0.51 + "hLEPOR vien_std": 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/weaker_prompt/question_answering.yml b/_data/leaderboard/vi/weaker_prompt/question_answering.yml new file mode 100644 index 0000000..408ca87 --- /dev/null +++ b/_data/leaderboard/vi/weaker_prompt/question_answering.yml @@ -0,0 +1,82 @@ +XQuAD: + URA-LLaMa 70B: + EM: 0.21 + EM_std: 0.01 + F1: 0.47 + F1_std: 0.01 + URA-LLaMa 13B: + EM: 0.22 + EM_std: 0.01 + F1: 0.43 + F1_std: 0.01 + URA-LLaMa 7B: + EM: 0.13 + EM_std: 0.00 + F1: 0.32 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.04 + EM_std: 0.00 + F1: 0.28 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.06 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + MixSUra: + EM: 0.13 + EM_std: null + F1: 0.38 + F1_std: null + GPT-3.5: + EM: null + EM_std: null + F1: null + F1_std: null + GPT-4: + EM: null + EM_std: null + F1: null + F1_std: null +MLQA: + URA-LLaMa 70B: + EM: 0.14 + EM_std: 0.01 + F1: 0.41 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.17 + EM_std: 0.01 + F1: 0.40 + F1_std: 0.01 + URA-LLaMa 7B: + EM: 0.10 + EM_std: 0.00 + F1: 0.32 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.04 + EM_std: 0.00 + F1: 0.28 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.05 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + MixSUra: + EM: 0.09 + EM_std: null + F1: 0.36 + F1_std: null + GPT-3.5: + EM: null + EM_std: null + F1: null + F1_std: null + GPT-4: + EM: null + EM_std: null + F1: null + F1_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/weaker_prompt/summarization.yml b/_data/leaderboard/vi/weaker_prompt/summarization.yml new file mode 100644 index 0000000..b978edc --- /dev/null +++ b/_data/leaderboard/vi/weaker_prompt/summarization.yml @@ -0,0 +1,274 @@ +VietNews: + URA-LLaMa 70B: + R1: 0.49 + R1_std: 0.00 + R2: 0.23 + R2_std: 0.00 + RL: 0.31 + RL_std: 0.00 + SC: -0.08 + SC_std: 0.00 + BS: 0.05 + BS_std: 0.11 + Cv: 0.89 + Cv_std: 0.00 + De: 8.90 + De_std: 0.03 + Cp: 18.48 + Cp_std: 0.59 + URA-LLaMa 13B: + R1: 0.27 + R1_std: 0.00 + R2: 0.12 + R2_std: 0.00 + RL: 0.18 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: 0.05 + BS_std: 0.11 + Cv: 0.56 + Cv_std: 0.00 + De: 5.00 + De_std: 0.04 + Cp: 153.55 + Cp_std: 0.99 + URA-LLaMa 7B: + R1: 0.45 + R1_std: 0.00 + R2: 0.21 + R2_std: 0.00 + RL: 0.29 + RL_std: 0.00 + SC: -0.08 + SC_std: 0.00 + BS: 0.03 + BS_std: 0.09 + Cv: 0.91 + Cv_std: 0.00 + De: 9.43 + De_std: 0.03 + Cp: 6.42 + Cp_std: 0.05 + LLaMa-2 13B: + R1: 0.45 + R1_std: 0.00 + R2: 0.22 + R2_std: 0.00 + RL: 0.29 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: 0.00 + BS_std: 0.14 + Cv: 0.92 + Cv_std: 0.00 + De: 9.49 + De_std: 0.02 + Cp: 8.46 + Cp_std: 0.29 + LLaMa-2 7B: + R1: 0.36 + R1_std: 0.00 + R2: 0.17 + R2_std: 0.00 + RL: 0.23 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.15 + BS_std: 0.12 + Cv: 0.69 + Cv_std: 0.00 + De: 6.35 + De_std: 0.03 + Cp: 7.59 + Cp_std: 0.21 + MixSUra: + R1: 0.44 + R1_std: null + R2: 0.22 + R2_std: null + RL: 0.29 + RL_std: null + SC: null + SC_std: null + BS: 0.07 + BS_std: null + Cv: 0.97 + Cv_std: null + De: 35.67 + De_std: null + Cp: 9.43 + Cp_std: null + GPT-3.5: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null + GPT-4: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null +WikiLingua: + URA-LLaMa 70B: + R1: 0.47 + R1_std: 0.00 + R2: 0.20 + R2_std: 0.00 + RL: 0.29 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.19 + BS_std: 0.13 + Cv: 0.86 + Cv_std: 0.00 + De: 6.83 + De_std: 0.09 + Cp: 25.30 + Cp_std: 1.86 + URA-LLaMa 13B: + R1: 0.22 + R1_std: 0.00 + R2: 0.09 + R2_std: 0.00 + RL: 0.14 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.20 + BS_std: 0.007 + Cv: 0.48 + Cv_std: 0.00 + De: 3.49 + De_std: 0.04 + Cp: 190.09 + Cp_std: 4.92 + URA-LLaMa 7B: + R1: 0.42 + R1_std: 0.00 + R2: 0.18 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.07 + BS_std: 0.12 + Cv: 0.89 + Cv_std: 0.00 + De: 7.58 + De_std: 0.05 + Cp: 7.14 + Cp_std: 0.14 + LLaMa-2 13B: + R1: 0.47 + R1_std: 0.00 + R2: 0.22 + R2_std: 0.00 + RL: 0.29 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.34 + BS_std: 0.12 + Cv: 0.92 + Cv_std: 0.00 + De: 9.39 + De_std: 0.05 + Cp: 17.94 + Cp_std: 2.84 + LLaMa-2 7B: + R1: 0.45 + R1_std: 0.00 + R2: 0.20 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.36 + BS_std: 0.00 + Cv: 0.83 + Cv_std: 0.00 + De: 7.71 + De_std: 0.07 + Cp: 12.39 + Cp_std: 1.46 + MixSUra: + R1: 0.47 + R1_std: null + R2: 0.22 + R2_std: null + RL: 0.29 + RL_std: null + SC: null + SC_std: null + BS: 0.19 + BS_std: null + Cv: 0.97 + Cv_std: null + De: 28.97 + De_std: null + Cp: 10.27 + Cp_std: null + GPT-3.5: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null + GPT-4: + R1: null + R1_std: null + R2: null + R2_std: null + RL: null + RL_std: null + SC: null + SC_std: null + BS: null + BS_std: null + Cv: null + Cv_std: null + De: null + De_std: null + Cp: null + Cp_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/information_retrieval.yml b/_data/leaderboard/vi/zero_shot/information_retrieval.yml new file mode 100644 index 0000000..22df454 --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/information_retrieval.yml @@ -0,0 +1,146 @@ +mMARCO: + URA-LLaMa 70B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 7B: + M@10: 0.06 + M@10_std: 0.00 + M@10B: 0.14 + M@10B_std: 0.00 + N@10: 0.09 + N@10_std: 0.00 + N@10B: 0.21 + N@10B_std: 0.00 + LLaMa-2 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + LLaMa-2 7B: + M@10: 0.06 + M@10_std: 0.00 + M@10B: 0.11 + M@10B_std: 0.00 + N@10: 0.08 + N@10_std: 0.00 + N@10B: 0.17 + N@10B_std: 0.00 + Vietcuna 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null +mRobust04: + URA-LLaMa 70B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + URA-LLaMa 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + LLaMa-2 13B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + LLaMa-2 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + Vietcuna 7B: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-3.5: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null + GPT-4: + M@10: null + M@10_std: null + M@10B: null + M@10B_std: null + N@10: null + N@10_std: null + N@10B: null + N@10B_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/knowledge.yml b/_data/leaderboard/vi/zero_shot/knowledge.yml new file mode 100644 index 0000000..88d0f13 --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/knowledge.yml @@ -0,0 +1,131 @@ +ZaloE2E: + num_fields: 2 + URA-LLaMa 70B: + EM: 0.28 + EM_std: 0.02 + F1: 0.44 + F1_std: 0.02 + URA-LLaMa 13B: + EM: 0.12 + EM_std: 0.01 + F1: 0.22 + F1_std: 0.01 + URA-LLaMa 7B: + EM: 0.09 + EM_std: 0.01 + F1: 0.20 + F1_std: 0.02 + LLaMa-2 13B: + EM: 0.06 + EM_std: 0.01 + F1: 0.10 + F1_std: 0.01 + LLaMa-2 7B: + EM: 0.03 + EM_std: 0.01 + F1: 0.07 + F1_std: 0.01 + Vietcuna 7B: + EM: 0.03 + EM_std: 0.01 + F1: 0.06 + F1_std: 0.01 + GPT-3.5: + EM: 0.37 + EM_std: 0.02 + F1: 0.56 + F1_std: 0.02 + GPT-4: + EM: 0.38 + EM_std: 0.02 + F1: 0.55 + F1_std: 0.02 +ViMMRC: + URA-LLaMa 70B: + AC: 0.80 + AC_std: 0.02 + F1: 0.80 + F1_std: 0.02 + AR: 0.85 + AR_std: 0.01 + ECE: 0.10 + ECE_std: 0.02 + A@10: 0.96 + A@10_std: 0.03 + URA-LLaMa 13B: + AC: 0.40 + AC_std: 0.02 + F1: 0.31 + F1_std: 0.02 + AR: 0.57 + AR_std: 0.02 + ECE: 0.48 + ECE_std: 0.02 + A@10: 0.42 + A@10_std: 0.08 + URA-LLaMa 7B: + AC: 0.30 + AC_std: 0.02 + F1: 0.10 + F1_std: 0.01 + AR: 0.56 + AR_std: 0.02 + ECE: 0.27 + ECE_std: 0.02 + A@10: 0.56 + A@10_std: 0.07 + LLaMa-2 13B: + AC: 0.52 + AC_std: 0.02 + F1: 0.41 + F1_std: 0.02 + AR: 0.64 + AR_std: 0.02 + ECE: 0.33 + ECE_std: 0.02 + A@10: 0.73 + A@10_std: 0.07 + LLaMa-2 7B: + AC: 0.37 + AC_std: 0.02 + F1: 0.25 + F1_std: 0.02 + AR: 0.51 + AR_std: 0.02 + ECE: 0.35 + ECE_std: 0.02 + A@10: 0.29 + A@10_std: 0.06 + Vietcuna 7B: + AC: 0.32 + AC_std: 0.02 + F1: 0.22 + F1_std: 0.02 + AR: 0.50 + AR_std: 0.00 + ECE: 0.07 + ECE_std: 0.02 + A@10: 0.33 + A@10_std: 0.07 + GPT-3.5: + AC: 0.90 + AC_std: 0.01 + F1: 0.72 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.65 + ECE_std: 0.01 + A@10: 0.90 + A@10_std: 0.04 + GPT-4: + AC: 0.92 + AC_std: 0.01 + F1: 0.73 + F1_std: 0.06 + AR: null + AR_std: null + ECE: 0.67 + ECE_std: 0.01 + A@10: 0.90 + A@10_std: 0.04 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/language_modeling.yml b/_data/leaderboard/vi/zero_shot/language_modeling.yml new file mode 100644 index 0000000..8132450 --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/language_modeling.yml @@ -0,0 +1,236 @@ +MLQA-MLM: + URA-LLaMa 70B: + EM: 0.00 + EM_std: 0.00 + CER: 0.50 + CER_std: 0.01 + WER: 0.64 + WER_std: 0.01 + CED: 519.09 + CED_std: 10.96 + WED: 115.82 + WED_std: 2.45 + PLX: 1.08 + PLX_std: 0.01 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + CER: 0.67 + CER_std: 0.00 + WER: 0.78 + WER_std: 0.00 + CED: 697.85 + CED_std: 11.62 + WED: 161.34 + WED_std: 2.64 + PLX: 1.16 + PLX_std: 0.02 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + CER: 0.40 + CER_std: 0.01 + WER: 0.55 + WER_std: 0.01 + CED: 498.36 + CED_std: 11.01 + WED: 118.11 + WED_std: 2.58 + PLX: 1.24 + PLX_std: 0.01 + LLaMa-2 13B: + EM: 0.01 + EM_std: 0.00 + CER: 0.74 + CER_std: 0.00 + WER: 0.87 + WER_std: 0.00 + CED: 760.98 + CED_std: 11.91 + WED: 186.90 + WED_std: 2.85 + PLX: 1.24 + PLX_std: 0.03 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + CER: 0.81 + CER_std: 0.00 + WER: 0.98 + WER_std: 0.00 + CED: 769.36 + CED_std: 10.51 + WED: 198.53 + WED_std: 2.57 + PLX: 1.74 + PLX_std: 0.19 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + CER: 1.04 + CER_std: 0.00 + WER: 1.06 + WER_std: 0.00 + CED: 935.65 + CED_std: 12.47 + WED: 204.98 + WED_std: 2.79 + PLX: 1.40 + PLX_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + CER: 0.55 + CER_std: null + WER: 0.63 + WER_std: null + CED: 526.79 + CED_std: null + WED: 131.02 + WED_std: null + PLX: 1.00 + PLX_std: null + GPT-3.5: + EM: 0.04 + EM_std: 0.00 + CER: 0.28 + CER_std: 0.01 + WER: 0.44 + WER_std: 0.01 + CED: 387.37 + CED_std: 10.86 + WED: 92.78 + WED_std: 2.46 + PLX: null + PLX_std: null + GPT-4: + EM: 0.08 + EM_std: 0.00 + CER: 0.23 + CER_std: 0.01 + WER: 0.40 + WER_std: 0.01 + CED: 336.53 + CED_std: 10.23 + WED: 83.55 + WED_std: 2.34 + PLX: null + PLX_std: null +VSEC: + URA-LLaMa 70B: + EM: 0.00 + EM_std: 0.00 + CER: 0.88 + CER_std: 0.00 + WER: 1.01 + WER_std: 0.00 + CED: 113.51 + CED_std: 0.57 + WED: 29.91 + WED_std: 0.15 + PLX: 1.09 + PLX_std: 0.00 + URA-LLaMa 13B: + EM: 0.01 + EM_std: 0.00 + CER: 0.42 + CER_std: 0.01 + WER: 0.56 + WER_std: 0.01 + CED: 54.88 + CED_std: 0.77 + WED: 14.50 + WED_std: 0.19 + PLX: 1.26 + PLX_std: 0.00 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + CER: 3.33 + CER_std: 0.04 + WER: 3.14 + WER_std: 0.03 + CED: 420.34 + CED_std: 5.66 + WED: 85.79 + WED_std: 0.96 + PLX: 1.33 + PLX_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + CER: 1.32 + CER_std: 0.01 + WER: 1.40 + WER_std: 0.01 + CED: 160.06 + CED_std: 1.16 + WED: 38.12 + WED_std: 0.23 + PLX: 1.11 + PLX_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + CER: 1.54 + CER_std: 0.04 + WER: 1.55 + WER_std: 0.03 + CED: 171.28 + CED_std: 5.66 + WED: 40.18 + WED_std: 0.96 + PLX: 1.14 + PLX_std: 0.00 + Vietcuna 7B: + EM: 0.01 + EM_std: 0.00 + CER: 1.11 + CER_std: 0.01 + WER: 1.20 + WER_std: 0.01 + CED: 139.90 + CED_std: 1.39 + WED: 33.94 + WED_std: 0.33 + PLX: 1.61 + PLX_std: 0.00 + MixSUra: + EM: 0.00 + EM_std: null + CER: 0.82 + CER_std: null + WER: 0.97 + WER_std: null + CED: 115.21 + CED_std: null + WED: 30.76 + WED_std: null + PLX: 1.09 + PLX_std: null + GPT-3.5: + EM: 0.02 + EM_std: 0.00 + CER: 0.16 + CER_std: 0.00 + WER: 0.30 + WER_std: 0.00 + CED: 12.63 + CED_std: 0.34 + WED: 3.48 + WED_std: 0.09 + PLX: null + PLX_std: null + GPT-4: + EM: 0.60 + EM_std: 0.01 + CER: 0.14 + CER_std: 0.00 + WER: 0.26 + WER_std: 0.00 + CED: 13.58 + CED_std: 0.45 + WED: 3.67 + WED_std: 0.12 + PLX: null + PLX_std: null \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/question_answering.yml b/_data/leaderboard/vi/zero_shot/question_answering.yml new file mode 100644 index 0000000..2fdb6f6 --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/question_answering.yml @@ -0,0 +1,82 @@ +XQuAD: + URA-LLaMa 70B: + EM: 0.06 + EM_std: 0.00 + F1: 0.30 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.14 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.14 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.04 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.05 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.27 + F1_std: 0.00 +MLQA: + URA-LLaMa 70B: + EM: 0.04 + EM_std: 0.00 + F1: 0.28 + F1_std: 0.00 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.15 + F1_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.16 + F1_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.02 + F1: 0.05 + F1_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.06 + F1_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.25 + F1_std: 0.00 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.27 + F1_std: 0.00 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/reasoning.yml b/_data/leaderboard/vi/zero_shot/reasoning.yml new file mode 100644 index 0000000..2a66b7b --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/reasoning.yml @@ -0,0 +1,171 @@ +"SR - Natural": + URA-LLaMa 70B: + EM: 0.06 + EM_std: 0.00 + F1: 0.34 + F1_std: 0.00 + Equ: 0.06 + Equ_std: 0.00 + URA-LLaMa 13B: + EM: 0.01 + EM_std: 0.00 + F1: 0.31 + F1_std: 0.00 + Equ: 0.02 + Equ_std: 0.00 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.26 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.06 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.04 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.04 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + GPT-3.5: + EM: 0.21 + EM_std: 0.00 + F1: 0.59 + F1_std: 0.00 + Equ: 0.32 + Equ_std: 0.00 + GPT-4: + EM: 0.21 + EM_std: 0.00 + F1: 0.59 + F1_std: 0.00 + Equ: 0.32 + Equ_std: 0.00 +"SR - Abstract symbol": + URA-LLaMa 70B: + EM: 0.02 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + Equ: 0.01 + Equ_std: 0.00 + URA-LLaMa 13B: + EM: 0.02 + EM_std: 0.00 + F1: 0.24 + F1_std: 0.00 + Equ: 0.01 + Equ_std: 0.00 + URA-LLaMa 7B: + EM: 0.01 + EM_std: 0.00 + F1: 0.17 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + LLaMa-2 13B: + EM: 0.02 + EM_std: 0.00 + F1: 0.19 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.05 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.10 + F1_std: 0.00 + Equ: 0.00 + Equ_std: 0.00 + GPT-3.5: + EM: 0.09 + EM_std: 0.00 + F1: 0.28 + F1_std: 0.00 + Equ: 0.13 + Equ_std: 0.00 + GPT-4: + EM: 0.09 + EM_std: 0.00 + F1: 0.28 + F1_std: 0.00 + Equ: 0.13 + Equ_std: 0.00 +MATH: + URA-LLaMa 70B: + EM: 0.00 + EM_std: 0.00 + F1: 0.01 + F1_std: 0.00 + Equ: 0.24 + Equ_std: 0.02 + URA-LLaMa 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.14 + Equ_std: 0.02 + URA-LLaMa 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.05 + Equ_std: 0.01 + LLaMa-2 13B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.16 + Equ_std: 0.02 + LLaMa-2 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.06 + Equ_std: 0.01 + Vietcuna 7B: + EM: 0.00 + EM_std: 0.00 + F1: 0.00 + F1_std: 0.00 + Equ: 0.01 + Equ_std: 0.00 + GPT-3.5: + EM: 0.00 + EM_std: 0.00 + F1: 0.01 + F1_std: 0.00 + Equ: 0.72 + Equ_std: 0.02 + GPT-4: + EM: 0.00 + EM_std: 0.00 + F1: 0.01 + F1_std: 0.00 + Equ: 0.76 + Equ_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/sentiment_analysis.yml b/_data/leaderboard/vi/zero_shot/sentiment_analysis.yml new file mode 100644 index 0000000..03403da --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/sentiment_analysis.yml @@ -0,0 +1,200 @@ +VLSP 2016: + URA-LLaMa 70B: + AC: 0.63 + AC_std: 0.02 + F1: 0.63 + F1_std: 0.02 + AR: 0.74 + AR_std: 0.01 + ECE: 0.15 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.03 + URA-LLaMa 13B: + AC: 0.52 + AC_std: 0.02 + F1: 0.35 + F1_std: 0.01 + AR: 0.60 + AR_std: 0.01 + ECE: 0.10 + ECE_std: 0.01 + A@10: 0.64 + A@10_std: 0.05 + URA-LLaMa 7B: + AC: 0.35 + AC_std: 0.02 + F1: 0.24 + F1_std: 0.01 + AR: 0.54 + AR_std: 0.01 + ECE: 0.24 + ECE_std: 0.01 + A@10: 0.31 + A@10_std: 0.05 + LLaMa-2 13B: + AC: 0.25 + AC_std: 0.01 + F1: 0.25 + F1_std: 0.01 + AR: 0.49 + AR_std: 0.01 + ECE: 0.39 + ECE_std: 0.01 + A@10: 0.29 + A@10_std: 0.05 + LLaMa-2 7B: + AC: 0.15 + AC_std: 0.01 + F1: 0.15 + F1_std: 0.01 + AR: 0.58 + AR_std: 0.01 + ECE: 0.73 + ECE_std: 0.01 + A@10: 0.12 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.11 + AC_std: 0.01 + F1: 0.12 + F1_std: 0.01 + AR: 0.49 + AR_std: 0.01 + ECE: 0.68 + ECE_std: 0.01 + A@10: 0.11 + A@10_std: 0.03 + MixSUra: + AC: 0.45 + AC_std: null + F1: 0.30 + F1_std: null + AR: 0.62 + AR_std: null + ECE: 0.50 + ECE_std: null + A@10: 0.49 + A@10_std: null + GPT-3.5: + AC: 0.62 + AC_std: 0.02 + F1: 0.56 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.29 + ECE_std: 0.02 + A@10: 0.62 + A@10_std: 0.05 + GPT-4: + AC: 0.71 + AC_std: 0.01 + F1: 0.68 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.37 + ECE_std: 0.01 + A@10: 0.70 + A@10_std: 0.04 +UiT-VSFC: + URA-LLaMa 70B: + AC: 0.64 + AC_std: 0.01 + F1: 0.54 + F1_std: 0.01 + AR: 0.85 + AR_std: 0.01 + ECE: 0.14 + ECE_std: 0.00 + A@10: 0.98 + A@10_std: 0.01 + URA-LLaMa 13B: + AC: 0.70 + AC_std: 0.01 + F1: 0.40 + F1_std: 0.01 + AR: 0.72 + AR_std: 0.01 + ECE: 0.23 + ECE_std: 0.01 + A@10: 0.95 + A@10_std: 0.01 + URA-LLaMa 7B: + AC: 0.27 + AC_std: 0.01 + F1: 0.18 + F1_std: 0.00 + AR: 0.52 + AR_std: 0.01 + ECE: 0.37 + ECE_std: 0.01 + A@10: 0.03 + A@10_std: 0.01 + LLaMa-2 13B: + AC: 0.29 + AC_std: 0.01 + F1: 0.24 + F1_std: 0.01 + AR: 0.52 + AR_std: 0.01 + ECE: 0.42 + ECE_std: 0.01 + A@10: 0.30 + A@10_std: 0.03 + LLaMa-2 7B: + AC: 0.04 + AC_std: 0.00 + F1: 0.06 + F1_std: 0.01 + AR: 0.49 + AR_std: 0.01 + ECE: 0.79 + ECE_std: 0.00 + A@10: 0.01 + A@10_std: 0.01 + Vietcuna 7B: + AC: 0.05 + AC_std: 0.00 + F1: 0.06 + F1_std: 0.00 + AR: 0.56 + AR_std: 0.01 + ECE: 0.73 + ECE_std: 0.00 + A@10: 0.05 + A@10_std: 0.01 + MixSUra: + AC: 0.55 + AC_std: null + F1: 0.40 + F1_std: null + AR: 0.66 + AR_std: null + ECE: 0.41 + ECE_std: null + A@10: 0.60 + A@10_std: null + GPT-3.5: + AC: 0.86 + AC_std: 0.01 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.86 + A@10_std: 0.02 + GPT-4: + AC: 0.85 + AC_std: 0.01 + F1: 0.71 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.87 + A@10_std: 0.02 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/summarization.yml b/_data/leaderboard/vi/zero_shot/summarization.yml new file mode 100644 index 0000000..342bccd --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/summarization.yml @@ -0,0 +1,291 @@ +VietNews: + URA-LLaMa 70B: + R1: 0.35 + R1_std: 0.00 + R2: 0.16 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.11 + SC_std: 0.00 + BS: 0.12 + BS_std: 0.00 + Cv: 0.63 + Cv_std: 0.00 + De: 5.43 + De_std: 0.02 + Cp: 37.78 + Cp_std: 0.47 + URA-LLaMa 13B: + R1: 0.26 + R1_std: 0.00 + R2: 0.12 + R2_std: 0.00 + RL: 0.17 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.08 + BS_std: 0.18 + Cv: 0.46 + Cv_std: 0.00 + De: 3.55 + De_std: 0.04 + Cp: 47.75 + Cp_std: 0.65 + URA-LLaMa 7B: + R1: 0.41 + R1_std: 0.00 + R2: 0.18 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.08 + BS_std: 0.13 + Cv: 0.83 + Cv_std: 0.00 + De: 8.13 + De_std: 0.04 + Cp: 8.08 + Cp_std: 0.17 + LLaMa-2 13B: + R1: 0.02 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.02 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.19 + BS_std: 0.05 + Cv: 0.01 + Cv_std: 0.00 + De: 0.01 + De_std: 0.00 + Cp: 54.67 + Cp_std: 0.16 + LLaMa-2 7B: + R1: 0.03 + R1_std: 0.00 + R2: 0.01 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: -0.17 + BS_std: 0.03 + Cv: 0.04 + Cv_std: 0.00 + De: 0.07 + De_std: 0.00 + Cp: 23.86 + Cp_std: 0.26 + MixSUra: + R1: 0.06 + R1_std: null + R2: 0.01 + R2_std: null + RL: 0.04 + RL_std: null + SC: null + SC_std: null + BS: -0.13 + BS_std: null + Cv: 0.10 + Cv_std: null + De: 0.17 + De_std: null + Cp: 9.03 + Cp_std: null + GPT-3.5: + R1: 0.36 + R1_std: 0.00 + R2: 0.20 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.09 + SC_std: 0.00 + BS: 0.04 + BS_std: 0.13 + Cv: 0.86 + Cv_std: 0.00 + De: 3.97 + De_std: 0.02 + Cp: 13.32 + Cp_std: 0.65 + GPT-4: + R1: 0.41 + R1_std: 0.00 + R2: 0.21 + R2_std: 0.00 + RL: 0.26 + RL_std: 0.00 + SC: -0.08 + SC_std: 0.00 + BS: -0.04 + BS_std: 0.11 + Cv: 0.84 + Cv_std: 0.00 + De: 3.45 + De_std: 0.00 + Cp: 15.43 + Cp_std: 0.49 +WikiLingua: + URA-LLaMa 70B: + R1: 0.37 + R1_std: 0.00 + R2: 0.16 + R2_std: 0.00 + RL: 0.24 + RL_std: 0.00 + SC: -0.22 + SC_std: 0.00 + BS: 0.26 + BS_std: 0.16 + Cv: 0.17 + Cv_std: 0.00 + De: 0.22 + De_std: 0.00 + Cp: 22.24 + Cp_std: 0.97 + URA-LLaMa 13B: + R1: 0.14 + R1_std: 0.00 + R2: 0.05 + R2_std: 0.00 + RL: 0.09 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.14 + BS_std: 0.12 + Cv: 0.26 + Cv_std: 0.01 + De: 1.83 + De_std: 0.06 + Cp: 60.10 + Cp_std: 2.16 + URA-LLaMa 7B: + R1: 0.42 + R1_std: 0.00 + R2: 0.17 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.27 + BS_std: 0.21 + Cv: 0.84 + Cv_std: 0.00 + De: 7.15 + De_std: 0.08 + Cp: 8.08 + Cp_std: 0.36 + LLaMa-2 13B: + R1: 0.04 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.17 + SC_std: 0.00 + BS: -0.05 + BS_std: 0.03 + Cv: 0.02 + Cv_std: 0.00 + De: 0.02 + De_std: 0.00 + Cp: 42.55 + Cp_std: 0.81 + LLaMa-2 7B: + R1: 0.04 + R1_std: 0.00 + R2: 0.00 + R2_std: 0.00 + RL: 0.03 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.14 + BS_std: 0.07 + Cv: 0.03 + Cv_std: 0.00 + De: 0.06 + De_std: 0.00 + Cp: 17.84 + Cp_std: 0.50 + Vietcuna 7B: + R1: 0.24 + R1_std: 0.00 + R2: 0.06 + R2_std: 0.00 + RL: 0.15 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: -0.18 + BS_std: 0.07 + Cv: 0.51 + Cv_std: 0.01 + De: 1.16 + De_std: 0.01 + Cp: 238.67 + Cp_std: 3.37 + MixSUra: + R1: 0.03 + R1_std: null + R2: 0.00 + R2_std: null + RL: 0.03 + RL_std: null + SC: null + SC_std: null + BS: -0.01 + BS_std: null + Cv: 0.17 + Cv_std: null + De: 0.26 + De_std: null + Cp: 16.68 + Cp_std: null + GPT-3.5: + R1: 0.43 + R1_std: 0.00 + R2: 0.21 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.22 + BS_std: 0.03 + Cv: 0.87 + Cv_std: 0.00 + De: 3.29 + De_std: 0.03 + Cp: 35.50 + Cp_std: 0.82 + GPT-4: + R1: 0.44 + R1_std: 0.00 + R2: 0.21 + R2_std: 0.00 + RL: 0.27 + RL_std: 0.00 + SC: -0.16 + SC_std: 0.00 + BS: 0.24 + BS_std: 0.04 + Cv: 0.82 + Cv_std: 0.00 + De: 2.37 + De_std: 0.01 + Cp: 6.61 + Cp_std: 0.16 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/text_classification.yml b/_data/leaderboard/vi/zero_shot/text_classification.yml new file mode 100644 index 0000000..7b5a34d --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/text_classification.yml @@ -0,0 +1,200 @@ +UiT-VSMEC: + URA-LLaMa 70B: + AC: 0.40 + AC_std: 0.02 + F1: 0.32 + F1_std: 0.02 + AR: 0.68 + AR_std: 0.01 + ECE: 0.14 + ECE_std: 0.02 + A@10: 0.60 + A@10_std: 0.06 + URA-LLaMa 13B: + AC: 0.29 + AC_std: 0.02 + F1: 0.25 + F1_std: 0.02 + AR: 0.52 + AR_std: 0.01 + ECE: 0.09 + ECE_std: 0.01 + A@10: 0.23 + A@10_std: 0.05 + URA-LLaMa 7B: + AC: 0.13 + AC_std: 0.01 + F1: 0.11 + F1_std: 0.01 + AR: 0.50 + AR_std: 0.01 + ECE: 0.15 + ECE_std: 0.01 + A@10: 0.21 + A@10_std: 0.05 + LLaMa-2 13B: + AC: 0.11 + AC_std: 0.01 + F1: 0.10 + F1_std: 0.01 + AR: 0.49 + AR_std: 0.01 + ECE: 0.31 + ECE_std: 0.01 + A@10: 0.09 + A@10_std: 0.04 + LLaMa-2 7B: + AC: 0.07 + AC_std: 0.01 + F1: 0.08 + F1_std: 0.01 + AR: 0.52 + AR_std: 0.01 + ECE: 0.35 + ECE_std: 0.01 + A@10: 0.07 + A@10_std: 0.03 + Vietcuna 7B: + AC: 0.05 + AC_std: 0.01 + F1: 0.02 + F1_std: 0.01 + AR: 0.52 + AR_std: 0.01 + ECE: 0.95 + ECE_std: 0.01 + A@10: 0.03 + A@10_std: 0.02 + MixSUra: + AC: 0.40 + AC_std: null + F1: 0.36 + F1_std: null + AR: 0.72 + AR_std: null + ECE: 0.53 + ECE_std: null + A@10: 0.79 + A@10_std: null + GPT-3.5: + AC: 0.43 + AC_std: 0.02 + F1: 0.37 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.29 + ECE_std: 0.02 + A@10: 0.43 + A@10_std: 0.06 + GPT-4: + AC: 0.49 + AC_std: 0.02 + F1: 0.46 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.35 + ECE_std: 0.02 + A@10: 0.50 + A@10_std: 0.06 +PhoATIS: + URA-LLaMa 70B: + AC: 0.56 + AC_std: 0.02 + F1: 0.48 + F1_std: 0.03 + AR: 0.85 + AR_std: 0.00 + ECE: 0.25 + ECE_std: 0.02 + A@10: 0.56 + A@10_std: 0.06 + URA-LLaMa 13B: + AC: 0.10 + AC_std: 0.01 + F1: 0.10 + F1_std: 0.01 + AR: 0.72 + AR_std: 0.00 + ECE: 0.52 + ECE_std: 0.01 + A@10: 0.14 + A@10_std: 0.04 + URA-LLaMa 7B: + AC: 0.04 + AC_std: 0.01 + F1: 0.04 + F1_std: 0.02 + AR: 0.77 + AR_std: 0.00 + ECE: 0.30 + ECE_std: 0.01 + A@10: 0.04 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.03 + AC_std: 0.01 + F1: 0.02 + F1_std: 0.00 + AR: 0.45 + AR_std: 0.01 + ECE: 0.28 + ECE_std: 0.01 + A@10: 0.03 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.00 + AC_std: 0.06 + F1: 0.00 + F1_std: 0.06 + AR: 0.61 + AR_std: 0.01 + ECE: 0.32 + ECE_std: 0.00 + A@10: 0.00 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.05 + AC_std: 0.01 + F1: 0.01 + F1_std: 0.00 + AR: 0.66 + AR_std: 0.00 + ECE: 0.20 + ECE_std: 0.01 + A@10: 0.01 + A@10_std: 0.21 + MixSUra: + AC: 0.81 + AC_std: null + F1: 0.58 + F1_std: null + AR: 0.96 + AR_std: null + ECE: 0.14 + ECE_std: null + A@10: 0.91 + A@10_std: null + GPT-3.5: + AC: 0.44 + AC_std: 0.02 + F1: 0.38 + F1_std: 0.03 + AR: null + AR_std: null + ECE: 0.38 + ECE_std: 0.02 + A@10: 0.44 + A@10_std: 0.05 + GPT-4: + AC: 0.89 + AC_std: 0.01 + F1: 0.69 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.83 + ECE_std: 0.01 + A@10: 0.89 + A@10_std: 0.03 \ No newline at end of file diff --git a/_data/leaderboard/vi/zero_shot/toxicity_detection.yml b/_data/leaderboard/vi/zero_shot/toxicity_detection.yml new file mode 100644 index 0000000..5f4798d --- /dev/null +++ b/_data/leaderboard/vi/zero_shot/toxicity_detection.yml @@ -0,0 +1,178 @@ +UiT-ViCTSD: + URA-LLaMa 70B: + AC: 0.61 + AC_std: 0.01 + F1: 0.52 + F1_std: 0.01 + AR: 0.77 + AR_std: 0.01 + ECE: 0.17 + ECE_std: 0.01 + A@10: 0.97 + A@10_std: 0.01 + URA-LLaMa 13B: + AC: 0.46 + AC_std: 0.01 + F1: 0.28 + F1_std: 0.03 + AR: 0.53 + AR_std: 0.02 + ECE: 0.22 + ECE_std: 0.01 + A@10: 0.48 + A@10_std: 0.03 + URA-LLaMa 7B: + AC: 0.25 + AC_std: 0.01 + F1: 0.19 + F1_std: 0.01 + AR: 0.53 + AR_std: 0.01 + ECE: 0.38 + ECE_std: 0.01 + A@10: 0.13 + A@10_std: 0.02 + LLaMa-2 13B: + AC: 0.16 + AC_std: 0.01 + F1: 0.14 + F1_std: 0.00 + AR: 0.40 + AR_std: 0.01 + ECE: 0.50 + ECE_std: 0.01 + A@10: 0.24 + A@10_std: 0.02 + LLaMa-2 7B: + AC: 0.13 + AC_std: 0.01 + F1: 0.14 + F1_std: 0.01 + AR: 0.45 + AR_std: 0.02 + ECE: 0.69 + ECE_std: 0.01 + A@10: 0.09 + A@10_std: 0.01 + Vietcuna 7B: + AC: 0.09 + AC_std: 0.00 + F1: 0.07 + F1_std: 0.00 + AR: 0.50 + AR_std: 0.00 + ECE: 0.41 + ECE_std: 0.00 + A@10: 0.10 + A@10_std: 0.03 + GPT-3.5: + AC: 0.75 + AC_std: 0.01 + F1: 0.61 + F1_std: 0.02 + AR: null + AR_std: null + ECE: 0.25 + ECE_std: 0.01 + A@10: 0.80 + A@10_std: 0.04 + GPT-4: + AC: 0.89 + AC_std: 0.01 + F1: 0.69 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.39 + ECE_std: 0.01 + A@10: 0.89 + A@10_std: 0.03 +UiT-ViHSD: + URA-LLaMa 70B: + AC: 0.38 + AC_std: 0.01 + F1: 0.34 + F1_std: 0.01 + AR: 0.74 + AR_std: 0.01 + ECE: 0.25 + ECE_std: 0.01 + A@10: 0.91 + A@10_std: 0.01 + URA-LLaMa 13B: + AC: 0.33 + AC_std: 0.01 + F1: 0.18 + F1_std: 0.00 + AR: 0.60 + AR_std: 0.01 + ECE: 0.35 + ECE_std: 0.01 + A@10: 0.54 + A@10_std: 0.02 + URA-LLaMa 7B: + AC: 0.19 + AC_std: 0.00 + F1: 0.13 + F1_std: 0.00 + AR: 0.55 + AR_std: 0.01 + ECE: 0.46 + ECE_std: 0.01 + A@10: 0.13 + A@10_std: 0.01 + LLaMa-2 13B: + AC: 0.09 + AC_std: 0.00 + F1: 0.13 + F1_std: 0.00 + AR: 0.38 + AR_std: 0.01 + ECE: 0.63 + ECE_std: 0.00 + A@10: 0.10 + A@10_std: 0.01 + LLaMa-2 7B: + AC: 0.03 + AC_std: 0.00 + F1: 0.05 + F1_std: 0.01 + AR: 0.56 + AR_std: 0.01 + ECE: 0.75 + ECE_std: 0.00 + A@10: 0.00 + A@10_std: 0.00 + Vietcuna 7B: + AC: 0.07 + AC_std: 0.00 + F1: 0.04 + F1_std: 0.00 + AR: 0.50 + AR_std: 0.00 + ECE: 0.26 + ECE_std: 0.00 + A@10: 0.07 + A@10_std: 0.01 + GPT-3.5: + AC: 0.55 + AC_std: 0.01 + F1: 0.42 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.22 + ECE_std: 0.01 + A@10: 0.55 + A@10_std: 0.02 + GPT-4: + AC: 0.75 + AC_std: 0.01 + F1: 0.53 + F1_std: 0.01 + AR: null + AR_std: null + ECE: 0.42 + ECE_std: 0.01 + A@10: 0.75 + A@10_std: 0.02 \ No newline at end of file diff --git a/_pages/ind/bias-toxicity/question-answering.md b/_pages/ind/bias-toxicity/question-answering.md index 73d5eca..b7aefa2 100644 --- a/_pages/ind/bias-toxicity/question-answering.md +++ b/_pages/ind/bias-toxicity/question-answering.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/ind/bias-toxicity/question-answering --- # Bias-Toxicity Question Answering Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.39 ± 0.01-0.41 ± 0.000.02 ± 0.00-0.14 ± 0.02-0.42 ± 0.030.02 ± 0.00
URA-LLaMa 13B-0.39 ± 0.01-0.45 ± 0.010.02 ± 0.00-0.17 ± 0.1-0.38 ± 0.000.02 ± 0.00
URA-LLaMa 7B-0.43 ± 0.01-0.48 ± 0.000.03 ± 0.00-0.18 ± 0.01-0.37 ± 0.010.02 ± 0.00
LLaMa-2 13B-0.35 ± 0.03-0.46 ± 0.000.01 ± 0.00-0.27 ± 0.01-0.43 ± 0.000.01 ± 0.00
LLaMa-2 7B-0.46 ± 0.01-0.42 ± 0.000.01 ± 0.00-0.21 ± 0.06-0.45 ± 0.000.01 ± 0.00
Vietcuna 7B-0.50 ± 0.00--0.04 ± 0.00-0.23 ± 0.09-0.49 ± 0.010.04 ± 0.00
GPT-3.5-0.43 ± 0.01-0.48 ± 0.000.02 ± 0.00-0.18 ± 0.01-0.40 ± 0.000.02 ± 0.00
GPT-4-0.40 ± 0.01-0.45 ± 0.000.02 ± 0.00-0.16 ± 0.01-0.41 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/bias-toxicity/summarization.md b/_pages/ind/bias-toxicity/summarization.md index 3776fd6..4741c40 100644 --- a/_pages/ind/bias-toxicity/summarization.md +++ b/_pages/ind/bias-toxicity/summarization.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/ind/bias-toxicity/summarization --- # Bias-Toxicity Summarization Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.21 ± 0.01-0.31 ± 0.010.05 ± 0.00-0.03 ± 0.02-0.25 ± 0.020.03 ± 0.00
URA-LLaMa 13B-0.20 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.07 ± 0.04-0.31 ± 0.030.02 ± 0.00
URA-LLaMa 7B-0.24 ± 0.02-0.33 ± 0.010.04 ± 0.00-0.07 ± 0.02-0.38 ± 0.020.03 ± 0.00
LLaMa-2 13B-0.26 ± 0.01-0.38 ± 0.010.01 ± 0.00-0.17 ± 0.08-0.50 ± 0.020.01 ± 0.00
LLaMa-2 7B-0.28 ± 0.02-0.39 ± 0.010.01 ± 0.00-0.39 ± 0.05-0.50 ± 0.020.01 ± 0.00
Vietcuna 7B-0.21 ± 0.02-0.32 ± 0.020.04 ± 0.00-0.17 ± 0.04-0.39 ± 0.030.03 ± 0.00
GPT-3.5-0.22 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.03 ± 0.02-0.28 ± 0.010.02 ± 0.00
GPT-4-0.19 ± 0.01-0.28 ± 0.010.06 ± 0.00-0.09 ± 0.02-0.28 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/bias-toxicity/translation.md b/_pages/ind/bias-toxicity/translation.md index 8196c33..a4ffacc 100644 --- a/_pages/ind/bias-toxicity/translation.md +++ b/_pages/ind/bias-toxicity/translation.md @@ -3,264 +3,94 @@ layout: default permalink: /leaderboard/ind/bias-toxicity/translation --- # Bias-Toxicity Translation Leaderboard +{% assign lang = 'ind' %} - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + + + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMT (En - Vi)OPUS100 (En - Vi) + {{ dataset[0] }} +
DRR$→|DRG$→|SAR$→|SAG$→|ToxDRR$→|DRG$→|SAR$→|SAG$→|ToxDRR↓DRG↓SAR↓SAG↓Tox↓
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
---- -layout: default -permalink: /leaderboard/ind/bias-toxicity/translation ---- -# Bias-Toxicity Translation Leaderboard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + + {% endfor %} -
ModelsPhoMT (En $\to$ Vi)OPUS100 (En $\to$ Vi)
DRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$ToxDRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$Tox
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
+ {{ model }} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/chain-of-thought/reasoning.md b/_pages/ind/chain-of-thought/reasoning.md index 985f3ce..7f321f3 100644 --- a/_pages/ind/chain-of-thought/reasoning.md +++ b/_pages/ind/chain-of-thought/reasoning.md @@ -3,73 +3,72 @@ layout: default permalink: /leaderboard/ind/chain-of-thought/reasoning --- # Chain-Of-Thought Reasoning Leaderboard +{% assign lang = 'ind' %} - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + {% endfor %} - - - + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].Equ and dataset[1][m].Equ > Equ_best %} + {% assign Equ_best = dataset[1][m].Equ %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsMetrics + Models + + {{ dataset[0] }} +
EM F1 Equ. EM↑F1↑Equ.↑
URA-LLaMa 70B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
URA-LLaMa 13B0.00 ± 0.000.23 ± 0.010.17 ± 0.01
URA-LLaMa 7B0.00 ± 0.000.23 ± 0.010.09 ± 0.01
LLaMa-2 13B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
LLaMa-2 7B0.00 ± 0.000.10 ± 0.000.12 ± 0.02
Vietcuna 7B0.00 ± 0.000.13 ± 0.010.10 ± 0.01
MixSUra 8x7B0.00 ± 0.000.17 ± 0.010.33 ± 0.00
GPT-3.50.00 ± 0.000.32 ± 0.010.78 ± 0.02
GPT-40.00 ± 0.000.32 ± 0.010.79 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Equ %} + {{ dataset[1][model].Equ | round: 2 }} ± {{ dataset[1][model].Equ_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/fairness-aware/information-retrieval.md b/_pages/ind/fairness-aware/information-retrieval.md index 346224f..7f61698 100644 --- a/_pages/ind/fairness-aware/information-retrieval.md +++ b/_pages/ind/fairness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/ind/fairness-aware/information-retrieval --- # Fairness-Aware Information Retrieval Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsmMARCOmRobust04 + Models + + {{ dataset[0] }} +
M@10M@10BN@10N@10BM@10M@10BN@10N@10BM@10↑M@10B↑N@10↑N@10B↑
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.10 ± 0.000.10 ± 0.000.14 ± 0.000.14 ± 0.000.01 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.00
LLaMa-2 13B
LLaMa-2 7B0.05 ± 0.000.10 ± 0.000.07 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4-------- + {{ model }} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/fairness-aware/language-modeling.md b/_pages/ind/fairness-aware/language-modeling.md index 4078ca9..070ae92 100644 --- a/_pages/ind/fairness-aware/language-modeling.md +++ b/_pages/ind/fairness-aware/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/ind/fairness-aware/language-modeling --- # Fairness-Aware Language Modeling Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.58 ± 0.010.70 ± 0.01653.57 ± 12.05150.64 ± 2.731.25 ± 0.060.30 ± 0.000.11 ± 0.000.14 ± 0.0015.19 ± 0.424.12 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.02 ± 0.000.40 ± 0.010.56 ± 0.01518.38 ± 11.19125.24 ± 2.661.48 ± 0.110.32 ± 0.000.07 ± 0.000.21 ± 0.002.98 ± 0.111.24 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01492.93 ± 11.32117.82 ± 2.721.22 ± 0.010.20 ± 0.000.54 ± 0.010.67 ± 0.0141.77 ± 1.5710.12 ± 0.351.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.76 ± 0.000.89 ± 0.00782.03 ± 11.71192.66 ± 2.831.27 ± 0.040.15 ± 0.000.07 ± 0.000.22 ± 0.003.39 ± 0.161.52 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.79 ± 0.000.96 ± 0.00761.38 ± 10.65197.18 ± 2.661.75 ± 0.200.12 ± 0.000.35 ± 0.010.48 ± 0.0147.54 ± 0.8511.82 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00940.71 ± 12.48208.05 ± 2.811.40 ± 0.000,06 ± 0.004.78 ± 0.064.80 ± 0.06634.48 ± 8.58145.12 ± 1.941.46 ± 0.01
MixSUra 8x7B0.00 ± -0.56 ± -0.63 ± -535.76 ± -133.64 ± -1.00 ± -0,07 ± -0.20 ± -0.29 ± -25.96 ± -8.79 ± -1.00 ± -
GPT-3.50.03 ± 0.000.29 ± 0.010.46 ± 0.01398.19 ± 11.0196.42 ± 2.54-0.59 ± 0.000.06 ± 0.000.19 ± 0.001.99 ± 0.080.74 ± 0.02-
GPT-40.06 ± 0.000.36 ± 0.010.41 ± 0.01347.82 ± 10.2386.96 ± 2.41-0.67 ± 0.000.01 ± 0.000.02 ± 0.001.30 ± 0.040.54 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/fairness-aware/question-answering.md b/_pages/ind/fairness-aware/question-answering.md index 58e2c63..b4fd2f3 100644 --- a/_pages/ind/fairness-aware/question-answering.md +++ b/_pages/ind/fairness-aware/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/ind/fairness-aware/question-answering --- # Fairness-Aware Question Answering Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
Exact MatchF1Exact MatchF1
URA-LLaMa 70B0.04 ± 0.000.27 ± 0.000.03 ± 0.000.25 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.14 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.15 ± 0.01
LLaMa-2 13B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.04 ± 0.00
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.05 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.23 ± 0.00
GPT-40.00 ± 0.000.26 ± 0.000.00 ± 0.000.24 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/fairness-aware/sentiment-analysis.md b/_pages/ind/fairness-aware/sentiment-analysis.md index 102fff9..ee7815b 100644 --- a/_pages/ind/fairness-aware/sentiment-analysis.md +++ b/_pages/ind/fairness-aware/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/ind/fairness-aware/sentiment-analysis --- # Fairness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.65 ± 0.010.49 ± 0.010.58 ± 0.010.13 ± 0.010.77 ± 0.040.76 ± 0.010.48 ± 0.010.61 ± 0.010.17 ± 0.010.66 ± 0.03
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.62 ± 0.010.07 ± 0.010.83 ± 0.040.75 ± 0.010.46 ± 0.080.83 ± 0.010.11 ± 0.010.88 ± 0.02
URA-LLaMa 7B0.74 ± 0.020.39 ± 0.060.83 ± 0.010.21 ± 0.020.98 ± 0.020.73 ± 0.010.73 ± 0.010.78 ± 0.010.13 ± 0.010.94 ± 0.01
LLaMa-2 13B0.51 ± 0.010.1 ± 0.060.56 ± 0.010.32 ± 0.020.79 ± 0.040.63 ± 0.010.41 ± 0.020.70 ± 0.010.13 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.34 ± 0.010.53 ± 0.010.26 ± 0.020.50 ± 0.00.51 ± 0.010.55 ± 0.010.68 ± 0.010.22 ± 0.010.64 ± 0.03
Vietcuna 7B0.04 ± 0.010.04 ± 0.010.49 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.55 ± 0.010.50 ± 0.000.01 ± 0.01
MixSUra 8x7B0.62 ± -0.62 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.61 ± -0.24 ± -0.66 ± -
Gemini Pro0.67 ± -0.50 ± -- 0.34 ± -0.59 ± -0.79 ± -0.50 ± -- 0.46 ± -0.82 ± -
GPT-3.50.66 ± 0.010.60 ± 0.01- 0.33 ± 0.010.52 ± 0.050.86 ± 0.010.71 ± 0.01- 0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01- 0.41 ± 0.000.73 ± 0.040.85 ± 0.010.71 ± 0.01- 0.52 ± 0.010.87 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/fairness-aware/text-classification.md b/_pages/ind/fairness-aware/text-classification.md index 60f2587..14efacd 100644 --- a/_pages/ind/fairness-aware/text-classification.md +++ b/_pages/ind/fairness-aware/text-classification.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/ind/fairness-aware/text-classification --- # Fairness-Aware Text Classification Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-VSMECPhoATIS + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.24 ± 0.020.14 ± 0.010.58 ± 0.010.26 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.31 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.31 ± 0.020.11 ± 0.010.58 ± 0.010.23 ± 0.020.57 ± 0.060.01 ± 0.010.05 ± 0.020.58 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.41 ± 0.060.00 ± 0.010.00 ± 0.000.55 ± 0.000.30 ± 0.010.01 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.44 ± 0.060.02 ± 0.010.01 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.11 ± 0.010.57 ± 0.010.22 ± 0.020.53 ± 0.060.02 ± 0.000.06 ± 0.010.57 ± 0.010.68 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.16 ± 0.040.04 ± 0.010.01 ± 0.000.77 ± 0.010.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
Gemini Pro0.48 ± -0.38 ± --0.34 ± -0.43 ± -0.79 ± -0.67 ± --0.73 ± -0.68 ± -
GPT-3.50.44 ± 0.020.42 ± 0.02-0.30 ± 0.020.36 ± 0.060.68 ± 0.020.66 ± 0.03-0.62 ± 0.020.67 ± 0.05
GPT-40.49 ± 0.020.47 ± 0.02-0.35 ± 0.020.36 ± 0.060.83 ± 0.010.76 ± 0.03-0.77 ± 0.010.87 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/fairness-aware/toxicity-detection.md b/_pages/ind/fairness-aware/toxicity-detection.md index 68fe66e..b2be50d 100644 --- a/_pages/ind/fairness-aware/toxicity-detection.md +++ b/_pages/ind/fairness-aware/toxicity-detection.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/ind/fairness-aware/toxicity-detection --- # Fairness-Aware Toxicity Detection Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.41 ± 0.020.26 ± 0.010.75 ± 0.010.53 ± 0.010.33 ± 0.050.15 ± 0.000.40 ± 0.000.64 ± 0.010.58 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.43 ± 0.020.29 ± 0.070.66 ± 0.010.36 ± 0.020.42 ± 0.050.24 ± 0.010.15 ± 0.000.61 ± 0.010.43 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.42 ± 0.020.39 ± 0.010.60 ± 0.010.30 ± 0.010.66 ± 0.050.16 ± 0.000.10 ± 0.000.67 ± 0.010.33 ± 0.000.28 ± 0.02
LLaMa-2 13B0.27 ± 0.010.18 ± 0.010.67 ± 0.010.53 ± 0.010.57 ± 0.050.16 ± 0.000.10 ± 0.000.62 ± 0.010.59 ± 0.000.42 ± 0.02
LLaMa-2 7B0.15 ± 0.010.11 ± 0.010.62 ± 0.010.67 ± 0.010.07 ± 0.030.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.00
Vietcuna 7B0.08 ± 0.010.09 ± 0.010.50 ± 0.010.42 ± 0.010.06 ± 0.030.62 ± 0.010.21 ± 0.000.50 ± 0.000.29 ± 0.010.62 ± 0.02
MixSUra 8x7B0.69 ± -0.38 ± -- ± -0.29 ± -0.78 ± -0.56 ± -0.31 ± -0.68 ± -0.32 ± -0.92 ± -
Gemini Pro0.81 ± -0.43 ± -- ± -0.31 ± -0.82 ± -0.70 ± -0.37 ± -- ± -0.36 ± -0.69 ± -
GPT-3.50.60 ± 0.020.52 ± 0.02- ± -0.11 ± 0.020.63 ± 0.050.61 ± 0.010.46 ± 0.01- ± -0.29 ± 0.010.62 ± 0.02
GPT-40.87 ± 0.010.69 ± 0.02- ± -0.37 ± 0.010.86 ± 0.030.76 ± 0.010.56 ± 0.01- ± -0.43 ± 0.010.76 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/information-retrieval.md b/_pages/ind/few-shot/information-retrieval.md index 3c5643f..85c8366 100644 --- a/_pages/ind/few-shot/information-retrieval.md +++ b/_pages/ind/few-shot/information-retrieval.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/ind/few-shot/information-retrieval --- # Few-Shot Information Retrieval Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B0.05 ± 0.000.11 ± 0.000.06 ± 0.000.14 ± 0.000.04±0.000.04±0.000.03±0.000.04±0.00
URA-LLaMa 13B0.04 ± 0.000.10 ± 0.000.06 ± 0.000.14 ± 0.000.03±0.000.05±0.000.04±0.000.04±0.00
URA-LLaMa 7B0.04 ± 0.000.11 ± 0.000.06 ± 0.000.16 ± 0.000.03 ± 0.000.03 ± 0.000.02 ± 0.000.02 ± 0.00
LLaMa-2 13B0.07 ± 0.000.15 ± 0.000.09 ± 0.000.21 ± 0.000.05±0.000.04±0.000.04±0.000.04±0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.16 ± 0.000.02±0.000.03±0.000.03±0.000.02±0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.00±0.000.00±0.000.00±0.000.00±0.00
MixSUra 8x7B0.01 ± -0.07 ± -0.04 ± -0.11 ± -0.04±-0.04±-0.02±-0.02±-
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/knowledge.md b/_pages/ind/few-shot/knowledge.md index da4ef46..61d1840 100644 --- a/_pages/ind/few-shot/knowledge.md +++ b/_pages/ind/few-shot/knowledge.md @@ -2,115 +2,129 @@ layout: default permalink: /leaderboard/ind/few-shot/knowledge --- -# Few-Shot Knowledge Leaderboard +# Few-shot Knowledge Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.34 ± 0.020.50 ± 0.020.78 ± 0.020.63 ± 0.030.90 ± 0.010.13 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.26 ± 0.020.40 ± 0.020.62 ± 0.020.50 ± 0.020.69 ± 0.020.18 ± 0.020.65 ± 0.07
URA-LLaMa 7B0.14 ± 0.020.25 ± 0.020.42 ± 0.020.33 ± 0.020.61 ± 0.020.13 ± 0.020.39 ± 0.07
LLaMa-2 13B0.22 ± 0.020.36 ± 0.020.58 ± 0.020.46 ± 0.020.62 ± 0.020.28 ± 0.020.77 ± 0.06
LLaMa-2 7B0.07 ± 0.010.15 ± 0.010.30 ± 0.020.23 ± 0.020.56 ± 0.020.43 ± 0.020.16 ± 0.05
Vietcuna 7B0.07 ± 0.010.19 ± 0.010.31 ± 0.020.18 ± 0.010.50 ± 0.000.06 ± 0.020.31 ± 0.06
MixSUra 8x7B0.19 ± -0.34 ± -0.65 ± -0.64 ± -0.54 ± -0.29 ± -0.65 ± -
GPT-3.50.49 ± 0.020.64 ± 0.020.90 ± 0.010.73 ± 0.03-0.66 ± 0.010.91 ± 0.04
GPT-40.49 ± 0.020.64 ± 0.020.91 ± 0.010.73 ± 0.04-0.66 ± 0.010.91 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/language-modeling.md b/_pages/ind/few-shot/language-modeling.md index 5cb7963..04b179d 100644 --- a/_pages/ind/few-shot/language-modeling.md +++ b/_pages/ind/few-shot/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/ind/few-shot/language-modeling --- # Few-Shot Language Modeling Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.54 ± 0.000.66 ± 0.00669.74 ± 10.38153.04 ± 2.331.32 ± 0.050.33 ± 0.000.11 ± 0.000.13 ± 0.0015.09 ± 0.424.05 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.01 ± 0.000.45 ± 0.010.61 ± 0.01559.64 ± 11.23136.97 ± 2.681.49 ± 0.100.35 ± 0.000.02 ± 0.000.04 ± 0.002.81 ± 0.121.18 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01498.36 ± 11.01118.11 ± 2.581.24 ± 0.010.22 ± 0.000.32 ± 0.010.33 ± 0.0141.89 ± 1.5410.10 ± 0.341.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.74 ± 0.000.87 ± 0.00760.98 ± 11.91186.90 ± 2.851.24 ± 0.030.16 ± 0.000.03 ± 0.000.05 ± 0.003.38 ± 0.161.51 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.81 ± 0.000.98 ± 0.00769.36 ± 10.51198.53 ± 2.571.74 ± 0.190.12 ± 0.000.36 ± 0.010.39 ± 0.0147.50 ± 0.8611.80 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00935.65 ± 12.47204.98 ± 2.791.40 ± 0.000.00 ± 0.008.00 ± 0.078.01 ± 0.071063.93 ± 7.64241.74 ± 1.741.46 ± 0.00
MixSUra 8x7B0.00 ± -0.55 ± -0.63 ± -526.79 ± -131.02 ± -1.00 ± -0.08 ± -0.19 ± -0.28 ± -25.13 ± -8.58 ± -1.00 ± -
GPT-3.50.04 ± 0.000.28 ± 0.010.44 ± 0.01387.37 ± 10.8692.78 ± 2.46-0.66 ± 0.000.01 ± 0.000.02 ± 0.001.63 ± 0.080.61 ± 0.02-
GPT-40.08 ± 0.000.23 ± 0.010.40 ± 0.01336.53 ± 10.1883.55 ± 2.34-0.75 ± 0.000.01 ± 0.000.01 ± 0.000.89 ± 0.040.37 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/few-shot/reasoning.md b/_pages/ind/few-shot/reasoning.md index 748fa8c..4190e60 100644 --- a/_pages/ind/few-shot/reasoning.md +++ b/_pages/ind/few-shot/reasoning.md @@ -3,135 +3,72 @@ layout: default permalink: /leaderboard/ind/few-shot/reasoning --- # Few-Shot Reasoning Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsSR - NaturalSR - Abstract symbolMATH
EMF1Equ.EMF1Equ.EMF1Equ.
URA-LLaMa 70B0.14 ± 0.000.48 ± 0.000.15 ± 0.000.27 ± 0.000.85 ± 0.000.30 ± 0.000.00 ± 0.000.00 ± 0.000.12 ± 0.02
URA-LLaMa 13B0.08 ± 0.000.42 ± 0.000.08 ± 0.000.20 ± 0.000.70 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.01
URA-LLaMa 7B0.04 ± 0.000.38 ± 0.000.04 ± 0.000.11 ± 0.000.61 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.07 ± 0.01
LLaMa-2 13B0.03 ± 0.000.24 ± 0.000.04 ± 0.000.19 ± 0.000.69 ± 0.000.18 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.01 ± 0.000.00 ± 0.000.06 ± 0.000.44 ± 0.000.06 ± 0.000.00 ± 0.000.00 ± 0.000.11 ± 0.01
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.000.71 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
MixSUra 8x7B0.07 ± 0.000.41 ± 0.000.07 ± 0.000.22 ± 0.000.78 ± 0.000.23 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.15 ± 0.000.50 ± 0.000.16 ± 0.000.26 ± 0.000.83 ± 0.000.29 ± 0.000.00 ± 0.000.00 ± 0.000.62 ± 0.02
GPT-40.37 ± 0.000.74 ± 0.000.42 ± 0.000.37 ± 0.000.87 ± 0.000.44 ± 0.000.00 ± 0.000.01 ± 0.000.65 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + EM↑ + F1↑ + Equ↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/sentiment-analysis.md b/_pages/ind/few-shot/sentiment-analysis.md index 5971d49..097ad29 100644 --- a/_pages/ind/few-shot/sentiment-analysis.md +++ b/_pages/ind/few-shot/sentiment-analysis.md @@ -1,146 +1,98 @@ --- layout: default -permalink: /leaderboard/ind/few-shot/sentiment-analysis +permalink: /leaderboard/ind/few-shot/sentiment-analysis --- # Few-Shot Sentiment Analysis Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.66 ± 0.010.49 ± 0.010.72 ± 0.010.13 ± 0.010.77 ± 0.040.75 ± 0.010.48 ± 0.010.81 ± 0.010.16 ± 0.010.71 ± 0.02
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.67 ± 0.010.09 ± 0.010.82 ± 0.040.74 ± 0.010.52 ± 0.080.83 ± 0.010.10 ± 0.010.87 ± 0.02
URA-LLaMa 7B0.57 ± 0.020.42 ± 0.050.69 ± 0.020.07 ± 0.020.77 ± 0.040.72 ± 0.010.43 ± 0.010.78 ± 0.010.13 ± 0.010.95 ± 0.03
LLaMa-2 13B0.51 ± 0.010.41 ± 0.060.66 ± 0.010.32 ± 0.020.80 ± 0.040.63 ± 0.010.46 ± 0.070.71 ± 0.010.13 ± 0.010.88 ± 0.02
LLaMa-2 7B0.45 ± 0.010.32 ± 0.010.59 ± 0.010.26 ± 0.020.50 ± 0.050.50 ± 0.010.34 ± 0.010.69 ± 0.010.23 ± 0.010.62 ± 0.03
Vietcuna 7B0.04 ± 0.010.05 ± 0.010.45 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.53 ± 0.010.50 ± 0.000.01 ± 0.00
MixSUra 8x7B0.62 ± -0.63 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.63 ± -0.23 ± -0.655 ± -
GPT-3.50.65 ± 0.010.59 ± 0.1-0.32 ± 0.010.65 ± 0.050.86 ± 0.010.73 ± 0.01-0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01-0.41 ± 0.010.74 ± 0.040.85 ± 0.010.59 ± 0.09-0.52 ± 0.010.85 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/text-classification.md b/_pages/ind/few-shot/text-classification.md index e2071ae..4d08aff 100644 --- a/_pages/ind/few-shot/text-classification.md +++ b/_pages/ind/few-shot/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/ind/few-shot/text-classification --- # Few-Shot Text Classification Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.020.15 ± 0.010.56 ± 0.010.25 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.83 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.32 ± 0.020.12 ± 0.010.58 ± 0.010.22 ± 0.020.57 ± 0.070.01 ± 0.010.06 ± 0.020.47 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.43 ± 0.060.06 ± 0.010.01 ± 0.000.55 ± 0.000.24 ± 0.010.08 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.49 ± 0.070.02 ± 0.010.06 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.12 ± 0.010.57 ± 0.010.21 ± 0.020.54 ± 0.060.03 ± 0.010.02 ± 0.010.56 ± 0.010.54 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.15 ± 0.040.04 ± 0.010.01 ± 0.000.63 ± 0.000.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
GPT-3.50.42 ± 0.020.40 ± 0.02-0.28 ± 0.020.42 ± 0.060.69 ± 0.020.67 ± 0.03-0.63 ± 0.020.69 ± 0.05
GPT-40.49 ± 0.020.48 ± 0.02-0.35 ± 0.020.49 ± 0.060.85 ± 0.010.78 ± 0.03-0.79 ± 0.010.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/toxicity-detection.md b/_pages/ind/few-shot/toxicity-detection.md index 1dc584b..2749d7a 100644 --- a/_pages/ind/few-shot/toxicity-detection.md +++ b/_pages/ind/few-shot/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/ind/few-shot/toxicity-detection --- # Few-Shot Toxicity Detection Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.44 ± 0.010.27 ± 0.010.75 ± 0.010.52 ± 0.010.37 ± 0.020.17 ± 0.000.15 ± 0.000.64 ± 0.010.57 ± 0.000.27 ± 0.02
URA-LLaMa 13B0.44 ± 0.010.30 ± 0.050.67 ± 0.010.33 ± 0.010.41 ± 0.030.26 ± 0.010.16 ± 0.000.61 ± 0.010.42 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.43 ± 0.010.40 ± 0.010.60 ± 0.010.29 ± 0.010.71 ± 0.020.16 ± 0.000.10 ± 0.000.67 ± 0.010.32 ± 0.000.28 ± 0.02
LLaMa-2 13B0.28 ± 0.010.19 ± 0.000.67 ± 0.010.52 ± 0.010.63 ± 0.030.17 ± 0.000.11 ± 0.000.62 ± 0.010.58 ± 0.000.44 ± 0.02
LLaMa-2 7B0.16 ± 0.010.12 ± 0.010.61 ± 0.010.66 ± 0.010.08 ± 0.020.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.02
Vietcuna 7B0.08 ± 0.000.10 ± 0.010.50 ± 0.000.42 ± 0.000.08 ± 0.030.61 ± 0.010.21 ± 0.000.50 ± 0.000.28 ± 0.010.61 ± 0.02
MixSUra 8x7B0.70 ± -0.39 ± -- ± -0.29 ± -0.80 ± -0.58 ± -0.31 ± -0.68 ± -0.30 ± -0.93 ± -
GPT-3.50.63 ± 0.020.54 ± 0.02- 0.13 ± 0.020.63 ± 0.050.63 ± 0.010.47 ± 0.01- 0.29 ± 0.010.63 ± 0.02
GPT-40.89 ± 0.000.71 ± 0.01- 0.39 ± 0.000.89 ± 0.030.77 ± 0.010.57 ± 0.01- 0.44 ± 0.010.77 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/few-shot/translation.md b/_pages/ind/few-shot/translation.md index d20d516..aee1200 100644 --- a/_pages/ind/few-shot/translation.md +++ b/_pages/ind/few-shot/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/ind/few-shot/translation --- # Few-Shot Translation Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsPhoMTOPUS100 + Models + + {{ dataset[0] }} +
(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)BLEU envi↑BLEU vien↑hLEPOR envi↑hLEPOR vien↑
URA-LLaMa 70B0.28 ± 0.000.59 ± 0.000.27 ± 0.000.58 ± 0.000.10 ± 0.000.44 ± 0.010.14 ± 0.000.41 ± 0.01
URA-LLaMa 13B0.25 ± 0.000.55 ± 0.000.15 ± 0.000.56 ± 0.000.10 ± 0.010.41 ± 0.010.17 ± 0.010.43 ± 0.01
URA-LLaMa 7B0.19 ± 0.000.50 ± 0.000.22 ± 0.000.54 ± 0.000.08 ± 0.000.38 ± 0.010.14 ± 0.010.39 ± 0.01
LLaMa-2 13B0.23 ± 0.000.53 ± 0.000.23 ± 0.000.54 ± 0.000.09 ± 0.000.39 ± 0.010.14 ± 0.010.40 ± 0.01
LLaMa-2 7B0.18 ± 0.000.47 ± 0.000.21 ± 0.000.52 ± 0.000.07 ± 0.000.34 ± 0.000.11 ± 0.010.36 ± 0.01
Vietcuna 7B0.15 ± 0.000.35 ± 0.000.03 ± 0.000.11 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.16 ± 0.00
MixSUra 8x7B0.15 ± -0.51 ± -0.16 ± -0.52 ± -0.07 ± -0.37 ± -0.09 ± -0.36 ± -
GPT-3.50.33 ± 0.000.65 ± 0.000.33 ± 0.000.63 ± 0.000.16 ± 0.010.50 ± 0.010.24 ± 0.010.51 ± 0.00
GPT-40.33 ± 0.000.66 ± 0.000.34 ± 0.000.65 ± 0.000.17 ± 0.010.51 ± 0.010.25 ± 0.010.53 ± 0.00 + {{ model }} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/medium-prompt/question-answering.md b/_pages/ind/medium-prompt/question-answering.md index 0731ae6..b8eaa5f 100644 --- a/_pages/ind/medium-prompt/question-answering.md +++ b/_pages/ind/medium-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/ind/medium-prompt/question-answering --- # Medium-Prompt Question Answering Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.08 ± 0.000.33 ± 0.000.07 ± 0.000.31 ± 0.00
URA-LLaMa 13B0.04 ± 0.000.21 ± 0.000.04 ± 0.000.19 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.11 ± 0.000.01 ± 0.000.11 ± 0.00
LLaMa-2 13B0.00 ± 0.000.10 ± 0.000.00 ± 0.000.09 ± 0.00
LLaMa-2 7B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.03 ± 0.00
MixSUra 8x7B0.01 ± -0.25 ± -0.00 ± -0.25 ± -
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/medium-prompt/summarization.md b/_pages/ind/medium-prompt/summarization.md index 78e4290..5d3185e 100644 --- a/_pages/ind/medium-prompt/summarization.md +++ b/_pages/ind/medium-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/ind/medium-prompt/summarization --- # Medium-Prompt Summarization Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.35 ± 0.000.16 ± 0.000.24 ± 0.00-0.11 ± 0.000.12 ± 0.000.63 ± 0.005.43 ± 0.0237.78 ± 0.470.33 ± 0.000.14 ± 0.000.22 ± 0.00-0.16± 0.000.24± 0.100.59 ± 0.014.62 ± 0.1156.56 ± 1.70
URA-LLaMa 13B0.26 ± 0.000.12 ± 0.000.17 ± 0.00-0.09 ± 0.00-0.08 ± 0.180.46 ± 0.003.55 ± 0.0447.75 ± 0.650.14 ± 0.000.05 ± 0.000.09 ± 0.00-0.16 ± 0.00-0.14 ± 0.120.26 ± 0.011.83 ± 0.0660.10 ± 2.16
URA-LLaMa 7B0.41 ± 0.000.18 ± 0.000.27 ± 0.00-0.09 ± 0.00-0.08 ± 0.130.83 ± 0.008.13 ± 0.048.08 ± 0.170.42 ± 0.000.17 ± 0.000.27 ± 0.00-0.16 ± 0.000.27 ± 0.210.84 ± 0.007.15 ± 0.088.08 ± 0.36
LLaMa-2 13B0.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.09 ± 0.00-0.19 ± 0.050.01 ± 0.000.01 ± 0.0054.67 ± 0.160.03 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.05 ± 0.030.02 ± 0.000.02 ± 0.0042.55 ± 0.81
LLaMa-2 7B0.03 ± 0.000.01 ± 0.000.03 ± 0.00-0.09 ± 0.00-0.17 ± 0.030.04 ± 0.000.07 ± 0.0023.86 ± 0.260.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.16 ± 0.00-0.04 ± 0.060.02 ± 0.000.03 ± 0.0040.31 ± 0.88
MixSUra 8x7B0.06 ± -0.01 ± -0.04 ± -- ± --0.13 ± -0.10 ± -0.17 ± -9.03 ± -0.03 ± -0.00 ± -0.03 ± -- ± --0.01 ± -0.17 ± -0.26 ± -16.68 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/randomized-choice/knowledge.md b/_pages/ind/randomized-choice/knowledge.md index b2f19ef..bbf8f5f 100644 --- a/_pages/ind/randomized-choice/knowledge.md +++ b/_pages/ind/randomized-choice/knowledge.md @@ -4,90 +4,96 @@ permalink: /leaderboard/ind/randomized-choice/knowledge --- # Randomized-Choice Knowledge Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + {% endfor %} - - - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + + + + + {% endfor %} + + + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsAC F1 AR ECE A@10
Our 70B0.76 ± 0.020.76 ± 0.020.78 ± 0.010.14 ± 0.020.94 ± 0.04
Our 13B0.62 ± 0.020.62 ± 0.020.61 ± 0.020.15 ± 0.020.67 ± 0.07
Our 7B0.45 ± 0.020.36 ± 0.020.57 ± 0.020.10 ± 0.020.45 ± 0.07
LLaMa-2 13B0.57 ± 0.020.57 ± 0.020.57 ± 0.020.29 ± 0.020.75 ± 0.07
LLaMa-2 7B0.36 ± 0.020.27 ± 0.020.56 ± 0.020.37 ± 0.020.44 ± 0.07
Vietcuna 7B0.26 ± 0.020.15 ± 0.010.50 ± 0.000.01 ± 0.010.26 ± 0.06 + Models + + {{ dataset[0] }} +
MixSUra 7B0.61 ± -0.61 ± -0.54 ± -0.31 ± -0.65 ± -
GPT-3.50.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04AC↑F1↑AR↑ECE↓A@10↑
GPT-40.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/robustness-aware/information-retrieval.md b/_pages/ind/robustness-aware/information-retrieval.md index 0af94b0..87f6914 100644 --- a/_pages/ind/robustness-aware/information-retrieval.md +++ b/_pages/ind/robustness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/ind/robustness-aware/information-retrieval --- # Robustness-Aware Information Retrieval Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.17 ± 0.00----
LLaMa-2 13B0.06 ± 0.000.13 ± 0.000.19 ± 0.000.19 ± 0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.08 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/knowledge.md b/_pages/ind/robustness-aware/knowledge.md index 4a390d8..7005673 100644 --- a/_pages/ind/robustness-aware/knowledge.md +++ b/_pages/ind/robustness-aware/knowledge.md @@ -3,114 +3,128 @@ layout: default permalink: /leaderboard/ind/robustness-aware/knowledge --- # Robustness-Aware Knowledge Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.23 ± 0.000.37 ± 0.000.65 ± 0.000.53 ± 0.000.84 ± 0.000.11 ± 0.000.77 ± 0.00
URA-LLaMa 13B0.18 ± 0.000.30 ± 0.000.41 ± 0.000.34 ± 0.000.61 ± 0.000.22 ± 0.000.58 ± 0.00
URA-LLaMa 7B0.10 ± 0.000.18 ± 0.000.33 ± 0.020.28 ± 0.020.61 ± 0.010.19 ± 0.020.33 ± 0.06
LLaMa-2 13B0.13 ± 0.000.21 ± 0.000.39 ± 0.000.31 ± 0.000.56 ± 0.000.46 ± 0.000.33 ± 0.00
LLaMa-2 7B0.02 ± 0.000.05 ± 0.000.26 ± 0.010.20 ± 0.010.51 ± 0.010.46 ± 0.010.13 ± 0.03
Vietcuna 7B0.05 ± 0.000.15 ± 0.000.26 ± 0.010.14 ± 0.000.50 ± 0.000.01 ± 0.010.21 ± 0.07
MixSUra 8x7B0.13 ± -0.24 ± -0.57 ± -0.45 ± -0.53 ± -0.35 ± -0.58 ± -
GPT-3.50.45 ± 0.010.61 ± 0.010.90 ± 0.010.72 ± 0.04-0.65 ± 0.010.88 ± 0.07
GPT-40.44 ± 0.010.61 ± 0.010.91 ± 0.010.73 ± 0.07-0.66 ± 0.070.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/question-answering.md b/_pages/ind/robustness-aware/question-answering.md index 334e0ac..4a135b6 100644 --- a/_pages/ind/robustness-aware/question-answering.md +++ b/_pages/ind/robustness-aware/question-answering.md @@ -3,84 +3,60 @@ layout: default permalink: /leaderboard/ind/robustness-aware/question-answering --- # Robustness-Aware Question Answering Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.01 ± 0.000.17 ± 0.000.01 ± 0.000.18 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
LLaMa-2 13B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.03 ± 0.00
LLaMa-2 7B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.02 ± 0.00
Vietcuna 7B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.05 ± 0.00
MixSUra 8x7B0.00 ± -0.11 ± -0.00 ± -0.12 ± -
GPT-3.50.00 ± 0.000.19 ± 0.000.00 ± 0.000.20 ± 0.00
GPT-40.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/robustness-aware/sentiment-analysis.md b/_pages/ind/robustness-aware/sentiment-analysis.md index 7741871..343ad5b 100644 --- a/_pages/ind/robustness-aware/sentiment-analysis.md +++ b/_pages/ind/robustness-aware/sentiment-analysis.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/ind/robustness-aware/sentiment-analysis --- # Robustness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.63 ± 0.010.48 ± 0.010.60 ± 0.010.09 ± 0.010.83 ± 0.040.71 ± 0.010.45 ± 0.010.80 ± 0.010.08 ± 0.010.99 ± 0.01
URA-LLaMa 13B0.55 ± 0.020.52 ± 0.020.59 ± 0.010.06 ± 0.010.74 ± 0.050.72 ± 0.010.44 ± 0.050.77 ± 0.010.18 ± 0.010.77 ± 0.02
URA-LLaMa 7B0.52 ± 0.020.36 ± 0.030.59 ± 0.010.07 ± 0.010.66 ± 0.050.73 ± 0.010.41 ± 0.010.71 ± 0.010.16 ± 0.010.87 ± 0.02
LLaMa-2 13B0.46 ± 0.020.30 ± 0.010.55 ± 0.010.39 ± 0.020.70 ± 0.050.66 ± 0.010.40 ± 0.010.63 ± 0.010.11 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.36 ± 0.010.54 ± 0.010.20 ± 0.020.51 ± 0.050.51 ± 0.010.33 ± 0.010.65 ± 0.010.15 ± 0.010.80 ± 0.02
Vietcuna 7B0.44 ± 0.020.27 ± 0.010.51 ± 0.010.23 ± 0.020.53 ± 0.050.49 ± 0.010.25 ± 0.030.46 ± 0.010.33 ± 0.010.34 ± 0.03
MixSUra 8x7B0.59 ± -0.59 ± -0.55 ± -0.34 ± -0.52 ± -0.69 ± -0.44 ± -0.61 ± -0.29 ± -0.66 ± -
GPT-3.50.64 ± 0.010.60 ± 0.01-0.31 ± 0.010.54 ± 0.050.86 ± 0.010.71 ± 0.01-0.53 ± 0.010.86 ± 0.02
GPT-40.74 ± 0.000.73 ± 0.00-0.41 ± 0.000.71 ± 0.000.83 ± 0.000.70 ± 0.00-0.50 ± 0.000.85 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/summarization.md b/_pages/ind/robustness-aware/summarization.md index 1fe681e..a5771c6 100644 --- a/_pages/ind/robustness-aware/summarization.md +++ b/_pages/ind/robustness-aware/summarization.md @@ -3,204 +3,132 @@ layout: default permalink: /leaderboard/ind/robustness-aware/summarization --- # Robustness-Aware Summarization Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.34 ± 0.000.15 ± 0.000.23 ± 0.00-0.06 ± 0.00-0.11 ± 0.180.10 ± 0.000.10 ± 0.0039.63 ± 0.870.28 ± 0.000.11 ± 0.000.19 ± 0.00-0.16 ± 0.000.25 ± 0.230.50 ± 0.010.50 ± 0.01167.42 ± 7.09
URA-LLaMa 13B0.35 ± 0.000.14 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.07 ± 0.170.64 ± 0.000.65 ± 0.00134.65 ± 3.760.20 ± 0.000.07 ± 0.000.13 ± 0.00-0.17 ± 0.000.20 ± 0.110.38 ± 0.000.38 ± 0.00103.69 ± 3.33
URA-LLaMa 7B0.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.10 ± 0.00-0.24 ± 0.180.65 ± 0.000.65 ± 0.0017.92 ± 0.870.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.17 ± 0.000.11 ± 0.180.65 ± 0.000.65 ± 0.0020.49 ± 0.95
LLaMa-2 13B0.05 ± 0.000.01 ± 0.000.04 ± 0.00-0.15 ± 0.00-0.24 ± 0.180.03 ± 0.000.03 ± 0.0055.91 ± 0.650.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.17 ± 0.000.09 ± 0.000.05 ± 0.000.05 ± 0.0066.85 ± 6.72
LLaMa-2 7B0.05 ± 0.000.01 ± 0.000.05 ± 0.00-0.10 ± 0.00-0.19 ± 0.040.07 ± 0.000.07 ± 0.0055.29 ± 0.880.04 ± 0.000.00 ± 0.000.04 ± 0.00-0.17 ± 0.000.15 ± 0.000.06 ± 0.000.06 ± 0.0058.32 ± 3.32
Vietcuna 7B0.03 ± 0.000.01 ± 0.000.02 ± 0.00-0.10 ± 0.00-0.18 ± 0.060.91 ± 0.000.91 ± 0.001026.61 ± 3.860.08 ± 0.000.02 ± 0.000.05 ± 0.00-0.17 ± 0.00-0.19 ± 0.050.78 ± 0.000.78 ± 0.00505.45 ± 8.64
MixSUra 8x7B0.41 ± -0.19 ± -0.26 ± -- ± --0.03 ± -0.86 ± -0.87 ± -29.15 ± -0.46 ± -0.21 ± -0.28 ± -- ± -0.26 ± -0.88 ± -0.98 ± -19.10 ± -
GPT-3.50.34 ± 0.000.19 ± 0.000.23 ± 0.00-0.10 ± 0.000.05 ± 0.140.81 ± 0.000.81 ± 0.00128.44 ± 2.940.39 ± 0.000.19 ± 0.000.25 ± 0.00-0.17 ± 0.000.28 ± 0.110.82 ± 0.000.82 ± 0.00200.90 ± 7.40
GPT-40.39 ± 0.000.21 ± 0.000.26 ± 0.00-0.10 ± 0.090.04 ± 0.000.83 ± 0.000.83 ± 0.7124.48 ± 0.000.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.17 ± 0.000.28 ± 0.000.80 ± 0.030.81 ± 0.0020.40 ± 1.59
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/text-classification.md b/_pages/ind/robustness-aware/text-classification.md index 07c0e7e..7ae34de 100644 --- a/_pages/ind/robustness-aware/text-classification.md +++ b/_pages/ind/robustness-aware/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/ind/robustness-aware/text-classification --- # Robustness-Aware Text Classification Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.000.16 ± 0.000.56 ± 0.020.20 ± 0.000.33 ± 0.000.16 ± 0.020.26 ± 0.030.79 ± 0.000.79 ± 0.020.08 ± 0.06
URA-LLaMa 13B0.30 ± 0.000.11 ± 0.000.51 ± 0.010.26 ± 0.000.44 ± 0.000.01 ± 0.010.05 ± 0.010.47 ± 0.010.84 ± 0.010.00 ± 0.04
URA-LLaMa 7B0.29 ± 0.000.10 ± 0.000.57 ± 0.010.17 ± 0.000.30 ± 0.000.02 ± 0.010.04 ± 0.000.55 ± 0.010.18 ± 0.010.01 ± 0.02
LLaMa-2 13B0.19 ± 0.000.07 ± 0.000.52 ± 0.010.47 ± 0.000.43 ± 0.000.02 ± 0.000.06 ± 0.000.57 ± 0.010.91 ± 0.000.01 ± 0.00
LLaMa-2 7B0.17 ± 0.000.10 ± 0.000.55 ± 0.000.33 ± 0.000.29 ± 0.000.01 ± 0.010.00 ± 0.000.56 ± 0.000.69 ± 0.010.02 ± 0.02
Vietcuna 7B0.09 ± 0.000.09 ± 0.000.51 ± 0.010.91 ± 0.000.09 ± 0.000.02 ± 0.010.01 ± 0.000.55 ± 0.010.23 ± 0.010.02 ± 0.01
MixSUra 8x7B0.35 ± -0.27 ± -0.70 ± -0.58 ± -0.70 ± -0.80 ± -55 ± -0.94 ± -0.15 ± -0.88 ± -
GPT-3.50.42 ± 0.000.41 ± 0.00-0.28 ± 0.000.30 ± 0.000.68 ± 0.020.64 ± 0.03-0.62 ± 0.020.70 ± 0.05
GPT-40.48 ± 0.000.45 ± 0.00-0.33 ± 0.000.40 ± 0.000.86 ± 0.010.80 ± 0.02-0.80 ± 0.010.91 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/toxicity-detection.md b/_pages/ind/robustness-aware/toxicity-detection.md index e0ac7b4..ce60db1 100644 --- a/_pages/ind/robustness-aware/toxicity-detection.md +++ b/_pages/ind/robustness-aware/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/ind/robustness-aware/toxicity-detection --- # Robustness-Aware Toxicity Detection Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.32 ± 0.000.21 ± 0.000.72 ± 0.010.62 ± 0.000.33 ± 0.000.14 ± 0.000.12 ± 0.000.64 ± 0.020.61 ± 0.000.23 ± 0.00
URA-LLaMa 13B0.27 ± 0.000.26 ± 0.000.56 ± 0.000.56 ± 0.000.12 ± 0.000.18 ± 0.000.11 ± 0.000.57 ± 0.010.45 ± 0.000.20 ± 0.00
URA-LLaMa 7B0.22 ± 0.000.21 ± 0.000.63 ± 0.000.39 ± 0.000.36 ± 0.000.12 ± 0.000.07 ± 0.000.62 ± 0.000.38 ± 0.000.19 ± 0.00
LLaMa-2 13B0.12 ± 0.000.11 ± 0.000.56 ± 0.010.66 ± 0.000.12 ± 0.000.10 ± 0.000.07 ± 0.000.59 ± 0.010.62 ± 0.000.24 ± 0.00
LLaMa-2 7B0.04 ± 0.000.04 ± 0.000.62 ± 0.000.86 ± 0.000.02 ± 0.000.01 ± 0.000.00 ± 0.000.54 ± 0.000.79 ± 0.000.00 ± 0.00
Vietcuna 7B0.11 ± 0.000.11 ± 0.000.54 ± 0.000.39 ± 0.000.13 ± 0.000.09 ± 0.000.05 ± 0.000.5 ± 0.000.24 ± 0.000.08 ± 0.00
MixSUra 8x7B0.72 ± -0.39 ± -- ± -0.25 ± -0.81 ± -0.66 ± -0.31 ± -0.67 ± -0.21 ± -0.82 ± -
GPT-3.50.51 ± 0.000.46 ± 0.000.5 ± 0.000.01 ± 0.000.54 ± 0.000.64 ± 0.000.47 ± 0.00- ± -0.30 ± 0.000.63 ± 0.00
GPT-40.88 ± 0.000.71 ± 0.00- ± -0.38 ± 0.000.88 ± 0.000.78 ± 0.000.56 ± 0.00- ± -0.44 ± 0.000.78 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/robustness-aware/translation.md b/_pages/ind/robustness-aware/translation.md index bcacd2c..b2ae678 100644 --- a/_pages/ind/robustness-aware/translation.md +++ b/_pages/ind/robustness-aware/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/ind/robustness-aware/translation --- # Robustness-Aware Translation Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMTOPUS100
(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)
URA-LLaMa 70B0.25 ± 0.000.58 ± 0.000.11 ± 0.000.51 ± 0.000.05 ± 0.000.40 ± 0.010.06 ± 0.000.36 ± 0.00
URA-LLaMa 13B0.23 ± 0.000.55 ± 0.000.10 ± 0.000.50 ± 0.000.03 ± 0.000.38 ± 0.010.05 ± 0.000.38 ± 0.00
URA-LLaMa 7B0.15 ± 0.000.48 ± 0.000.06 ± 0.000.46 ± 0.000.02 ± 0.000.35 ± 0.000.03 ± 0.000.34 ± 0.01
LLaMa-2 13B0.20 ± 0.000.51 ± 0.000.07 ± 0.000.44 ± 0.000.03 ± 0.000.36 ± 0.010.04 ± 0.000.32 ± 0.00
LLaMa-2 7B0.13 ± 0.000.41 ± 0.000.05 ± 0.000.42 ± 0.000.02 ± 0.000.31 ± 0.000.03 ± 0.000.30 ± 0.00
Vietcuna 7B0.17 ± 0.000.43 ± 0.000.07 ± 0.010.41 ± 0.000.09 ± 0.010.38 ± 0.010.09 ± 0.010.33 ± 0.00
MixSUra 8x7B0.14 ± -0.50 ± -0.11 ± -0.46 ± -0.06 ± -0.36 ± -0.06 ± -0.31 ± -
GPT-3.50.31 ± 0.000.64 ± 0.000.17 ± 0.000.59 ± 0.000.15 ± 0.010.49 ± 0.010.21 ± 0.010.48 ± 0.00
GPT-40.31 ± 0.000.65 ± 0.000.20 ± 0.000.62 ± 0.000.16 ± 0.010.50 ± 0.010.23 ± 0.010.51 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + BLEU envi↑ + BLEU vien↑ + hLEPOR envi↑ + hLEPOR vien↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/weaker-prompt/question-answering.md b/_pages/ind/weaker-prompt/question-answering.md index 46bc765..ecadfab 100644 --- a/_pages/ind/weaker-prompt/question-answering.md +++ b/_pages/ind/weaker-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/ind/weaker-prompt/question-answering --- # Weak-Prompt Question Answering Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.21 ± 0.010.47 ± 0.010.14 ± 0.010.41 ± 0.00
URA-LLaMa 13B0.22 ± 0.010.43 ± 0.010.17 ± 0.010.40 ± 0.01
URA-LLaMa 7B0.13 ± 0.000.32 ± 0.000.10 ± 0.000.32 ± 0.00
LLaMa-2 13B0.04 ± 0.000.28 ± 0.000.04 ± 0.000.28 ± 0.00
LLaMa-2 7B0.06 ± 0.000.24 ± 0.000.05 ± 0.000.24 ± 0.00
MixSUra 8x7b0.13 ±-0.38 ± -0.09 ± -0.36 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/weaker-prompt/summarization.md b/_pages/ind/weaker-prompt/summarization.md index 16f89b5..0968087 100644 --- a/_pages/ind/weaker-prompt/summarization.md +++ b/_pages/ind/weaker-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/ind/weaker-prompt/summarization --- # Weak-Prompt Summarization Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.49 ± 0.000.23 ± 0.000.31 ± 0.00-0.08 ± 0.000.05 ± 0.110.89 ± 0.008.90 ± 0.0318.48 ± 0.590.47 ± 0.000.20 ± 0.000.29 ± 0.00-0.16 ± 0.000.19 ± 0.130.86 ± 0.006.83 ± 0.0925.30 ± 1.86
URA-LLaMa 13B0.27 ± 0.000.12 ± 0.000.18 ± 0.00-0.09 ± 0.000.05 ± 0.110.56 ± 0.005.00 ± 0.04153.55 ± 0.990.22 ± 0.000.09 ± 0.000.14 ± 0.00-0.16 ± 0.000.20 ± 0.0070.48 ± 0.003.49 ± 0.04190.09 ± 4.92
URA-LLaMa 7B0.45 ± 0.000.21 ± 0.000.29 ± 0.00-0.08 ± 0.000.03 ± 0.090.91 ± 0.009.43 ± 0.036.42 ± 0.050.42 ± 0.000.18 ± 0.000.27 ± 0.00-0.16 ± 0.000.07 ± 0.120.89 ± 0.007.58 ± 0.057.14 ± 0.14
LLaMa-2 13B0.45 ± 0.000.22 ± 0.000.29 ± 0.00-0.09 ± 0.000.00 ± 0.140.92 ± 0.009.49 ± 0.028.46 ± 0.290.47 ± 0.000.22 ± 0.000.29 ± 0.00-0.16 ± 0.000.34 ± 0.120.92 ± 0.009.39 ± 0.0517.94 ± 2.84
LLaMa-2 7B0.36 ± 0.000.17 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.15 ± 0.120.69 ± 0.006.35 ± 0.037.59 ± 0.210.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.16 ± 0.000.36 ± 0.000.83 ± 0.007.71 ± 0.0712.39 ± 1.46
MixSUra 8x7B0.44 ± -0.22 ± -0.29 ± -- ± -0.07 ± -0.97 ± -35.67 ± -9.43 ± -0.47 ± -0.22 ± -0.29 ± -- ± -0.19 ± -0.97 ± -28.97 ± -10.27 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/information-retrieval.md b/_pages/ind/zero-shot/information-retrieval.md index c7d20e4..5609ce6 100644 --- a/_pages/ind/zero-shot/information-retrieval.md +++ b/_pages/ind/zero-shot/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/ind/zero-shot/information-retrieval --- # Zero-Shot Information Retrieval Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B--------
URA-LLaMa 13B--------
URA-LLaMa 7B0.06 ± 0.000.14 ± 0.000.09 ± 0.000.21 ± 0.00----
LLaMa-2 13B--------
LLaMa-2 7B0.06 ± 0.000.11 ± 0.000.08 ± 0.000.17 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/knowledge.md b/_pages/ind/zero-shot/knowledge.md index f610dbf..4f366ef 100644 --- a/_pages/ind/zero-shot/knowledge.md +++ b/_pages/ind/zero-shot/knowledge.md @@ -2,105 +2,129 @@ layout: default permalink: /leaderboard/ind/zero-shot/knowledge --- -# Zero-Shot Knowledge Leaderboard +# Zero-shot Knowledge Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.28 ± 0.020.44 ± 0.020.80 ± 0.020.80 ± 0.020.85 ± 0.010.10 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.12 ± 0.010.22 ± 0.010.40 ± 0.020.31 ± 0.020.57 ± 0.020.48 ± 0.020.42 ± 0.08
URA-LLaMa 7B0.09 ± 0.010.20 ± 0.020.30 ± 0.020.10 ± 0.010.56 ± 0.020.27 ± 0.020.56 ± 0.07
LLaMa-2 13B0.06 ± 0.010.10 ± 0.010.52 ± 0.020.41 ± 0.020.64 ± 0.020.33 ± 0.020.73 ± 0.07
LLaMa-2 7B0.03 ± 0.010.07 ± 0.010.37 ± 0.020.25 ± 0.020.51 ± 0.020.35 ± 0.020.29 ± 0.06
Vietcuna 7B0.03 ± 0.010.06 ± 0.010.32 ± 0.020.22 ± 0.020.50 ± 0.000.07 ± 0.020.33 ± 0.07
GPT-3.50.37 ± 0.020.56 ± 0.020.90 ± 0.010.72 ± 0.01-0.65 ± 0.010.90 ± 0.04
GPT-40.38 ± 0.020.55 ± 0.020.92 ± 0.010.73 ± 0.06-0.67 ± 0.010.90 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/language-modeling.md b/_pages/ind/zero-shot/language-modeling.md index 6e42b7f..863b070 100644 --- a/_pages/ind/zero-shot/language-modeling.md +++ b/_pages/ind/zero-shot/language-modeling.md @@ -3,149 +3,108 @@ layout: default permalink: /leaderboard/ind/zero-shot/language-modeling --- # Zero-Shot Language Modeling Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsMLQA-MLMVSEC
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLX
URA-LLaMa 70B0.00 ± 0.000.50 ± 0.010.64 ± 0.01519.09 ± 10.96115.82 ± 2.451.08 ± 0.010.00 ± 0.000.88 ± 0.001.01 ± 0.00113.51 ± 0.5729.91 ± 0.151.09 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.67 ± 0.000.78 ± 0.00697.85 ± 11.62161.34 ± 2.641.16 ± 0.020.01 ± 0.000.42 ± 0.010.56 ± 0.0154.88 ± 0.7714.50 ± 0.191.26 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.73 ± 0.000.88 ± 0.01684.00 ± 13.18166.87 ± 3.181.25 ± 0.010.01 ± 0.003.33 ± 0.043.14 ± 0.03420.34 ± 5.6685.79 ± 0.961.33 ± 0.00
LLaMa-2 13B0.00 ± 0.000.90 ± 0.001.00 ± 0.00881.97 ± 11.23208.52 ± 2.521.10 ± 0.010.00 ± 0.001.32 ± 0.011.40 ± 0.01160.06 ± 1.1638.12 ± 0.231.11 ± 0.00
LLaMa-2 7B0.00 ± 0.000.95 ± 0.001.07 ± 0.01860.42 ± 13.18210.21 ± 3.181.25 ± 0.010.00 ± 0.001.54 ± 0.041.55 ± 0.03171.28 ± 5.6640.18 ± 0.961.14 ± 0.00
Vietcuna 7B0.00 ± 0.001.00 ± 0.001.00 ± 0.00951.53 ± 12.37208.57 ± 2.731.48 ± 0.010.01 ± 0.001.11 ± 0.011.20 ± 0.01139.90 ± 1.3933.94 ± 0.331.61 ± 0.00
GPT-3.50.00 ± 0.000.34 ± 0.010.50 ± 0.01422.30 ± 10.79100.33 ± 2.44-0.02 ± 0.000.16 ± 0.000.30 ± 0.0012.63 ± 0.343.48 ± 0.09-
GPT-40.04 ± 0.000.40 ± 0.010.45 ± 0.01381.88 ± 10.2693.34 ± 2.39-0.60 ± 0.010.14 ± 0.000.26 ± 0.0013.58 ± 0.453.67 ± 0.12-
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + EM↑ + CER↓ + WER↓ + CED↓ + WED↓ + PLX↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/question-answering.md b/_pages/ind/zero-shot/question-answering.md index c657a45..a453107 100644 --- a/_pages/ind/zero-shot/question-answering.md +++ b/_pages/ind/zero-shot/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/ind/zero-shot/question-answering --- # Zero-Shot Question Answering Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.06 ± 0.000.30 ± 0.000.04 ± 0.000.28 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.15 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.16 ± 0.00
LLaMa-2 13B0.00 ± 0.000.04 ± 0.000.00 ± 0.020.05 ± 0.00
LLaMa-2 7B0.00 ± 0.000.05 ± 0.000.00 ± 0.000.06 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
GPT-40.00 ± 0.000.27 ± 0.000.00 ± 0.000.27 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/reasoning.md b/_pages/ind/zero-shot/reasoning.md index 60b3834..2a20d6d 100644 --- a/_pages/ind/zero-shot/reasoning.md +++ b/_pages/ind/zero-shot/reasoning.md @@ -3,123 +3,72 @@ layout: default permalink: /leaderboard/ind/zero-shot/reasoning --- # Zero-Shot Reasoning Leaderboard +{% assign lang = 'ind' %} - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + {% endfor %} - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsSR - NaturalSR - Abstract symbolMATH + Models + + {{ dataset[0] }} +
EMF1Equ.EMF1Equ.EMF1Equ.EM↑F1↑Equ↑
URA-LLaMa 70B0.06 ± 0.000.34 ± 0.000.06 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.01 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.01 ± 0.000.31 ± 0.000.02 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.02
URA-LLaMa 7B0.00 ± 0.000.26 ± 0.000.00 ± 0.000.01 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.01
LLaMa-2 13B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.02 ± 0.000.19 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.06 ± 0.01
Vietcuna 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
GPT-3.50.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.72 ± 0.02
GPT-40.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.76 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/zero-shot/sentiment-analysis.md b/_pages/ind/zero-shot/sentiment-analysis.md index 320d74d..e2ee956 100644 --- a/_pages/ind/zero-shot/sentiment-analysis.md +++ b/_pages/ind/zero-shot/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/ind/zero-shot/sentiment-analysis --- # Zero-Shot Sentiment Analysis Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsVLSP 2016UiT-VSFC + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.63 ± 0.020.63 ± 0.020.74 ± 0.010.15 ± 0.010.87 ± 0.030.64 ± 0.010.54 ± 0.010.85 ± 0.010.14 ± 0.000.98 ± 0.01
URA-LLaMa 13B0.52 ± 0.020.35 ± 0.010.60 ± 0.010.10 ± 0.010.64 ± 0.050.70 ± 0.010.40 ± 0.010.72 ± 0.010.23 ± 0.010.95 ± 0.01
URA-LLaMa 7B0.35 ± 0.020.24 ± 0.010.54 ± 0.010.24 ± 0.010.31 ± 0.050.27 ± 0.010.18 ± 0.000.52 ± 0.010.37 ± 0.010.03 ± 0.01
LLaMa-2 13B0.25 ± 0.010.25 ± 0.010.49 ± 0.010.39 ± 0.010.29 ± 0.050.29 ± 0.010.24 ± 0.010.52 ± 0.010.42 ± 0.010.30 ± 0.03
LLaMa-2 7B0.15 ± 0.010.15 ± 0.010.58 ± 0.010.73 ± 0.010.12 ± 0.030.04 ± 0.000.06 ± 0.010.49 ± 0.010.79 ± 0.000.01 ± 0.01
Vietcuna 7B0.11 ± 0.010.12 ± 0.010.49 ± 0.010.68 ± 0.010.11 ± 0.030.05 ± 0.000.06 ± 0.000.56 ± 0.010.73 ± 0.000.05 ± 0.01
MixSUra 8x7B0.45 ± -0.30 ± -0.62 ± -0.50 ± -0.49 ± -0.55 ± -0.40 ± -0.66 ± -0.41 ± -0.60 ± -
Gemini Pro0.64 ± -0.47 ± --0.31 ± -0.53 ± -0.76 ± -0.49 ± --0.43 ± -0.77 ± -
GPT-3.50.62 ± 0.020.56 ± 0.01-0.29 ± 0.020.62 ± 0.050.81 ± 0.310.68 ± 0.31-0.48 ± 0.010.83 ± 0.02
GPT-40.71 ± 0.010.68 ± 0.01-0.37 ± 0.010.70 ± 0.040.80 ± 0.010.67 ± 0.01-0.47 ± 0.010.85 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/ind/zero-shot/summarization.md b/_pages/ind/zero-shot/summarization.md index 8f1c268..f81e374 100644 --- a/_pages/ind/zero-shot/summarization.md +++ b/_pages/ind/zero-shot/summarization.md @@ -3,185 +3,132 @@ layout: default permalink: /leaderboard/ind/zero-shot/summarization --- # Zero-Shot Summarization Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.42 ± 0.170.21 ± 0.120.28 ± 0.00-0.11 ± 0.000.03 ± 0.190.85 ± 0.0014.59 ± 0.0517.21 ± 0.330.37 ± 0.000.16 ± 0.000.24 ± 0.00-0.22 ± 0.000.26 ± 0.160.17 ± 0.000.22 ± 0.0022.24 ± 0.97
URA-LLaMa 13B0.38 ± 0.000.18 ± 0.000.25 ± 0.00-0.09 ± 0.000.01 ± 0.180.71 ± 0.006.01 ± 0.0724.27 ± 0.610.22 ± 0.000.08 ± 0.000.14 ± 0.00-0.16 ± 0.00-0.13 ± 0.120.42 ± 0.013.06 ± 0.1049.58 ± 1.16
URA-LLaMa 7B0.38 ± 0.000.14 ± 0.000.25 ± 0.00-0.09 ± 0.000.04 ± 0.120.65 ± 0.004.88 ± 0.037.77 ± 0.050.40 ± 0.000.15 ± 0.000.26 ± 0.00-0.16 ± 0.000.19 ± 0.070.73 ± 0.004.79 ± 0.076.22 ± 0.07
LLaMa-2 13B0.06 ± 0.000.02 ± 0.000.04 ± 0.00-0.09 ± 0.00-0.18 ± 0.040.07 ± 0.000.43 ± 0.0128.25 ± 0.240.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.11 ± 0.080.03 ± 0.000.07 ± 0.0119.55 ± 0.51
LLaMa-2 7B0.06 ± 0.000.01 ± 0.000.05 ± 0.00-0.09 ± 0.00-0.23 ± 0.040.06 ± 0.000.21 ± 0.0015.75 ± 0.200.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.14 ± 0.070.03 ± 0.000.06 ± 0.0017.84 ± 0.50
Vietcuna 7B0.28 ± 0.000.06 ± 0.000.18 ± 0.00-0.09 ± 0.00-0.09 ± 0.090.31 ± 0.000.80 ± 0.01171.63 ± 1.710.24 ± 0.000.06 ± 0.000.15 ± 0.00-0.16 ± 0.00-0.18 ± 0.070.51 ± 0.011.16 ± 0.01238.67 ± 3.37
GPT-3.50.36 ± 0.000.20 ± 0.000.24 ± 0.00-0.09 ± 0.000.04 ± 0.130.86 ± 0.003.97 ± 0.0213.32 ± 0.650.43 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.22 ± 0.030.87 ± 0.003.29 ± 0.0335.50 ± 0.82
GPT-40.41 ± 0.000.21 ± 0.000.26 ± 0.00-0.08 ± 0.00-0.04 ± 0.110.84 ± 0.003.45 ± 0.0015.43 ± 0.490.44 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.24 ± 0.040.82 ± 0.002.37 ± 0.016.61 ± 0.16
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/text-classification.md b/_pages/ind/zero-shot/text-classification.md index ca8ebdb..0d0fb92 100644 --- a/_pages/ind/zero-shot/text-classification.md +++ b/_pages/ind/zero-shot/text-classification.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/ind/zero-shot/text-classification --- # Zero-Shot Text Classification Leaderboard +{% assign lang = 'ind' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.40 ± 0.020.32 ± 0.020.68 ± 0.010.14 ± 0.020.60 ± 0.060.56 ± 0.020.48 ± 0.030.85 ± 0.000.25 ± 0.020.56 ± 0.06
URA-LLaMa 13B0.29 ± 0.020.25 ± 0.020.52 ± 0.010.09 ± 0.010.23 ± 0.050.10 ± 0.010.10 ± 0.010.72 ± 0.000.52 ± 0.010.14 ± 0.04
URA-LLaMa 7B0.13 ± 0.010.11 ± 0.010.50 ± 0.010.15 ± 0.010.21 ± 0.050.04 ± 0.010.04 ± 0.020.77 ± 0.000.30 ± 0.010.04 ± 0.02
LLaMa-2 13B0.11 ± 0.010.10 ± 0.010.49 ± 0.010.31 ± 0.010.09 ± 0.040.03 ± 0.010.02 ± 0.000.45 ± 0.010.28 ± 0.010.03 ± 0.02
LLaMa-2 7B0.07 ± 0.010.08 ± 0.010.52 ± 0.010.35 ± 0.010.07 ± 0.030.00 ± 0.060.00 ± 0.060.61 ± 0.010.32 ± 0.000.00 ± 0.00
Vietcuna 7B0.05 ± 0.010.02 ± 0.010.52 ± 0.010.95 ± 0.010.03 ± 0.020.05 ± 0.010.01 ± 0.000.66 ± 0.000.20 ± 0.010.01 ± 0.21
GPT-3.50.43 ± 0.020.37 ± 0.02-0.29 ± 0.020.43 ± 0.060.44 ± 0.020.38 ± 0.03-0.38 ± 0.020.44 ± 0.05
GPT-40.49 ± 0.020.46 ± 0.02-0.35 ± 0.020.50 ± 0.060.89 ± 0.010.69 ± 0.02-0.83 ± 0.010.89 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/ind/zero-shot/toxicity-detection.md b/_pages/ind/zero-shot/toxicity-detection.md index 00e7152..eb3e0a9 100644 --- a/_pages/ind/zero-shot/toxicity-detection.md +++ b/_pages/ind/zero-shot/toxicity-detection.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/ind/zero-shot/toxicity-detection --- # Zero-Shot Toxicity Detection Leaderboard +{% assign lang = 'ind' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-ViCTSDUiT-ViHSD + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.61 ± 0.010.52 ± 0.010.77 ± 0.010.17 ± 0.010.97 ± 0.010.38 ± 0.010.34 ± 0.010.74 ± 0.010.25 ± 0.010.91 ± 0.01
URA-LLaMa 13B0.46 ± 0.010.28 ± 0.030.53 ± 0.020.22 ± 0.010.48 ± 0.030.33 ± 0.010.18 ± 0.000.60 ± 0.010.35 ± 0.010.54 ± 0.02
URA-LLaMa 7B0.25 ± 0.010.19 ± 0.010.53 ± 0.010.38 ± 0.010.13 ± 0.020.19 ± 0.000.13 ± 0.000.55 ± 0.010.46 ± 0.010.13 ± 0.01
LLaMa-2 13B0.16 ± 0.010.14 ± 0.000.40 ± 0.010.50 ± 0.010.24 ± 0.020.09 ± 0.000.13 ± 0.000.38 ± 0.010.63 ± 0.000.10 ± 0.01
LLaMa-2 7B0.13 ± 0.010.14 ± 0.010.45 ± 0.020.69 ± 0.010.09 ± 0.010.03 ± 0.000.05 ± 0.010.56 ± 0.010.75 ± 0.000.00 ± 0.00
Vietcuna 7B0.09 ± 0.000.07 ± 0.000.50 ± 0.000.41 ± 0.000.10 ± 0.030.07 ± 0.000.04 ± 0.000.50 ± 0.000.26 ± 0.000.07 ± 0.01
GPT-3.50.75 ± 0.010.61 ± 0.02-0.25 ± 0.010.80 ± 0.040.55 ± 0.010.42 ± 0.01-0.22 ± 0.010.55 ± 0.02
GPT-40.89 ± 0.010.69 ± 0.01-0.39 ± 0.010.89 ± 0.030.75 ± 0.010.53 ± 0.01-0.42 ± 0.010.75 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/bias-toxicity/question-answering.md b/_pages/kr/bias-toxicity/question-answering.md index 40fbb60..9e224f5 100644 --- a/_pages/kr/bias-toxicity/question-answering.md +++ b/_pages/kr/bias-toxicity/question-answering.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/kr/bias-toxicity/question-answering --- # Bias-Toxicity Question Answering Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.39 ± 0.01-0.41 ± 0.000.02 ± 0.00-0.14 ± 0.02-0.42 ± 0.030.02 ± 0.00
URA-LLaMa 13B-0.39 ± 0.01-0.45 ± 0.010.02 ± 0.00-0.17 ± 0.1-0.38 ± 0.000.02 ± 0.00
URA-LLaMa 7B-0.43 ± 0.01-0.48 ± 0.000.03 ± 0.00-0.18 ± 0.01-0.37 ± 0.010.02 ± 0.00
LLaMa-2 13B-0.35 ± 0.03-0.46 ± 0.000.01 ± 0.00-0.27 ± 0.01-0.43 ± 0.000.01 ± 0.00
LLaMa-2 7B-0.46 ± 0.01-0.42 ± 0.000.01 ± 0.00-0.21 ± 0.06-0.45 ± 0.000.01 ± 0.00
Vietcuna 7B-0.50 ± 0.00--0.04 ± 0.00-0.23 ± 0.09-0.49 ± 0.010.04 ± 0.00
GPT-3.5-0.43 ± 0.01-0.48 ± 0.000.02 ± 0.00-0.18 ± 0.01-0.40 ± 0.000.02 ± 0.00
GPT-4-0.40 ± 0.01-0.45 ± 0.000.02 ± 0.00-0.16 ± 0.01-0.41 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/bias-toxicity/summarization.md b/_pages/kr/bias-toxicity/summarization.md index 4a632c6..14c703a 100644 --- a/_pages/kr/bias-toxicity/summarization.md +++ b/_pages/kr/bias-toxicity/summarization.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/kr/bias-toxicity/summarization --- # Bias-Toxicity Summarization Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.21 ± 0.01-0.31 ± 0.010.05 ± 0.00-0.03 ± 0.02-0.25 ± 0.020.03 ± 0.00
URA-LLaMa 13B-0.20 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.07 ± 0.04-0.31 ± 0.030.02 ± 0.00
URA-LLaMa 7B-0.24 ± 0.02-0.33 ± 0.010.04 ± 0.00-0.07 ± 0.02-0.38 ± 0.020.03 ± 0.00
LLaMa-2 13B-0.26 ± 0.01-0.38 ± 0.010.01 ± 0.00-0.17 ± 0.08-0.50 ± 0.020.01 ± 0.00
LLaMa-2 7B-0.28 ± 0.02-0.39 ± 0.010.01 ± 0.00-0.39 ± 0.05-0.50 ± 0.020.01 ± 0.00
Vietcuna 7B-0.21 ± 0.02-0.32 ± 0.020.04 ± 0.00-0.17 ± 0.04-0.39 ± 0.030.03 ± 0.00
GPT-3.5-0.22 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.03 ± 0.02-0.28 ± 0.010.02 ± 0.00
GPT-4-0.19 ± 0.01-0.28 ± 0.010.06 ± 0.00-0.09 ± 0.02-0.28 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/bias-toxicity/translation.md b/_pages/kr/bias-toxicity/translation.md index 5cb9225..1ea89b6 100644 --- a/_pages/kr/bias-toxicity/translation.md +++ b/_pages/kr/bias-toxicity/translation.md @@ -3,264 +3,94 @@ layout: default permalink: /leaderboard/kr/bias-toxicity/translation --- # Bias-Toxicity Translation Leaderboard +{% assign lang = 'kr' %} - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + + + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMT (En - Vi)OPUS100 (En - Vi) + {{ dataset[0] }} +
DRR$→|DRG$→|SAR$→|SAG$→|ToxDRR$→|DRG$→|SAR$→|SAG$→|ToxDRR↓DRG↓SAR↓SAG↓Tox↓
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
---- -layout: default -permalink: /leaderboard/kr/bias-toxicity/translation ---- -# Bias-Toxicity Translation Leaderboard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + + {% endfor %} -
ModelsPhoMT (En $\to$ Vi)OPUS100 (En $\to$ Vi)
DRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$ToxDRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$Tox
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
+ {{ model }} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/chain-of-thought/reasoning.md b/_pages/kr/chain-of-thought/reasoning.md index 0b34d0f..31d87c6 100644 --- a/_pages/kr/chain-of-thought/reasoning.md +++ b/_pages/kr/chain-of-thought/reasoning.md @@ -3,73 +3,72 @@ layout: default permalink: /leaderboard/kr/chain-of-thought/reasoning --- # Chain-Of-Thought Reasoning Leaderboard +{% assign lang = 'kr' %} - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + {% endfor %} - - - + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].Equ and dataset[1][m].Equ > Equ_best %} + {% assign Equ_best = dataset[1][m].Equ %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsMetrics + Models + + {{ dataset[0] }} +
EM F1 Equ. EM↑F1↑Equ.↑
URA-LLaMa 70B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
URA-LLaMa 13B0.00 ± 0.000.23 ± 0.010.17 ± 0.01
URA-LLaMa 7B0.00 ± 0.000.23 ± 0.010.09 ± 0.01
LLaMa-2 13B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
LLaMa-2 7B0.00 ± 0.000.10 ± 0.000.12 ± 0.02
Vietcuna 7B0.00 ± 0.000.13 ± 0.010.10 ± 0.01
MixSUra 8x7B0.00 ± 0.000.17 ± 0.010.33 ± 0.00
GPT-3.50.00 ± 0.000.32 ± 0.010.78 ± 0.02
GPT-40.00 ± 0.000.32 ± 0.010.79 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Equ %} + {{ dataset[1][model].Equ | round: 2 }} ± {{ dataset[1][model].Equ_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/fairness-aware/information-retrieval.md b/_pages/kr/fairness-aware/information-retrieval.md index bc6feae..d3395a6 100644 --- a/_pages/kr/fairness-aware/information-retrieval.md +++ b/_pages/kr/fairness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/kr/fairness-aware/information-retrieval --- # Fairness-Aware Information Retrieval Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsmMARCOmRobust04 + Models + + {{ dataset[0] }} +
M@10M@10BN@10N@10BM@10M@10BN@10N@10BM@10↑M@10B↑N@10↑N@10B↑
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.10 ± 0.000.10 ± 0.000.14 ± 0.000.14 ± 0.000.01 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.00
LLaMa-2 13B
LLaMa-2 7B0.05 ± 0.000.10 ± 0.000.07 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4-------- + {{ model }} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/fairness-aware/language-modeling.md b/_pages/kr/fairness-aware/language-modeling.md index 2b6291d..6e96973 100644 --- a/_pages/kr/fairness-aware/language-modeling.md +++ b/_pages/kr/fairness-aware/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/kr/fairness-aware/language-modeling --- # Fairness-Aware Language Modeling Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.58 ± 0.010.70 ± 0.01653.57 ± 12.05150.64 ± 2.731.25 ± 0.060.30 ± 0.000.11 ± 0.000.14 ± 0.0015.19 ± 0.424.12 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.02 ± 0.000.40 ± 0.010.56 ± 0.01518.38 ± 11.19125.24 ± 2.661.48 ± 0.110.32 ± 0.000.07 ± 0.000.21 ± 0.002.98 ± 0.111.24 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01492.93 ± 11.32117.82 ± 2.721.22 ± 0.010.20 ± 0.000.54 ± 0.010.67 ± 0.0141.77 ± 1.5710.12 ± 0.351.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.76 ± 0.000.89 ± 0.00782.03 ± 11.71192.66 ± 2.831.27 ± 0.040.15 ± 0.000.07 ± 0.000.22 ± 0.003.39 ± 0.161.52 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.79 ± 0.000.96 ± 0.00761.38 ± 10.65197.18 ± 2.661.75 ± 0.200.12 ± 0.000.35 ± 0.010.48 ± 0.0147.54 ± 0.8511.82 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00940.71 ± 12.48208.05 ± 2.811.40 ± 0.000,06 ± 0.004.78 ± 0.064.80 ± 0.06634.48 ± 8.58145.12 ± 1.941.46 ± 0.01
MixSUra 8x7B0.00 ± -0.56 ± -0.63 ± -535.76 ± -133.64 ± -1.00 ± -0,07 ± -0.20 ± -0.29 ± -25.96 ± -8.79 ± -1.00 ± -
GPT-3.50.03 ± 0.000.29 ± 0.010.46 ± 0.01398.19 ± 11.0196.42 ± 2.54-0.59 ± 0.000.06 ± 0.000.19 ± 0.001.99 ± 0.080.74 ± 0.02-
GPT-40.06 ± 0.000.36 ± 0.010.41 ± 0.01347.82 ± 10.2386.96 ± 2.41-0.67 ± 0.000.01 ± 0.000.02 ± 0.001.30 ± 0.040.54 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/fairness-aware/question-answering.md b/_pages/kr/fairness-aware/question-answering.md index 9d96fd8..5c18ff5 100644 --- a/_pages/kr/fairness-aware/question-answering.md +++ b/_pages/kr/fairness-aware/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/kr/fairness-aware/question-answering --- # Fairness-Aware Question Answering Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
Exact MatchF1Exact MatchF1
URA-LLaMa 70B0.04 ± 0.000.27 ± 0.000.03 ± 0.000.25 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.14 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.15 ± 0.01
LLaMa-2 13B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.04 ± 0.00
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.05 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.23 ± 0.00
GPT-40.00 ± 0.000.26 ± 0.000.00 ± 0.000.24 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/fairness-aware/sentiment-analysis.md b/_pages/kr/fairness-aware/sentiment-analysis.md index b13babc..e731b8d 100644 --- a/_pages/kr/fairness-aware/sentiment-analysis.md +++ b/_pages/kr/fairness-aware/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/kr/fairness-aware/sentiment-analysis --- # Fairness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.65 ± 0.010.49 ± 0.010.58 ± 0.010.13 ± 0.010.77 ± 0.040.76 ± 0.010.48 ± 0.010.61 ± 0.010.17 ± 0.010.66 ± 0.03
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.62 ± 0.010.07 ± 0.010.83 ± 0.040.75 ± 0.010.46 ± 0.080.83 ± 0.010.11 ± 0.010.88 ± 0.02
URA-LLaMa 7B0.74 ± 0.020.39 ± 0.060.83 ± 0.010.21 ± 0.020.98 ± 0.020.73 ± 0.010.73 ± 0.010.78 ± 0.010.13 ± 0.010.94 ± 0.01
LLaMa-2 13B0.51 ± 0.010.1 ± 0.060.56 ± 0.010.32 ± 0.020.79 ± 0.040.63 ± 0.010.41 ± 0.020.70 ± 0.010.13 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.34 ± 0.010.53 ± 0.010.26 ± 0.020.50 ± 0.00.51 ± 0.010.55 ± 0.010.68 ± 0.010.22 ± 0.010.64 ± 0.03
Vietcuna 7B0.04 ± 0.010.04 ± 0.010.49 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.55 ± 0.010.50 ± 0.000.01 ± 0.01
MixSUra 8x7B0.62 ± -0.62 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.61 ± -0.24 ± -0.66 ± -
Gemini Pro0.67 ± -0.50 ± -- 0.34 ± -0.59 ± -0.79 ± -0.50 ± -- 0.46 ± -0.82 ± -
GPT-3.50.66 ± 0.010.60 ± 0.01- 0.33 ± 0.010.52 ± 0.050.86 ± 0.010.71 ± 0.01- 0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01- 0.41 ± 0.000.73 ± 0.040.85 ± 0.010.71 ± 0.01- 0.52 ± 0.010.87 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/fairness-aware/text-classification.md b/_pages/kr/fairness-aware/text-classification.md index cb180bb..e3d5a2a 100644 --- a/_pages/kr/fairness-aware/text-classification.md +++ b/_pages/kr/fairness-aware/text-classification.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/kr/fairness-aware/text-classification --- # Fairness-Aware Text Classification Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-VSMECPhoATIS + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.24 ± 0.020.14 ± 0.010.58 ± 0.010.26 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.31 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.31 ± 0.020.11 ± 0.010.58 ± 0.010.23 ± 0.020.57 ± 0.060.01 ± 0.010.05 ± 0.020.58 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.41 ± 0.060.00 ± 0.010.00 ± 0.000.55 ± 0.000.30 ± 0.010.01 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.44 ± 0.060.02 ± 0.010.01 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.11 ± 0.010.57 ± 0.010.22 ± 0.020.53 ± 0.060.02 ± 0.000.06 ± 0.010.57 ± 0.010.68 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.16 ± 0.040.04 ± 0.010.01 ± 0.000.77 ± 0.010.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
Gemini Pro0.48 ± -0.38 ± --0.34 ± -0.43 ± -0.79 ± -0.67 ± --0.73 ± -0.68 ± -
GPT-3.50.44 ± 0.020.42 ± 0.02-0.30 ± 0.020.36 ± 0.060.68 ± 0.020.66 ± 0.03-0.62 ± 0.020.67 ± 0.05
GPT-40.49 ± 0.020.47 ± 0.02-0.35 ± 0.020.36 ± 0.060.83 ± 0.010.76 ± 0.03-0.77 ± 0.010.87 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/fairness-aware/toxicity-detection.md b/_pages/kr/fairness-aware/toxicity-detection.md index 3285dca..187c4da 100644 --- a/_pages/kr/fairness-aware/toxicity-detection.md +++ b/_pages/kr/fairness-aware/toxicity-detection.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/kr/fairness-aware/toxicity-detection --- # Fairness-Aware Toxicity Detection Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.41 ± 0.020.26 ± 0.010.75 ± 0.010.53 ± 0.010.33 ± 0.050.15 ± 0.000.40 ± 0.000.64 ± 0.010.58 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.43 ± 0.020.29 ± 0.070.66 ± 0.010.36 ± 0.020.42 ± 0.050.24 ± 0.010.15 ± 0.000.61 ± 0.010.43 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.42 ± 0.020.39 ± 0.010.60 ± 0.010.30 ± 0.010.66 ± 0.050.16 ± 0.000.10 ± 0.000.67 ± 0.010.33 ± 0.000.28 ± 0.02
LLaMa-2 13B0.27 ± 0.010.18 ± 0.010.67 ± 0.010.53 ± 0.010.57 ± 0.050.16 ± 0.000.10 ± 0.000.62 ± 0.010.59 ± 0.000.42 ± 0.02
LLaMa-2 7B0.15 ± 0.010.11 ± 0.010.62 ± 0.010.67 ± 0.010.07 ± 0.030.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.00
Vietcuna 7B0.08 ± 0.010.09 ± 0.010.50 ± 0.010.42 ± 0.010.06 ± 0.030.62 ± 0.010.21 ± 0.000.50 ± 0.000.29 ± 0.010.62 ± 0.02
MixSUra 8x7B0.69 ± -0.38 ± -- ± -0.29 ± -0.78 ± -0.56 ± -0.31 ± -0.68 ± -0.32 ± -0.92 ± -
Gemini Pro0.81 ± -0.43 ± -- ± -0.31 ± -0.82 ± -0.70 ± -0.37 ± -- ± -0.36 ± -0.69 ± -
GPT-3.50.60 ± 0.020.52 ± 0.02- ± -0.11 ± 0.020.63 ± 0.050.61 ± 0.010.46 ± 0.01- ± -0.29 ± 0.010.62 ± 0.02
GPT-40.87 ± 0.010.69 ± 0.02- ± -0.37 ± 0.010.86 ± 0.030.76 ± 0.010.56 ± 0.01- ± -0.43 ± 0.010.76 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/information-retrieval.md b/_pages/kr/few-shot/information-retrieval.md index a297cfd..bfd814e 100644 --- a/_pages/kr/few-shot/information-retrieval.md +++ b/_pages/kr/few-shot/information-retrieval.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/kr/few-shot/information-retrieval --- # Few-Shot Information Retrieval Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B0.05 ± 0.000.11 ± 0.000.06 ± 0.000.14 ± 0.000.04±0.000.04±0.000.03±0.000.04±0.00
URA-LLaMa 13B0.04 ± 0.000.10 ± 0.000.06 ± 0.000.14 ± 0.000.03±0.000.05±0.000.04±0.000.04±0.00
URA-LLaMa 7B0.04 ± 0.000.11 ± 0.000.06 ± 0.000.16 ± 0.000.03 ± 0.000.03 ± 0.000.02 ± 0.000.02 ± 0.00
LLaMa-2 13B0.07 ± 0.000.15 ± 0.000.09 ± 0.000.21 ± 0.000.05±0.000.04±0.000.04±0.000.04±0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.16 ± 0.000.02±0.000.03±0.000.03±0.000.02±0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.00±0.000.00±0.000.00±0.000.00±0.00
MixSUra 8x7B0.01 ± -0.07 ± -0.04 ± -0.11 ± -0.04±-0.04±-0.02±-0.02±-
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/knowledge.md b/_pages/kr/few-shot/knowledge.md index a8e785e..6f8c380 100644 --- a/_pages/kr/few-shot/knowledge.md +++ b/_pages/kr/few-shot/knowledge.md @@ -2,115 +2,129 @@ layout: default permalink: /leaderboard/kr/few-shot/knowledge --- -# Few-Shot Knowledge Leaderboard +# Few-shot Knowledge Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.34 ± 0.020.50 ± 0.020.78 ± 0.020.63 ± 0.030.90 ± 0.010.13 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.26 ± 0.020.40 ± 0.020.62 ± 0.020.50 ± 0.020.69 ± 0.020.18 ± 0.020.65 ± 0.07
URA-LLaMa 7B0.14 ± 0.020.25 ± 0.020.42 ± 0.020.33 ± 0.020.61 ± 0.020.13 ± 0.020.39 ± 0.07
LLaMa-2 13B0.22 ± 0.020.36 ± 0.020.58 ± 0.020.46 ± 0.020.62 ± 0.020.28 ± 0.020.77 ± 0.06
LLaMa-2 7B0.07 ± 0.010.15 ± 0.010.30 ± 0.020.23 ± 0.020.56 ± 0.020.43 ± 0.020.16 ± 0.05
Vietcuna 7B0.07 ± 0.010.19 ± 0.010.31 ± 0.020.18 ± 0.010.50 ± 0.000.06 ± 0.020.31 ± 0.06
MixSUra 8x7B0.19 ± -0.34 ± -0.65 ± -0.64 ± -0.54 ± -0.29 ± -0.65 ± -
GPT-3.50.49 ± 0.020.64 ± 0.020.90 ± 0.010.73 ± 0.03-0.66 ± 0.010.91 ± 0.04
GPT-40.49 ± 0.020.64 ± 0.020.91 ± 0.010.73 ± 0.04-0.66 ± 0.010.91 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/language-modeling.md b/_pages/kr/few-shot/language-modeling.md index d5d6771..8176ffb 100644 --- a/_pages/kr/few-shot/language-modeling.md +++ b/_pages/kr/few-shot/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/kr/few-shot/language-modeling --- # Few-Shot Language Modeling Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.54 ± 0.000.66 ± 0.00669.74 ± 10.38153.04 ± 2.331.32 ± 0.050.33 ± 0.000.11 ± 0.000.13 ± 0.0015.09 ± 0.424.05 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.01 ± 0.000.45 ± 0.010.61 ± 0.01559.64 ± 11.23136.97 ± 2.681.49 ± 0.100.35 ± 0.000.02 ± 0.000.04 ± 0.002.81 ± 0.121.18 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01498.36 ± 11.01118.11 ± 2.581.24 ± 0.010.22 ± 0.000.32 ± 0.010.33 ± 0.0141.89 ± 1.5410.10 ± 0.341.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.74 ± 0.000.87 ± 0.00760.98 ± 11.91186.90 ± 2.851.24 ± 0.030.16 ± 0.000.03 ± 0.000.05 ± 0.003.38 ± 0.161.51 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.81 ± 0.000.98 ± 0.00769.36 ± 10.51198.53 ± 2.571.74 ± 0.190.12 ± 0.000.36 ± 0.010.39 ± 0.0147.50 ± 0.8611.80 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00935.65 ± 12.47204.98 ± 2.791.40 ± 0.000.00 ± 0.008.00 ± 0.078.01 ± 0.071063.93 ± 7.64241.74 ± 1.741.46 ± 0.00
MixSUra 8x7B0.00 ± -0.55 ± -0.63 ± -526.79 ± -131.02 ± -1.00 ± -0.08 ± -0.19 ± -0.28 ± -25.13 ± -8.58 ± -1.00 ± -
GPT-3.50.04 ± 0.000.28 ± 0.010.44 ± 0.01387.37 ± 10.8692.78 ± 2.46-0.66 ± 0.000.01 ± 0.000.02 ± 0.001.63 ± 0.080.61 ± 0.02-
GPT-40.08 ± 0.000.23 ± 0.010.40 ± 0.01336.53 ± 10.1883.55 ± 2.34-0.75 ± 0.000.01 ± 0.000.01 ± 0.000.89 ± 0.040.37 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/few-shot/reasoning.md b/_pages/kr/few-shot/reasoning.md index d11333e..4b30766 100644 --- a/_pages/kr/few-shot/reasoning.md +++ b/_pages/kr/few-shot/reasoning.md @@ -3,135 +3,72 @@ layout: default permalink: /leaderboard/kr/few-shot/reasoning --- # Few-Shot Reasoning Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsSR - NaturalSR - Abstract symbolMATH
EMF1Equ.EMF1Equ.EMF1Equ.
URA-LLaMa 70B0.14 ± 0.000.48 ± 0.000.15 ± 0.000.27 ± 0.000.85 ± 0.000.30 ± 0.000.00 ± 0.000.00 ± 0.000.12 ± 0.02
URA-LLaMa 13B0.08 ± 0.000.42 ± 0.000.08 ± 0.000.20 ± 0.000.70 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.01
URA-LLaMa 7B0.04 ± 0.000.38 ± 0.000.04 ± 0.000.11 ± 0.000.61 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.07 ± 0.01
LLaMa-2 13B0.03 ± 0.000.24 ± 0.000.04 ± 0.000.19 ± 0.000.69 ± 0.000.18 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.01 ± 0.000.00 ± 0.000.06 ± 0.000.44 ± 0.000.06 ± 0.000.00 ± 0.000.00 ± 0.000.11 ± 0.01
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.000.71 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
MixSUra 8x7B0.07 ± 0.000.41 ± 0.000.07 ± 0.000.22 ± 0.000.78 ± 0.000.23 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.15 ± 0.000.50 ± 0.000.16 ± 0.000.26 ± 0.000.83 ± 0.000.29 ± 0.000.00 ± 0.000.00 ± 0.000.62 ± 0.02
GPT-40.37 ± 0.000.74 ± 0.000.42 ± 0.000.37 ± 0.000.87 ± 0.000.44 ± 0.000.00 ± 0.000.01 ± 0.000.65 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + EM↑ + F1↑ + Equ↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/sentiment-analysis.md b/_pages/kr/few-shot/sentiment-analysis.md index 0f457fb..5d478d3 100644 --- a/_pages/kr/few-shot/sentiment-analysis.md +++ b/_pages/kr/few-shot/sentiment-analysis.md @@ -1,146 +1,98 @@ --- layout: default -permalink: /leaderboard/kr/few-shot/sentiment-analysis +permalink: /leaderboard/kr/few-shot/sentiment-analysis --- # Few-Shot Sentiment Analysis Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.66 ± 0.010.49 ± 0.010.72 ± 0.010.13 ± 0.010.77 ± 0.040.75 ± 0.010.48 ± 0.010.81 ± 0.010.16 ± 0.010.71 ± 0.02
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.67 ± 0.010.09 ± 0.010.82 ± 0.040.74 ± 0.010.52 ± 0.080.83 ± 0.010.10 ± 0.010.87 ± 0.02
URA-LLaMa 7B0.57 ± 0.020.42 ± 0.050.69 ± 0.020.07 ± 0.020.77 ± 0.040.72 ± 0.010.43 ± 0.010.78 ± 0.010.13 ± 0.010.95 ± 0.03
LLaMa-2 13B0.51 ± 0.010.41 ± 0.060.66 ± 0.010.32 ± 0.020.80 ± 0.040.63 ± 0.010.46 ± 0.070.71 ± 0.010.13 ± 0.010.88 ± 0.02
LLaMa-2 7B0.45 ± 0.010.32 ± 0.010.59 ± 0.010.26 ± 0.020.50 ± 0.050.50 ± 0.010.34 ± 0.010.69 ± 0.010.23 ± 0.010.62 ± 0.03
Vietcuna 7B0.04 ± 0.010.05 ± 0.010.45 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.53 ± 0.010.50 ± 0.000.01 ± 0.00
MixSUra 8x7B0.62 ± -0.63 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.63 ± -0.23 ± -0.655 ± -
GPT-3.50.65 ± 0.010.59 ± 0.1-0.32 ± 0.010.65 ± 0.050.86 ± 0.010.73 ± 0.01-0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01-0.41 ± 0.010.74 ± 0.040.85 ± 0.010.59 ± 0.09-0.52 ± 0.010.85 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/text-classification.md b/_pages/kr/few-shot/text-classification.md index d369c3d..eae4ba6 100644 --- a/_pages/kr/few-shot/text-classification.md +++ b/_pages/kr/few-shot/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/kr/few-shot/text-classification --- # Few-Shot Text Classification Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.020.15 ± 0.010.56 ± 0.010.25 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.83 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.32 ± 0.020.12 ± 0.010.58 ± 0.010.22 ± 0.020.57 ± 0.070.01 ± 0.010.06 ± 0.020.47 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.43 ± 0.060.06 ± 0.010.01 ± 0.000.55 ± 0.000.24 ± 0.010.08 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.49 ± 0.070.02 ± 0.010.06 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.12 ± 0.010.57 ± 0.010.21 ± 0.020.54 ± 0.060.03 ± 0.010.02 ± 0.010.56 ± 0.010.54 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.15 ± 0.040.04 ± 0.010.01 ± 0.000.63 ± 0.000.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
GPT-3.50.42 ± 0.020.40 ± 0.02-0.28 ± 0.020.42 ± 0.060.69 ± 0.020.67 ± 0.03-0.63 ± 0.020.69 ± 0.05
GPT-40.49 ± 0.020.48 ± 0.02-0.35 ± 0.020.49 ± 0.060.85 ± 0.010.78 ± 0.03-0.79 ± 0.010.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/toxicity-detection.md b/_pages/kr/few-shot/toxicity-detection.md index 26357cf..d917521 100644 --- a/_pages/kr/few-shot/toxicity-detection.md +++ b/_pages/kr/few-shot/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/kr/few-shot/toxicity-detection --- # Few-Shot Toxicity Detection Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.44 ± 0.010.27 ± 0.010.75 ± 0.010.52 ± 0.010.37 ± 0.020.17 ± 0.000.15 ± 0.000.64 ± 0.010.57 ± 0.000.27 ± 0.02
URA-LLaMa 13B0.44 ± 0.010.30 ± 0.050.67 ± 0.010.33 ± 0.010.41 ± 0.030.26 ± 0.010.16 ± 0.000.61 ± 0.010.42 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.43 ± 0.010.40 ± 0.010.60 ± 0.010.29 ± 0.010.71 ± 0.020.16 ± 0.000.10 ± 0.000.67 ± 0.010.32 ± 0.000.28 ± 0.02
LLaMa-2 13B0.28 ± 0.010.19 ± 0.000.67 ± 0.010.52 ± 0.010.63 ± 0.030.17 ± 0.000.11 ± 0.000.62 ± 0.010.58 ± 0.000.44 ± 0.02
LLaMa-2 7B0.16 ± 0.010.12 ± 0.010.61 ± 0.010.66 ± 0.010.08 ± 0.020.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.02
Vietcuna 7B0.08 ± 0.000.10 ± 0.010.50 ± 0.000.42 ± 0.000.08 ± 0.030.61 ± 0.010.21 ± 0.000.50 ± 0.000.28 ± 0.010.61 ± 0.02
MixSUra 8x7B0.70 ± -0.39 ± -- ± -0.29 ± -0.80 ± -0.58 ± -0.31 ± -0.68 ± -0.30 ± -0.93 ± -
GPT-3.50.63 ± 0.020.54 ± 0.02- 0.13 ± 0.020.63 ± 0.050.63 ± 0.010.47 ± 0.01- 0.29 ± 0.010.63 ± 0.02
GPT-40.89 ± 0.000.71 ± 0.01- 0.39 ± 0.000.89 ± 0.030.77 ± 0.010.57 ± 0.01- 0.44 ± 0.010.77 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/few-shot/translation.md b/_pages/kr/few-shot/translation.md index 571f46c..b30a503 100644 --- a/_pages/kr/few-shot/translation.md +++ b/_pages/kr/few-shot/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/kr/few-shot/translation --- # Few-Shot Translation Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsPhoMTOPUS100 + Models + + {{ dataset[0] }} +
(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)BLEU envi↑BLEU vien↑hLEPOR envi↑hLEPOR vien↑
URA-LLaMa 70B0.28 ± 0.000.59 ± 0.000.27 ± 0.000.58 ± 0.000.10 ± 0.000.44 ± 0.010.14 ± 0.000.41 ± 0.01
URA-LLaMa 13B0.25 ± 0.000.55 ± 0.000.15 ± 0.000.56 ± 0.000.10 ± 0.010.41 ± 0.010.17 ± 0.010.43 ± 0.01
URA-LLaMa 7B0.19 ± 0.000.50 ± 0.000.22 ± 0.000.54 ± 0.000.08 ± 0.000.38 ± 0.010.14 ± 0.010.39 ± 0.01
LLaMa-2 13B0.23 ± 0.000.53 ± 0.000.23 ± 0.000.54 ± 0.000.09 ± 0.000.39 ± 0.010.14 ± 0.010.40 ± 0.01
LLaMa-2 7B0.18 ± 0.000.47 ± 0.000.21 ± 0.000.52 ± 0.000.07 ± 0.000.34 ± 0.000.11 ± 0.010.36 ± 0.01
Vietcuna 7B0.15 ± 0.000.35 ± 0.000.03 ± 0.000.11 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.16 ± 0.00
MixSUra 8x7B0.15 ± -0.51 ± -0.16 ± -0.52 ± -0.07 ± -0.37 ± -0.09 ± -0.36 ± -
GPT-3.50.33 ± 0.000.65 ± 0.000.33 ± 0.000.63 ± 0.000.16 ± 0.010.50 ± 0.010.24 ± 0.010.51 ± 0.00
GPT-40.33 ± 0.000.66 ± 0.000.34 ± 0.000.65 ± 0.000.17 ± 0.010.51 ± 0.010.25 ± 0.010.53 ± 0.00 + {{ model }} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/medium-prompt/question-answering.md b/_pages/kr/medium-prompt/question-answering.md index 246c507..033c20f 100644 --- a/_pages/kr/medium-prompt/question-answering.md +++ b/_pages/kr/medium-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/kr/medium-prompt/question-answering --- # Medium-Prompt Question Answering Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.08 ± 0.000.33 ± 0.000.07 ± 0.000.31 ± 0.00
URA-LLaMa 13B0.04 ± 0.000.21 ± 0.000.04 ± 0.000.19 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.11 ± 0.000.01 ± 0.000.11 ± 0.00
LLaMa-2 13B0.00 ± 0.000.10 ± 0.000.00 ± 0.000.09 ± 0.00
LLaMa-2 7B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.03 ± 0.00
MixSUra 8x7B0.01 ± -0.25 ± -0.00 ± -0.25 ± -
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/medium-prompt/summarization.md b/_pages/kr/medium-prompt/summarization.md index 4bef8d5..ab5cca3 100644 --- a/_pages/kr/medium-prompt/summarization.md +++ b/_pages/kr/medium-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/kr/medium-prompt/summarization --- # Medium-Prompt Summarization Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.35 ± 0.000.16 ± 0.000.24 ± 0.00-0.11 ± 0.000.12 ± 0.000.63 ± 0.005.43 ± 0.0237.78 ± 0.470.33 ± 0.000.14 ± 0.000.22 ± 0.00-0.16± 0.000.24± 0.100.59 ± 0.014.62 ± 0.1156.56 ± 1.70
URA-LLaMa 13B0.26 ± 0.000.12 ± 0.000.17 ± 0.00-0.09 ± 0.00-0.08 ± 0.180.46 ± 0.003.55 ± 0.0447.75 ± 0.650.14 ± 0.000.05 ± 0.000.09 ± 0.00-0.16 ± 0.00-0.14 ± 0.120.26 ± 0.011.83 ± 0.0660.10 ± 2.16
URA-LLaMa 7B0.41 ± 0.000.18 ± 0.000.27 ± 0.00-0.09 ± 0.00-0.08 ± 0.130.83 ± 0.008.13 ± 0.048.08 ± 0.170.42 ± 0.000.17 ± 0.000.27 ± 0.00-0.16 ± 0.000.27 ± 0.210.84 ± 0.007.15 ± 0.088.08 ± 0.36
LLaMa-2 13B0.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.09 ± 0.00-0.19 ± 0.050.01 ± 0.000.01 ± 0.0054.67 ± 0.160.03 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.05 ± 0.030.02 ± 0.000.02 ± 0.0042.55 ± 0.81
LLaMa-2 7B0.03 ± 0.000.01 ± 0.000.03 ± 0.00-0.09 ± 0.00-0.17 ± 0.030.04 ± 0.000.07 ± 0.0023.86 ± 0.260.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.16 ± 0.00-0.04 ± 0.060.02 ± 0.000.03 ± 0.0040.31 ± 0.88
MixSUra 8x7B0.06 ± -0.01 ± -0.04 ± -- ± --0.13 ± -0.10 ± -0.17 ± -9.03 ± -0.03 ± -0.00 ± -0.03 ± -- ± --0.01 ± -0.17 ± -0.26 ± -16.68 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/randomized-choice/knowledge.md b/_pages/kr/randomized-choice/knowledge.md index 49a68be..a85022b 100644 --- a/_pages/kr/randomized-choice/knowledge.md +++ b/_pages/kr/randomized-choice/knowledge.md @@ -4,90 +4,96 @@ permalink: /leaderboard/kr/randomized-choice/knowledge --- # Randomized-Choice Knowledge Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + {% endfor %} - - - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + + + + + {% endfor %} + + + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsAC F1 AR ECE A@10
Our 70B0.76 ± 0.020.76 ± 0.020.78 ± 0.010.14 ± 0.020.94 ± 0.04
Our 13B0.62 ± 0.020.62 ± 0.020.61 ± 0.020.15 ± 0.020.67 ± 0.07
Our 7B0.45 ± 0.020.36 ± 0.020.57 ± 0.020.10 ± 0.020.45 ± 0.07
LLaMa-2 13B0.57 ± 0.020.57 ± 0.020.57 ± 0.020.29 ± 0.020.75 ± 0.07
LLaMa-2 7B0.36 ± 0.020.27 ± 0.020.56 ± 0.020.37 ± 0.020.44 ± 0.07
Vietcuna 7B0.26 ± 0.020.15 ± 0.010.50 ± 0.000.01 ± 0.010.26 ± 0.06 + Models + + {{ dataset[0] }} +
MixSUra 7B0.61 ± -0.61 ± -0.54 ± -0.31 ± -0.65 ± -
GPT-3.50.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04AC↑F1↑AR↑ECE↓A@10↑
GPT-40.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/robustness-aware/information-retrieval.md b/_pages/kr/robustness-aware/information-retrieval.md index 9239acd..645f8b2 100644 --- a/_pages/kr/robustness-aware/information-retrieval.md +++ b/_pages/kr/robustness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/kr/robustness-aware/information-retrieval --- # Robustness-Aware Information Retrieval Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.17 ± 0.00----
LLaMa-2 13B0.06 ± 0.000.13 ± 0.000.19 ± 0.000.19 ± 0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.08 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/knowledge.md b/_pages/kr/robustness-aware/knowledge.md index 647b533..9fea7c7 100644 --- a/_pages/kr/robustness-aware/knowledge.md +++ b/_pages/kr/robustness-aware/knowledge.md @@ -3,114 +3,128 @@ layout: default permalink: /leaderboard/kr/robustness-aware/knowledge --- # Robustness-Aware Knowledge Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.23 ± 0.000.37 ± 0.000.65 ± 0.000.53 ± 0.000.84 ± 0.000.11 ± 0.000.77 ± 0.00
URA-LLaMa 13B0.18 ± 0.000.30 ± 0.000.41 ± 0.000.34 ± 0.000.61 ± 0.000.22 ± 0.000.58 ± 0.00
URA-LLaMa 7B0.10 ± 0.000.18 ± 0.000.33 ± 0.020.28 ± 0.020.61 ± 0.010.19 ± 0.020.33 ± 0.06
LLaMa-2 13B0.13 ± 0.000.21 ± 0.000.39 ± 0.000.31 ± 0.000.56 ± 0.000.46 ± 0.000.33 ± 0.00
LLaMa-2 7B0.02 ± 0.000.05 ± 0.000.26 ± 0.010.20 ± 0.010.51 ± 0.010.46 ± 0.010.13 ± 0.03
Vietcuna 7B0.05 ± 0.000.15 ± 0.000.26 ± 0.010.14 ± 0.000.50 ± 0.000.01 ± 0.010.21 ± 0.07
MixSUra 8x7B0.13 ± -0.24 ± -0.57 ± -0.45 ± -0.53 ± -0.35 ± -0.58 ± -
GPT-3.50.45 ± 0.010.61 ± 0.010.90 ± 0.010.72 ± 0.04-0.65 ± 0.010.88 ± 0.07
GPT-40.44 ± 0.010.61 ± 0.010.91 ± 0.010.73 ± 0.07-0.66 ± 0.070.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/question-answering.md b/_pages/kr/robustness-aware/question-answering.md index f3e0218..251ef70 100644 --- a/_pages/kr/robustness-aware/question-answering.md +++ b/_pages/kr/robustness-aware/question-answering.md @@ -3,84 +3,60 @@ layout: default permalink: /leaderboard/kr/robustness-aware/question-answering --- # Robustness-Aware Question Answering Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.01 ± 0.000.17 ± 0.000.01 ± 0.000.18 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
LLaMa-2 13B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.03 ± 0.00
LLaMa-2 7B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.02 ± 0.00
Vietcuna 7B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.05 ± 0.00
MixSUra 8x7B0.00 ± -0.11 ± -0.00 ± -0.12 ± -
GPT-3.50.00 ± 0.000.19 ± 0.000.00 ± 0.000.20 ± 0.00
GPT-40.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/robustness-aware/sentiment-analysis.md b/_pages/kr/robustness-aware/sentiment-analysis.md index 1b53f39..aef4e20 100644 --- a/_pages/kr/robustness-aware/sentiment-analysis.md +++ b/_pages/kr/robustness-aware/sentiment-analysis.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/kr/robustness-aware/sentiment-analysis --- # Robustness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.63 ± 0.010.48 ± 0.010.60 ± 0.010.09 ± 0.010.83 ± 0.040.71 ± 0.010.45 ± 0.010.80 ± 0.010.08 ± 0.010.99 ± 0.01
URA-LLaMa 13B0.55 ± 0.020.52 ± 0.020.59 ± 0.010.06 ± 0.010.74 ± 0.050.72 ± 0.010.44 ± 0.050.77 ± 0.010.18 ± 0.010.77 ± 0.02
URA-LLaMa 7B0.52 ± 0.020.36 ± 0.030.59 ± 0.010.07 ± 0.010.66 ± 0.050.73 ± 0.010.41 ± 0.010.71 ± 0.010.16 ± 0.010.87 ± 0.02
LLaMa-2 13B0.46 ± 0.020.30 ± 0.010.55 ± 0.010.39 ± 0.020.70 ± 0.050.66 ± 0.010.40 ± 0.010.63 ± 0.010.11 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.36 ± 0.010.54 ± 0.010.20 ± 0.020.51 ± 0.050.51 ± 0.010.33 ± 0.010.65 ± 0.010.15 ± 0.010.80 ± 0.02
Vietcuna 7B0.44 ± 0.020.27 ± 0.010.51 ± 0.010.23 ± 0.020.53 ± 0.050.49 ± 0.010.25 ± 0.030.46 ± 0.010.33 ± 0.010.34 ± 0.03
MixSUra 8x7B0.59 ± -0.59 ± -0.55 ± -0.34 ± -0.52 ± -0.69 ± -0.44 ± -0.61 ± -0.29 ± -0.66 ± -
GPT-3.50.64 ± 0.010.60 ± 0.01-0.31 ± 0.010.54 ± 0.050.86 ± 0.010.71 ± 0.01-0.53 ± 0.010.86 ± 0.02
GPT-40.74 ± 0.000.73 ± 0.00-0.41 ± 0.000.71 ± 0.000.83 ± 0.000.70 ± 0.00-0.50 ± 0.000.85 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/summarization.md b/_pages/kr/robustness-aware/summarization.md index e4e847d..4533611 100644 --- a/_pages/kr/robustness-aware/summarization.md +++ b/_pages/kr/robustness-aware/summarization.md @@ -3,204 +3,132 @@ layout: default permalink: /leaderboard/kr/robustness-aware/summarization --- # Robustness-Aware Summarization Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.34 ± 0.000.15 ± 0.000.23 ± 0.00-0.06 ± 0.00-0.11 ± 0.180.10 ± 0.000.10 ± 0.0039.63 ± 0.870.28 ± 0.000.11 ± 0.000.19 ± 0.00-0.16 ± 0.000.25 ± 0.230.50 ± 0.010.50 ± 0.01167.42 ± 7.09
URA-LLaMa 13B0.35 ± 0.000.14 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.07 ± 0.170.64 ± 0.000.65 ± 0.00134.65 ± 3.760.20 ± 0.000.07 ± 0.000.13 ± 0.00-0.17 ± 0.000.20 ± 0.110.38 ± 0.000.38 ± 0.00103.69 ± 3.33
URA-LLaMa 7B0.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.10 ± 0.00-0.24 ± 0.180.65 ± 0.000.65 ± 0.0017.92 ± 0.870.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.17 ± 0.000.11 ± 0.180.65 ± 0.000.65 ± 0.0020.49 ± 0.95
LLaMa-2 13B0.05 ± 0.000.01 ± 0.000.04 ± 0.00-0.15 ± 0.00-0.24 ± 0.180.03 ± 0.000.03 ± 0.0055.91 ± 0.650.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.17 ± 0.000.09 ± 0.000.05 ± 0.000.05 ± 0.0066.85 ± 6.72
LLaMa-2 7B0.05 ± 0.000.01 ± 0.000.05 ± 0.00-0.10 ± 0.00-0.19 ± 0.040.07 ± 0.000.07 ± 0.0055.29 ± 0.880.04 ± 0.000.00 ± 0.000.04 ± 0.00-0.17 ± 0.000.15 ± 0.000.06 ± 0.000.06 ± 0.0058.32 ± 3.32
Vietcuna 7B0.03 ± 0.000.01 ± 0.000.02 ± 0.00-0.10 ± 0.00-0.18 ± 0.060.91 ± 0.000.91 ± 0.001026.61 ± 3.860.08 ± 0.000.02 ± 0.000.05 ± 0.00-0.17 ± 0.00-0.19 ± 0.050.78 ± 0.000.78 ± 0.00505.45 ± 8.64
MixSUra 8x7B0.41 ± -0.19 ± -0.26 ± -- ± --0.03 ± -0.86 ± -0.87 ± -29.15 ± -0.46 ± -0.21 ± -0.28 ± -- ± -0.26 ± -0.88 ± -0.98 ± -19.10 ± -
GPT-3.50.34 ± 0.000.19 ± 0.000.23 ± 0.00-0.10 ± 0.000.05 ± 0.140.81 ± 0.000.81 ± 0.00128.44 ± 2.940.39 ± 0.000.19 ± 0.000.25 ± 0.00-0.17 ± 0.000.28 ± 0.110.82 ± 0.000.82 ± 0.00200.90 ± 7.40
GPT-40.39 ± 0.000.21 ± 0.000.26 ± 0.00-0.10 ± 0.090.04 ± 0.000.83 ± 0.000.83 ± 0.7124.48 ± 0.000.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.17 ± 0.000.28 ± 0.000.80 ± 0.030.81 ± 0.0020.40 ± 1.59
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/text-classification.md b/_pages/kr/robustness-aware/text-classification.md index 3600278..1951d98 100644 --- a/_pages/kr/robustness-aware/text-classification.md +++ b/_pages/kr/robustness-aware/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/kr/robustness-aware/text-classification --- # Robustness-Aware Text Classification Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.000.16 ± 0.000.56 ± 0.020.20 ± 0.000.33 ± 0.000.16 ± 0.020.26 ± 0.030.79 ± 0.000.79 ± 0.020.08 ± 0.06
URA-LLaMa 13B0.30 ± 0.000.11 ± 0.000.51 ± 0.010.26 ± 0.000.44 ± 0.000.01 ± 0.010.05 ± 0.010.47 ± 0.010.84 ± 0.010.00 ± 0.04
URA-LLaMa 7B0.29 ± 0.000.10 ± 0.000.57 ± 0.010.17 ± 0.000.30 ± 0.000.02 ± 0.010.04 ± 0.000.55 ± 0.010.18 ± 0.010.01 ± 0.02
LLaMa-2 13B0.19 ± 0.000.07 ± 0.000.52 ± 0.010.47 ± 0.000.43 ± 0.000.02 ± 0.000.06 ± 0.000.57 ± 0.010.91 ± 0.000.01 ± 0.00
LLaMa-2 7B0.17 ± 0.000.10 ± 0.000.55 ± 0.000.33 ± 0.000.29 ± 0.000.01 ± 0.010.00 ± 0.000.56 ± 0.000.69 ± 0.010.02 ± 0.02
Vietcuna 7B0.09 ± 0.000.09 ± 0.000.51 ± 0.010.91 ± 0.000.09 ± 0.000.02 ± 0.010.01 ± 0.000.55 ± 0.010.23 ± 0.010.02 ± 0.01
MixSUra 8x7B0.35 ± -0.27 ± -0.70 ± -0.58 ± -0.70 ± -0.80 ± -55 ± -0.94 ± -0.15 ± -0.88 ± -
GPT-3.50.42 ± 0.000.41 ± 0.00-0.28 ± 0.000.30 ± 0.000.68 ± 0.020.64 ± 0.03-0.62 ± 0.020.70 ± 0.05
GPT-40.48 ± 0.000.45 ± 0.00-0.33 ± 0.000.40 ± 0.000.86 ± 0.010.80 ± 0.02-0.80 ± 0.010.91 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/toxicity-detection.md b/_pages/kr/robustness-aware/toxicity-detection.md index 275a770..1f72cd4 100644 --- a/_pages/kr/robustness-aware/toxicity-detection.md +++ b/_pages/kr/robustness-aware/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/kr/robustness-aware/toxicity-detection --- # Robustness-Aware Toxicity Detection Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.32 ± 0.000.21 ± 0.000.72 ± 0.010.62 ± 0.000.33 ± 0.000.14 ± 0.000.12 ± 0.000.64 ± 0.020.61 ± 0.000.23 ± 0.00
URA-LLaMa 13B0.27 ± 0.000.26 ± 0.000.56 ± 0.000.56 ± 0.000.12 ± 0.000.18 ± 0.000.11 ± 0.000.57 ± 0.010.45 ± 0.000.20 ± 0.00
URA-LLaMa 7B0.22 ± 0.000.21 ± 0.000.63 ± 0.000.39 ± 0.000.36 ± 0.000.12 ± 0.000.07 ± 0.000.62 ± 0.000.38 ± 0.000.19 ± 0.00
LLaMa-2 13B0.12 ± 0.000.11 ± 0.000.56 ± 0.010.66 ± 0.000.12 ± 0.000.10 ± 0.000.07 ± 0.000.59 ± 0.010.62 ± 0.000.24 ± 0.00
LLaMa-2 7B0.04 ± 0.000.04 ± 0.000.62 ± 0.000.86 ± 0.000.02 ± 0.000.01 ± 0.000.00 ± 0.000.54 ± 0.000.79 ± 0.000.00 ± 0.00
Vietcuna 7B0.11 ± 0.000.11 ± 0.000.54 ± 0.000.39 ± 0.000.13 ± 0.000.09 ± 0.000.05 ± 0.000.5 ± 0.000.24 ± 0.000.08 ± 0.00
MixSUra 8x7B0.72 ± -0.39 ± -- ± -0.25 ± -0.81 ± -0.66 ± -0.31 ± -0.67 ± -0.21 ± -0.82 ± -
GPT-3.50.51 ± 0.000.46 ± 0.000.5 ± 0.000.01 ± 0.000.54 ± 0.000.64 ± 0.000.47 ± 0.00- ± -0.30 ± 0.000.63 ± 0.00
GPT-40.88 ± 0.000.71 ± 0.00- ± -0.38 ± 0.000.88 ± 0.000.78 ± 0.000.56 ± 0.00- ± -0.44 ± 0.000.78 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/robustness-aware/translation.md b/_pages/kr/robustness-aware/translation.md index a710c7a..e116582 100644 --- a/_pages/kr/robustness-aware/translation.md +++ b/_pages/kr/robustness-aware/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/kr/robustness-aware/translation --- # Robustness-Aware Translation Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMTOPUS100
(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)
URA-LLaMa 70B0.25 ± 0.000.58 ± 0.000.11 ± 0.000.51 ± 0.000.05 ± 0.000.40 ± 0.010.06 ± 0.000.36 ± 0.00
URA-LLaMa 13B0.23 ± 0.000.55 ± 0.000.10 ± 0.000.50 ± 0.000.03 ± 0.000.38 ± 0.010.05 ± 0.000.38 ± 0.00
URA-LLaMa 7B0.15 ± 0.000.48 ± 0.000.06 ± 0.000.46 ± 0.000.02 ± 0.000.35 ± 0.000.03 ± 0.000.34 ± 0.01
LLaMa-2 13B0.20 ± 0.000.51 ± 0.000.07 ± 0.000.44 ± 0.000.03 ± 0.000.36 ± 0.010.04 ± 0.000.32 ± 0.00
LLaMa-2 7B0.13 ± 0.000.41 ± 0.000.05 ± 0.000.42 ± 0.000.02 ± 0.000.31 ± 0.000.03 ± 0.000.30 ± 0.00
Vietcuna 7B0.17 ± 0.000.43 ± 0.000.07 ± 0.010.41 ± 0.000.09 ± 0.010.38 ± 0.010.09 ± 0.010.33 ± 0.00
MixSUra 8x7B0.14 ± -0.50 ± -0.11 ± -0.46 ± -0.06 ± -0.36 ± -0.06 ± -0.31 ± -
GPT-3.50.31 ± 0.000.64 ± 0.000.17 ± 0.000.59 ± 0.000.15 ± 0.010.49 ± 0.010.21 ± 0.010.48 ± 0.00
GPT-40.31 ± 0.000.65 ± 0.000.20 ± 0.000.62 ± 0.000.16 ± 0.010.50 ± 0.010.23 ± 0.010.51 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + BLEU envi↑ + BLEU vien↑ + hLEPOR envi↑ + hLEPOR vien↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/weaker-prompt/question-answering.md b/_pages/kr/weaker-prompt/question-answering.md index 9e75486..f15a887 100644 --- a/_pages/kr/weaker-prompt/question-answering.md +++ b/_pages/kr/weaker-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/kr/weaker-prompt/question-answering --- # Weak-Prompt Question Answering Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.21 ± 0.010.47 ± 0.010.14 ± 0.010.41 ± 0.00
URA-LLaMa 13B0.22 ± 0.010.43 ± 0.010.17 ± 0.010.40 ± 0.01
URA-LLaMa 7B0.13 ± 0.000.32 ± 0.000.10 ± 0.000.32 ± 0.00
LLaMa-2 13B0.04 ± 0.000.28 ± 0.000.04 ± 0.000.28 ± 0.00
LLaMa-2 7B0.06 ± 0.000.24 ± 0.000.05 ± 0.000.24 ± 0.00
MixSUra 8x7b0.13 ±-0.38 ± -0.09 ± -0.36 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/weaker-prompt/summarization.md b/_pages/kr/weaker-prompt/summarization.md index 76b649e..c079f68 100644 --- a/_pages/kr/weaker-prompt/summarization.md +++ b/_pages/kr/weaker-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/kr/weaker-prompt/summarization --- # Weak-Prompt Summarization Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.49 ± 0.000.23 ± 0.000.31 ± 0.00-0.08 ± 0.000.05 ± 0.110.89 ± 0.008.90 ± 0.0318.48 ± 0.590.47 ± 0.000.20 ± 0.000.29 ± 0.00-0.16 ± 0.000.19 ± 0.130.86 ± 0.006.83 ± 0.0925.30 ± 1.86
URA-LLaMa 13B0.27 ± 0.000.12 ± 0.000.18 ± 0.00-0.09 ± 0.000.05 ± 0.110.56 ± 0.005.00 ± 0.04153.55 ± 0.990.22 ± 0.000.09 ± 0.000.14 ± 0.00-0.16 ± 0.000.20 ± 0.0070.48 ± 0.003.49 ± 0.04190.09 ± 4.92
URA-LLaMa 7B0.45 ± 0.000.21 ± 0.000.29 ± 0.00-0.08 ± 0.000.03 ± 0.090.91 ± 0.009.43 ± 0.036.42 ± 0.050.42 ± 0.000.18 ± 0.000.27 ± 0.00-0.16 ± 0.000.07 ± 0.120.89 ± 0.007.58 ± 0.057.14 ± 0.14
LLaMa-2 13B0.45 ± 0.000.22 ± 0.000.29 ± 0.00-0.09 ± 0.000.00 ± 0.140.92 ± 0.009.49 ± 0.028.46 ± 0.290.47 ± 0.000.22 ± 0.000.29 ± 0.00-0.16 ± 0.000.34 ± 0.120.92 ± 0.009.39 ± 0.0517.94 ± 2.84
LLaMa-2 7B0.36 ± 0.000.17 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.15 ± 0.120.69 ± 0.006.35 ± 0.037.59 ± 0.210.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.16 ± 0.000.36 ± 0.000.83 ± 0.007.71 ± 0.0712.39 ± 1.46
MixSUra 8x7B0.44 ± -0.22 ± -0.29 ± -- ± -0.07 ± -0.97 ± -35.67 ± -9.43 ± -0.47 ± -0.22 ± -0.29 ± -- ± -0.19 ± -0.97 ± -28.97 ± -10.27 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/information-retrieval.md b/_pages/kr/zero-shot/information-retrieval.md index b43d715..1f21b31 100644 --- a/_pages/kr/zero-shot/information-retrieval.md +++ b/_pages/kr/zero-shot/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/kr/zero-shot/information-retrieval --- # Zero-Shot Information Retrieval Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B--------
URA-LLaMa 13B--------
URA-LLaMa 7B0.06 ± 0.000.14 ± 0.000.09 ± 0.000.21 ± 0.00----
LLaMa-2 13B--------
LLaMa-2 7B0.06 ± 0.000.11 ± 0.000.08 ± 0.000.17 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/knowledge.md b/_pages/kr/zero-shot/knowledge.md index 10caf83..d54075f 100644 --- a/_pages/kr/zero-shot/knowledge.md +++ b/_pages/kr/zero-shot/knowledge.md @@ -2,105 +2,129 @@ layout: default permalink: /leaderboard/kr/zero-shot/knowledge --- -# Zero-Shot Knowledge Leaderboard +# Zero-shot Knowledge Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.28 ± 0.020.44 ± 0.020.80 ± 0.020.80 ± 0.020.85 ± 0.010.10 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.12 ± 0.010.22 ± 0.010.40 ± 0.020.31 ± 0.020.57 ± 0.020.48 ± 0.020.42 ± 0.08
URA-LLaMa 7B0.09 ± 0.010.20 ± 0.020.30 ± 0.020.10 ± 0.010.56 ± 0.020.27 ± 0.020.56 ± 0.07
LLaMa-2 13B0.06 ± 0.010.10 ± 0.010.52 ± 0.020.41 ± 0.020.64 ± 0.020.33 ± 0.020.73 ± 0.07
LLaMa-2 7B0.03 ± 0.010.07 ± 0.010.37 ± 0.020.25 ± 0.020.51 ± 0.020.35 ± 0.020.29 ± 0.06
Vietcuna 7B0.03 ± 0.010.06 ± 0.010.32 ± 0.020.22 ± 0.020.50 ± 0.000.07 ± 0.020.33 ± 0.07
GPT-3.50.37 ± 0.020.56 ± 0.020.90 ± 0.010.72 ± 0.01-0.65 ± 0.010.90 ± 0.04
GPT-40.38 ± 0.020.55 ± 0.020.92 ± 0.010.73 ± 0.06-0.67 ± 0.010.90 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/language-modeling.md b/_pages/kr/zero-shot/language-modeling.md index 2c2a949..64e1df9 100644 --- a/_pages/kr/zero-shot/language-modeling.md +++ b/_pages/kr/zero-shot/language-modeling.md @@ -3,149 +3,108 @@ layout: default permalink: /leaderboard/kr/zero-shot/language-modeling --- # Zero-Shot Language Modeling Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsMLQA-MLMVSEC
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLX
URA-LLaMa 70B0.00 ± 0.000.50 ± 0.010.64 ± 0.01519.09 ± 10.96115.82 ± 2.451.08 ± 0.010.00 ± 0.000.88 ± 0.001.01 ± 0.00113.51 ± 0.5729.91 ± 0.151.09 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.67 ± 0.000.78 ± 0.00697.85 ± 11.62161.34 ± 2.641.16 ± 0.020.01 ± 0.000.42 ± 0.010.56 ± 0.0154.88 ± 0.7714.50 ± 0.191.26 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.73 ± 0.000.88 ± 0.01684.00 ± 13.18166.87 ± 3.181.25 ± 0.010.01 ± 0.003.33 ± 0.043.14 ± 0.03420.34 ± 5.6685.79 ± 0.961.33 ± 0.00
LLaMa-2 13B0.00 ± 0.000.90 ± 0.001.00 ± 0.00881.97 ± 11.23208.52 ± 2.521.10 ± 0.010.00 ± 0.001.32 ± 0.011.40 ± 0.01160.06 ± 1.1638.12 ± 0.231.11 ± 0.00
LLaMa-2 7B0.00 ± 0.000.95 ± 0.001.07 ± 0.01860.42 ± 13.18210.21 ± 3.181.25 ± 0.010.00 ± 0.001.54 ± 0.041.55 ± 0.03171.28 ± 5.6640.18 ± 0.961.14 ± 0.00
Vietcuna 7B0.00 ± 0.001.00 ± 0.001.00 ± 0.00951.53 ± 12.37208.57 ± 2.731.48 ± 0.010.01 ± 0.001.11 ± 0.011.20 ± 0.01139.90 ± 1.3933.94 ± 0.331.61 ± 0.00
GPT-3.50.00 ± 0.000.34 ± 0.010.50 ± 0.01422.30 ± 10.79100.33 ± 2.44-0.02 ± 0.000.16 ± 0.000.30 ± 0.0012.63 ± 0.343.48 ± 0.09-
GPT-40.04 ± 0.000.40 ± 0.010.45 ± 0.01381.88 ± 10.2693.34 ± 2.39-0.60 ± 0.010.14 ± 0.000.26 ± 0.0013.58 ± 0.453.67 ± 0.12-
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + EM↑ + CER↓ + WER↓ + CED↓ + WED↓ + PLX↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/question-answering.md b/_pages/kr/zero-shot/question-answering.md index cfe29fc..e4eb8c2 100644 --- a/_pages/kr/zero-shot/question-answering.md +++ b/_pages/kr/zero-shot/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/kr/zero-shot/question-answering --- # Zero-Shot Question Answering Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.06 ± 0.000.30 ± 0.000.04 ± 0.000.28 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.15 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.16 ± 0.00
LLaMa-2 13B0.00 ± 0.000.04 ± 0.000.00 ± 0.020.05 ± 0.00
LLaMa-2 7B0.00 ± 0.000.05 ± 0.000.00 ± 0.000.06 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
GPT-40.00 ± 0.000.27 ± 0.000.00 ± 0.000.27 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/reasoning.md b/_pages/kr/zero-shot/reasoning.md index 8dd405a..4eac335 100644 --- a/_pages/kr/zero-shot/reasoning.md +++ b/_pages/kr/zero-shot/reasoning.md @@ -3,123 +3,72 @@ layout: default permalink: /leaderboard/kr/zero-shot/reasoning --- # Zero-Shot Reasoning Leaderboard +{% assign lang = 'kr' %} - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + {% endfor %} - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsSR - NaturalSR - Abstract symbolMATH + Models + + {{ dataset[0] }} +
EMF1Equ.EMF1Equ.EMF1Equ.EM↑F1↑Equ↑
URA-LLaMa 70B0.06 ± 0.000.34 ± 0.000.06 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.01 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.01 ± 0.000.31 ± 0.000.02 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.02
URA-LLaMa 7B0.00 ± 0.000.26 ± 0.000.00 ± 0.000.01 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.01
LLaMa-2 13B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.02 ± 0.000.19 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.06 ± 0.01
Vietcuna 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
GPT-3.50.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.72 ± 0.02
GPT-40.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.76 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/zero-shot/sentiment-analysis.md b/_pages/kr/zero-shot/sentiment-analysis.md index 99c521a..88dccd8 100644 --- a/_pages/kr/zero-shot/sentiment-analysis.md +++ b/_pages/kr/zero-shot/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/kr/zero-shot/sentiment-analysis --- # Zero-Shot Sentiment Analysis Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsVLSP 2016UiT-VSFC + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.63 ± 0.020.63 ± 0.020.74 ± 0.010.15 ± 0.010.87 ± 0.030.64 ± 0.010.54 ± 0.010.85 ± 0.010.14 ± 0.000.98 ± 0.01
URA-LLaMa 13B0.52 ± 0.020.35 ± 0.010.60 ± 0.010.10 ± 0.010.64 ± 0.050.70 ± 0.010.40 ± 0.010.72 ± 0.010.23 ± 0.010.95 ± 0.01
URA-LLaMa 7B0.35 ± 0.020.24 ± 0.010.54 ± 0.010.24 ± 0.010.31 ± 0.050.27 ± 0.010.18 ± 0.000.52 ± 0.010.37 ± 0.010.03 ± 0.01
LLaMa-2 13B0.25 ± 0.010.25 ± 0.010.49 ± 0.010.39 ± 0.010.29 ± 0.050.29 ± 0.010.24 ± 0.010.52 ± 0.010.42 ± 0.010.30 ± 0.03
LLaMa-2 7B0.15 ± 0.010.15 ± 0.010.58 ± 0.010.73 ± 0.010.12 ± 0.030.04 ± 0.000.06 ± 0.010.49 ± 0.010.79 ± 0.000.01 ± 0.01
Vietcuna 7B0.11 ± 0.010.12 ± 0.010.49 ± 0.010.68 ± 0.010.11 ± 0.030.05 ± 0.000.06 ± 0.000.56 ± 0.010.73 ± 0.000.05 ± 0.01
MixSUra 8x7B0.45 ± -0.30 ± -0.62 ± -0.50 ± -0.49 ± -0.55 ± -0.40 ± -0.66 ± -0.41 ± -0.60 ± -
Gemini Pro0.64 ± -0.47 ± --0.31 ± -0.53 ± -0.76 ± -0.49 ± --0.43 ± -0.77 ± -
GPT-3.50.62 ± 0.020.56 ± 0.01-0.29 ± 0.020.62 ± 0.050.81 ± 0.310.68 ± 0.31-0.48 ± 0.010.83 ± 0.02
GPT-40.71 ± 0.010.68 ± 0.01-0.37 ± 0.010.70 ± 0.040.80 ± 0.010.67 ± 0.01-0.47 ± 0.010.85 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/kr/zero-shot/summarization.md b/_pages/kr/zero-shot/summarization.md index d878b05..25b89a1 100644 --- a/_pages/kr/zero-shot/summarization.md +++ b/_pages/kr/zero-shot/summarization.md @@ -3,185 +3,132 @@ layout: default permalink: /leaderboard/kr/zero-shot/summarization --- # Zero-Shot Summarization Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.42 ± 0.170.21 ± 0.120.28 ± 0.00-0.11 ± 0.000.03 ± 0.190.85 ± 0.0014.59 ± 0.0517.21 ± 0.330.37 ± 0.000.16 ± 0.000.24 ± 0.00-0.22 ± 0.000.26 ± 0.160.17 ± 0.000.22 ± 0.0022.24 ± 0.97
URA-LLaMa 13B0.38 ± 0.000.18 ± 0.000.25 ± 0.00-0.09 ± 0.000.01 ± 0.180.71 ± 0.006.01 ± 0.0724.27 ± 0.610.22 ± 0.000.08 ± 0.000.14 ± 0.00-0.16 ± 0.00-0.13 ± 0.120.42 ± 0.013.06 ± 0.1049.58 ± 1.16
URA-LLaMa 7B0.38 ± 0.000.14 ± 0.000.25 ± 0.00-0.09 ± 0.000.04 ± 0.120.65 ± 0.004.88 ± 0.037.77 ± 0.050.40 ± 0.000.15 ± 0.000.26 ± 0.00-0.16 ± 0.000.19 ± 0.070.73 ± 0.004.79 ± 0.076.22 ± 0.07
LLaMa-2 13B0.06 ± 0.000.02 ± 0.000.04 ± 0.00-0.09 ± 0.00-0.18 ± 0.040.07 ± 0.000.43 ± 0.0128.25 ± 0.240.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.11 ± 0.080.03 ± 0.000.07 ± 0.0119.55 ± 0.51
LLaMa-2 7B0.06 ± 0.000.01 ± 0.000.05 ± 0.00-0.09 ± 0.00-0.23 ± 0.040.06 ± 0.000.21 ± 0.0015.75 ± 0.200.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.14 ± 0.070.03 ± 0.000.06 ± 0.0017.84 ± 0.50
Vietcuna 7B0.28 ± 0.000.06 ± 0.000.18 ± 0.00-0.09 ± 0.00-0.09 ± 0.090.31 ± 0.000.80 ± 0.01171.63 ± 1.710.24 ± 0.000.06 ± 0.000.15 ± 0.00-0.16 ± 0.00-0.18 ± 0.070.51 ± 0.011.16 ± 0.01238.67 ± 3.37
GPT-3.50.36 ± 0.000.20 ± 0.000.24 ± 0.00-0.09 ± 0.000.04 ± 0.130.86 ± 0.003.97 ± 0.0213.32 ± 0.650.43 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.22 ± 0.030.87 ± 0.003.29 ± 0.0335.50 ± 0.82
GPT-40.41 ± 0.000.21 ± 0.000.26 ± 0.00-0.08 ± 0.00-0.04 ± 0.110.84 ± 0.003.45 ± 0.0015.43 ± 0.490.44 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.24 ± 0.040.82 ± 0.002.37 ± 0.016.61 ± 0.16
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/text-classification.md b/_pages/kr/zero-shot/text-classification.md index 9f55058..c297565 100644 --- a/_pages/kr/zero-shot/text-classification.md +++ b/_pages/kr/zero-shot/text-classification.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/kr/zero-shot/text-classification --- # Zero-Shot Text Classification Leaderboard +{% assign lang = 'kr' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.40 ± 0.020.32 ± 0.020.68 ± 0.010.14 ± 0.020.60 ± 0.060.56 ± 0.020.48 ± 0.030.85 ± 0.000.25 ± 0.020.56 ± 0.06
URA-LLaMa 13B0.29 ± 0.020.25 ± 0.020.52 ± 0.010.09 ± 0.010.23 ± 0.050.10 ± 0.010.10 ± 0.010.72 ± 0.000.52 ± 0.010.14 ± 0.04
URA-LLaMa 7B0.13 ± 0.010.11 ± 0.010.50 ± 0.010.15 ± 0.010.21 ± 0.050.04 ± 0.010.04 ± 0.020.77 ± 0.000.30 ± 0.010.04 ± 0.02
LLaMa-2 13B0.11 ± 0.010.10 ± 0.010.49 ± 0.010.31 ± 0.010.09 ± 0.040.03 ± 0.010.02 ± 0.000.45 ± 0.010.28 ± 0.010.03 ± 0.02
LLaMa-2 7B0.07 ± 0.010.08 ± 0.010.52 ± 0.010.35 ± 0.010.07 ± 0.030.00 ± 0.060.00 ± 0.060.61 ± 0.010.32 ± 0.000.00 ± 0.00
Vietcuna 7B0.05 ± 0.010.02 ± 0.010.52 ± 0.010.95 ± 0.010.03 ± 0.020.05 ± 0.010.01 ± 0.000.66 ± 0.000.20 ± 0.010.01 ± 0.21
GPT-3.50.43 ± 0.020.37 ± 0.02-0.29 ± 0.020.43 ± 0.060.44 ± 0.020.38 ± 0.03-0.38 ± 0.020.44 ± 0.05
GPT-40.49 ± 0.020.46 ± 0.02-0.35 ± 0.020.50 ± 0.060.89 ± 0.010.69 ± 0.02-0.83 ± 0.010.89 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/kr/zero-shot/toxicity-detection.md b/_pages/kr/zero-shot/toxicity-detection.md index 184d581..52336ea 100644 --- a/_pages/kr/zero-shot/toxicity-detection.md +++ b/_pages/kr/zero-shot/toxicity-detection.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/kr/zero-shot/toxicity-detection --- # Zero-Shot Toxicity Detection Leaderboard +{% assign lang = 'kr' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-ViCTSDUiT-ViHSD + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.61 ± 0.010.52 ± 0.010.77 ± 0.010.17 ± 0.010.97 ± 0.010.38 ± 0.010.34 ± 0.010.74 ± 0.010.25 ± 0.010.91 ± 0.01
URA-LLaMa 13B0.46 ± 0.010.28 ± 0.030.53 ± 0.020.22 ± 0.010.48 ± 0.030.33 ± 0.010.18 ± 0.000.60 ± 0.010.35 ± 0.010.54 ± 0.02
URA-LLaMa 7B0.25 ± 0.010.19 ± 0.010.53 ± 0.010.38 ± 0.010.13 ± 0.020.19 ± 0.000.13 ± 0.000.55 ± 0.010.46 ± 0.010.13 ± 0.01
LLaMa-2 13B0.16 ± 0.010.14 ± 0.000.40 ± 0.010.50 ± 0.010.24 ± 0.020.09 ± 0.000.13 ± 0.000.38 ± 0.010.63 ± 0.000.10 ± 0.01
LLaMa-2 7B0.13 ± 0.010.14 ± 0.010.45 ± 0.020.69 ± 0.010.09 ± 0.010.03 ± 0.000.05 ± 0.010.56 ± 0.010.75 ± 0.000.00 ± 0.00
Vietcuna 7B0.09 ± 0.000.07 ± 0.000.50 ± 0.000.41 ± 0.000.10 ± 0.030.07 ± 0.000.04 ± 0.000.50 ± 0.000.26 ± 0.000.07 ± 0.01
GPT-3.50.75 ± 0.010.61 ± 0.02-0.25 ± 0.010.80 ± 0.040.55 ± 0.010.42 ± 0.01-0.22 ± 0.010.55 ± 0.02
GPT-40.89 ± 0.010.69 ± 0.01-0.39 ± 0.010.89 ± 0.030.75 ± 0.010.53 ± 0.01-0.42 ± 0.010.75 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/leaderboard.md b/_pages/leaderboard.md index 7d2cc16..0d417e0 100644 --- a/_pages/leaderboard.md +++ b/_pages/leaderboard.md @@ -152,6 +152,7 @@ Below are our detail evaluation results, please choose the task and scenario to Zero-shot Few-shot Weaker Prompt + Medium Prompt Fairness Aware Robustness Aware Chain-of-Thought diff --git a/_pages/vi/bias-toxicity/question-answering.md b/_pages/vi/bias-toxicity/question-answering.md index f932c3f..f06e7c4 100644 --- a/_pages/vi/bias-toxicity/question-answering.md +++ b/_pages/vi/bias-toxicity/question-answering.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/vi/bias-toxicity/question-answering --- # Bias-Toxicity Question Answering Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.39 ± 0.01-0.41 ± 0.000.02 ± 0.00-0.14 ± 0.02-0.42 ± 0.030.02 ± 0.00
URA-LLaMa 13B-0.39 ± 0.01-0.45 ± 0.010.02 ± 0.00-0.17 ± 0.1-0.38 ± 0.000.02 ± 0.00
URA-LLaMa 7B-0.43 ± 0.01-0.48 ± 0.000.03 ± 0.00-0.18 ± 0.01-0.37 ± 0.010.02 ± 0.00
LLaMa-2 13B-0.35 ± 0.03-0.46 ± 0.000.01 ± 0.00-0.27 ± 0.01-0.43 ± 0.000.01 ± 0.00
LLaMa-2 7B-0.46 ± 0.01-0.42 ± 0.000.01 ± 0.00-0.21 ± 0.06-0.45 ± 0.000.01 ± 0.00
Vietcuna 7B-0.50 ± 0.00--0.04 ± 0.00-0.23 ± 0.09-0.49 ± 0.010.04 ± 0.00
GPT-3.5-0.43 ± 0.01-0.48 ± 0.000.02 ± 0.00-0.18 ± 0.01-0.40 ± 0.000.02 ± 0.00
GPT-4-0.40 ± 0.01-0.45 ± 0.000.02 ± 0.00-0.16 ± 0.01-0.41 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.question_answering %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/bias-toxicity/summarization.md b/_pages/vi/bias-toxicity/summarization.md index a185baf..f93dbf3 100644 --- a/_pages/vi/bias-toxicity/summarization.md +++ b/_pages/vi/bias-toxicity/summarization.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/vi/bias-toxicity/summarization --- # Bias-Toxicity Summarization Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
DRR→|DRG→|SAR→|SAG→|Tox↓DRR→|DRG→|SAR→|SAG→|Tox↓
URA-LLaMa 70B-0.21 ± 0.01-0.31 ± 0.010.05 ± 0.00-0.03 ± 0.02-0.25 ± 0.020.03 ± 0.00
URA-LLaMa 13B-0.20 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.07 ± 0.04-0.31 ± 0.030.02 ± 0.00
URA-LLaMa 7B-0.24 ± 0.02-0.33 ± 0.010.04 ± 0.00-0.07 ± 0.02-0.38 ± 0.020.03 ± 0.00
LLaMa-2 13B-0.26 ± 0.01-0.38 ± 0.010.01 ± 0.00-0.17 ± 0.08-0.50 ± 0.020.01 ± 0.00
LLaMa-2 7B-0.28 ± 0.02-0.39 ± 0.010.01 ± 0.00-0.39 ± 0.05-0.50 ± 0.020.01 ± 0.00
Vietcuna 7B-0.21 ± 0.02-0.32 ± 0.020.04 ± 0.00-0.17 ± 0.04-0.39 ± 0.030.03 ± 0.00
GPT-3.5-0.22 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.03 ± 0.02-0.28 ± 0.010.02 ± 0.00
GPT-4-0.19 ± 0.01-0.28 ± 0.010.06 ± 0.00-0.09 ± 0.02-0.28 ± 0.010.02 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + DRR↓ + DRG↓ + SAR↓ + SAG↓ + Tox↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.summarization %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/bias-toxicity/translation.md b/_pages/vi/bias-toxicity/translation.md index b1174ba..5700248 100644 --- a/_pages/vi/bias-toxicity/translation.md +++ b/_pages/vi/bias-toxicity/translation.md @@ -3,264 +3,94 @@ layout: default permalink: /leaderboard/vi/bias-toxicity/translation --- # Bias-Toxicity Translation Leaderboard +{% assign lang = 'vi' %} - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + + + + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMT (En - Vi)OPUS100 (En - Vi) + {{ dataset[0] }} +
DRR$→|DRG$→|SAR$→|SAG$→|ToxDRR$→|DRG$→|SAR$→|SAG$→|ToxDRR↓DRG↓SAR↓SAG↓Tox↓
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
---- -layout: default -permalink: /leaderboard/vi/bias-toxicity/translation ---- -# Bias-Toxicity Translation Leaderboard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].bias_toxicity.translation %} + {% assign DRR_min = 1 %} + {% assign DRG_min = 1 %} + {% assign SAR_min = 1 %} + {% assign SAG_min = 1 %} + {% assign Tox_min = 1 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].DRR and dataset[1][m].DRR < DRR_min %} + {% assign DRR_min = dataset[1][m].DRR %} + {% endif %} + {% if dataset[1][m].DRG and dataset[1][m].DRG < DRG_min %} + {% assign DRG_min = dataset[1][m].DRG %} + {% endif %} + {% if dataset[1][m].SAR and dataset[1][m].SAR < SAR_min %} + {% assign SAR_min = dataset[1][m].SAR %} + {% endif %} + {% if dataset[1][m].SAG and dataset[1][m].SAG < SAG_min %} + {% assign SAG_min = dataset[1][m].SAG %} + {% endif %} + {% if dataset[1][m].Tox and dataset[1][m].Tox < Tox_min %} + {% assign Tox_min = dataset[1][m].Tox %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + + {% endfor %} -
ModelsPhoMT (En $\to$ Vi)OPUS100 (En $\to$ Vi)
DRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$ToxDRR$\to\mid$DRG$\to\mid$SAR$\to\mid$SAG$\to\mid$Tox
URA-LLaMa 70B-0.03 ± 0.01-0.30 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.47 ± 0.010.06 ± 0.00
URA-LLaMa 13B-0.09 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.27 ± 0.01-0.43 ± 0.020.07 ± 0.00
URA-LLaMa 7B-0.13 ± 0.00-0.33 ± 0.010.05 ± 0.00-0.18 ± 0.03-0.47 ± 0.010.07 ± 0.00
LLaMa-2 13B-0.08 ± 0.00-0.33 ± 0.020.05 ± 0.00-0.31 ± 0.02-0.47 ± 0.010.06 ± 0.00
LLaMa-2 7B-0.17 ± 0.01-0.29 ± 0.010.04 ± 0.00-0.21 ± 0.02-0.45 ± 0.020.05 ± 0.00
Vietcuna 7B-0.18 ± 0.01-0.36 ± 0.010.04 ± 0.00-0.16 ± 0.03-0.43 ± 0.020.07 ± 0.00
GPT-3.5-0.11 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.16 ± 0.03-0.43 ± 0.030.07 ± 0.00
GPT-4-0.09 ± 0.01-0.34 ± 0.010.05 ± 0.00-0.14 ± 0.03-0.41 ± 0.010.07 ± 0.00
+ {{ model }} + + {% if dataset[1][model].DRR %} + {{ dataset[1][model].DRR | round: 2 }} ± {{ dataset[1][model].DRR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].DRG %} + {{ dataset[1][model].DRG | round: 2 }} ± {{ dataset[1][model].DRG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAR %} + {{ dataset[1][model].SAR | round: 2 }} ± {{ dataset[1][model].SAR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].SAG %} + {{ dataset[1][model].SAG | round: 2 }} ± {{ dataset[1][model].SAG_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Tox %} + {{ dataset[1][model].Tox | round: 2 }} ± {{ dataset[1][model].Tox_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/chain-of-thought/reasoning.md b/_pages/vi/chain-of-thought/reasoning.md index 6d62bc7..5c07ce8 100644 --- a/_pages/vi/chain-of-thought/reasoning.md +++ b/_pages/vi/chain-of-thought/reasoning.md @@ -3,73 +3,72 @@ layout: default permalink: /leaderboard/vi/chain-of-thought/reasoning --- # Chain-Of-Thought Reasoning Leaderboard +{% assign lang = 'vi' %} - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + {% endfor %} - - - + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].chain_of_thought.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].Equ and dataset[1][m].Equ > Equ_best %} + {% assign Equ_best = dataset[1][m].Equ %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsMetrics + Models + + {{ dataset[0] }} +
EM F1 Equ. EM↑F1↑Equ.↑
URA-LLaMa 70B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
URA-LLaMa 13B0.00 ± 0.000.23 ± 0.010.17 ± 0.01
URA-LLaMa 7B0.00 ± 0.000.23 ± 0.010.09 ± 0.01
LLaMa-2 13B0.00 ± 0.000.12 ± 0.010.18 ± 0.02
LLaMa-2 7B0.00 ± 0.000.10 ± 0.000.12 ± 0.02
Vietcuna 7B0.00 ± 0.000.13 ± 0.010.10 ± 0.01
MixSUra 8x7B0.00 ± 0.000.17 ± 0.010.33 ± 0.00
GPT-3.50.00 ± 0.000.32 ± 0.010.78 ± 0.02
GPT-40.00 ± 0.000.32 ± 0.010.79 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].Equ %} + {{ dataset[1][model].Equ | round: 2 }} ± {{ dataset[1][model].Equ_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/fairness-aware/information-retrieval.md b/_pages/vi/fairness-aware/information-retrieval.md index 91ad6c4..6eb7f91 100644 --- a/_pages/vi/fairness-aware/information-retrieval.md +++ b/_pages/vi/fairness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/vi/fairness-aware/information-retrieval --- # Fairness-Aware Information Retrieval Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsmMARCOmRobust04 + Models + + {{ dataset[0] }} +
M@10M@10BN@10N@10BM@10M@10BN@10N@10BM@10↑M@10B↑N@10↑N@10B↑
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.10 ± 0.000.10 ± 0.000.14 ± 0.000.14 ± 0.000.01 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.00
LLaMa-2 13B
LLaMa-2 7B0.05 ± 0.000.10 ± 0.000.07 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4-------- + {{ model }} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/fairness-aware/language-modeling.md b/_pages/vi/fairness-aware/language-modeling.md index c8ceba9..97227d5 100644 --- a/_pages/vi/fairness-aware/language-modeling.md +++ b/_pages/vi/fairness-aware/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/vi/fairness-aware/language-modeling --- # Fairness-Aware Language Modeling Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.58 ± 0.010.70 ± 0.01653.57 ± 12.05150.64 ± 2.731.25 ± 0.060.30 ± 0.000.11 ± 0.000.14 ± 0.0015.19 ± 0.424.12 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.02 ± 0.000.40 ± 0.010.56 ± 0.01518.38 ± 11.19125.24 ± 2.661.48 ± 0.110.32 ± 0.000.07 ± 0.000.21 ± 0.002.98 ± 0.111.24 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01492.93 ± 11.32117.82 ± 2.721.22 ± 0.010.20 ± 0.000.54 ± 0.010.67 ± 0.0141.77 ± 1.5710.12 ± 0.351.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.76 ± 0.000.89 ± 0.00782.03 ± 11.71192.66 ± 2.831.27 ± 0.040.15 ± 0.000.07 ± 0.000.22 ± 0.003.39 ± 0.161.52 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.79 ± 0.000.96 ± 0.00761.38 ± 10.65197.18 ± 2.661.75 ± 0.200.12 ± 0.000.35 ± 0.010.48 ± 0.0147.54 ± 0.8511.82 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00940.71 ± 12.48208.05 ± 2.811.40 ± 0.000,06 ± 0.004.78 ± 0.064.80 ± 0.06634.48 ± 8.58145.12 ± 1.941.46 ± 0.01
MixSUra 8x7B0.00 ± -0.56 ± -0.63 ± -535.76 ± -133.64 ± -1.00 ± -0,07 ± -0.20 ± -0.29 ± -25.96 ± -8.79 ± -1.00 ± -
GPT-3.50.03 ± 0.000.29 ± 0.010.46 ± 0.01398.19 ± 11.0196.42 ± 2.54-0.59 ± 0.000.06 ± 0.000.19 ± 0.001.99 ± 0.080.74 ± 0.02-
GPT-40.06 ± 0.000.36 ± 0.010.41 ± 0.01347.82 ± 10.2386.96 ± 2.41-0.67 ± 0.000.01 ± 0.000.02 ± 0.001.30 ± 0.040.54 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/fairness-aware/question-answering.md b/_pages/vi/fairness-aware/question-answering.md index f2ffc91..1c8836f 100644 --- a/_pages/vi/fairness-aware/question-answering.md +++ b/_pages/vi/fairness-aware/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/vi/fairness-aware/question-answering --- # Fairness-Aware Question Answering Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
Exact MatchF1Exact MatchF1
URA-LLaMa 70B0.04 ± 0.000.27 ± 0.000.03 ± 0.000.25 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.14 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.13 ± 0.000.00 ± 0.000.15 ± 0.01
LLaMa-2 13B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.04 ± 0.00
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.05 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.23 ± 0.00
GPT-40.00 ± 0.000.26 ± 0.000.00 ± 0.000.24 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/fairness-aware/sentiment-analysis.md b/_pages/vi/fairness-aware/sentiment-analysis.md index f5a7b0b..017da03 100644 --- a/_pages/vi/fairness-aware/sentiment-analysis.md +++ b/_pages/vi/fairness-aware/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/vi/fairness-aware/sentiment-analysis --- # Fairness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.65 ± 0.010.49 ± 0.010.58 ± 0.010.13 ± 0.010.77 ± 0.040.76 ± 0.010.48 ± 0.010.61 ± 0.010.17 ± 0.010.66 ± 0.03
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.62 ± 0.010.07 ± 0.010.83 ± 0.040.75 ± 0.010.46 ± 0.080.83 ± 0.010.11 ± 0.010.88 ± 0.02
URA-LLaMa 7B0.74 ± 0.020.39 ± 0.060.83 ± 0.010.21 ± 0.020.98 ± 0.020.73 ± 0.010.73 ± 0.010.78 ± 0.010.13 ± 0.010.94 ± 0.01
LLaMa-2 13B0.51 ± 0.010.1 ± 0.060.56 ± 0.010.32 ± 0.020.79 ± 0.040.63 ± 0.010.41 ± 0.020.70 ± 0.010.13 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.34 ± 0.010.53 ± 0.010.26 ± 0.020.50 ± 0.00.51 ± 0.010.55 ± 0.010.68 ± 0.010.22 ± 0.010.64 ± 0.03
Vietcuna 7B0.04 ± 0.010.04 ± 0.010.49 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.55 ± 0.010.50 ± 0.000.01 ± 0.01
MixSUra 8x7B0.62 ± -0.62 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.61 ± -0.24 ± -0.66 ± -
Gemini Pro0.67 ± -0.50 ± -- 0.34 ± -0.59 ± -0.79 ± -0.50 ± -- 0.46 ± -0.82 ± -
GPT-3.50.66 ± 0.010.60 ± 0.01- 0.33 ± 0.010.52 ± 0.050.86 ± 0.010.71 ± 0.01- 0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01- 0.41 ± 0.000.73 ± 0.040.85 ± 0.010.71 ± 0.01- 0.52 ± 0.010.87 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/fairness-aware/text-classification.md b/_pages/vi/fairness-aware/text-classification.md index ea021e3..1743b71 100644 --- a/_pages/vi/fairness-aware/text-classification.md +++ b/_pages/vi/fairness-aware/text-classification.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/vi/fairness-aware/text-classification --- # Fairness-Aware Text Classification Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].fairness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-VSMECPhoATIS + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.24 ± 0.020.14 ± 0.010.58 ± 0.010.26 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.31 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.31 ± 0.020.11 ± 0.010.58 ± 0.010.23 ± 0.020.57 ± 0.060.01 ± 0.010.05 ± 0.020.58 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.41 ± 0.060.00 ± 0.010.00 ± 0.000.55 ± 0.000.30 ± 0.010.01 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.44 ± 0.060.02 ± 0.010.01 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.11 ± 0.010.57 ± 0.010.22 ± 0.020.53 ± 0.060.02 ± 0.000.06 ± 0.010.57 ± 0.010.68 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.16 ± 0.040.04 ± 0.010.01 ± 0.000.77 ± 0.010.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
Gemini Pro0.48 ± -0.38 ± --0.34 ± -0.43 ± -0.79 ± -0.67 ± --0.73 ± -0.68 ± -
GPT-3.50.44 ± 0.020.42 ± 0.02-0.30 ± 0.020.36 ± 0.060.68 ± 0.020.66 ± 0.03-0.62 ± 0.020.67 ± 0.05
GPT-40.49 ± 0.020.47 ± 0.02-0.35 ± 0.020.36 ± 0.060.83 ± 0.010.76 ± 0.03-0.77 ± 0.010.87 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/fairness-aware/toxicity-detection.md b/_pages/vi/fairness-aware/toxicity-detection.md index 8b89bfd..7a1706f 100644 --- a/_pages/vi/fairness-aware/toxicity-detection.md +++ b/_pages/vi/fairness-aware/toxicity-detection.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/vi/fairness-aware/toxicity-detection --- # Fairness-Aware Toxicity Detection Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.41 ± 0.020.26 ± 0.010.75 ± 0.010.53 ± 0.010.33 ± 0.050.15 ± 0.000.40 ± 0.000.64 ± 0.010.58 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.43 ± 0.020.29 ± 0.070.66 ± 0.010.36 ± 0.020.42 ± 0.050.24 ± 0.010.15 ± 0.000.61 ± 0.010.43 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.42 ± 0.020.39 ± 0.010.60 ± 0.010.30 ± 0.010.66 ± 0.050.16 ± 0.000.10 ± 0.000.67 ± 0.010.33 ± 0.000.28 ± 0.02
LLaMa-2 13B0.27 ± 0.010.18 ± 0.010.67 ± 0.010.53 ± 0.010.57 ± 0.050.16 ± 0.000.10 ± 0.000.62 ± 0.010.59 ± 0.000.42 ± 0.02
LLaMa-2 7B0.15 ± 0.010.11 ± 0.010.62 ± 0.010.67 ± 0.010.07 ± 0.030.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.00
Vietcuna 7B0.08 ± 0.010.09 ± 0.010.50 ± 0.010.42 ± 0.010.06 ± 0.030.62 ± 0.010.21 ± 0.000.50 ± 0.000.29 ± 0.010.62 ± 0.02
MixSUra 8x7B0.69 ± -0.38 ± -- ± -0.29 ± -0.78 ± -0.56 ± -0.31 ± -0.68 ± -0.32 ± -0.92 ± -
Gemini Pro0.81 ± -0.43 ± -- ± -0.31 ± -0.82 ± -0.70 ± -0.37 ± -- ± -0.36 ± -0.69 ± -
GPT-3.50.60 ± 0.020.52 ± 0.02- ± -0.11 ± 0.020.63 ± 0.050.61 ± 0.010.46 ± 0.01- ± -0.29 ± 0.010.62 ± 0.02
GPT-40.87 ± 0.010.69 ± 0.02- ± -0.37 ± 0.010.86 ± 0.030.76 ± 0.010.56 ± 0.01- ± -0.43 ± 0.010.76 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].fairness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/information-retrieval.md b/_pages/vi/few-shot/information-retrieval.md index 25c1d2c..f82fbbd 100644 --- a/_pages/vi/few-shot/information-retrieval.md +++ b/_pages/vi/few-shot/information-retrieval.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/vi/few-shot/information-retrieval --- # Few-Shot Information Retrieval Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B0.05 ± 0.000.11 ± 0.000.06 ± 0.000.14 ± 0.000.04±0.000.04±0.000.03±0.000.04±0.00
URA-LLaMa 13B0.04 ± 0.000.10 ± 0.000.06 ± 0.000.14 ± 0.000.03±0.000.05±0.000.04±0.000.04±0.00
URA-LLaMa 7B0.04 ± 0.000.11 ± 0.000.06 ± 0.000.16 ± 0.000.03 ± 0.000.03 ± 0.000.02 ± 0.000.02 ± 0.00
LLaMa-2 13B0.07 ± 0.000.15 ± 0.000.09 ± 0.000.21 ± 0.000.05±0.000.04±0.000.04±0.000.04±0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.16 ± 0.000.02±0.000.03±0.000.03±0.000.02±0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.00±0.000.00±0.000.00±0.000.00±0.00
MixSUra 8x7B0.01 ± -0.07 ± -0.04 ± -0.11 ± -0.04±-0.04±-0.02±-0.02±-
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/knowledge.md b/_pages/vi/few-shot/knowledge.md index db04348..2c1b6e9 100644 --- a/_pages/vi/few-shot/knowledge.md +++ b/_pages/vi/few-shot/knowledge.md @@ -2,115 +2,129 @@ layout: default permalink: /leaderboard/vi/few-shot/knowledge --- -# Few-Shot Knowledge Leaderboard +# Few-shot Knowledge Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.34 ± 0.020.50 ± 0.020.78 ± 0.020.63 ± 0.030.90 ± 0.010.13 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.26 ± 0.020.40 ± 0.020.62 ± 0.020.50 ± 0.020.69 ± 0.020.18 ± 0.020.65 ± 0.07
URA-LLaMa 7B0.14 ± 0.020.25 ± 0.020.42 ± 0.020.33 ± 0.020.61 ± 0.020.13 ± 0.020.39 ± 0.07
LLaMa-2 13B0.22 ± 0.020.36 ± 0.020.58 ± 0.020.46 ± 0.020.62 ± 0.020.28 ± 0.020.77 ± 0.06
LLaMa-2 7B0.07 ± 0.010.15 ± 0.010.30 ± 0.020.23 ± 0.020.56 ± 0.020.43 ± 0.020.16 ± 0.05
Vietcuna 7B0.07 ± 0.010.19 ± 0.010.31 ± 0.020.18 ± 0.010.50 ± 0.000.06 ± 0.020.31 ± 0.06
MixSUra 8x7B0.19 ± -0.34 ± -0.65 ± -0.64 ± -0.54 ± -0.29 ± -0.65 ± -
GPT-3.50.49 ± 0.020.64 ± 0.020.90 ± 0.010.73 ± 0.03-0.66 ± 0.010.91 ± 0.04
GPT-40.49 ± 0.020.64 ± 0.020.91 ± 0.010.73 ± 0.04-0.66 ± 0.010.91 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/language-modeling.md b/_pages/vi/few-shot/language-modeling.md index 85c1105..f2b6953 100644 --- a/_pages/vi/few-shot/language-modeling.md +++ b/_pages/vi/few-shot/language-modeling.md @@ -3,164 +3,108 @@ layout: default permalink: /leaderboard/vi/few-shot/language-modeling --- # Few-Shot Language Modeling Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + {% endfor %} - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + + + + + + {% endfor %} + {% endfor %} -
ModelsMLQA-MLMVSEC + Models + + {{ dataset[0] }} +
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLXEM↑CER↓WER↓CED↓WED↓PLX↓
URA-LLaMa 70B0.01 ± 0.000.54 ± 0.000.66 ± 0.00669.74 ± 10.38153.04 ± 2.331.32 ± 0.050.33 ± 0.000.11 ± 0.000.13 ± 0.0015.09 ± 0.424.05 ± 0.111.13 ± 0.00
URA-LLaMa 13B0.01 ± 0.000.45 ± 0.010.61 ± 0.01559.64 ± 11.23136.97 ± 2.681.49 ± 0.100.35 ± 0.000.02 ± 0.000.04 ± 0.002.81 ± 0.121.18 ± 0.031.15 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.40 ± 0.010.55 ± 0.01498.36 ± 11.01118.11 ± 2.581.24 ± 0.010.22 ± 0.000.32 ± 0.010.33 ± 0.0141.89 ± 1.5410.10 ± 0.341.07 ± 0.00
LLaMa-2 13B0.01 ± 0.000.74 ± 0.000.87 ± 0.00760.98 ± 11.91186.90 ± 2.851.24 ± 0.030.16 ± 0.000.03 ± 0.000.05 ± 0.003.38 ± 0.161.51 ± 0.041.01 ± 0.00
LLaMa-2 7B0.00 ± 0.000.81 ± 0.000.98 ± 0.00769.36 ± 10.51198.53 ± 2.571.74 ± 0.190.12 ± 0.000.36 ± 0.010.39 ± 0.0147.50 ± 0.8611.80 ± 0.191.06 ± 0.00
Vietcuna 7B0.00 ± 0.001.04 ± 0.001.06 ± 0.00935.65 ± 12.47204.98 ± 2.791.40 ± 0.000.00 ± 0.008.00 ± 0.078.01 ± 0.071063.93 ± 7.64241.74 ± 1.741.46 ± 0.00
MixSUra 8x7B0.00 ± -0.55 ± -0.63 ± -526.79 ± -131.02 ± -1.00 ± -0.08 ± -0.19 ± -0.28 ± -25.13 ± -8.58 ± -1.00 ± -
GPT-3.50.04 ± 0.000.28 ± 0.010.44 ± 0.01387.37 ± 10.8692.78 ± 2.46-0.66 ± 0.000.01 ± 0.000.02 ± 0.001.63 ± 0.080.61 ± 0.02-
GPT-40.08 ± 0.000.23 ± 0.010.40 ± 0.01336.53 ± 10.1883.55 ± 2.34-0.75 ± 0.000.01 ± 0.000.01 ± 0.000.89 ± 0.040.37 ± 0.01- + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/few-shot/reasoning.md b/_pages/vi/few-shot/reasoning.md index 47b2923..f5428de 100644 --- a/_pages/vi/few-shot/reasoning.md +++ b/_pages/vi/few-shot/reasoning.md @@ -3,135 +3,72 @@ layout: default permalink: /leaderboard/vi/few-shot/reasoning --- # Few-Shot Reasoning Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsSR - NaturalSR - Abstract symbolMATH
EMF1Equ.EMF1Equ.EMF1Equ.
URA-LLaMa 70B0.14 ± 0.000.48 ± 0.000.15 ± 0.000.27 ± 0.000.85 ± 0.000.30 ± 0.000.00 ± 0.000.00 ± 0.000.12 ± 0.02
URA-LLaMa 13B0.08 ± 0.000.42 ± 0.000.08 ± 0.000.20 ± 0.000.70 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.01
URA-LLaMa 7B0.04 ± 0.000.38 ± 0.000.04 ± 0.000.11 ± 0.000.61 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.07 ± 0.01
LLaMa-2 13B0.03 ± 0.000.24 ± 0.000.04 ± 0.000.19 ± 0.000.69 ± 0.000.18 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.01 ± 0.000.00 ± 0.000.06 ± 0.000.44 ± 0.000.06 ± 0.000.00 ± 0.000.00 ± 0.000.11 ± 0.01
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.000.71 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
MixSUra 8x7B0.07 ± 0.000.41 ± 0.000.07 ± 0.000.22 ± 0.000.78 ± 0.000.23 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.15 ± 0.000.50 ± 0.000.16 ± 0.000.26 ± 0.000.83 ± 0.000.29 ± 0.000.00 ± 0.000.00 ± 0.000.62 ± 0.02
GPT-40.37 ± 0.000.74 ± 0.000.42 ± 0.000.37 ± 0.000.87 ± 0.000.44 ± 0.000.00 ± 0.000.01 ± 0.000.65 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + EM↑ + F1↑ + Equ↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/sentiment-analysis.md b/_pages/vi/few-shot/sentiment-analysis.md index 7c77504..0608261 100644 --- a/_pages/vi/few-shot/sentiment-analysis.md +++ b/_pages/vi/few-shot/sentiment-analysis.md @@ -1,146 +1,98 @@ --- layout: default -permalink: /leaderboard/vi/few-shot/sentiment-analysis +permalink: /leaderboard/vi/few-shot/sentiment-analysis --- # Few-Shot Sentiment Analysis Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.66 ± 0.010.49 ± 0.010.72 ± 0.010.13 ± 0.010.77 ± 0.040.75 ± 0.010.48 ± 0.010.81 ± 0.010.16 ± 0.010.71 ± 0.02
URA-LLaMa 13B0.59 ± 0.010.57 ± 0.010.67 ± 0.010.09 ± 0.010.82 ± 0.040.74 ± 0.010.52 ± 0.080.83 ± 0.010.10 ± 0.010.87 ± 0.02
URA-LLaMa 7B0.57 ± 0.020.42 ± 0.050.69 ± 0.020.07 ± 0.020.77 ± 0.040.72 ± 0.010.43 ± 0.010.78 ± 0.010.13 ± 0.010.95 ± 0.03
LLaMa-2 13B0.51 ± 0.010.41 ± 0.060.66 ± 0.010.32 ± 0.020.80 ± 0.040.63 ± 0.010.46 ± 0.070.71 ± 0.010.13 ± 0.010.88 ± 0.02
LLaMa-2 7B0.45 ± 0.010.32 ± 0.010.59 ± 0.010.26 ± 0.020.50 ± 0.050.50 ± 0.010.34 ± 0.010.69 ± 0.010.23 ± 0.010.62 ± 0.03
Vietcuna 7B0.04 ± 0.010.05 ± 0.010.45 ± 0.010.71 ± 0.010.05 ± 0.020.03 ± 0.000.03 ± 0.000.53 ± 0.010.50 ± 0.000.01 ± 0.00
MixSUra 8x7B0.62 ± -0.63 ± -0.59 ± -0.30 ± -0.59 ± -0.74 ± -0.46 ± -0.63 ± -0.23 ± -0.655 ± -
GPT-3.50.65 ± 0.010.59 ± 0.1-0.32 ± 0.010.65 ± 0.050.86 ± 0.010.73 ± 0.01-0.52 ± 0.010.86 ± 0.02
GPT-40.75 ± 0.010.74 ± 0.01-0.41 ± 0.010.74 ± 0.040.85 ± 0.010.59 ± 0.09-0.52 ± 0.010.85 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/text-classification.md b/_pages/vi/few-shot/text-classification.md index 6c3b0d7..10d2f30 100644 --- a/_pages/vi/few-shot/text-classification.md +++ b/_pages/vi/few-shot/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/vi/few-shot/text-classification --- # Few-Shot Text Classification Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.020.15 ± 0.010.56 ± 0.010.25 ± 0.020.37 ± 0.060.15 ± 0.010.22 ± 0.030.83 ± 0.000.81 ± 0.010.13 ± 0.04
URA-LLaMa 13B0.32 ± 0.020.12 ± 0.010.58 ± 0.010.22 ± 0.020.57 ± 0.070.01 ± 0.010.06 ± 0.020.47 ± 0.000.84 ± 0.010.00 ± 0.01
URA-LLaMa 7B0.29 ± 0.020.11 ± 0.010.60 ± 0.010.12 ± 0.020.43 ± 0.060.06 ± 0.010.01 ± 0.000.55 ± 0.000.24 ± 0.010.08 ± 0.03
LLaMa-2 13B0.18 ± 0.020.08 ± 0.010.55 ± 0.010.45 ± 0.010.49 ± 0.070.02 ± 0.010.06 ± 0.020.57 ± 0.010.90 ± 0.010.01 ± 0.01
LLaMa-2 7B0.25 ± 0.020.12 ± 0.010.57 ± 0.010.21 ± 0.020.54 ± 0.060.03 ± 0.010.02 ± 0.010.56 ± 0.010.54 ± 0.010.01 ± 0.01
Vietcuna 7B0.15 ± 0.010.05 ± 0.010.46 ± 0.010.85 ± 0.010.15 ± 0.040.04 ± 0.010.01 ± 0.000.63 ± 0.000.21 ± 0.010.07 ± 0.03
MixSUra 8x7B0.40 ± -0.36 ± -0.72 ± -0.53 ± -0.79 ± -0.81 ± -0.58 ± -0.96 ± -0.14 ± -0.91 ± -
GPT-3.50.42 ± 0.020.40 ± 0.02-0.28 ± 0.020.42 ± 0.060.69 ± 0.020.67 ± 0.03-0.63 ± 0.020.69 ± 0.05
GPT-40.49 ± 0.020.48 ± 0.02-0.35 ± 0.020.49 ± 0.060.85 ± 0.010.78 ± 0.03-0.79 ± 0.010.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/toxicity-detection.md b/_pages/vi/few-shot/toxicity-detection.md index d752c6a..2e3dcf8 100644 --- a/_pages/vi/few-shot/toxicity-detection.md +++ b/_pages/vi/few-shot/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/vi/few-shot/toxicity-detection --- # Few-Shot Toxicity Detection Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.44 ± 0.010.27 ± 0.010.75 ± 0.010.52 ± 0.010.37 ± 0.020.17 ± 0.000.15 ± 0.000.64 ± 0.010.57 ± 0.000.27 ± 0.02
URA-LLaMa 13B0.44 ± 0.010.30 ± 0.050.67 ± 0.010.33 ± 0.010.41 ± 0.030.26 ± 0.010.16 ± 0.000.61 ± 0.010.42 ± 0.010.21 ± 0.02
URA-LLaMa 7B0.43 ± 0.010.40 ± 0.010.60 ± 0.010.29 ± 0.010.71 ± 0.020.16 ± 0.000.10 ± 0.000.67 ± 0.010.32 ± 0.000.28 ± 0.02
LLaMa-2 13B0.28 ± 0.010.19 ± 0.000.67 ± 0.010.52 ± 0.010.63 ± 0.030.17 ± 0.000.11 ± 0.000.62 ± 0.010.58 ± 0.000.44 ± 0.02
LLaMa-2 7B0.16 ± 0.010.12 ± 0.010.61 ± 0.010.66 ± 0.010.08 ± 0.020.01 ± 0.000.01 ± 0.000.56 ± 0.010.71 ± 0.000.01 ± 0.02
Vietcuna 7B0.08 ± 0.000.10 ± 0.010.50 ± 0.000.42 ± 0.000.08 ± 0.030.61 ± 0.010.21 ± 0.000.50 ± 0.000.28 ± 0.010.61 ± 0.02
MixSUra 8x7B0.70 ± -0.39 ± -- ± -0.29 ± -0.80 ± -0.58 ± -0.31 ± -0.68 ± -0.30 ± -0.93 ± -
GPT-3.50.63 ± 0.020.54 ± 0.02- 0.13 ± 0.020.63 ± 0.050.63 ± 0.010.47 ± 0.01- 0.29 ± 0.010.63 ± 0.02
GPT-40.89 ± 0.000.71 ± 0.01- 0.39 ± 0.000.89 ± 0.030.77 ± 0.010.57 ± 0.01- 0.44 ± 0.010.77 ± 0.02
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].few_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/few-shot/translation.md b/_pages/vi/few-shot/translation.md index ef13374..d2c3841 100644 --- a/_pages/vi/few-shot/translation.md +++ b/_pages/vi/few-shot/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/vi/few-shot/translation --- # Few-Shot Translation Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + {% endfor %} - - - - - - - - + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].few_shot.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + + + + {% endfor %} + {% endfor %} -
ModelsPhoMTOPUS100 + Models + + {{ dataset[0] }} +
(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)(En -> Vi)(Vi -> En)BLEU envi↑BLEU vien↑hLEPOR envi↑hLEPOR vien↑
URA-LLaMa 70B0.28 ± 0.000.59 ± 0.000.27 ± 0.000.58 ± 0.000.10 ± 0.000.44 ± 0.010.14 ± 0.000.41 ± 0.01
URA-LLaMa 13B0.25 ± 0.000.55 ± 0.000.15 ± 0.000.56 ± 0.000.10 ± 0.010.41 ± 0.010.17 ± 0.010.43 ± 0.01
URA-LLaMa 7B0.19 ± 0.000.50 ± 0.000.22 ± 0.000.54 ± 0.000.08 ± 0.000.38 ± 0.010.14 ± 0.010.39 ± 0.01
LLaMa-2 13B0.23 ± 0.000.53 ± 0.000.23 ± 0.000.54 ± 0.000.09 ± 0.000.39 ± 0.010.14 ± 0.010.40 ± 0.01
LLaMa-2 7B0.18 ± 0.000.47 ± 0.000.21 ± 0.000.52 ± 0.000.07 ± 0.000.34 ± 0.000.11 ± 0.010.36 ± 0.01
Vietcuna 7B0.15 ± 0.000.35 ± 0.000.03 ± 0.000.11 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.16 ± 0.00
MixSUra 8x7B0.15 ± -0.51 ± -0.16 ± -0.52 ± -0.07 ± -0.37 ± -0.09 ± -0.36 ± -
GPT-3.50.33 ± 0.000.65 ± 0.000.33 ± 0.000.63 ± 0.000.16 ± 0.010.50 ± 0.010.24 ± 0.010.51 ± 0.00
GPT-40.33 ± 0.000.66 ± 0.000.34 ± 0.000.65 ± 0.000.17 ± 0.010.51 ± 0.010.25 ± 0.010.53 ± 0.00 + {{ model }} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/medium-prompt/question-answering.md b/_pages/vi/medium-prompt/question-answering.md index 13fc8a8..5e06833 100644 --- a/_pages/vi/medium-prompt/question-answering.md +++ b/_pages/vi/medium-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/vi/medium-prompt/question-answering --- # Medium-Prompt Question Answering Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.08 ± 0.000.33 ± 0.000.07 ± 0.000.31 ± 0.00
URA-LLaMa 13B0.04 ± 0.000.21 ± 0.000.04 ± 0.000.19 ± 0.00
URA-LLaMa 7B0.01 ± 0.000.11 ± 0.000.01 ± 0.000.11 ± 0.00
LLaMa-2 13B0.00 ± 0.000.10 ± 0.000.00 ± 0.000.09 ± 0.00
LLaMa-2 7B0.00 ± 0.000.03 ± 0.000.00 ± 0.000.03 ± 0.00
MixSUra 8x7B0.01 ± -0.25 ± -0.00 ± -0.25 ± -
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/medium-prompt/summarization.md b/_pages/vi/medium-prompt/summarization.md index 739433e..df2a021 100644 --- a/_pages/vi/medium-prompt/summarization.md +++ b/_pages/vi/medium-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/vi/medium-prompt/summarization --- # Medium-Prompt Summarization Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.35 ± 0.000.16 ± 0.000.24 ± 0.00-0.11 ± 0.000.12 ± 0.000.63 ± 0.005.43 ± 0.0237.78 ± 0.470.33 ± 0.000.14 ± 0.000.22 ± 0.00-0.16± 0.000.24± 0.100.59 ± 0.014.62 ± 0.1156.56 ± 1.70
URA-LLaMa 13B0.26 ± 0.000.12 ± 0.000.17 ± 0.00-0.09 ± 0.00-0.08 ± 0.180.46 ± 0.003.55 ± 0.0447.75 ± 0.650.14 ± 0.000.05 ± 0.000.09 ± 0.00-0.16 ± 0.00-0.14 ± 0.120.26 ± 0.011.83 ± 0.0660.10 ± 2.16
URA-LLaMa 7B0.41 ± 0.000.18 ± 0.000.27 ± 0.00-0.09 ± 0.00-0.08 ± 0.130.83 ± 0.008.13 ± 0.048.08 ± 0.170.42 ± 0.000.17 ± 0.000.27 ± 0.00-0.16 ± 0.000.27 ± 0.210.84 ± 0.007.15 ± 0.088.08 ± 0.36
LLaMa-2 13B0.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.09 ± 0.00-0.19 ± 0.050.01 ± 0.000.01 ± 0.0054.67 ± 0.160.03 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.05 ± 0.030.02 ± 0.000.02 ± 0.0042.55 ± 0.81
LLaMa-2 7B0.03 ± 0.000.01 ± 0.000.03 ± 0.00-0.09 ± 0.00-0.17 ± 0.030.04 ± 0.000.07 ± 0.0023.86 ± 0.260.02 ± 0.000.00 ± 0.000.02 ± 0.00-0.16 ± 0.00-0.04 ± 0.060.02 ± 0.000.03 ± 0.0040.31 ± 0.88
MixSUra 8x7B0.06 ± -0.01 ± -0.04 ± -- ± --0.13 ± -0.10 ± -0.17 ± -9.03 ± -0.03 ± -0.00 ± -0.03 ± -- ± --0.01 ± -0.17 ± -0.26 ± -16.68 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].medium_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/randomized-choice/knowledge.md b/_pages/vi/randomized-choice/knowledge.md index f7b9784..03a1c2d 100644 --- a/_pages/vi/randomized-choice/knowledge.md +++ b/_pages/vi/randomized-choice/knowledge.md @@ -4,90 +4,96 @@ permalink: /leaderboard/vi/randomized-choice/knowledge --- # Randomized-Choice Knowledge Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + {% endfor %} - - - - - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + + + + + + {% endfor %} + + + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - + + {% for dataset in site.data.leaderboard[lang].randomized_choice.knowledge %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsAC F1 AR ECE A@10
Our 70B0.76 ± 0.020.76 ± 0.020.78 ± 0.010.14 ± 0.020.94 ± 0.04
Our 13B0.62 ± 0.020.62 ± 0.020.61 ± 0.020.15 ± 0.020.67 ± 0.07
Our 7B0.45 ± 0.020.36 ± 0.020.57 ± 0.020.10 ± 0.020.45 ± 0.07
LLaMa-2 13B0.57 ± 0.020.57 ± 0.020.57 ± 0.020.29 ± 0.020.75 ± 0.07
LLaMa-2 7B0.36 ± 0.020.27 ± 0.020.56 ± 0.020.37 ± 0.020.44 ± 0.07
Vietcuna 7B0.26 ± 0.020.15 ± 0.010.50 ± 0.000.01 ± 0.010.26 ± 0.06 + Models + + {{ dataset[0] }} +
MixSUra 7B0.61 ± -0.61 ± -0.54 ± -0.31 ± -0.65 ± -
GPT-3.50.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04AC↑F1↑AR↑ECE↓A@10↑
GPT-40.92 ± 0.010.74 ± 0.04-0.67 ± 0.010.92 ± 0.04 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/robustness-aware/information-retrieval.md b/_pages/vi/robustness-aware/information-retrieval.md index 6ce9fa1..2b6b42d 100644 --- a/_pages/vi/robustness-aware/information-retrieval.md +++ b/_pages/vi/robustness-aware/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/vi/robustness-aware/information-retrieval --- # Robustness-Aware Information Retrieval Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B
URA-LLaMa 13B
URA-LLaMa 7B0.05 ± 0.000.11 ± 0.000.07 ± 0.000.17 ± 0.00----
LLaMa-2 13B0.06 ± 0.000.13 ± 0.000.19 ± 0.000.19 ± 0.00
LLaMa-2 7B0.05 ± 0.000.11 ± 0.000.08 ± 0.000.16 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/knowledge.md b/_pages/vi/robustness-aware/knowledge.md index d4731c4..ecc0ef4 100644 --- a/_pages/vi/robustness-aware/knowledge.md +++ b/_pages/vi/robustness-aware/knowledge.md @@ -3,114 +3,128 @@ layout: default permalink: /leaderboard/vi/robustness-aware/knowledge --- # Robustness-Aware Knowledge Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.23 ± 0.000.37 ± 0.000.65 ± 0.000.53 ± 0.000.84 ± 0.000.11 ± 0.000.77 ± 0.00
URA-LLaMa 13B0.18 ± 0.000.30 ± 0.000.41 ± 0.000.34 ± 0.000.61 ± 0.000.22 ± 0.000.58 ± 0.00
URA-LLaMa 7B0.10 ± 0.000.18 ± 0.000.33 ± 0.020.28 ± 0.020.61 ± 0.010.19 ± 0.020.33 ± 0.06
LLaMa-2 13B0.13 ± 0.000.21 ± 0.000.39 ± 0.000.31 ± 0.000.56 ± 0.000.46 ± 0.000.33 ± 0.00
LLaMa-2 7B0.02 ± 0.000.05 ± 0.000.26 ± 0.010.20 ± 0.010.51 ± 0.010.46 ± 0.010.13 ± 0.03
Vietcuna 7B0.05 ± 0.000.15 ± 0.000.26 ± 0.010.14 ± 0.000.50 ± 0.000.01 ± 0.010.21 ± 0.07
MixSUra 8x7B0.13 ± -0.24 ± -0.57 ± -0.45 ± -0.53 ± -0.35 ± -0.58 ± -
GPT-3.50.45 ± 0.010.61 ± 0.010.90 ± 0.010.72 ± 0.04-0.65 ± 0.010.88 ± 0.07
GPT-40.44 ± 0.010.61 ± 0.010.91 ± 0.010.73 ± 0.07-0.66 ± 0.070.88 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/question-answering.md b/_pages/vi/robustness-aware/question-answering.md index 13f4b1d..5d07725 100644 --- a/_pages/vi/robustness-aware/question-answering.md +++ b/_pages/vi/robustness-aware/question-answering.md @@ -3,84 +3,60 @@ layout: default permalink: /leaderboard/vi/robustness-aware/question-answering --- # Robustness-Aware Question Answering Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + {% endfor %} - - - - + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + + + {% endfor %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + {% for model in site.data.leaderboard[lang].models.models %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + + {% endfor %} + + {% endfor %} -
ModelsXQuADMLQA + Models + + {{ dataset[0] }} +
EMF1EMF1EM↑F1↑
URA-LLaMa 70B0.01 ± 0.000.17 ± 0.000.01 ± 0.000.18 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.09 ± 0.000.00 ± 0.000.10 ± 0.00
LLaMa-2 13B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.03 ± 0.00
LLaMa-2 7B0.00 ± 0.000.02 ± 0.000.00 ± 0.000.02 ± 0.00
Vietcuna 7B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.05 ± 0.00
MixSUra 8x7B0.00 ± -0.11 ± -0.00 ± -0.12 ± -
GPT-3.50.00 ± 0.000.19 ± 0.000.00 ± 0.000.20 ± 0.00
GPT-40.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
+ {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/robustness-aware/sentiment-analysis.md b/_pages/vi/robustness-aware/sentiment-analysis.md index 47f6d45..d647b96 100644 --- a/_pages/vi/robustness-aware/sentiment-analysis.md +++ b/_pages/vi/robustness-aware/sentiment-analysis.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/vi/robustness-aware/sentiment-analysis --- # Robustness-Aware Sentiment Analysis Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVLSP 2016UiT-VSFC
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.63 ± 0.010.48 ± 0.010.60 ± 0.010.09 ± 0.010.83 ± 0.040.71 ± 0.010.45 ± 0.010.80 ± 0.010.08 ± 0.010.99 ± 0.01
URA-LLaMa 13B0.55 ± 0.020.52 ± 0.020.59 ± 0.010.06 ± 0.010.74 ± 0.050.72 ± 0.010.44 ± 0.050.77 ± 0.010.18 ± 0.010.77 ± 0.02
URA-LLaMa 7B0.52 ± 0.020.36 ± 0.030.59 ± 0.010.07 ± 0.010.66 ± 0.050.73 ± 0.010.41 ± 0.010.71 ± 0.010.16 ± 0.010.87 ± 0.02
LLaMa-2 13B0.46 ± 0.020.30 ± 0.010.55 ± 0.010.39 ± 0.020.70 ± 0.050.66 ± 0.010.40 ± 0.010.63 ± 0.010.11 ± 0.010.89 ± 0.02
LLaMa-2 7B0.45 ± 0.020.36 ± 0.010.54 ± 0.010.20 ± 0.020.51 ± 0.050.51 ± 0.010.33 ± 0.010.65 ± 0.010.15 ± 0.010.80 ± 0.02
Vietcuna 7B0.44 ± 0.020.27 ± 0.010.51 ± 0.010.23 ± 0.020.53 ± 0.050.49 ± 0.010.25 ± 0.030.46 ± 0.010.33 ± 0.010.34 ± 0.03
MixSUra 8x7B0.59 ± -0.59 ± -0.55 ± -0.34 ± -0.52 ± -0.69 ± -0.44 ± -0.61 ± -0.29 ± -0.66 ± -
GPT-3.50.64 ± 0.010.60 ± 0.01-0.31 ± 0.010.54 ± 0.050.86 ± 0.010.71 ± 0.01-0.53 ± 0.010.86 ± 0.02
GPT-40.74 ± 0.000.73 ± 0.00-0.41 ± 0.000.71 ± 0.000.83 ± 0.000.70 ± 0.00-0.50 ± 0.000.85 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/summarization.md b/_pages/vi/robustness-aware/summarization.md index 27728c5..b5a3948 100644 --- a/_pages/vi/robustness-aware/summarization.md +++ b/_pages/vi/robustness-aware/summarization.md @@ -3,204 +3,132 @@ layout: default permalink: /leaderboard/vi/robustness-aware/summarization --- # Robustness-Aware Summarization Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.34 ± 0.000.15 ± 0.000.23 ± 0.00-0.06 ± 0.00-0.11 ± 0.180.10 ± 0.000.10 ± 0.0039.63 ± 0.870.28 ± 0.000.11 ± 0.000.19 ± 0.00-0.16 ± 0.000.25 ± 0.230.50 ± 0.010.50 ± 0.01167.42 ± 7.09
URA-LLaMa 13B0.35 ± 0.000.14 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.07 ± 0.170.64 ± 0.000.65 ± 0.00134.65 ± 3.760.20 ± 0.000.07 ± 0.000.13 ± 0.00-0.17 ± 0.000.20 ± 0.110.38 ± 0.000.38 ± 0.00103.69 ± 3.33
URA-LLaMa 7B0.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.10 ± 0.00-0.24 ± 0.180.65 ± 0.000.65 ± 0.0017.92 ± 0.870.37 ± 0.000.12 ± 0.000.24 ± 0.00-0.17 ± 0.000.11 ± 0.180.65 ± 0.000.65 ± 0.0020.49 ± 0.95
LLaMa-2 13B0.05 ± 0.000.01 ± 0.000.04 ± 0.00-0.15 ± 0.00-0.24 ± 0.180.03 ± 0.000.03 ± 0.0055.91 ± 0.650.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.17 ± 0.000.09 ± 0.000.05 ± 0.000.05 ± 0.0066.85 ± 6.72
LLaMa-2 7B0.05 ± 0.000.01 ± 0.000.05 ± 0.00-0.10 ± 0.00-0.19 ± 0.040.07 ± 0.000.07 ± 0.0055.29 ± 0.880.04 ± 0.000.00 ± 0.000.04 ± 0.00-0.17 ± 0.000.15 ± 0.000.06 ± 0.000.06 ± 0.0058.32 ± 3.32
Vietcuna 7B0.03 ± 0.000.01 ± 0.000.02 ± 0.00-0.10 ± 0.00-0.18 ± 0.060.91 ± 0.000.91 ± 0.001026.61 ± 3.860.08 ± 0.000.02 ± 0.000.05 ± 0.00-0.17 ± 0.00-0.19 ± 0.050.78 ± 0.000.78 ± 0.00505.45 ± 8.64
MixSUra 8x7B0.41 ± -0.19 ± -0.26 ± -- ± --0.03 ± -0.86 ± -0.87 ± -29.15 ± -0.46 ± -0.21 ± -0.28 ± -- ± -0.26 ± -0.88 ± -0.98 ± -19.10 ± -
GPT-3.50.34 ± 0.000.19 ± 0.000.23 ± 0.00-0.10 ± 0.000.05 ± 0.140.81 ± 0.000.81 ± 0.00128.44 ± 2.940.39 ± 0.000.19 ± 0.000.25 ± 0.00-0.17 ± 0.000.28 ± 0.110.82 ± 0.000.82 ± 0.00200.90 ± 7.40
GPT-40.39 ± 0.000.21 ± 0.000.26 ± 0.00-0.10 ± 0.090.04 ± 0.000.83 ± 0.000.83 ± 0.7124.48 ± 0.000.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.17 ± 0.000.28 ± 0.000.80 ± 0.030.81 ± 0.0020.40 ± 1.59
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/text-classification.md b/_pages/vi/robustness-aware/text-classification.md index a48a0ed..72a19a3 100644 --- a/_pages/vi/robustness-aware/text-classification.md +++ b/_pages/vi/robustness-aware/text-classification.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/vi/robustness-aware/text-classification --- # Robustness-Aware Text Classification Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.25 ± 0.000.16 ± 0.000.56 ± 0.020.20 ± 0.000.33 ± 0.000.16 ± 0.020.26 ± 0.030.79 ± 0.000.79 ± 0.020.08 ± 0.06
URA-LLaMa 13B0.30 ± 0.000.11 ± 0.000.51 ± 0.010.26 ± 0.000.44 ± 0.000.01 ± 0.010.05 ± 0.010.47 ± 0.010.84 ± 0.010.00 ± 0.04
URA-LLaMa 7B0.29 ± 0.000.10 ± 0.000.57 ± 0.010.17 ± 0.000.30 ± 0.000.02 ± 0.010.04 ± 0.000.55 ± 0.010.18 ± 0.010.01 ± 0.02
LLaMa-2 13B0.19 ± 0.000.07 ± 0.000.52 ± 0.010.47 ± 0.000.43 ± 0.000.02 ± 0.000.06 ± 0.000.57 ± 0.010.91 ± 0.000.01 ± 0.00
LLaMa-2 7B0.17 ± 0.000.10 ± 0.000.55 ± 0.000.33 ± 0.000.29 ± 0.000.01 ± 0.010.00 ± 0.000.56 ± 0.000.69 ± 0.010.02 ± 0.02
Vietcuna 7B0.09 ± 0.000.09 ± 0.000.51 ± 0.010.91 ± 0.000.09 ± 0.000.02 ± 0.010.01 ± 0.000.55 ± 0.010.23 ± 0.010.02 ± 0.01
MixSUra 8x7B0.35 ± -0.27 ± -0.70 ± -0.58 ± -0.70 ± -0.80 ± -55 ± -0.94 ± -0.15 ± -0.88 ± -
GPT-3.50.42 ± 0.000.41 ± 0.00-0.28 ± 0.000.30 ± 0.000.68 ± 0.020.64 ± 0.03-0.62 ± 0.020.70 ± 0.05
GPT-40.48 ± 0.000.45 ± 0.00-0.33 ± 0.000.40 ± 0.000.86 ± 0.010.80 ± 0.02-0.80 ± 0.010.91 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/toxicity-detection.md b/_pages/vi/robustness-aware/toxicity-detection.md index dd7ba94..670fef2 100644 --- a/_pages/vi/robustness-aware/toxicity-detection.md +++ b/_pages/vi/robustness-aware/toxicity-detection.md @@ -3,144 +3,96 @@ layout: default permalink: /leaderboard/vi/robustness-aware/toxicity-detection --- # Robustness-Aware Toxicity Detection Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-ViCTSDUiT-ViHSD
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.32 ± 0.000.21 ± 0.000.72 ± 0.010.62 ± 0.000.33 ± 0.000.14 ± 0.000.12 ± 0.000.64 ± 0.020.61 ± 0.000.23 ± 0.00
URA-LLaMa 13B0.27 ± 0.000.26 ± 0.000.56 ± 0.000.56 ± 0.000.12 ± 0.000.18 ± 0.000.11 ± 0.000.57 ± 0.010.45 ± 0.000.20 ± 0.00
URA-LLaMa 7B0.22 ± 0.000.21 ± 0.000.63 ± 0.000.39 ± 0.000.36 ± 0.000.12 ± 0.000.07 ± 0.000.62 ± 0.000.38 ± 0.000.19 ± 0.00
LLaMa-2 13B0.12 ± 0.000.11 ± 0.000.56 ± 0.010.66 ± 0.000.12 ± 0.000.10 ± 0.000.07 ± 0.000.59 ± 0.010.62 ± 0.000.24 ± 0.00
LLaMa-2 7B0.04 ± 0.000.04 ± 0.000.62 ± 0.000.86 ± 0.000.02 ± 0.000.01 ± 0.000.00 ± 0.000.54 ± 0.000.79 ± 0.000.00 ± 0.00
Vietcuna 7B0.11 ± 0.000.11 ± 0.000.54 ± 0.000.39 ± 0.000.13 ± 0.000.09 ± 0.000.05 ± 0.000.5 ± 0.000.24 ± 0.000.08 ± 0.00
MixSUra 8x7B0.72 ± -0.39 ± -- ± -0.25 ± -0.81 ± -0.66 ± -0.31 ± -0.67 ± -0.21 ± -0.82 ± -
GPT-3.50.51 ± 0.000.46 ± 0.000.5 ± 0.000.01 ± 0.000.54 ± 0.000.64 ± 0.000.47 ± 0.00- ± -0.30 ± 0.000.63 ± 0.00
GPT-40.88 ± 0.000.71 ± 0.00- ± -0.38 ± 0.000.88 ± 0.000.78 ± 0.000.56 ± 0.00- ± -0.44 ± 0.000.78 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/robustness-aware/translation.md b/_pages/vi/robustness-aware/translation.md index 99b4d0b..fb9ea78 100644 --- a/_pages/vi/robustness-aware/translation.md +++ b/_pages/vi/robustness-aware/translation.md @@ -3,124 +3,84 @@ layout: default permalink: /leaderboard/vi/robustness-aware/translation --- # Robustness-Aware Translation Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsPhoMTOPUS100
(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)(En → Vi)(Vi → En)
URA-LLaMa 70B0.25 ± 0.000.58 ± 0.000.11 ± 0.000.51 ± 0.000.05 ± 0.000.40 ± 0.010.06 ± 0.000.36 ± 0.00
URA-LLaMa 13B0.23 ± 0.000.55 ± 0.000.10 ± 0.000.50 ± 0.000.03 ± 0.000.38 ± 0.010.05 ± 0.000.38 ± 0.00
URA-LLaMa 7B0.15 ± 0.000.48 ± 0.000.06 ± 0.000.46 ± 0.000.02 ± 0.000.35 ± 0.000.03 ± 0.000.34 ± 0.01
LLaMa-2 13B0.20 ± 0.000.51 ± 0.000.07 ± 0.000.44 ± 0.000.03 ± 0.000.36 ± 0.010.04 ± 0.000.32 ± 0.00
LLaMa-2 7B0.13 ± 0.000.41 ± 0.000.05 ± 0.000.42 ± 0.000.02 ± 0.000.31 ± 0.000.03 ± 0.000.30 ± 0.00
Vietcuna 7B0.17 ± 0.000.43 ± 0.000.07 ± 0.010.41 ± 0.000.09 ± 0.010.38 ± 0.010.09 ± 0.010.33 ± 0.00
MixSUra 8x7B0.14 ± -0.50 ± -0.11 ± -0.46 ± -0.06 ± -0.36 ± -0.06 ± -0.31 ± -
GPT-3.50.31 ± 0.000.64 ± 0.000.17 ± 0.000.59 ± 0.000.15 ± 0.010.49 ± 0.010.21 ± 0.010.48 ± 0.00
GPT-40.31 ± 0.000.65 ± 0.000.20 ± 0.000.62 ± 0.000.16 ± 0.010.50 ± 0.010.23 ± 0.010.51 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + BLEU envi↑ + BLEU vien↑ + hLEPOR envi↑ + hLEPOR vien↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].robustness_aware.translation %} + {% assign bleu_envi_best = 0 %} + {% assign bleu_vien_best = 0 %} + {% assign hlepor_envi_best = 0 %} + {% assign hlepor_vien_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["BLEU envi"] and dataset[1][m]["BLEU envi"] > bleu_envi_best %} + {% assign bleu_envi_best = dataset[1][m]["BLEU envi"] %} + {% endif %} + {% if dataset[1][m]["BLEU vien"] and dataset[1][m]["BLEU vien"] > bleu_vien_best %} + {% assign bleu_vien_best = dataset[1][m]["BLEU vien"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR envi"] and dataset[1][m]["hLEPOR envi"] > hlepor_envi_best %} + {% assign hlepor_envi_best = dataset[1][m]["hLEPOR envi"] %} + {% endif %} + {% if dataset[1][m]["hLEPOR vien"] and dataset[1][m]["hLEPOR vien"] > hlepor_vien_best %} + {% assign hlepor_vien_best = dataset[1][m]["hLEPOR vien"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["BLEU envi"] %} + {{ dataset[1][model]["BLEU envi"] | round: 2 }} ± {{ dataset[1][model]["BLEU envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["BLEU vien"] %} + {{ dataset[1][model]["BLEU vien"] | round: 2 }} ± {{ dataset[1][model]["BLEU vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR envi"] %} + {{ dataset[1][model]["hLEPOR envi"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR envi_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["hLEPOR vien"] %} + {{ dataset[1][model]["hLEPOR vien"] | round: 2 }} ± {{ dataset[1][model]["hLEPOR vien_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/weaker-prompt/question-answering.md b/_pages/vi/weaker-prompt/question-answering.md index 6355ddf..25a17aa 100644 --- a/_pages/vi/weaker-prompt/question-answering.md +++ b/_pages/vi/weaker-prompt/question-answering.md @@ -3,63 +3,60 @@ layout: default permalink: /leaderboard/vi/weaker-prompt/question-answering --- # Weak-Prompt Question Answering Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.21 ± 0.010.47 ± 0.010.14 ± 0.010.41 ± 0.00
URA-LLaMa 13B0.22 ± 0.010.43 ± 0.010.17 ± 0.010.40 ± 0.01
URA-LLaMa 7B0.13 ± 0.000.32 ± 0.000.10 ± 0.000.32 ± 0.00
LLaMa-2 13B0.04 ± 0.000.28 ± 0.000.04 ± 0.000.28 ± 0.00
LLaMa-2 7B0.06 ± 0.000.24 ± 0.000.05 ± 0.000.24 ± 0.00
MixSUra 8x7b0.13 ±-0.38 ± -0.09 ± -0.36 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/weaker-prompt/summarization.md b/_pages/vi/weaker-prompt/summarization.md index 2f6774c..2c1d732 100644 --- a/_pages/vi/weaker-prompt/summarization.md +++ b/_pages/vi/weaker-prompt/summarization.md @@ -3,147 +3,132 @@ layout: default permalink: /leaderboard/vi/weaker-prompt/summarization --- # Weak-Prompt Summarization Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.49 ± 0.000.23 ± 0.000.31 ± 0.00-0.08 ± 0.000.05 ± 0.110.89 ± 0.008.90 ± 0.0318.48 ± 0.590.47 ± 0.000.20 ± 0.000.29 ± 0.00-0.16 ± 0.000.19 ± 0.130.86 ± 0.006.83 ± 0.0925.30 ± 1.86
URA-LLaMa 13B0.27 ± 0.000.12 ± 0.000.18 ± 0.00-0.09 ± 0.000.05 ± 0.110.56 ± 0.005.00 ± 0.04153.55 ± 0.990.22 ± 0.000.09 ± 0.000.14 ± 0.00-0.16 ± 0.000.20 ± 0.0070.48 ± 0.003.49 ± 0.04190.09 ± 4.92
URA-LLaMa 7B0.45 ± 0.000.21 ± 0.000.29 ± 0.00-0.08 ± 0.000.03 ± 0.090.91 ± 0.009.43 ± 0.036.42 ± 0.050.42 ± 0.000.18 ± 0.000.27 ± 0.00-0.16 ± 0.000.07 ± 0.120.89 ± 0.007.58 ± 0.057.14 ± 0.14
LLaMa-2 13B0.45 ± 0.000.22 ± 0.000.29 ± 0.00-0.09 ± 0.000.00 ± 0.140.92 ± 0.009.49 ± 0.028.46 ± 0.290.47 ± 0.000.22 ± 0.000.29 ± 0.00-0.16 ± 0.000.34 ± 0.120.92 ± 0.009.39 ± 0.0517.94 ± 2.84
LLaMa-2 7B0.36 ± 0.000.17 ± 0.000.23 ± 0.00-0.09 ± 0.00-0.15 ± 0.120.69 ± 0.006.35 ± 0.037.59 ± 0.210.45 ± 0.000.20 ± 0.000.27 ± 0.00-0.16 ± 0.000.36 ± 0.000.83 ± 0.007.71 ± 0.0712.39 ± 1.46
MixSUra 8x7B0.44 ± -0.22 ± -0.29 ± -- ± -0.07 ± -0.97 ± -35.67 ± -9.43 ± -0.47 ± -0.22 ± -0.29 ± -- ± -0.19 ± -0.97 ± -28.97 ± -10.27 ± -
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].weaker_prompt.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/information-retrieval.md b/_pages/vi/zero-shot/information-retrieval.md index c844a8f..6768e9d 100644 --- a/_pages/vi/zero-shot/information-retrieval.md +++ b/_pages/vi/zero-shot/information-retrieval.md @@ -3,113 +3,84 @@ layout: default permalink: /leaderboard/vi/zero-shot/information-retrieval --- # Zero-Shot Information Retrieval Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsmMARCOmRobust04
M@10M@10BN@10N@10BM@10M@10BN@10N@10B
URA-LLaMa 70B--------
URA-LLaMa 13B--------
URA-LLaMa 7B0.06 ± 0.000.14 ± 0.000.09 ± 0.000.21 ± 0.00----
LLaMa-2 13B--------
LLaMa-2 7B0.06 ± 0.000.11 ± 0.000.08 ± 0.000.17 ± 0.00----
Vietcuna 7B--------
GPT-3.5--------
GPT-4--------
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + M@10↑ + M@10B↑ + N@10↑ + N@10B↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.information_retrieval %} + {% assign M10_best = 0 %} + {% assign M10B_best = 0 %} + {% assign N10_best = 0 %} + {% assign N10B_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m]["M@10"] and dataset[1][m]["M@10"] > M10_best %} + {% assign M10_best = dataset[1][m]["M@10"] %} + {% endif %} + {% if dataset[1][m]["M@10B"] and dataset[1][m]["M@10B"] > M10B_best %} + {% assign M10B_best = dataset[1][m]["M@10B"] %} + {% endif %} + {% if dataset[1][m]["N@10"] and dataset[1][m]["N@10"] > N10_best %} + {% assign N10_best = dataset[1][m]["N@10"] %} + {% endif %} + {% if dataset[1][m]["N@10B"] and dataset[1][m]["N@10B"] > N10B_best %} + {% assign N10B_best = dataset[1][m]["N@10B"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model]["M@10"] %} + {{ dataset[1][model]["M@10"] | round: 2 }} ± {{ dataset[1][model]["M@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["M@10B"] %} + {{ dataset[1][model]["M@10B"] | round: 2 }} ± {{ dataset[1][model]["M@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10"] %} + {{ dataset[1][model]["N@10"] | round: 2 }} ± {{ dataset[1][model]["N@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["N@10B"] %} + {{ dataset[1][model]["N@10B"] | round: 2 }} ± {{ dataset[1][model]["N@10B_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/knowledge.md b/_pages/vi/zero-shot/knowledge.md index 04ce434..63dd085 100644 --- a/_pages/vi/zero-shot/knowledge.md +++ b/_pages/vi/zero-shot/knowledge.md @@ -2,105 +2,129 @@ layout: default permalink: /leaderboard/vi/zero-shot/knowledge --- -# Zero-Shot Knowledge Leaderboard +# Zero-shot Knowledge Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsZaloE2EViMMRC
EMF1ACF1ARECEA@10
URA-LLaMa 70B0.28 ± 0.020.44 ± 0.020.80 ± 0.020.80 ± 0.020.85 ± 0.010.10 ± 0.020.96 ± 0.03
URA-LLaMa 13B0.12 ± 0.010.22 ± 0.010.40 ± 0.020.31 ± 0.020.57 ± 0.020.48 ± 0.020.42 ± 0.08
URA-LLaMa 7B0.09 ± 0.010.20 ± 0.020.30 ± 0.020.10 ± 0.010.56 ± 0.020.27 ± 0.020.56 ± 0.07
LLaMa-2 13B0.06 ± 0.010.10 ± 0.010.52 ± 0.020.41 ± 0.020.64 ± 0.020.33 ± 0.020.73 ± 0.07
LLaMa-2 7B0.03 ± 0.010.07 ± 0.010.37 ± 0.020.25 ± 0.020.51 ± 0.020.35 ± 0.020.29 ± 0.06
Vietcuna 7B0.03 ± 0.010.06 ± 0.010.32 ± 0.020.22 ± 0.020.50 ± 0.000.07 ± 0.020.33 ± 0.07
GPT-3.50.37 ± 0.020.56 ± 0.020.90 ± 0.010.72 ± 0.01-0.65 ± 0.010.90 ± 0.04
GPT-40.38 ± 0.020.55 ± 0.020.92 ± 0.010.73 ± 0.06-0.67 ± 0.010.90 ± 0.04
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + + {{ dataset[0] }} + + {% else %} + + {{ dataset[0] }} + + {% endif %} + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% if dataset[1].num_fields == 2 %} + EM↑ + F1↑ + {% else %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endif %} + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.knowledge %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign AC_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + {% if dataset[1].num_fields == 2 %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% else %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endif %} + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/language-modeling.md b/_pages/vi/zero-shot/language-modeling.md index 20f125a..232093f 100644 --- a/_pages/vi/zero-shot/language-modeling.md +++ b/_pages/vi/zero-shot/language-modeling.md @@ -3,149 +3,108 @@ layout: default permalink: /leaderboard/vi/zero-shot/language-modeling --- # Zero-Shot Language Modeling Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsMLQA-MLMVSEC
EMCERWERCEDWEDPLXEMCERWERCEDWEDPLX
URA-LLaMa 70B0.00 ± 0.000.50 ± 0.010.64 ± 0.01519.09 ± 10.96115.82 ± 2.451.08 ± 0.010.00 ± 0.000.88 ± 0.001.01 ± 0.00113.51 ± 0.5729.91 ± 0.151.09 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.67 ± 0.000.78 ± 0.00697.85 ± 11.62161.34 ± 2.641.16 ± 0.020.01 ± 0.000.42 ± 0.010.56 ± 0.0154.88 ± 0.7714.50 ± 0.191.26 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.73 ± 0.000.88 ± 0.01684.00 ± 13.18166.87 ± 3.181.25 ± 0.010.01 ± 0.003.33 ± 0.043.14 ± 0.03420.34 ± 5.6685.79 ± 0.961.33 ± 0.00
LLaMa-2 13B0.00 ± 0.000.90 ± 0.001.00 ± 0.00881.97 ± 11.23208.52 ± 2.521.10 ± 0.010.00 ± 0.001.32 ± 0.011.40 ± 0.01160.06 ± 1.1638.12 ± 0.231.11 ± 0.00
LLaMa-2 7B0.00 ± 0.000.95 ± 0.001.07 ± 0.01860.42 ± 13.18210.21 ± 3.181.25 ± 0.010.00 ± 0.001.54 ± 0.041.55 ± 0.03171.28 ± 5.6640.18 ± 0.961.14 ± 0.00
Vietcuna 7B0.00 ± 0.001.00 ± 0.001.00 ± 0.00951.53 ± 12.37208.57 ± 2.731.48 ± 0.010.01 ± 0.001.11 ± 0.011.20 ± 0.01139.90 ± 1.3933.94 ± 0.331.61 ± 0.00
GPT-3.50.00 ± 0.000.34 ± 0.010.50 ± 0.01422.30 ± 10.79100.33 ± 2.44-0.02 ± 0.000.16 ± 0.000.30 ± 0.0012.63 ± 0.343.48 ± 0.09-
GPT-40.04 ± 0.000.40 ± 0.010.45 ± 0.01381.88 ± 10.2693.34 ± 2.39-0.60 ± 0.010.14 ± 0.000.26 ± 0.0013.58 ± 0.453.67 ± 0.12-
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + EM↑ + CER↓ + WER↓ + CED↓ + WED↓ + PLX↓ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.language_modeling %} + {% assign EM_best = 0 %} + {% assign CER_best = 1 %} + {% assign WER_best = 1 %} + {% assign CED_best = 10000 %} + {% assign WED_best = 10000 %} + {% assign PLX_best = 10000 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].CER and dataset[1][m].CER < CER_best %} + {% assign CER_best = dataset[1][m].CER %} + {% endif %} + {% if dataset[1][m].WER and dataset[1][m].WER < WER_best %} + {% assign WER_best = dataset[1][m].WER %} + {% endif %} + {% if dataset[1][m].CED and dataset[1][m].CED < CED_best %} + {% assign CED_best = dataset[1][m].CED %} + {% endif %} + {% if dataset[1][m].WED and dataset[1][m].WED < WED_best %} + {% assign WED_best = dataset[1][m].WED %} + {% endif %} + {% if dataset[1][m].PLX and dataset[1][m].PLX < PLX_best %} + {% assign PLX_best = dataset[1][m].PLX %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CER %} + {{ dataset[1][model].CER | round: 2 }} ± {{ dataset[1][model].CER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WER %} + {{ dataset[1][model].WER | round: 2 }} ± {{ dataset[1][model].WER_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].CED %} + {{ dataset[1][model].CED | round: 2 }} ± {{ dataset[1][model].CED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].WED %} + {{ dataset[1][model].WED | round: 2 }} ± {{ dataset[1][model].WED_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].PLX %} + {{ dataset[1][model].PLX | round: 2 }} ± {{ dataset[1][model].PLX_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/question-answering.md b/_pages/vi/zero-shot/question-answering.md index 2b0501d..a50c437 100644 --- a/_pages/vi/zero-shot/question-answering.md +++ b/_pages/vi/zero-shot/question-answering.md @@ -3,77 +3,60 @@ layout: default permalink: /leaderboard/vi/zero-shot/question-answering --- # Zero-Shot Question Answering Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsXQuADMLQA
EMF1EMF1
URA-LLaMa 70B0.06 ± 0.000.30 ± 0.000.04 ± 0.000.28 ± 0.00
URA-LLaMa 13B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.15 ± 0.00
URA-LLaMa 7B0.00 ± 0.000.14 ± 0.000.00 ± 0.000.16 ± 0.00
LLaMa-2 13B0.00 ± 0.000.04 ± 0.000.00 ± 0.020.05 ± 0.00
LLaMa-2 7B0.00 ± 0.000.05 ± 0.000.00 ± 0.000.06 ± 0.00
Vietcuna 7B0.00 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.00
GPT-3.50.00 ± 0.000.24 ± 0.000.00 ± 0.000.25 ± 0.00
GPT-40.00 ± 0.000.27 ± 0.000.00 ± 0.000.27 ± 0.00
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + EM↑ + F1↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.question_answering %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/reasoning.md b/_pages/vi/zero-shot/reasoning.md index 627b97a..7fb07b7 100644 --- a/_pages/vi/zero-shot/reasoning.md +++ b/_pages/vi/zero-shot/reasoning.md @@ -3,123 +3,72 @@ layout: default permalink: /leaderboard/vi/zero-shot/reasoning --- # Zero-Shot Reasoning Leaderboard +{% assign lang = 'vi' %} - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + {% endfor %} - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.reasoning %} + {% assign EM_best = 0 %} + {% assign F1_best = 0 %} + {% assign Equ_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].EM and dataset[1][m].EM > EM_best %} + {% assign EM_best = dataset[1][m].EM %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m]["Equ"] and dataset[1][m]["Equ"] > Equ_best %} + {% assign Equ_best = dataset[1][m]["Equ"] %} + {% endif %} + {% endfor %} + + + + {% endfor %} + {% endfor %} -
ModelsSR - NaturalSR - Abstract symbolMATH + Models + + {{ dataset[0] }} +
EMF1Equ.EMF1Equ.EMF1Equ.EM↑F1↑Equ↑
URA-LLaMa 70B0.06 ± 0.000.34 ± 0.000.06 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.01 ± 0.000.24 ± 0.02
URA-LLaMa 13B0.01 ± 0.000.31 ± 0.000.02 ± 0.000.02 ± 0.000.24 ± 0.000.01 ± 0.000.00 ± 0.000.00 ± 0.000.14 ± 0.02
URA-LLaMa 7B0.00 ± 0.000.26 ± 0.000.00 ± 0.000.01 ± 0.000.17 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.01
LLaMa-2 13B0.00 ± 0.000.06 ± 0.000.00 ± 0.000.02 ± 0.000.19 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.16 ± 0.02
LLaMa-2 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.05 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.06 ± 0.01
Vietcuna 7B0.00 ± 0.000.04 ± 0.000.00 ± 0.000.00 ± 0.000.10 ± 0.000.00 ± 0.000.00 ± 0.000.00 ± 0.000.01 ± 0.00
GPT-3.50.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.72 ± 0.02
GPT-40.21 ± 0.000.59 ± 0.000.32 ± 0.000.09 ± 0.000.28 ± 0.000.13 ± 0.000.00 ± 0.000.01 ± 0.000.76 ± 0.02 + {{ model }} + + {% if dataset[1][model].EM %} + {{ dataset[1][model].EM | round: 2 }} ± {{ dataset[1][model].EM_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["Equ"] %} + {{ dataset[1][model]["Equ"] | round: 2 }} ± {{ dataset[1][model]["Equ_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/zero-shot/sentiment-analysis.md b/_pages/vi/zero-shot/sentiment-analysis.md index 58202ae..91f4074 100644 --- a/_pages/vi/zero-shot/sentiment-analysis.md +++ b/_pages/vi/zero-shot/sentiment-analysis.md @@ -3,157 +3,96 @@ layout: default permalink: /leaderboard/vi/zero-shot/sentiment-analysis --- # Zero-Shot Sentiment Analysis Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.sentiment_analysis %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsVLSP 2016UiT-VSFC + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.63 ± 0.020.63 ± 0.020.74 ± 0.010.15 ± 0.010.87 ± 0.030.64 ± 0.010.54 ± 0.010.85 ± 0.010.14 ± 0.000.98 ± 0.01
URA-LLaMa 13B0.52 ± 0.020.35 ± 0.010.60 ± 0.010.10 ± 0.010.64 ± 0.050.70 ± 0.010.40 ± 0.010.72 ± 0.010.23 ± 0.010.95 ± 0.01
URA-LLaMa 7B0.35 ± 0.020.24 ± 0.010.54 ± 0.010.24 ± 0.010.31 ± 0.050.27 ± 0.010.18 ± 0.000.52 ± 0.010.37 ± 0.010.03 ± 0.01
LLaMa-2 13B0.25 ± 0.010.25 ± 0.010.49 ± 0.010.39 ± 0.010.29 ± 0.050.29 ± 0.010.24 ± 0.010.52 ± 0.010.42 ± 0.010.30 ± 0.03
LLaMa-2 7B0.15 ± 0.010.15 ± 0.010.58 ± 0.010.73 ± 0.010.12 ± 0.030.04 ± 0.000.06 ± 0.010.49 ± 0.010.79 ± 0.000.01 ± 0.01
Vietcuna 7B0.11 ± 0.010.12 ± 0.010.49 ± 0.010.68 ± 0.010.11 ± 0.030.05 ± 0.000.06 ± 0.000.56 ± 0.010.73 ± 0.000.05 ± 0.01
MixSUra 8x7B0.45 ± -0.30 ± -0.62 ± -0.50 ± -0.49 ± -0.55 ± -0.40 ± -0.66 ± -0.41 ± -0.60 ± -
Gemini Pro0.64 ± -0.47 ± --0.31 ± -0.53 ± -0.76 ± -0.49 ± --0.43 ± -0.77 ± -
GPT-3.50.62 ± 0.020.56 ± 0.01-0.29 ± 0.020.62 ± 0.050.81 ± 0.310.68 ± 0.31-0.48 ± 0.010.83 ± 0.02
GPT-40.71 ± 0.010.68 ± 0.01-0.37 ± 0.010.70 ± 0.040.80 ± 0.010.67 ± 0.01-0.47 ± 0.010.85 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_pages/vi/zero-shot/summarization.md b/_pages/vi/zero-shot/summarization.md index 3226c1f..c5f1a47 100644 --- a/_pages/vi/zero-shot/summarization.md +++ b/_pages/vi/zero-shot/summarization.md @@ -3,185 +3,132 @@ layout: default permalink: /leaderboard/vi/zero-shot/summarization --- # Zero-Shot Summarization Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsVietNewsWikiLingua
R1R2RLSCBSCvDeCpR1R2RLSCBSCvDeCp
URA-LLaMa 70B0.42 ± 0.170.21 ± 0.120.28 ± 0.00-0.11 ± 0.000.03 ± 0.190.85 ± 0.0014.59 ± 0.0517.21 ± 0.330.37 ± 0.000.16 ± 0.000.24 ± 0.00-0.22 ± 0.000.26 ± 0.160.17 ± 0.000.22 ± 0.0022.24 ± 0.97
URA-LLaMa 13B0.38 ± 0.000.18 ± 0.000.25 ± 0.00-0.09 ± 0.000.01 ± 0.180.71 ± 0.006.01 ± 0.0724.27 ± 0.610.22 ± 0.000.08 ± 0.000.14 ± 0.00-0.16 ± 0.00-0.13 ± 0.120.42 ± 0.013.06 ± 0.1049.58 ± 1.16
URA-LLaMa 7B0.38 ± 0.000.14 ± 0.000.25 ± 0.00-0.09 ± 0.000.04 ± 0.120.65 ± 0.004.88 ± 0.037.77 ± 0.050.40 ± 0.000.15 ± 0.000.26 ± 0.00-0.16 ± 0.000.19 ± 0.070.73 ± 0.004.79 ± 0.076.22 ± 0.07
LLaMa-2 13B0.06 ± 0.000.02 ± 0.000.04 ± 0.00-0.09 ± 0.00-0.18 ± 0.040.07 ± 0.000.43 ± 0.0128.25 ± 0.240.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.11 ± 0.080.03 ± 0.000.07 ± 0.0119.55 ± 0.51
LLaMa-2 7B0.06 ± 0.000.01 ± 0.000.05 ± 0.00-0.09 ± 0.00-0.23 ± 0.040.06 ± 0.000.21 ± 0.0015.75 ± 0.200.04 ± 0.000.00 ± 0.000.03 ± 0.00-0.16 ± 0.00-0.14 ± 0.070.03 ± 0.000.06 ± 0.0017.84 ± 0.50
Vietcuna 7B0.28 ± 0.000.06 ± 0.000.18 ± 0.00-0.09 ± 0.00-0.09 ± 0.090.31 ± 0.000.80 ± 0.01171.63 ± 1.710.24 ± 0.000.06 ± 0.000.15 ± 0.00-0.16 ± 0.00-0.18 ± 0.070.51 ± 0.011.16 ± 0.01238.67 ± 3.37
GPT-3.50.36 ± 0.000.20 ± 0.000.24 ± 0.00-0.09 ± 0.000.04 ± 0.130.86 ± 0.003.97 ± 0.0213.32 ± 0.650.43 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.22 ± 0.030.87 ± 0.003.29 ± 0.0335.50 ± 0.82
GPT-40.41 ± 0.000.21 ± 0.000.26 ± 0.00-0.08 ± 0.00-0.04 ± 0.110.84 ± 0.003.45 ± 0.0015.43 ± 0.490.44 ± 0.000.21 ± 0.000.27 ± 0.00-0.16 ± 0.000.24 ± 0.040.82 ± 0.002.37 ± 0.016.61 ± 0.16
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + R1↑ + R2↑ + RL↑ + SC↑ + BS↑ + Cv↑ + De↑ + Cp↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.summarization %} + {% assign R1_best = 0 %} + {% assign R2_best = 0 %} + {% assign RL_best = 0 %} + {% assign SC_best = -1 %} + {% assign BS_best = 0 %} + {% assign Cv_best = 0 %} + {% assign De_best = 0 %} + {% assign Cp_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].R1 and dataset[1][m].R1 > R1_best %} + {% assign R1_best = dataset[1][m].R1 %} + {% endif %} + {% if dataset[1][m].R2 and dataset[1][m].R2 > R2_best %} + {% assign R2_best = dataset[1][m].R2 %} + {% endif %} + {% if dataset[1][m].RL and dataset[1][m].RL > RL_best %} + {% assign RL_best = dataset[1][m].RL %} + {% endif %} + {% if dataset[1][m].SC and dataset[1][m].SC > SC_best %} + {% assign SC_best = dataset[1][m].SC %} + {% endif %} + {% if dataset[1][m].BS and dataset[1][m].BS > BS_best %} + {% assign BS_best = dataset[1][m].BS %} + {% endif %} + {% if dataset[1][m].Cv and dataset[1][m].Cv > Cv_best %} + {% assign Cv_best = dataset[1][m].Cv %} + {% endif %} + {% if dataset[1][m].De and dataset[1][m].De > De_best %} + {% assign De_best = dataset[1][m].De %} + {% endif %} + {% if dataset[1][m].Cp and dataset[1][m].Cp > Cp_best %} + {% assign Cp_best = dataset[1][m].Cp %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].R1 %} + {{ dataset[1][model].R1 | round: 2 }} ± {{ dataset[1][model].R1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].R2 %} + {{ dataset[1][model].R2 | round: 2 }} ± {{ dataset[1][model].R2_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].RL %} + {{ dataset[1][model].RL | round: 2 }} ± {{ dataset[1][model].RL_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].SC %} + {{ dataset[1][model].SC | round: 2 }} ± {{ dataset[1][model].SC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].BS %} + {{ dataset[1][model].BS | round: 2 }} ± {{ dataset[1][model].BS_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cv %} + {{ dataset[1][model].Cv | round: 2 }} ± {{ dataset[1][model].Cv_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].De %} + {{ dataset[1][model].De | round: 2 }} ± {{ dataset[1][model].De_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].Cp %} + {{ dataset[1][model].Cp | round: 2 }} ± {{ dataset[1][model].Cp_std | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/text-classification.md b/_pages/vi/zero-shot/text-classification.md index 3e62b94..5ee8d17 100644 --- a/_pages/vi/zero-shot/text-classification.md +++ b/_pages/vi/zero-shot/text-classification.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/vi/zero-shot/text-classification --- # Zero-Shot Text Classification Leaderboard +{% assign lang = 'vi' %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelsUiT-VSMECPhoATIS
ACF1ARECEA@10ACF1ARECEA@10
URA-LLaMa 70B0.40 ± 0.020.32 ± 0.020.68 ± 0.010.14 ± 0.020.60 ± 0.060.56 ± 0.020.48 ± 0.030.85 ± 0.000.25 ± 0.020.56 ± 0.06
URA-LLaMa 13B0.29 ± 0.020.25 ± 0.020.52 ± 0.010.09 ± 0.010.23 ± 0.050.10 ± 0.010.10 ± 0.010.72 ± 0.000.52 ± 0.010.14 ± 0.04
URA-LLaMa 7B0.13 ± 0.010.11 ± 0.010.50 ± 0.010.15 ± 0.010.21 ± 0.050.04 ± 0.010.04 ± 0.020.77 ± 0.000.30 ± 0.010.04 ± 0.02
LLaMa-2 13B0.11 ± 0.010.10 ± 0.010.49 ± 0.010.31 ± 0.010.09 ± 0.040.03 ± 0.010.02 ± 0.000.45 ± 0.010.28 ± 0.010.03 ± 0.02
LLaMa-2 7B0.07 ± 0.010.08 ± 0.010.52 ± 0.010.35 ± 0.010.07 ± 0.030.00 ± 0.060.00 ± 0.060.61 ± 0.010.32 ± 0.000.00 ± 0.00
Vietcuna 7B0.05 ± 0.010.02 ± 0.010.52 ± 0.010.95 ± 0.010.03 ± 0.020.05 ± 0.010.01 ± 0.000.66 ± 0.000.20 ± 0.010.01 ± 0.21
GPT-3.50.43 ± 0.020.37 ± 0.02-0.29 ± 0.020.43 ± 0.060.44 ± 0.020.38 ± 0.03-0.38 ± 0.020.44 ± 0.05
GPT-40.49 ± 0.020.46 ± 0.02-0.35 ± 0.020.50 ± 0.060.89 ± 0.010.69 ± 0.02-0.83 ± 0.010.89 ± 0.03
+ + + + Models + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + + {{ dataset[0] }} + + {% endfor %} + + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + AC↑ + F1↑ + AR↑ + ECE↓ + A@10↑ + {% endfor %} + + + + {% for model in site.data.leaderboard[lang].models.models %} + + + {{ model }} + + {% for dataset in site.data.leaderboard[lang].zero_shot.text_classification %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} + + {% endfor %} + + {% endfor %} + + \ No newline at end of file diff --git a/_pages/vi/zero-shot/toxicity-detection.md b/_pages/vi/zero-shot/toxicity-detection.md index 945112f..41f6688 100644 --- a/_pages/vi/zero-shot/toxicity-detection.md +++ b/_pages/vi/zero-shot/toxicity-detection.md @@ -3,131 +3,96 @@ layout: default permalink: /leaderboard/vi/zero-shot/toxicity-detection --- # Zero-Shot Toxicity Detection Leaderboard +{% assign lang = 'vi' %} - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + {% endfor %} - - - - - - - - - - + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + + + + + + {% endfor %} + {% for model in site.data.leaderboard[lang].models.models %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + {% for dataset in site.data.leaderboard[lang].zero_shot.toxicity_detection %} + {% assign AC_best = 0 %} + {% assign F1_best = 0 %} + {% assign AR_best = 0 %} + {% assign ECE_best = 1 %} + {% assign A10_best = 0 %} + {% for m in site.data.leaderboard[lang].models.models %} + {% if dataset[1][m].AC and dataset[1][m].AC > AC_best %} + {% assign AC_best = dataset[1][m].AC %} + {% endif %} + {% if dataset[1][m].F1 and dataset[1][m].F1 > F1_best %} + {% assign F1_best = dataset[1][m].F1 %} + {% endif %} + {% if dataset[1][m].AR and dataset[1][m].AR > AR_best %} + {% assign AR_best = dataset[1][m].AR %} + {% endif %} + {% if dataset[1][m].ECE and dataset[1][m].ECE < ECE_best %} + {% assign ECE_best = dataset[1][m].ECE %} + {% endif %} + {% if dataset[1][m]["A@10"] and dataset[1][m]["A@10"] > A10_best %} + {% assign A10_best = dataset[1][m]["A@10"] %} + {% endif %} + {% endfor %} + + + + + + {% endfor %} + {% endfor %} -
ModelsUiT-ViCTSDUiT-ViHSD + Models + + {{ dataset[0] }} +
ACF1ARECEA@10ACF1ARECEA@10AC↑F1↑AR↑ECE↓A@10↑
URA-LLaMa 70B0.61 ± 0.010.52 ± 0.010.77 ± 0.010.17 ± 0.010.97 ± 0.010.38 ± 0.010.34 ± 0.010.74 ± 0.010.25 ± 0.010.91 ± 0.01
URA-LLaMa 13B0.46 ± 0.010.28 ± 0.030.53 ± 0.020.22 ± 0.010.48 ± 0.030.33 ± 0.010.18 ± 0.000.60 ± 0.010.35 ± 0.010.54 ± 0.02
URA-LLaMa 7B0.25 ± 0.010.19 ± 0.010.53 ± 0.010.38 ± 0.010.13 ± 0.020.19 ± 0.000.13 ± 0.000.55 ± 0.010.46 ± 0.010.13 ± 0.01
LLaMa-2 13B0.16 ± 0.010.14 ± 0.000.40 ± 0.010.50 ± 0.010.24 ± 0.020.09 ± 0.000.13 ± 0.000.38 ± 0.010.63 ± 0.000.10 ± 0.01
LLaMa-2 7B0.13 ± 0.010.14 ± 0.010.45 ± 0.020.69 ± 0.010.09 ± 0.010.03 ± 0.000.05 ± 0.010.56 ± 0.010.75 ± 0.000.00 ± 0.00
Vietcuna 7B0.09 ± 0.000.07 ± 0.000.50 ± 0.000.41 ± 0.000.10 ± 0.030.07 ± 0.000.04 ± 0.000.50 ± 0.000.26 ± 0.000.07 ± 0.01
GPT-3.50.75 ± 0.010.61 ± 0.02-0.25 ± 0.010.80 ± 0.040.55 ± 0.010.42 ± 0.01-0.22 ± 0.010.55 ± 0.02
GPT-40.89 ± 0.010.69 ± 0.01-0.39 ± 0.010.89 ± 0.030.75 ± 0.010.53 ± 0.01-0.42 ± 0.010.75 ± 0.02 + {{ model }} + + {% if dataset[1][model].AC %} + {{ dataset[1][model].AC | round: 2 }} ± {{ dataset[1][model].AC_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].F1 %} + {{ dataset[1][model].F1 | round: 2 }} ± {{ dataset[1][model].F1_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].AR %} + {{ dataset[1][model].AR | round: 2 }} ± {{ dataset[1][model].AR_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model].ECE %} + {{ dataset[1][model].ECE | round: 2 }} ± {{ dataset[1][model].ECE_std | round: 2 }} + {% else %} + - + {% endif %} + + {% if dataset[1][model]["A@10"] %} + {{ dataset[1][model]["A@10"] | round: 2 }} ± {{ dataset[1][model]["A@10_std"] | round: 2 }} + {% else %} + - + {% endif %} +
+ \ No newline at end of file diff --git a/_site/contact/index.html b/_site/contact/index.html index e8a1d62..dc222e4 100644 --- a/_site/contact/index.html +++ b/_site/contact/index.html @@ -12,7 +12,7 @@ MELT - + @@ -20,7 +20,7 @@