masakhane-io · bonaventuredossou · Jan 25, 2023 · Jan 25, 2023 · Jan 25, 2023 · Jan 25, 2023
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,14 @@
+MIT License
+
+Copyright (c) 2023 Bonaventure F. P. Dossou
+
+<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Licence Creative Commons" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is made available under the terms of the <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Licence Creative Commons Attribution - No Commercial Use - Sharing under the Same Conditions 4.0 International</a>.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+**The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The notion has to mention "Built on top of #C" where C is the proper citation of the current repository.**
+
+**The authors have all the rights to the datasets and codes created. Any third-party entity desiring to have access to and use our dataset is required to contact the authors, and have their approval. In any case, this repository content should not be used for commercial purposes without the approval of the authors.**
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,2 +1,7 @@
 # masakhane-news
 MasakhaNEWS: News Topic Classification for African Languages
+
+- clone this branch
+- Create a folder called `results`
+- According to your need, modify the models and necessary information on `/home/mila/b/bonaventure.dossou/masakhane-news/code/sample.sh` otherwise you should be good to train all models, for all languages, across 5 seeds and the two headings styles (`0` is when qwe consider only the text, `1` is when we combine the heading of the article to its supposedly for situation where we need more context than the headline to label the article)
+- run `bash/sbatch /home/mila/b/bonaventure.dossou/masakhane-news/code/sample.sh`
diff --git a/code/requirements.txt b/code/requirements.txt
@@ -10,3 +10,4 @@ scikit-learn
 accelerate
 sentencepiece!=0.1.92
 datasets>=1.8.0
+tensorboard
diff --git a/code/sample.sh b/code/sample.sh
@@ -1,33 +1,76 @@
-export MAX_LENGTH=164
-export OUTPUT_FILE=test_result
-export OUTPUT_PREDICTION=test_predictions
-export BATCH_SIZE=16
+#!/bin/bash
+#SBATCH --job-name=masakhanews-experiments
+#SBATCH --gres=gpu:a100l:2
+#SBATCH --cpus-per-gpu=12
+#SBATCH --mem=96G
+#SBATCH --time=168:00:00
+#SBATCH --partition=long
+#SBATCH --error=/home/mila/b/bonaventure.dossou/masakhane-news/comErrorXLMRBase.txt
+#SBATCH --output=/home/mila/b/bonaventure.dossou/masakhane-news/comOutputXLMRBase.txt
+
+###########cluster information above this line
+module load python/3.9 cuda/10.2/cudnn/7.6
+source /home/mila/b/bonaventure.dossou/afrispeech/bin/activate
+pip install -r requirements.txt
+export PYTHONPATH=$PYTHONPATH:~/masakhane-news
+export MAX_LENGTH=256
+export BATCH_SIZE=64
 export NUM_EPOCHS=10
-export SAVE_STEPS=500000
-export SEED=1
-export LANG=pcm
-export DATA_DIR=../data/${LANG}
-
+export SAVE_STEPS=1500
 
-export BERT_MODEL=Davlan/afro-xlmr-base
-export OUTPUT_DIR=${LANG}_afroxlmrbase
+for model in xlm-roberta-base xlm-roberta-large google/rembert microsoft/deberta-v3-base
+do
+	export BERT_MODEL=${model}
+	if [ ${model} == xlm-roberta-base ]
+	then
+		declare -a langs=(eng fra hau ibo lin pcm run swa yor orm sna)
+	else
+		declare -a langs=(amh eng fra hau ibo lin pcm run swa yor orm sna)
+	fi
+	for lang in "${langs[@]}"
+	do
+		export OUTPUT_DIR=/home/mila/b/bonaventure.dossou/masakhane-news/results/${lang}_${model}
+		export DATA_DIR=/home/mila/b/bonaventure.dossou/masakhane-news/data/${lang}
+		export LABELS_FILE=${DATA_DIR}/labels.txt
+		export LANG=${lang}
+		export OUTPUT_DIR=/home/mila/b/bonaventure.dossou/masakhane-news/results/${lang}_${model}
+		if [[ ${lang} == eng && ${model} == xlm-roberta-base ]]
+		then
+			declare -a seeds=(4 5)
+		else
+			declare -a seeds=(1 2 3 4 5)
+		fi
+		for seed in "${seeds[@]}"
+		do
+			for header_style in 0 1
+			do
+				export HEADER_STYLE=${header_style}
+				export SEED=${seed}
+				export OUTPUT_FILE=${OUTPUT_DIR}/test_result_${lang}_${seed}_${header_style}
+				export OUTPUT_PREDICTION=${OUTPUT_DIR}/test_predictions_${lang}_${seed}_${header_style}
 
-CUDA_VISIBLE_DEVICES=3 python3 train_textclass.py --data_dir $DATA_DIR \
---model_type xlmroberta \
---model_name_or_path $BERT_MODEL \
---output_dir $OUTPUT_DIR \
---output_result $OUTPUT_FILE \
---output_prediction_file $OUTPUT_PREDICTION \
---max_seq_length  $MAX_LENGTH \
---num_train_epochs $NUM_EPOCHS \
---learning_rate 5e-5 \
---per_gpu_train_batch_size $BATCH_SIZE \
---per_gpu_eval_batch_size $BATCH_SIZE \
---save_steps $SAVE_STEPS \
---seed $SEED \
---labels ../data/${LANG}/labels.txt \
---gradient_accumulation_steps 2 \
---do_train \
---do_eval \
---do_predict \
---overwrite_output_dir
+				CUDA_VISIBLE_DEVICES=2 python train_textclass.py --data_dir $DATA_DIR \
+				--model_type xlmroberta \
+				--model_name_or_path $BERT_MODEL \
+				--output_dir $OUTPUT_DIR \
+				--output_result $OUTPUT_FILE \
+				--output_prediction_file $OUTPUT_PREDICTION \
+				--max_seq_length  $MAX_LENGTH \
+				--num_train_epochs $NUM_EPOCHS \
+				--learning_rate 5e-5 \
+				--per_gpu_train_batch_size $BATCH_SIZE \
+				--per_gpu_eval_batch_size $BATCH_SIZE \
+				--save_steps $SAVE_STEPS \
+				--seed $SEED \
+				--labels $LABELS_FILE \
+				--save_total_limit 1 \
+				--gradient_accumulation_steps 2 \
+				--do_train \
+				--do_eval \
+				--do_predict \
+				--overwrite_output_dir \
+				--header $HEADER_STYLE
+			done
+		done
+	done
+done
diff --git a/code/train_textclass.py b/code/train_textclass.py
@@ -22,9 +22,8 @@
 import logging
 import os
 import random
-
-import numpy as np
 import pandas as pd
+import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
@@ -303,8 +302,8 @@ def evaluate(args, model, tokenizer, labels, mode, prefix="", display_res=False)
         "loss": eval_loss,
         "precision": eval_report["weighted avg"]["precision"],
         "recall": eval_report["weighted avg"]["recall"],
-        "f1": eval_report["weighted avg"]["f1-score"],
         "acc": sklearn.metrics.accuracy_score(out_label_ids, preds),
+        "f1": eval_report["weighted avg"]["f1-score"]
     }
 
     if not display_res:
@@ -331,7 +330,7 @@ def load_and_cache_examples(args, tokenizer, labels, mode='train'):
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        instances = read_instances_from_file(args.data_dir, mode)
+        instances = read_instances_from_file(args, args.data_dir, mode)
         features = convert_instances_to_features_and_labels(instances, tokenizer, labels, args.max_seq_length)
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -481,6 +480,9 @@ def main():
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--header", type=int, default=0, help="with header")
+    parser.add_argument('--save_total_limit', type=int, default=1, help="Number of checkpoints to save")
+
     args = parser.parse_args()
 
     if (
@@ -548,6 +550,7 @@ def main():
         id2label={str(i): label for i, label in enumerate(labels)},
         label2id={label: i for i, label in enumerate(labels)},
         cache_dir=args.cache_dir if args.cache_dir else None,
+        save_total_limit=int(args.save_total_limit),
     )
     tokenizer = tokenizer_class.from_pretrained(
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
@@ -638,31 +641,21 @@ def main():
 
         output_test_predictions_file = os.path.join(args.output_dir, args.output_prediction_file+".txt")
         with open(output_test_predictions_file, "w", encoding='utf-8') as writer:
-            df = pd.read_csv(os.path.join(args.data_dir, "test.tsv"), sep='\t')
-            N = df.shape[0]
-
-            texts = list(df['headline'].values)
-            for i in range(N):
-                output_line = texts[i] + "\t" + id2label[str(predictions[i])] + "\n"
+            test_path = os.path.join(args.data_dir, "test.tsv")
+            test_set = pd.read_csv(test_path, delimiter = "\t")
+
+            texts = test_set['text'].values
+            labels = test_set['category'].values
+            headlines  = test_set['headline'].values
+
+            for idx, (text_, headline_, label_) in enumerate(zip(texts, headlines, labels)):
+                if int(args.header) == 1:
+                    text_ = headline_.strip() + ". " + text_.strip()
+                output_line = text_ + "\t" + id2label[str(predictions[idx])] + "\n"
                 writer.write(output_line)
-            '''
-            with open(os.path.join(args.data_dir, "test.tsv"), "r", encoding='utf-8') as f:
-                line_data = f.read()
-            line_data =  line_data.splitlines()
-            for l, line in enumerate(line_data):
-                if l == 0:
-                    continue
-                else:
-                    text_vals = line.strip().split("\t")
-                    if len(text_vals) < 2: text_vals += [7]
-                    text, label = text_vals
-                    output_line = text + "\t" + id2label[str(predictions[l-1])] + "\n"
-                    writer.write(output_line)
-            '''
 
     return results
 
 
 if __name__ == "__main__":
-    main()
-
+    main()
diff --git a/code/util_textclass.py b/code/util_textclass.py
@@ -2,7 +2,6 @@
 import torch
 import logging
 import pandas as pd
-
 import numpy as np
 from torch.utils.data import TensorDataset
 
@@ -24,28 +23,20 @@ def __init__(self, input_ids, attention_mask, token_type_ids, label):
         self.token_type_ids = token_type_ids
         self.label = label
 
-def read_instances_from_file(data_dir, mode, delimiter="\t"):
+def read_instances_from_file(args, data_dir, mode, delimiter="\t"):
     file_path = os.path.join(data_dir, "{}.tsv".format(mode))
     instances = []
 
-    df = pd.read_csv(file_path, sep='\t')
-    N = df.shape[0]
+    line_data  = pd.read_csv(file_path, sep=delimiter)
 
-    for i in range(N):
-        instances.append(Instance(df['headline'].iloc[i], df['category'].iloc[i]))
-    '''
-    with open(file_path, "r", encoding='utf-8') as input_file:
-        line_data = input_file.read()
+    texts = line_data['text'].values
+    labels = line_data['category'].values
+    headlines  = line_data['headline'].values
 
-    line_data = line_data.splitlines()
-    for l, line in enumerate(line_data):
-        if l==0:
-            continue
-        else:
-            text_vals = line.strip().split(delimiter)
-            text, label = ' '.join(text_vals[:-1]), text_vals[-1]
-            instances.append(Instance(text, label))
-    '''
+    for text_, headline_, label_ in zip(texts, headlines, labels):
+        if int(args.header) == 1:
+            text_ = headline_.strip() + ". " + text_.strip()
+        instances.append(Instance(text_, label_))
 
     return instances
 
@@ -92,4 +83,4 @@ def get_labels(path):
             labels = f.read().splitlines()
         return labels
     else:
-        return ['sports', 'health', 'technology', 'business', 'politics', 'entertainment', 'religion', 'uncategorized']
+        return ['sports', 'health', 'technology', 'business', 'politics', 'entertainment', 'religion', 'uncategorized']