diff --git a/inspire_classifier/api.py b/inspire_classifier/api.py index e746054..f22c7a5 100644 --- a/inspire_classifier/api.py +++ b/inspire_classifier/api.py @@ -190,18 +190,18 @@ def validate(validation_df): except IOError as error: raise IOError("There was a problem loading the classifier model") from error predictions = [] - true_labels = [] validation_df = validation_df.sample(frac=1, random_state=42) for _, row in tqdm( - validation_df.iterrows(), total=len(validation_df.labels.values) + validation_df.iterrows(), total=len(validation_df.label.values) ): predicted_value = classifier.predict( row.text, temperature=current_app.config["CLASSIFIER_SOFTMAX_TEMPERATUR"] ) predicted_class = np.argmax(predicted_value) predictions.append(predicted_class) - true_labels.append(row.labels) - print("f1 score ", f1_score(true_labels, predictions, average="micro")) - pprint(classification_report(true_labels, predictions)) - pprint(confusion_matrix(true_labels, predictions)) + validation_df.insert(2, 'predicted_label', predictions) + validation_df.to_csv(f"{path_for('data')}/validation_results.csv", index=False) + print("f1 score ", f1_score(validation_df["label"], validation_df["predicted_label"], average="micro")) + pprint(classification_report(validation_df["label"], validation_df["predicted_label"])) + pprint(confusion_matrix(validation_df["label"], validation_df["predicted_label"])) diff --git a/inspire_classifier/config.py b/inspire_classifier/config.py index 2d6d532..22d2fff 100644 --- a/inspire_classifier/config.py +++ b/inspire_classifier/config.py @@ -27,16 +27,16 @@ CLASSIFIER_MINIMUM_WORD_FREQUENCY = 2 CLASSIFIER_VALIDATION_DATA_FRACTION = 0.1 CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH = 15 -CLASSIFIER_CLASSIFIER_CYCLE_LENGTH = 14 -CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE = 32 -CLASSIFIER_CLASSIFIER_BATCH_SIZE = 10 +CLASSIFIER_CLASSIFIER_CYCLE_LENGTH = 15 +CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE = 64 +CLASSIFIER_CLASSIFIER_BATCH_SIZE = 128 CLASSIFIER_SOFTMAX_TEMPERATUR = 0.25 CLASSIFIER_CUDA_DEVICE_ID = 0 # set to 0 to use a GPU CLASSIFIER_DATA_PATH = "data" CLASSIFIER_LANGUAGE_MODEL_PATH = "models/language_model" CLASSIFIER_CLASSIFIER_MODEL_PATH = "models/classifier_model" -CLASSIFIER_DATAFRAME_PATH = "data/inspire_data.df" +CLASSIFIER_DATAFRAME_PATH = "data/train_valid_data.df" CLASSIFIER_TRAIN_VALID_DATA_PATH = "data/train_valid_data.csv" CLASSIFIER_FINETUNED_LANGUAGE_MODEL_ENCODER_PATH = ( "models/language_model/finetuned_language_model_encoder.h5" @@ -44,5 +44,5 @@ CLASSIFIER_TRAINED_CLASSIFIER_PATH = ( "models/classifier_model/trained_classifier_model.h5" ) -CLASSIFIER_DATA_ITOS_PATH = "data/inspire_data_itos.pkl" +CLASSIFIER_DATA_ITOS_PATH = "data/train_valid_data_itos.pkl" PROMETHEUS_ENABLE_EXPORTER_FLASK = False diff --git a/inspire_classifier/domain/models.py b/inspire_classifier/domain/models.py index 9fb2a13..1514e79 100644 --- a/inspire_classifier/domain/models.py +++ b/inspire_classifier/domain/models.py @@ -46,7 +46,7 @@ def __init__( else: default_device(False) - number_of_backpropagation_through_time_steps = 70 + number_of_backpropagation_through_time_steps = 100 train_valid_data = pd.read_csv(train_valid_data_dir) @@ -64,7 +64,7 @@ def __init__( dls_lm = dblock_lm.dataloaders( train_valid_data, - bs=batch_size // 2, + bs=batch_size, num_workers=multiprocessing.cpu_count() // 2, pin_memory=True, ) @@ -88,7 +88,9 @@ def train(self, finetuned_language_model_encoder_save_path, cycle_length=15): self.learner.fit_one_cycle(1, 1e-2) self.learner.unfreeze() self.learner.fit_one_cycle(cycle_length, 1e-3) + print("language model training finished, saving encoder") save_encoder_path(self.learner, finetuned_language_model_encoder_save_path) + print("encoder saved") class Classifier: @@ -108,11 +110,11 @@ def load_training_and_validation_data( train_valid_data = pd.read_csv(train_valid_data_dir) self.dataloader = TextDataLoaders.from_df( train_valid_data, - label_col="labels", + label_col="label", text_col="text", valid_col="is_valid", is_lm=False, - bs=batch_size // 2, + bs=batch_size, num_workers=multiprocessing.cpu_count() // 2, pin_memory=True, text_vocab=self.dls_lm_vocab, @@ -152,7 +154,7 @@ def train(self, trained_classifier_save_path, cycle_length=14): self.learner.fit_one_cycle(1, slice(5e-3 / (2.6**4), 5e-3)) self.learner.unfreeze() self.learner.fit_one_cycle(cycle_length, slice(1e-3 / (2.6**4), 1e-3)) - + print("Core classifier model training finished") export_classifier_path(self.learner, trained_classifier_save_path) self.calculate_f1_for_validation_dataset() diff --git a/scripts/create_dataset.py b/scripts/create_dataset.py index ff430b1..29c7ada 100644 --- a/scripts/create_dataset.py +++ b/scripts/create_dataset.py @@ -63,10 +63,12 @@ def __init__(self, index, query_filters, year_from, year_to): if index == "holdingpen-hep": self.source_fields = [ + "id", "metadata.abstracts", "metadata.titles", "metadata.inspire_categories", ] + self.id_field = "id" self.title_field = "metadata.titles[0].title" self.abstract_field = "metadata.abstracts[0].value" self.inspire_categories_field = "metadata.inspire_categories.term" @@ -81,7 +83,8 @@ def __init__(self, index, query_filters, year_from, year_to): ), ] else: - self.source_fields = ["abstracts", "titles", "inspire_categories"] + self.source_fields = ["id", "abstracts", "titles", "inspire_categories"] + self.id_field = "id" self.title_field = "titles[0].title" self.abstract_field = "abstracts[0].value" self.inspire_categories_field = "inspire_categories.term" @@ -91,10 +94,12 @@ def __init__(self, index, query_filters, year_from, year_to): ] def _postprocess_record_data(self, record_data): + id = get_value(record_data, self.id_field) title = get_value(record_data, self.title_field) abstract = get_value(record_data, self.abstract_field) inspire_categories = get_value(record_data, self.inspire_categories_field, []) return { + "id": int(id), "title": title, "abstract": abstract, "inspire_categories": inspire_categories, @@ -121,7 +126,7 @@ def get_data_for_decisions(year_from, year_to): record_classifier_data = inspire_search._postprocess_record_data( record_es_data.to_dict() ) - record_classifier_data["labels"] = DECISIONS_MAPPING[decision]["label"] + record_classifier_data["label"] = DECISIONS_MAPPING[decision]["label"] yield record_classifier_data @@ -133,7 +138,7 @@ def prepare_inspire_classifier_dataset(data, save_data_path): inspire_data_df["text"] = ( inspire_data_df["title"] + " " + inspire_data_df["abstract"] ) - inspire_classifier_data_df = inspire_data_df[["labels", "text"]] + inspire_classifier_data_df = inspire_data_df[["id", "inspire_categories", "label", "text"]] inspire_classifier_data_df.to_pickle(save_data_path) diff --git a/scripts/train_classifier.py b/scripts/train_classifier.py index 3754076..996b1d0 100644 --- a/scripts/train_classifier.py +++ b/scripts/train_classifier.py @@ -13,35 +13,36 @@ def train_classifier( os.makedirs(os.path.join(os.getcwd(), "classifier", "data"), exist_ok=True) df = pd.read_pickle(text_path) - print(df["labels"].value_counts()) - train_size = round(min(df["labels"].value_counts()) * train_test_split) - test_size = round(min(df["labels"].value_counts()) * (1 - train_test_split)) + df = df.sample(frac=1, random_state=42).reset_index(drop=True) + print(df["label"].value_counts()) + train_size = round(min(df["label"].value_counts()) * train_test_split) + test_size = round(min(df["label"].value_counts()) * (1 - train_test_split)) print(train_size) print(test_size) - grouped_df = df.groupby("labels", as_index=False).sample( + grouped_df = df.groupby("label", as_index=False).sample( n=train_size, random_state=42 ) test_df = df.drop(grouped_df.index) - grouped_test_df = test_df.groupby("labels", as_index=False).sample( + grouped_test_df = test_df.groupby("label", as_index=False).sample( n=test_size, random_state=42 ) test_df = grouped_test_df.reset_index(drop=True) df = grouped_df.reset_index(drop=True) - df.to_pickle(os.path.join("classifier/data", "inspire_data.df")) + df.to_pickle(os.path.join("classifier/data", "train_valid_data.df")) test_df.to_pickle(os.path.join("classifier/data", "test_data.df")) print("-----------------") print("Inspire Data:") print(f"dataframe size: {df.shape}") print("categories: ") - print(df["labels"].value_counts()) + print(df["label"].value_counts()) print("-----------------") print("Test Data:") print(f"dataframe size: {test_df.shape}") print("categories: ") - print(test_df["labels"].value_counts()) + print(test_df["label"].value_counts()) print("-----------------") os.system(