Skip to content

Commit

Permalink
improve df structure
Browse files Browse the repository at this point in the history
  • Loading branch information
PascalEgn committed Jul 15, 2024
1 parent 017167b commit 19d1b36
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 27 deletions.
12 changes: 6 additions & 6 deletions inspire_classifier/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,18 @@ def validate(validation_df):
except IOError as error:
raise IOError("There was a problem loading the classifier model") from error
predictions = []
true_labels = []
validation_df = validation_df.sample(frac=1, random_state=42)
for _, row in tqdm(
validation_df.iterrows(), total=len(validation_df.labels.values)
validation_df.iterrows(), total=len(validation_df.label.values)
):
predicted_value = classifier.predict(
row.text, temperature=current_app.config["CLASSIFIER_SOFTMAX_TEMPERATUR"]
)
predicted_class = np.argmax(predicted_value)
predictions.append(predicted_class)
true_labels.append(row.labels)

print("f1 score ", f1_score(true_labels, predictions, average="micro"))
pprint(classification_report(true_labels, predictions))
pprint(confusion_matrix(true_labels, predictions))
validation_df.insert(2, 'predicted_label', predictions)
validation_df.to_csv(f"{path_for('data')}/validation_results.csv", index=False)
print("f1 score ", f1_score(validation_df["label"], validation_df["predicted_label"], average="micro"))
pprint(classification_report(validation_df["label"], validation_df["predicted_label"]))
pprint(confusion_matrix(validation_df["label"], validation_df["predicted_label"]))
10 changes: 5 additions & 5 deletions inspire_classifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,22 @@
CLASSIFIER_MINIMUM_WORD_FREQUENCY = 2
CLASSIFIER_VALIDATION_DATA_FRACTION = 0.1
CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH = 15
CLASSIFIER_CLASSIFIER_CYCLE_LENGTH = 14
CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE = 32
CLASSIFIER_CLASSIFIER_BATCH_SIZE = 10
CLASSIFIER_CLASSIFIER_CYCLE_LENGTH = 15
CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE = 64
CLASSIFIER_CLASSIFIER_BATCH_SIZE = 128
CLASSIFIER_SOFTMAX_TEMPERATUR = 0.25
CLASSIFIER_CUDA_DEVICE_ID = 0 # set to 0 to use a GPU

CLASSIFIER_DATA_PATH = "data"
CLASSIFIER_LANGUAGE_MODEL_PATH = "models/language_model"
CLASSIFIER_CLASSIFIER_MODEL_PATH = "models/classifier_model"
CLASSIFIER_DATAFRAME_PATH = "data/inspire_data.df"
CLASSIFIER_DATAFRAME_PATH = "data/train_valid_data.df"
CLASSIFIER_TRAIN_VALID_DATA_PATH = "data/train_valid_data.csv"
CLASSIFIER_FINETUNED_LANGUAGE_MODEL_ENCODER_PATH = (
"models/language_model/finetuned_language_model_encoder.h5"
)
CLASSIFIER_TRAINED_CLASSIFIER_PATH = (
"models/classifier_model/trained_classifier_model.h5"
)
CLASSIFIER_DATA_ITOS_PATH = "data/inspire_data_itos.pkl"
CLASSIFIER_DATA_ITOS_PATH = "data/train_valid_data_itos.pkl"
PROMETHEUS_ENABLE_EXPORTER_FLASK = False
12 changes: 7 additions & 5 deletions inspire_classifier/domain/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
else:
default_device(False)

number_of_backpropagation_through_time_steps = 70
number_of_backpropagation_through_time_steps = 100

train_valid_data = pd.read_csv(train_valid_data_dir)

Expand All @@ -64,7 +64,7 @@ def __init__(

dls_lm = dblock_lm.dataloaders(
train_valid_data,
bs=batch_size // 2,
bs=batch_size,
num_workers=multiprocessing.cpu_count() // 2,
pin_memory=True,
)
Expand All @@ -88,7 +88,9 @@ def train(self, finetuned_language_model_encoder_save_path, cycle_length=15):
self.learner.fit_one_cycle(1, 1e-2)
self.learner.unfreeze()
self.learner.fit_one_cycle(cycle_length, 1e-3)
print("language model training finished, saving encoder")
save_encoder_path(self.learner, finetuned_language_model_encoder_save_path)
print("encoder saved")


class Classifier:
Expand All @@ -108,11 +110,11 @@ def load_training_and_validation_data(
train_valid_data = pd.read_csv(train_valid_data_dir)
self.dataloader = TextDataLoaders.from_df(
train_valid_data,
label_col="labels",
label_col="label",
text_col="text",
valid_col="is_valid",
is_lm=False,
bs=batch_size // 2,
bs=batch_size,
num_workers=multiprocessing.cpu_count() // 2,
pin_memory=True,
text_vocab=self.dls_lm_vocab,
Expand Down Expand Up @@ -152,7 +154,7 @@ def train(self, trained_classifier_save_path, cycle_length=14):
self.learner.fit_one_cycle(1, slice(5e-3 / (2.6**4), 5e-3))
self.learner.unfreeze()
self.learner.fit_one_cycle(cycle_length, slice(1e-3 / (2.6**4), 1e-3))

print("Core classifier model training finished")
export_classifier_path(self.learner, trained_classifier_save_path)
self.calculate_f1_for_validation_dataset()

Expand Down
11 changes: 8 additions & 3 deletions scripts/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,12 @@ def __init__(self, index, query_filters, year_from, year_to):

if index == "holdingpen-hep":
self.source_fields = [
"id",
"metadata.abstracts",
"metadata.titles",
"metadata.inspire_categories",
]
self.id_field = "id"
self.title_field = "metadata.titles[0].title"
self.abstract_field = "metadata.abstracts[0].value"
self.inspire_categories_field = "metadata.inspire_categories.term"
Expand All @@ -81,7 +83,8 @@ def __init__(self, index, query_filters, year_from, year_to):
),
]
else:
self.source_fields = ["abstracts", "titles", "inspire_categories"]
self.source_fields = ["id", "abstracts", "titles", "inspire_categories"]
self.id_field = "id"
self.title_field = "titles[0].title"
self.abstract_field = "abstracts[0].value"
self.inspire_categories_field = "inspire_categories.term"
Expand All @@ -91,10 +94,12 @@ def __init__(self, index, query_filters, year_from, year_to):
]

def _postprocess_record_data(self, record_data):
id = get_value(record_data, self.id_field)
title = get_value(record_data, self.title_field)
abstract = get_value(record_data, self.abstract_field)
inspire_categories = get_value(record_data, self.inspire_categories_field, [])
return {
"id": int(id),
"title": title,
"abstract": abstract,
"inspire_categories": inspire_categories,
Expand All @@ -121,7 +126,7 @@ def get_data_for_decisions(year_from, year_to):
record_classifier_data = inspire_search._postprocess_record_data(
record_es_data.to_dict()
)
record_classifier_data["labels"] = DECISIONS_MAPPING[decision]["label"]
record_classifier_data["label"] = DECISIONS_MAPPING[decision]["label"]
yield record_classifier_data


Expand All @@ -133,7 +138,7 @@ def prepare_inspire_classifier_dataset(data, save_data_path):
inspire_data_df["text"] = (
inspire_data_df["title"] + " <ENDTITLE> " + inspire_data_df["abstract"]
)
inspire_classifier_data_df = inspire_data_df[["labels", "text"]]
inspire_classifier_data_df = inspire_data_df[["id", "inspire_categories", "label", "text"]]
inspire_classifier_data_df.to_pickle(save_data_path)


Expand Down
17 changes: 9 additions & 8 deletions scripts/train_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,36 @@ def train_classifier(
os.makedirs(os.path.join(os.getcwd(), "classifier", "data"), exist_ok=True)

df = pd.read_pickle(text_path)
print(df["labels"].value_counts())
train_size = round(min(df["labels"].value_counts()) * train_test_split)
test_size = round(min(df["labels"].value_counts()) * (1 - train_test_split))
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df["label"].value_counts())
train_size = round(min(df["label"].value_counts()) * train_test_split)
test_size = round(min(df["label"].value_counts()) * (1 - train_test_split))

print(train_size)
print(test_size)
grouped_df = df.groupby("labels", as_index=False).sample(
grouped_df = df.groupby("label", as_index=False).sample(
n=train_size, random_state=42
)
test_df = df.drop(grouped_df.index)
grouped_test_df = test_df.groupby("labels", as_index=False).sample(
grouped_test_df = test_df.groupby("label", as_index=False).sample(
n=test_size, random_state=42
)
test_df = grouped_test_df.reset_index(drop=True)
df = grouped_df.reset_index(drop=True)

df.to_pickle(os.path.join("classifier/data", "inspire_data.df"))
df.to_pickle(os.path.join("classifier/data", "train_valid_data.df"))
test_df.to_pickle(os.path.join("classifier/data", "test_data.df"))

print("-----------------")
print("Inspire Data:")
print(f"dataframe size: {df.shape}")
print("categories: ")
print(df["labels"].value_counts())
print(df["label"].value_counts())
print("-----------------")
print("Test Data:")
print(f"dataframe size: {test_df.shape}")
print("categories: ")
print(test_df["labels"].value_counts())
print(test_df["label"].value_counts())
print("-----------------")

os.system(
Expand Down

0 comments on commit 19d1b36

Please sign in to comment.