diff --git a/src/osc_transformer_based_extractor/README.md b/src/osc_transformer_based_extractor/README.md index 6a64ea6..a5abcd6 100644 --- a/src/osc_transformer_based_extractor/README.md +++ b/src/osc_transformer_based_extractor/README.md @@ -1,4 +1,3 @@ - --- # Relevance Detector @@ -11,7 +10,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr - One must have data from the curator module, which is used for training of the model. The data from the curator module is a CSV file as follows: ### Example Snippet - + | question | context | company | source_file | source_page | kpi_id | year | answer | data_type | relevant_paragraphs | annotator | Index | label | |-------------------------------|----------------------------------------------------------------------------------------------------------------------------|---------|-----------------------------------|-------------|--------|------|--------------|-----------|------------------------------------------------|---------------------|-------|-------| | What is the company name? | The Company is exposed to a risk of by losses counterparties their contractual financial obligations when due, and in particular depends on the reliability of banks the Company deposits its available cash. | NOVATEK | 04_NOVATEK_AR_2016_ENG_11.pdf | ['0'] | 0 | 2016 | PAO NOVATEK | TEXT | ["PAO NOVATEK ANNUAL REPORT 2016"] | train_anno_large.xlsx | 1022 | 0 | @@ -22,7 +21,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr 2. **Train the Model**: - Use `train_sentence_transformer.ipynb` or `train_sentence_transformer.py` to train a sentence transformer model with the processed data from the `Data` folder and save it locally. Follow the steps in the notebook or script to configure and start the training process. - + - To train the model using function calling ```python from train_sentence_transformer import fine_tune_model @@ -46,7 +45,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr - `batch_size (int)`: Batch size for training. - `output_dir (str)`: Directory to save the trained models. - `save_steps (int)`: Number of steps between saving checkpoints. - + - To train the model from the command line, run `fine_tune.py` with the required arguments: ```bash python fine_tune.py \ @@ -76,7 +75,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr 1. **`inference.py`** - This script contains the function to make inferences using the trained model. - **Usage**: Import this script and use the provided function to predict the relevance of new data. - - **Example**: + - **Example**: ```python from inference import get_inference result = get_inference(question="What is the relevance?", paragraph="This is a sample paragraph.", model_path="path/to/model", tokenizer_path="path/to/tokenizer") @@ -94,7 +93,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr 3. **`train_sentence_transformer.py`** - This script defines a function to train a sentence transformer model, which can be called from other scripts or notebooks. - **Usage**: Import and call the `fine_tune_model` function to train your model. - - **Example**: + - **Example**: ```python from train_sentence_transformer import fine_tune_model fine_tune_model( @@ -121,7 +120,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr 4. **`fine_tune.py`** - This script allows you to train a sentence transformer model from the command line. - **Usage**: Run this script from the command line with the necessary arguments. - - **Example**: + - **Example**: ```bash python fine_tune.py \ --data_path "data/train_data.csv" \ @@ -198,4 +197,4 @@ Contributions are welcome! Please fork the repository and submit a pull request --- -For further details and documentation, please refer to the individual scripts and notebooks provided in this repository. \ No newline at end of file +For further details and documentation, please refer to the individual scripts and notebooks provided in this repository. diff --git a/src/osc_transformer_based_extractor/fine_tune.py b/src/osc_transformer_based_extractor/fine_tune.py index 3d7d3f3..bfa70c4 100644 --- a/src/osc_transformer_based_extractor/fine_tune.py +++ b/src/osc_transformer_based_extractor/fine_tune.py @@ -18,12 +18,12 @@ def check_csv_columns(file_path): Raises: ValueError: If the file does not exist or does not contain the required columns. - """ + """ if not os.path.exists(file_path): raise ValueError(f"Data path {file_path} does not exist.") - + required_columns = ["question", "context", "label"] - + try: df = pd.read_csv(file_path) missing_columns = [col for col in required_columns if col not in df.columns] @@ -180,7 +180,7 @@ def fine_tune_model(data_path, model_name, num_labels, max_length, epochs, batch print(f"Model- {args.model_name} Trained and Saved Successfully at {args.output_dir}") -''' +''' To run the file in CMD python fine_tune.py \ @@ -194,4 +194,3 @@ def fine_tune_model(data_path, model_name, num_labels, max_length, epochs, batch --save_steps 500 ''' - diff --git a/src/osc_transformer_based_extractor/inference.py b/src/osc_transformer_based_extractor/inference.py index c670c27..781a77c 100644 --- a/src/osc_transformer_based_extractor/inference.py +++ b/src/osc_transformer_based_extractor/inference.py @@ -12,11 +12,11 @@ def check_model_and_tokenizer_path(model_path, tokenizer_path): """ Check if the model and tokenizer paths are valid. - + Args: model_path (str): Path to the model file. tokenizer_path (str): Path to the tokenizer file. - + Raises: ValueError: If the model or tokenizer path does not exist. """ @@ -25,7 +25,7 @@ def check_model_and_tokenizer_path(model_path, tokenizer_path): if not os.path.exists(tokenizer_path): raise ValueError(f"Tokenizer path {tokenizer_path} does not exist.") - + def check_question_context(question, context): if not isinstance(question, str): @@ -93,8 +93,8 @@ def get_inference(question: str, context: str, model_path: str, tokenizer_path: -'''python inference.py - --question "What is the capital of France?" - --context "Paris is the capital of France." - --model_path /path/to/model +'''python inference.py + --question "What is the capital of France?" + --context "Paris is the capital of France." + --model_path /path/to/model --tokenizer_path /path/to/tokenizer''' diff --git a/src/osc_transformer_based_extractor/make_training_data_from_curator.py b/src/osc_transformer_based_extractor/make_training_data_from_curator.py index 97c70bf..a612631 100644 --- a/src/osc_transformer_based_extractor/make_training_data_from_curator.py +++ b/src/osc_transformer_based_extractor/make_training_data_from_curator.py @@ -77,7 +77,7 @@ def make_training_data(curator_data_path: str, kpi_mapping_path: str, output_pat if isinstance(kpi_id, float) and kpi_id.is_integer(): kpi_id = int(kpi_id) kpi_dict[kpi_id] = { - "question": row["question"], + "question": row["question"], "sectors": row["sectors"], "add_year": row["add_year"], "kpi_category": row["kpi_category"], diff --git a/src/pytests/tess.py b/src/pytests/tess.py index 74659c9..2d653f3 100644 --- a/src/pytests/tess.py +++ b/src/pytests/tess.py @@ -9,7 +9,7 @@ class MockTrainer: def train(self): pass - + def evaluate(self, dataset): return {"eval_loss": 0.1, "eval_accuracy": 0.95} @@ -84,7 +84,7 @@ def test_fine_tune_model(mock_trainer, tmp_path): max_length=512, epochs=2, batch_size=4, - output_dir=str(tmp_path / "saved_models"), + output_dir=str(tmp_path / "saved_models"), save_steps=500 ) diff --git a/src/pytests/test_fine_tune.py b/src/pytests/test_fine_tune.py index 39e3ab8..39b2f4a 100644 --- a/src/pytests/test_fine_tune.py +++ b/src/pytests/test_fine_tune.py @@ -50,23 +50,23 @@ def test_fine_tune_model(mock_trainer, mock_tokenizer, mock_model, mock_data, mo # Mock the model and tokenizer model_instance = MagicMock(spec=AutoModelForSequenceClassification) model_instance.to = MagicMock(return_value=model_instance) - + # Set the attributes on the model class to mimic a PyTorch model mock_model_class = MagicMock(spec=AutoModelForSequenceClassification) mock_model_class.module = "torch" mock_model_class.name = "PreTrainedModel" mock_model_class.return_value = model_instance - + mock_model.return_value = mock_model_class.return_value mock_tokenizer.return_value = MagicMock(spec=AutoTokenizer) - + # Mock the Trainer's train, evaluate, and predict methods mock_trainer_instance = MagicMock(spec=Trainer) mock_trainer_instance.train.return_value = None mock_trainer_instance.evaluate.return_value = {"eval_loss": 0.5} mock_trainer_instance.predict.return_value = MagicMock(predictions=torch.tensor([[0.5, 0.5], [0.6, 0.4]])) mock_trainer.return_value = mock_trainer_instance - + # Run the fine_tune_model function with mock arguments fine_tune_model( data_path=mock_args.data_path, @@ -78,11 +78,11 @@ def test_fine_tune_model(mock_trainer, mock_tokenizer, mock_model, mock_data, mo output_dir=mock_args.output_dir, save_steps=mock_args.save_steps ) - + # Assert that the model and tokenizer were loaded correctly mock_model.assert_called_once_with(mock_args.model_name, num_labels=mock_args.num_labels) mock_tokenizer.assert_called_once_with(mock_args.model_name) - + # Assert that the Trainer's train, evaluate, and predict methods were called mock_trainer_instance.train.assert_called_once() mock_trainer_instance.evaluate.assert_called_once() diff --git a/src/pytests/test_make_training_data_from_curator.py b/src/pytests/test_make_training_data_from_curator.py index 37bcfc2..887f53a 100644 --- a/src/pytests/test_make_training_data_from_curator.py +++ b/src/pytests/test_make_training_data_from_curator.py @@ -25,10 +25,10 @@ dummy_curator_data.to_csv(curator_data_path_valid, index=False) dummy_kpi_mapping = pd.DataFrame({ - "kpi_id": [1], - "question": ["What is the question?"], - "sectors": ["Sector"], - "add_year": [2022], + "kpi_id": [1], + "question": ["What is the question?"], + "sectors": ["Sector"], + "add_year": [2022], "kpi_category": ["Category"] }) dummy_kpi_mapping.to_csv(kpi_mapping_path_valid, index=False)