Chore: pre-commit autoupdate

os-climate · Jun 18, 2024 · d8adb77 · d8adb77
1 parent 08dbabc
commit d8adb77
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 33 deletions.
diff --git a/src/osc_transformer_based_extractor/README.md b/src/osc_transformer_based_extractor/README.md
@@ -1,4 +1,3 @@
-
 ---
 
 # Relevance Detector
@@ -11,7 +10,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
    - One must have data from the curator module, which is used for training of the model. The data from the curator module is a CSV file as follows:
 
    ### Example Snippet
-   
+
    | question                      | context                                                                                                                   | company | source_file                       | source_page | kpi_id | year | answer       | data_type | relevant_paragraphs                            | annotator          | Index | label |
    |-------------------------------|----------------------------------------------------------------------------------------------------------------------------|---------|-----------------------------------|-------------|--------|------|--------------|-----------|------------------------------------------------|---------------------|-------|-------|
    | What is the company name?     | The Company is exposed to a risk of by losses counterparties their contractual financial obligations when due, and in particular depends on the reliability of banks the Company deposits its available cash. | NOVATEK | 04_NOVATEK_AR_2016_ENG_11.pdf | ['0']       | 0      | 2016 | PAO NOVATEK  | TEXT      | ["PAO NOVATEK ANNUAL REPORT 2016"]            | train_anno_large.xlsx | 1022  | 0     |
@@ -22,7 +21,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
 
 2. **Train the Model**:
    - Use `train_sentence_transformer.ipynb` or `train_sentence_transformer.py` to train a sentence transformer model with the processed data from the `Data` folder and save it locally. Follow the steps in the notebook or script to configure and start the training process.
-   
+
    - To train the model using function calling
       ```python
      from train_sentence_transformer import fine_tune_model
@@ -46,7 +45,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
        - `batch_size (int)`: Batch size for training.
        - `output_dir (str)`: Directory to save the trained models.
        - `save_steps (int)`: Number of steps between saving checkpoints.
-   
+
    - To train the model from the command line, run `fine_tune.py` with the required arguments:
      ```bash
      python fine_tune.py \
@@ -76,7 +75,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
 1. **`inference.py`**
    - This script contains the function to make inferences using the trained model.
    - **Usage**: Import this script and use the provided function to predict the relevance of new data.
-   - **Example**: 
+   - **Example**:
      ```python
      from inference import get_inference
      result = get_inference(question="What is the relevance?", paragraph="This is a sample paragraph.", model_path="path/to/model", tokenizer_path="path/to/tokenizer")
@@ -94,7 +93,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
 3. **`train_sentence_transformer.py`**
    - This script defines a function to train a sentence transformer model, which can be called from other scripts or notebooks.
    - **Usage**: Import and call the `fine_tune_model` function to train your model.
-   - **Example**: 
+   - **Example**:
      ```python
      from train_sentence_transformer import fine_tune_model
      fine_tune_model(
@@ -121,7 +120,7 @@ This folder contains a set of scripts and notebooks designed to process data, tr
 4. **`fine_tune.py`**
    - This script allows you to train a sentence transformer model from the command line.
    - **Usage**: Run this script from the command line with the necessary arguments.
-   - **Example**: 
+   - **Example**:
      ```bash
      python fine_tune.py \
        --data_path "data/train_data.csv" \
@@ -198,4 +197,4 @@ Contributions are welcome! Please fork the repository and submit a pull request
 
 ---
 
-For further details and documentation, please refer to the individual scripts and notebooks provided in this repository.
+For further details and documentation, please refer to the individual scripts and notebooks provided in this repository.
diff --git a/src/osc_transformer_based_extractor/fine_tune.py b/src/osc_transformer_based_extractor/fine_tune.py
@@ -18,12 +18,12 @@ def check_csv_columns(file_path):
 
     Raises:
         ValueError: If the file does not exist or does not contain the required columns.
-    """ 
+    """
     if not os.path.exists(file_path):
         raise ValueError(f"Data path {file_path} does not exist.")
-    
+
     required_columns = ["question", "context", "label"]
-    
+
     try:
         df = pd.read_csv(file_path)
         missing_columns = [col for col in required_columns if col not in df.columns]
@@ -180,7 +180,7 @@ def fine_tune_model(data_path, model_name, num_labels, max_length, epochs, batch
 
     print(f"Model- {args.model_name} Trained and Saved Successfully at {args.output_dir}")
 
-''' 
+'''
 To run the file in CMD
 
 python fine_tune.py \
@@ -194,4 +194,3 @@ def fine_tune_model(data_path, model_name, num_labels, max_length, epochs, batch
   --save_steps 500
 
 '''
-
diff --git a/src/osc_transformer_based_extractor/inference.py b/src/osc_transformer_based_extractor/inference.py
@@ -12,11 +12,11 @@
 def check_model_and_tokenizer_path(model_path, tokenizer_path):
     """
     Check if the model and tokenizer paths are valid.
-     
+
     Args:
         model_path (str): Path to the model file.
         tokenizer_path (str): Path to the tokenizer file.
-        
+
     Raises:
         ValueError: If the model or tokenizer path does not exist.
     """
@@ -25,7 +25,7 @@ def check_model_and_tokenizer_path(model_path, tokenizer_path):
 
     if not os.path.exists(tokenizer_path):
         raise ValueError(f"Tokenizer path {tokenizer_path} does not exist.")
-    
+
 
 def check_question_context(question, context):
     if not isinstance(question, str):
@@ -93,8 +93,8 @@ def get_inference(question: str, context: str, model_path: str, tokenizer_path:
 
 
 
-'''python inference.py 
-    --question "What is the capital of France?" 
-    --context "Paris is the capital of France." 
-    --model_path /path/to/model    
+'''python inference.py
+    --question "What is the capital of France?"
+    --context "Paris is the capital of France."
+    --model_path /path/to/model
     --tokenizer_path /path/to/tokenizer'''
diff --git a/src/osc_transformer_based_extractor/make_training_data_from_curator.py b/src/osc_transformer_based_extractor/make_training_data_from_curator.py
@@ -77,7 +77,7 @@ def make_training_data(curator_data_path: str, kpi_mapping_path: str, output_pat
         if isinstance(kpi_id, float) and kpi_id.is_integer():
             kpi_id = int(kpi_id)
         kpi_dict[kpi_id] = {
-            "question": row["question"], 
+            "question": row["question"],
             "sectors": row["sectors"],
             "add_year": row["add_year"],
             "kpi_category": row["kpi_category"],

diff --git a/src/pytests/tess.py b/src/pytests/tess.py
@@ -9,7 +9,7 @@
 class MockTrainer:
     def train(self):
         pass
-    
+
     def evaluate(self, dataset):
         return {"eval_loss": 0.1, "eval_accuracy": 0.95}
 
@@ -84,7 +84,7 @@ def test_fine_tune_model(mock_trainer, tmp_path):
         max_length=512,
         epochs=2,
         batch_size=4,
-        output_dir=str(tmp_path / "saved_models"), 
+        output_dir=str(tmp_path / "saved_models"),
         save_steps=500
     )
 

diff --git a/src/pytests/test_fine_tune.py b/src/pytests/test_fine_tune.py
@@ -50,23 +50,23 @@ def test_fine_tune_model(mock_trainer, mock_tokenizer, mock_model, mock_data, mo
     # Mock the model and tokenizer
     model_instance = MagicMock(spec=AutoModelForSequenceClassification)
     model_instance.to = MagicMock(return_value=model_instance)
-    
+
     # Set the attributes on the model class to mimic a PyTorch model
     mock_model_class = MagicMock(spec=AutoModelForSequenceClassification)
     mock_model_class.module = "torch"
     mock_model_class.name = "PreTrainedModel"
     mock_model_class.return_value = model_instance
-    
+
     mock_model.return_value = mock_model_class.return_value
     mock_tokenizer.return_value = MagicMock(spec=AutoTokenizer)
-    
+
     # Mock the Trainer's train, evaluate, and predict methods
     mock_trainer_instance = MagicMock(spec=Trainer)
     mock_trainer_instance.train.return_value = None
     mock_trainer_instance.evaluate.return_value = {"eval_loss": 0.5}
     mock_trainer_instance.predict.return_value = MagicMock(predictions=torch.tensor([[0.5, 0.5], [0.6, 0.4]]))
     mock_trainer.return_value = mock_trainer_instance
-    
+
     # Run the fine_tune_model function with mock arguments
     fine_tune_model(
         data_path=mock_args.data_path,
@@ -78,11 +78,11 @@ def test_fine_tune_model(mock_trainer, mock_tokenizer, mock_model, mock_data, mo
         output_dir=mock_args.output_dir,
         save_steps=mock_args.save_steps
     )
-    
+
     # Assert that the model and tokenizer were loaded correctly
     mock_model.assert_called_once_with(mock_args.model_name, num_labels=mock_args.num_labels)
     mock_tokenizer.assert_called_once_with(mock_args.model_name)
-    
+
     # Assert that the Trainer's train, evaluate, and predict methods were called
     mock_trainer_instance.train.assert_called_once()
     mock_trainer_instance.evaluate.assert_called_once()

diff --git a/src/pytests/test_make_training_data_from_curator.py b/src/pytests/test_make_training_data_from_curator.py
@@ -25,10 +25,10 @@
 dummy_curator_data.to_csv(curator_data_path_valid, index=False)
 
 dummy_kpi_mapping = pd.DataFrame({
-    "kpi_id": [1], 
-    "question": ["What is the question?"], 
-    "sectors": ["Sector"], 
-    "add_year": [2022], 
+    "kpi_id": [1],
+    "question": ["What is the question?"],
+    "sectors": ["Sector"],
+    "add_year": [2022],
     "kpi_category": ["Category"]
 })
 dummy_kpi_mapping.to_csv(kpi_mapping_path_valid, index=False)