Niketkumardheeryan · Harsh-Bajpay · Jan 16, 2025 · Jan 17, 2025
diff --git a/Diabetes_Prediction/diabetes_prediction.py b/Diabetes_Prediction/diabetes_prediction.py
@@ -0,0 +1,48 @@
+from utils.error_handler import error_handler, DataValidationError, ModelError
+import pandas as pd
+import numpy as np
+
+@error_handler
+def load_data(file_path):
+    try:
+        df = pd.read_csv(file_path)
+        if df.empty:
+            raise DataValidationError("Empty dataset loaded")
+        return df
+    except FileNotFoundError:
+        raise DataValidationError(f"Dataset not found at {file_path}")
+
+@error_handler
+def preprocess_data(df):
+    if not isinstance(df, pd.DataFrame):
+        raise DataValidationError("Input must be a pandas DataFrame")
+
+    # Check for missing values
+    if df.isnull().sum().any():
+        logging.warning("Missing values detected in the dataset")
+        df = df.fillna(df.mean())
+
+    # Check for invalid values
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    for col in numeric_columns:
+        if (df[col] < 0).any():
+            raise DataValidationError(f"Negative values found in column {col}")
+
+    return df
+
+@error_handler
+def train_model(X_train, y_train, model_type='random_forest'):
+    if len(X_train) != len(y_train):
+        raise DataValidationError("Feature and target dimensions do not match")
+
+    try:
+        if model_type == 'random_forest':
+            from sklearn.ensemble import RandomForestClassifier
+            model = RandomForestClassifier(random_state=42)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+
+        model.fit(X_train, y_train)
+        return model
+    except Exception as e:
+        raise ModelError(f"Model training failed: {str(e)}")
diff --git a/Heart_Disease_Prediction/heart_disease_prediction.py b/Heart_Disease_Prediction/heart_disease_prediction.py
@@ -0,0 +1,46 @@
+from utils.error_handler import error_handler, DataValidationError, ModelError
+import pandas as pd
+import numpy as np
+
+@error_handler
+def validate_heart_data(df):
+    required_columns = [
+        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
+        'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 
+        'ca', 'thal', 'target'
+    ]
+
+    # Check required columns
+    missing_cols = [col for col in required_columns if col not in df.columns]
+    if missing_cols:
+        raise DataValidationError(f"Missing required columns: {missing_cols}")
+
+    # Validate value ranges
+    if not (df['age'] >= 0).all():
+        raise DataValidationError("Age cannot be negative")
+    if not df['sex'].isin([0, 1]).all():
+        raise DataValidationError("Sex must be binary (0 or 1)")
+    if not (df['trestbps'] > 0).all():
+        raise DataValidationError("Blood pressure must be positive")
+
+    return True
+
+@error_handler
+def prepare_heart_data(df):
+    try:
+        validate_heart_data(df)
+
+        # Handle missing values
+        if df.isnull().sum().any():
+            logging.warning("Missing values found - applying mean imputation")
+            df = df.fillna(df.mean())
+
+        # Feature scaling
+        from sklearn.preprocessing import StandardScaler
+        scaler = StandardScaler()
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
+
+        return df
+    except Exception as e:
+        raise DataValidationError(f"Data preparation failed: {str(e)}") 
diff --git a/utils/error_handler.py b/utils/error_handler.py
@@ -0,0 +1,38 @@
+import logging
+import sys
+from functools import wraps
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('ml_capsule.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+class MLCapsuleError(Exception):
+    """Base exception class for ML-CaPsule"""
+    pass
+
+class DataValidationError(MLCapsuleError):
+    """Raised when data validation fails"""
+    pass
+
+class ModelError(MLCapsuleError):
+    """Raised when model operations fail"""
+    pass
+
+def error_handler(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except MLCapsuleError as e:
+            logging.error(f"ML-CaPsule error in {func.__name__}: {str(e)}")
+            raise
+        except Exception as e:
+            logging.error(f"Unexpected error in {func.__name__}: {str(e)}")
+            raise MLCapsuleError(f"Function {func.__name__} failed: {str(e)}")
+    return wrapper 
diff --git a/utils/model_evaluation.ipynb b/utils/model_evaluation.ipynb
@@ -0,0 +1,159 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Evaluation and Cross-Validation\n",
+    "\n",
+    "This notebook contains functions for evaluating classification models and performing cross-validation. It includes error handling to ensure robustness. # type: ignore\n",
+    "\n",
+    "## 1. Importing Libraries\n",
+    "\n",
+    "We will start by importing the necessary libraries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "from utils.error_handler import error_handler, ModelError\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Evaluating Classification Models\n",
+    "\n",
+    "The `evaluate_classification_model` function computes various evaluation metrics for classification models, including accuracy, precision, recall, and F1 score. It also includes error handling to manage mismatched dimensions between predictions and true labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@error_handler\n",
+    "def evaluate_classification_model(y_true, y_pred):\n",
+    "    \"\"\"\n",
+    "    Evaluate the classification model using various metrics.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - y_true: array-like, true labels\n",
+    "    - y_pred: array-like, predicted labels\n",
+    "\n",
+    "    Returns:\n",
+    "    - metrics: dict, containing accuracy, precision, recall, and F1 score\n",
+    "    \"\"\"\n",
+    "    if len(y_true) != len(y_pred):\n",
+    "        raise ModelError(\"Prediction and ground truth dimensions do not match\")\n",
+    "    \n",
+    "    metrics = {\n",
+    "        'accuracy': accuracy_score(y_true, y_pred),\n",
+    "        'precision': precision_score(y_true, y_pred, average='weighted'),\n",
+    "        'recall': recall_score(y_true, y_pred, average='weighted'),\n",
+    "        'f1': f1_score(y_true, y_pred, average='weighted')\n",
+    "    }\n",
+    "    \n",
+    "    return metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Cross-Validation\n",
+    "\n",
+    "The `cross_validate_model` function performs cross-validation on a given model and dataset. It returns the mean and standard deviation of the scores obtained during cross-validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@error_handler\n",
+    "def cross_validate_model(model, X, y, cv=5):\n",
+    "    \"\"\"\n",
+    "    Perform cross-validation on the given model.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - model: the model to evaluate\n",
+    "    - X: array-like, feature data\n",
+    "    - y: array-like, target labels\n",
+    "    - cv: int, number of cross-validation folds\n",
+    "\n",
+    "    Returns:\n",
+    "    - dict: containing mean score, standard deviation, and individual scores\n",
+    "    \"\"\"\n",
+    "    scores = cross_val_score(model, X, y, cv=cv)\n",
+    "    return {\n",
+    "        'mean_score': np.mean(scores),\n",
+    "        'std_score': np.std(scores),\n",
+    "        'scores': scores\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Example Usage\n",
+    "\n",
+    "Here, we can provide an example of how to use the above functions with a sample dataset and model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example usage\n",
+    "if __name__ == \"__main__\":\n",
+    "    from sklearn.datasets import load_iris\n",
+    "    from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "    # Load sample data\n",
+    "    data = load_iris()\n",
+    "    X, y = data.data, data.target\n",
+    "\n",
+    "    # Train a model\n",
+    "    model = RandomForestClassifier(random_state=42)\n",
+    "    model.fit(X, y)\n",
+    "\n",
+    "    # Make predictions\n",
+    "    y_pred = model.predict(X)\n",
+    "\n",
+    "    # Evaluate the model\n",
+    "    metrics = evaluate_classification_model(y, y_pred)\n",
+    "    print(\"Evaluation Metrics:\", metrics)\n",
+    "\n",
+    "    # Perform cross-validation\n",
+    "    cv_results = cross_validate_model(model, X, y, cv=5)\n",
+    "    print(\"Cross-Validation Results:\", cv_results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/utils/model_evaluation.py b/utils/model_evaluation.py
@@ -0,0 +1,34 @@
+from utils.error_handler import error_handler, ModelError
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+import numpy as np
+
+@error_handler
+def evaluate_classification_model(y_true, y_pred):
+    if len(y_true) != len(y_pred):
+        raise ModelError("Prediction and ground truth dimensions do not match")
+
+    try:
+        metrics = {
+            'accuracy': accuracy_score(y_true, y_pred),
+            'precision': precision_score(y_true, y_pred, average='weighted'),
+            'recall': recall_score(y_true, y_pred, average='weighted'),
+            'f1': f1_score(y_true, y_pred, average='weighted')
+        }
+
+        return metrics
+    except Exception as e:
+        raise ModelError(f"Model evaluation failed: {str(e)}")
+
+@error_handler
+def cross_validate_model(model, X, y, cv=5):
+    from sklearn.model_selection import cross_val_score
+
+    try:
+        scores = cross_val_score(model, X, y, cv=cv)
+        return {
+            'mean_score': np.mean(scores),
+            'std_score': np.std(scores),
+            'scores': scores
+        }
+    except Exception as e:
+        raise ModelError(f"Cross-validation failed: {str(e)}")