diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3fdfb3d
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+Yes
\ No newline at end of file
diff --git a/documentation/docs/Documentation/API-Reference.md b/documentation/docs/Documentation/API-Reference.md
new file mode 100644
index 0000000..c945c13
--- /dev/null
+++ b/documentation/docs/Documentation/API-Reference.md
@@ -0,0 +1,59 @@
+<a id="nn.loss"></a>
+
+# nn.loss
+
+<a id="nn.loss.MeanSquaredError"></a>
+
+## MeanSquaredError Objects
+
+```python
+class MeanSquaredError()
+```
+
+Class to compute the Mean Squared Error (MSE) and its gradient.
+
+<a id="nn.loss.MeanSquaredError.forward"></a>
+
+#### forward
+
+```python
+def forward(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray
+```
+
+Compute the Mean Squared Error between true and predicted values.
+
+**Arguments**:
+
+- `y_true` _np.ndarray_ - True values.
+- `y_pred` _np.ndarray_ - Predicted values.
+  
+
+**Returns**:
+
+- `np.ndarray` - The mean squared error.
+  
+
+**Raises**:
+
+- `ValueError` - If y_true and y_pred do not have the same shape.
+
+<a id="nn.loss.MeanSquaredError.backward"></a>
+
+#### backward
+
+```python
+def backward(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray
+```
+
+Compute the gradient of the Mean Squared Error with respect to the predicted values.
+
+**Arguments**:
+
+- `y_true` _np.ndarray_ - True values.
+- `y_pred` _np.ndarray_ - Predicted values.
+  
+
+**Returns**:
+
+- `np.ndarray` - The gradient of the loss with respect to y_pred.
+
diff --git a/documentation/docs/Documentation/Install.md b/documentation/docs/Documentation/Install.md
new file mode 100644
index 0000000..0e4c6b3
--- /dev/null
+++ b/documentation/docs/Documentation/Install.md
@@ -0,0 +1,31 @@
+# Installation
+
+## Introduction
+
+`liltorch` is a library for building and training machine learning models. This tutorial will guide you through the process of installing `liltorch` from PyPI, the Python Package Index.
+
+## Prerequisites
+
+Before installing `liltorch`, ensure that you have the following prerequisites:
+
+- Python 3.6 or higher
+- pip (Python package installer)
+
+You can verify your Python and pip versions by running the following commands in your terminal or command prompt:
+
+```sh
+python --version
+pip --version
+```
+
+## Installation Steps
+Open the terminal and type:
+```sh
+pip install --upgrade pip
+pip install liltorch
+```
+Check installation:
+```sh
+import liltorch
+print("liltorch version:", liltorch.__version__)
+```
diff --git a/documentation/docs/index.md b/documentation/docs/index.md
index 96bfa09..b18e405 100644
--- a/documentation/docs/index.md
+++ b/documentation/docs/index.md
@@ -2,4 +2,4 @@
 
 LilTorch is a lightweight library for Deep Learning, created entirely in Python with Numpy as its sole dependency. This library allows you to design and understand the internals of Neural Networks without the need for C/C++ imported code and binaries. Everything is as understandable as Python itself.
 
-**Note**: This library is intended for educational purposes only. It is not recommended for production use, and execution speed is not a primary focus.
+**Note**: This library is intended for educational purposes only. It is not recommended for production use, and execution speed is not a primary focus.
\ No newline at end of file
diff --git a/documentation/mkdocs.yml b/documentation/mkdocs.yml
index 64e04b8..314d061 100644
--- a/documentation/mkdocs.yml
+++ b/documentation/mkdocs.yml
@@ -1 +1 @@
-site_name: LilTorch
+site_name: LilTorch
\ No newline at end of file
diff --git a/examples/mnist.py b/examples/mnist.py
index a36bcff..a0ac8ba 100644
--- a/examples/mnist.py
+++ b/examples/mnist.py
@@ -7,16 +7,16 @@
 from keras.utils import to_categorical
 
 from liltorch.nn.fully_connected import FullyConnectedLayer
-from liltorch.nn.activation import ActivationLayerTanh
+from liltorch.nn.activation import Tanh
 from liltorch.nn.network import Network
 
-from liltorch.nn.loss import mse_loss, mse_grad
+from liltorch.nn.loss import MeanSquaredError
 import numpy as np
 
 # load MNIST from server
 lr = 0.1
-epochs = 5
-batch_size = 16
+epochs = 20
+batch_size = 4
 (x_train, y_train), (x_test, y_test) = mnist.load_data()
 
 x_train = x_train.reshape(x_train.shape[0], 28*28)
@@ -30,55 +30,46 @@
 y_test = to_categorical(y_test)
 
 print(f'shape of xtrain {x_train.shape} xtrain {y_train.shape} and xtrain {x_test.shape} xtest {x_test.shape}')
+
 # build model
 model = Network(lr=lr)
 model.add(FullyConnectedLayer(28*28, 100))
-model.add(ActivationLayerTanh())
+model.add(Tanh())
 model.add(FullyConnectedLayer(100, 50))
-model.add(ActivationLayerTanh())
+model.add(Tanh())
 model.add(FullyConnectedLayer(50, 10))
-model.add(ActivationLayerTanh())
+model.add(Tanh())
 
 # training
 for epoch in range(epochs):
     epoch_loss = 0
     dataset_size = len(x_train)
-    batch_begin = 0
-    batch_end = batch_size - 1
-    while batch_begin < dataset_size:
+    correct = 0
+    criterion = MeanSquaredError()
+    for batch_begin in range(0, dataset_size, batch_size):
+        batch_end = min(batch_begin + batch_size, dataset_size)
         data = x_train[batch_begin:batch_end]
         target = y_train[batch_begin:batch_end]
-        #print(f"batch from {batch_begin} to {batch_end}")
 
         # forward
         output = model.forward(data)
-        epoch_loss += mse_loss(target, output)
+        epoch_loss += criterion.foward(target, output)
 
         # backward pass
-        error = mse_grad(target, output)
+        error = criterion.backward(target, output)
         model.backward(error)
-
-        batch_begin = batch_end
-        batch_end += batch_size
-        batch_end = min(dataset_size, batch_end)
-
-    print(f'Epoch {epoch} -> Average loss {epoch_loss/len(x_train)}')
+        correct += np.sum((np.argmax(output, axis=1) == np.argmax(target, axis=1)))
+    print(f'Epoch {epoch} -> Average loss {epoch_loss/dataset_size} / Average Accuracy {(correct/dataset_size):.4f}')
 
 # testing
 correct = 0
 dataset_size = len(x_test)
-batch_begin = 0
-batch_end = batch_size - 1
-
-while batch_begin < dataset_size -1:
+for batch_begin in range(0, dataset_size, batch_size):
+    batch_end = min(batch_begin + batch_size, dataset_size)
     data = x_test[batch_begin:batch_end]
     target = y_test[batch_begin:batch_end]
-    #print(f"batch from {batch_begin} to {batch_end}")
-    output = model.forward(data)
-    correct += np.sum((np.argmax(output, axis=1) == np.argmax(target)))
 
-    batch_begin = batch_end
-    batch_end += batch_size
-    batch_end = min(dataset_size, batch_end)
+    output = model.forward(data)
+    correct += np.sum((np.argmax(output, axis=1) == np.argmax(target, axis=1)))
 
-print(f'Test Accuracy: {correct/dataset_size}')
+print(f'Test Accuracy: {(correct/dataset_size):.4f}')
diff --git a/liltorch/nn/activation.py b/liltorch/nn/activation.py
index a6dfc47..1e5f186 100644
--- a/liltorch/nn/activation.py
+++ b/liltorch/nn/activation.py
@@ -3,7 +3,7 @@
 from liltorch.nn.layer import Layer
 
 
-class ActivationLayerTanh(Layer):
+class Tanh(Layer):
 
     def forward(self, input_data):
         '''fordward pass using tanh activation'''
diff --git a/liltorch/nn/fully_connected.py b/liltorch/nn/fully_connected.py
index dd89170..43d7fd9 100644
--- a/liltorch/nn/fully_connected.py
+++ b/liltorch/nn/fully_connected.py
@@ -1,5 +1,3 @@
-import random
-
 import numpy as np
 
 from liltorch.nn.layer import Layer
@@ -17,10 +15,17 @@ def forward(self, input_data):
         self.output = np.dot(self.input, self.weights) + self.bias
         return self.output
 
-    def backward(self, output_error, lr):
-        input_error = np.dot(output_error, self.weights.T)
-        weights_error = np.dot(self.input.T, output_error)
-
-        self.weights -= lr * weights_error
-        self.bias -= lr * np.sum(output_error, axis=0, keepdims=True)
-        return input_error
+    def backward(self, upstream_gradients, lr):
+        # Calculate gradients to propagate to the previous layer (dL/dz[i]) given 
+        # a previous layer gradient (dL/dz[i+1]) (which in forward pass would be next layer)
+        downstream_gradients = np.dot(upstream_gradients, self.weights.T)
+        
+        # Calculate local gradients for weights and biases (dL/dW and dL/dB )
+        local_gradients_w = np.dot(self.input.T, upstream_gradients)
+        local_gradients_b = np.sum(upstream_gradients, axis=0, keepdims=True)
+        
+        # Update weights and biases using the gradients and learning rate
+        self.weights -= lr * local_gradients_w
+        self.bias -= lr * local_gradients_b
+        
+        return downstream_gradients
diff --git a/liltorch/nn/loss.py b/liltorch/nn/loss.py
index 4a5aee3..54777f0 100644
--- a/liltorch/nn/loss.py
+++ b/liltorch/nn/loss.py
@@ -1,10 +1,37 @@
 import numpy as np
 
+class MeanSquaredError:
+    """
+    Class to compute the Mean Squared Error (MSE) and its gradient.
+    """
 
-def mse_loss(y_true: np.ndarray, y_pred: np.ndarray):
-    if y_true.shape != y_pred.shape:
-        raise ValueError("y_true and y_pred must have the same length.")
-    return np.mean(np.power(y_true-y_pred, 2));
+    def forward(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
+        """
+        Compute the Mean Squared Error between true and predicted values.
 
-def mse_grad(y_true, y_pred):
-    return 2*(y_pred-y_true)/y_true.size;
\ No newline at end of file
+        Parameters:
+        y_true (np.ndarray): True values.
+        y_pred (np.ndarray): Predicted values.
+
+        Returns:
+        np.ndarray: The mean squared error.
+
+        Raises:
+        ValueError: If y_true and y_pred do not have the same shape.
+        """
+        if y_true.shape != y_pred.shape:
+            raise ValueError("y_true and y_pred must have the same length.")
+        return np.mean(np.power(y_true - y_pred, 2))
+
+    def backward(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
+        """
+        Compute the gradient of the Mean Squared Error with respect to the predicted values.
+
+        Parameters:
+        y_true (np.ndarray): True values.
+        y_pred (np.ndarray): Predicted values.
+
+        Returns:
+        np.ndarray: The gradient of the loss with respect to y_pred.
+        """
+        return 2 * (y_pred - y_true) / y_true.size
\ No newline at end of file
diff --git a/liltorch/nn/perceptron.py b/liltorch/nn/perceptron.py
deleted file mode 100644
index 2647810..0000000
--- a/liltorch/nn/perceptron.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import numpy as np
-
-
-class Perceptron:
-
-    def __init__(self, x_dim: int, learning_rate:float = 0.1):
-        self.lr = learning_rate
-        self.weights = np.zeros(x_dim + 1)
-
-    def forward(self, x):
-        output = np.dot(x, self.weights[1:]) + self.weights[0]
-        if output > 0:
-            return True
-        return False
-
-    def backward(self, x, y, y_hat):
-        self.weights[1:] = self.lr * (y-y_hat)*x
-        self.weights[0] = self.lr * (y-y_hat)
diff --git a/requirements.dev.txt b/requirements.dev.txt
index c34b612..233f889 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,3 +1,4 @@
 pytest==8.2.2
-mkdocs==1.6.0
-coverage==7.5.4
+Sphinx==7.3.7
+sphinx-autodoc-typehints==2.2.2
+sphinx-markdown-builder==0.6.6
diff --git a/setup.py b/setup.py
index 8f8f8ef..09a1646 100644
--- a/setup.py
+++ b/setup.py
@@ -6,4 +6,6 @@
     description='Small neural network library made only with raw python',
     author='Mateus Souza',
     packages=find_packages(),
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
 )