diff --git a/.vscode/launch.json b/.vscode/launch.json
index 3dfbdda..380375f 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -8,7 +8,7 @@
             "name": "Auto - Debbug kimchima",
             "type": "python",
             "request": "launch",
-            "program": "${workspaceFolder}/kimchima.py",
+            "program": "${workspaceFolder}/src/kimchima/cmds/kimchima_cli.py",
             "console": "integratedTerminal",
             "purpose": [
                 "debug-in-terminal"
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 184c4eb..d73198f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -2,7 +2,7 @@
     "python.testing.unittestArgs": [
         "-v",
         "-s",
-        ".",
+        "./src",
         "-p",
         "test_*.py"
     ],
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index b175f76..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,6 +0,0 @@
-exclude kimchi.yml
-exclude Makefile
-exclude .vscode
-exclude .gitignore
-exclude LICENSE
-recursive-exclude params *
diff --git a/Makefile b/Makefile
index 0136e33..66466cd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,7 @@
+#################################################################
+TESTDIR:=src/kimchima/tests/
+
+##############################Legacy#############################
 .PHONY: setup
 setup:
 	python setup.py sdist bdist_wheel
@@ -10,6 +14,7 @@ upload:
 ################################Poetry################################
 .PHONY: poetry
 poetry:
+	@poetry config virtualenvs.in-project true
 	@pipx install poetry==1.8.2
 
 
@@ -25,7 +30,7 @@ install:
 
 .PHONY: test
 test:
-	@poetry run python -m unittest discover -v
+	@poetry run python -m unittest discover ${TESTDIR} -v
 
 
 # build and publish
diff --git a/README.md b/README.md
index 1c64423..d11cf69 100644
--- a/README.md
+++ b/README.md
@@ -10,13 +10,6 @@ The collections of tools for ML model development.
 
 You can use it as a command line tool if you like. And you can also use it as a library. Or you can run it in VSCode with [`launch.json`](.vscode/launch.json).
 
-## Command Line Tool
-
-```bash
-$ python -m kimchima auto sentence-transformers/all-MiniLM-L6-v2 Melbourne 
-
-```
-
 
 
 # Acknowledgement
diff --git a/cmds/__init__.py b/cmds/__init__.py
deleted file mode 100644
index 23f8d4a..0000000
--- a/cmds/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from cmds.auto import CommandAuto
\ No newline at end of file
diff --git a/examples/examples.py b/examples/examples.py
new file mode 100644
index 0000000..b3c0077
--- /dev/null
+++ b/examples/examples.py
@@ -0,0 +1,20 @@
+from kimchima import Auto, get_device, get_capability
+
+model = Auto(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")
+
+# computing embeddings for single text
+embeddings = model.get_embeddings(text="Melbourne")
+print(embeddings.shape)
+
+# computing embeddings for multiple texts
+embeddings = model.get_embeddings(text=["Melbourne", "Sydney"])
+print(embeddings.shape)
+
+# Checking the device: GPU, mps and CPU
+device = get_device()
+print(device)
+
+
+# get capability of GPU(Nvidia)
+capability = get_capability()
+print(capability)
\ No newline at end of file
diff --git a/pkg/__init__.py b/pkg/__init__.py
deleted file mode 100644
index 8d23440..0000000
--- a/pkg/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from pkg.devices import *
-from pkg.auto import *
diff --git a/pkg/auto/__init__.py b/pkg/auto/__init__.py
deleted file mode 100644
index f26097b..0000000
--- a/pkg/auto/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from pkg.auto.auto import Auto
diff --git a/pkg/devices/__init__.py b/pkg/devices/__init__.py
deleted file mode 100644
index 3691d75..0000000
--- a/pkg/devices/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from pkg.devices.devices import *
\ No newline at end of file
diff --git a/pkg/dump/__init__.py b/pkg/dump/__init__.py
deleted file mode 100644
index 1b88290..0000000
--- a/pkg/dump/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from pkg.dump.dump import Dump
\ No newline at end of file
diff --git a/pkg/dump/dump.py b/pkg/dump/dump.py
deleted file mode 100644
index f67835b..0000000
--- a/pkg/dump/dump.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import numpy as np
-import torch
-import pathlib
-
-
-class Dump:
-    """
-    A class that provides methods to save various components of a transformer model.
-    """
-    @staticmethod
-    def save_scalar(s, name, path):
-        """
-        Saves a scalar value to a numpy file.
-
-        Args:
-        - s: The scalar value to be saved.
-        - name: The name of the file to be saved.
-        - path: The path where the file will be saved.
-        """
-        s = np.array([1.0, float(s)]).astype(np.float32)
-        np.save(pathlib.Path(path, f'{name}.npy'), s)
-    
-    @staticmethod
-    def save_tensor(tensor, name, path):
-        """
-        Saves a tensor to a numpy file.
-
-        Args:
-        - tensor: The tensor to be saved.
-        - name: The name of the file to be saved.
-        - path: The path where the file will be saved.
-        """
-        tensor_numpy=tensor.numpy()
-        tensor_dims =np.array(tensor_numpy.shape)
-        tensor_values = tensor_numpy.flatten()
-        tensor_to_save = np.concatenate((tensor_dims, tensor_values)).astype(np.float32)
-        np.save(pathlib.Path(path, f'{name}.npy'), tensor_to_save)
-    
-    @staticmethod
-    def save_linear(linear, path):
-        """
-        Saves the weight and bias of a linear layer to numpy files.
-
-        Args:
-        - linear: The linear layer to be saved.
-        - path: The path where the files will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_tensor(linear.weight.t(), 'weight', path) # PyTorch and Tinygrad strangely transpose linear weights so reverse that
-        if linear.bias is not None:
-            Dump.save_tensor(linear.bias, 'bias', path)
-
-    @staticmethod
-    def save_rmsnorm(norm, path):
-        """
-        Saves the weight and epsilon value of a RMSNorm layer to numpy files.
-
-        Args:
-        - norm: The RMSNorm layer to be saved.
-        - path: The path where the files will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_tensor(norm.weight, 'weight', path)
-        Dump.save_scalar(norm.eps, 'eps', path)
-
-    @staticmethod
-    def save_attention(attention, path):
-        """
-        Saves the weight of the query, key, value and output linear layers of an attention layer to numpy files.
-
-        Args:
-        - attention: The attention layer to be saved.
-        - path: The path where the files will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_linear(attention.wq, pathlib.Path(path, 'wq'))
-        Dump.save_linear(attention.wk, pathlib.Path(path, 'wk'))
-        Dump.save_linear(attention.wv, pathlib.Path(path, 'wv'))
-        Dump.save_linear(attention.wo, pathlib.Path(path, 'wo'))
-        n_kv_head = attention.n_kv_heads
-        n_head = n_kv_head * attention.n_rep
-        Dump.save_scalar(n_head, "n_head", path)
-        Dump.save_scalar(n_kv_head, "n_kv_head", path)
-
-    @staticmethod
-    def save_feedforward(feed_forward, path):
-        """
-        Saves the weight of the three linear layers of a feedforward layer to numpy files.
-
-        Args:
-        - feed_forward: The feedforward layer to be saved.
-        - path: The path where the files will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_linear(feed_forward.w1, pathlib.Path(path, 'w1'))
-        Dump.save_linear(feed_forward.w2, pathlib.Path(path, 'w2'))
-        Dump.save_linear(feed_forward.w3, pathlib.Path(path, 'w3'))
-
-    @staticmethod
-    def save_embedding(embedding, path):
-        """
-        Saves the weight of an embedding layer to a numpy file.
-
-        Args:
-        - embedding: The embedding layer to be saved.
-        - path: The path where the file will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_tensor(embedding.weight, 'weight', path)
-
-    @staticmethod
-    def save_transformer_block(transformer_block, path):
-        """
-        Saves the components of a transformer block to numpy files.
-
-        Args:
-        - transformer_block: The transformer block to be saved.
-        - path: The path where the files will be saved.
-        """
-        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-        Dump.save_attention(transformer_block.attention, pathlib.Path(path, 'attention'))
-        Dump.save_feedforward(transformer_block.feed_forward, pathlib.Path(path, 'feedforward'))
-        Dump.save_rmsnorm(transformer_block.attention_norm, pathlib.Path(path, 'attention_norm'))
-        Dump.save_rmsnorm(transformer_block.ffn_norm, pathlib.Path(path, 'ffn_norm'))
-
-    @staticmethod
-    def save_transformer(transformer, path):
-        """
-        Saves the components of a transformer model to numpy files.
-
-        Args:
-        - transformer: The transformer model to be saved.
-        - path: The path where the files will be saved.
-        """
-        with torch.no_grad():
-            pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-            Dump.save_scalar(len(transformer.layers), 'n_layer', path)
-            for idx, layer in enumerate(transformer.layers):
-                Dump.save_transformer_block(layer, pathlib.Path(path, f'layer{idx}'))
-            Dump.save_rmsnorm(transformer.norm, pathlib.Path(path, 'norm'))
-            Dump.save_embedding(transformer.tok_embeddings, pathlib.Path(path, 'tok_embeddings'))
-            Dump.save_linear(transformer.output, pathlib.Path(path, 'output'))
-            Dump.save_scalar(10000.0, 'theta', path)
-            Dump.save_scalar(transformer.params.max_seq_len, 'n_ctx', path)
-            Dump.save_scalar(transformer.params.multiple_of, 'multiple_of', path)
-            if transformer.params.ffn_dim_multiplier is not None:
-                Dump.save_scalar(transformer.params.ffn_dim_multiplier, 'ffn_dim_multiplier', path)
diff --git a/pkg/model/__init__.py b/pkg/model/__init__.py
deleted file mode 100644
index 5770728..0000000
--- a/pkg/model/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from pkg.model.model import *
\ No newline at end of file
diff --git a/pkg/model/model.py b/pkg/model/model.py
deleted file mode 100644
index fa61b32..0000000
--- a/pkg/model/model.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# This file is adapted from the LLama project:
-# https://github.com/facebookresearch/llama/blob/main/llama/model.py
-
-# Original LLama code by Facebook AI Research 
-# Adapted by Aisuko
-
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-import torch.nn as nn
-from torch import nn
-from torch.nn import Embedding, Linear
-
-
-@dataclass
-class ModelArgs:
-    """
-    A class to store the arguments for the model.
-
-    Attributes:
-    -----------
-    dim : int
-        The dimension of the model.
-    n_layers : int
-        The number of layers in the model.
-    n_heads : int
-        The number of heads in the model.
-    n_kv_heads : Optional[int]
-        The number of key-value heads in the model.
-    vocab_size : int
-        The size of the vocabulary.
-    multiple_of : int
-        The multiple of the SwiGLU hidden layer size.
-    ffn_dim_multiplier : Optional[float]
-        The multiplier for the feedforward network dimension.
-    norm_eps : float
-        The epsilon value for normalization.
-
-    max_batch_size : int
-        The maximum batch size for the model.
-    max_seq_len : int
-        The maximum sequence length for the model.
-    """
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-class RMSNorm(torch.nn.Module):
-    """
-    Root Mean Square Normalization (RMSNorm) layer.
-
-    Args:
-        dim (int): The dimension of the input tensor.
-        eps (float): A small value added to the denominator for numerical stability. Default: 1e-6.
-    """
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
-    """
-    Precomputes the frequency of cosine and sine functions for a given dimension and end value.
-
-    Args:
-        dim (int): The dimension of the frequency tensor.
-        end (int): The end value of the frequency tensor.
-        theta (float, optional): The scaling factor for the frequency tensor. Defaults to 10000.0.
-
-    Returns:
-        freqs_cis (torch.Tensor): A complex tensor of shape (end, dim/2) containing the precomputed frequency values.
-    """
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)  # type: ignore
-    freqs = torch.outer(t, freqs).float()  # type: ignore
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    """
-    Reshapes the input tensor `freqs_cis` to match the shape of `x` for broadcasting.
-
-    Args:
-        freqs_cis (torch.Tensor): A tensor of shape (x.shape[1], x.shape[-1]).
-        x (torch.Tensor): The input tensor.
-
-    Returns:
-        torch.Tensor: The reshaped tensor of the same shape as `x` except for the second and second-to-last dimensions.
-    """
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Applies rotary embeddings to the input tensors xq and xk using the given frequency tensor freqs_cis.
-
-    Args:
-        xq (torch.Tensor): The query tensor.
-        xk (torch.Tensor): The key tensor.
-        freqs_cis (torch.Tensor): The frequency tensor.
-
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: A tuple of the output tensors xq_out and xk_out.
-    """
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-
-
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
-        """
-        Initializes the Attention module.
-
-        Args:
-        - args: ModelArgs object containing the following attributes:
-            - n_heads: int, number of attention heads
-            - n_kv_heads: int, number of key-value attention heads
-            - dim: int, dimension of the model
-            - max_batch_size: int, maximum batch size
-            - max_seq_len: int, maximum sequence length
-        """
-        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        model_parallel_size = 1#fs_init.get_model_parallel_world_size()
-        self.n_local_heads = args.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-
-        self.wq = nn.Linear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-        )
-        self.wk = nn.Linear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-        )
-        self.wv = nn.Linear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-        )
-        self.wo = nn.Linear(
-            args.n_heads * self.head_dim,
-            args.dim,
-            bias=False,
-        )
-
-        self.cache_k = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-        self.cache_v = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Performs a forward pass of the Attention module.
-
-        Args:
-        - x: torch.Tensor of shape (batch_size, sequence_length, model_dimension), input tensor
-        - start_pos: int, starting position of the sequence
-        - freqs_cis: torch.Tensor of shape (sequence_length, model_dimension), cosine and sine frequencies for rotary embeddings
-        - mask: Optional[torch.Tensor] of shape (batch_size, sequence_length, sequence_length), mask tensor
-
-        Returns:
-        - output: torch.Tensor of shape (batch_size, sequence_length, model_dimension), output tensor
-        """
-        bsz, seqlen, _ = x.shape
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
-
-        self.cache_k = self.cache_k.to(xq)
-        self.cache_v = self.cache_v.to(xq)
-
-        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
-        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
-
-        keys = self.cache_k[:bsz, : start_pos + seqlen]
-        values = self.cache_v[:bsz, : start_pos + seqlen]
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        keys = repeat_kv(keys, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-        values = repeat_kv(values, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-
-        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        keys = keys.transpose(1, 2)
-        values = values.transpose(1, 2)
-        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if mask is not None:
-            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
-        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-        return self.wo(output)
-
-
-class FeedForward(nn.Module):
-    """
-    A feedforward neural network module.
-
-    Args:
-        dim (int): The input dimension.
-        hidden_dim (int): The hidden dimension.
-        multiple_of (int): The output dimension is a multiple of this value.
-        ffn_dim_multiplier (Optional[float]): A multiplier for the hidden dimension.
-
-    Attributes:
-        w1 (nn.Linear): The first linear layer.
-        w2 (nn.Linear): The second linear layer.
-        w3 (nn.Linear): The third linear layer.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = Linear(
-            dim, hidden_dim, bias=False
-        )
-        self.w2 = Linear(
-            hidden_dim, dim, bias=False
-        )
-        self.w3 = Linear(
-            dim, hidden_dim, bias=False
-        )
-
-    def forward(self, x):
-        """
-        Forward pass of the feedforward neural network.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The output tensor.
-        """
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-
-
-class TransformerBlock(nn.Module):
-    """
-    A transformer block that consists of a self-attention layer and a feedforward layer.
-
-    Args:
-        layer_id (int): The ID of the layer.
-        args (ModelArgs): The arguments for the model.
-
-    Attributes:
-        n_heads (int): The number of attention heads.
-        dim (int): The dimension of the model.
-        head_dim (int): The dimension of each attention head.
-        attention (Attention): The self-attention layer.
-        feed_forward (FeedForward): The feedforward layer.
-        layer_id (int): The ID of the layer.
-        attention_norm (RMSNorm): The normalization layer for the attention output.
-        ffn_norm (RMSNorm): The normalization layer for the feedforward output.
-    """
-
-    def __init__(self, layer_id: int, args: ModelArgs):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=4 * args.dim,
-            multiple_of=args.multiple_of,
-            ffn_dim_multiplier=args.ffn_dim_multiplier,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Forward pass of the transformer block.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-            start_pos (int): The starting position for the input sequence.
-            freqs_cis (torch.Tensor): The frequency tensor for the input sequence.
-            mask (Optional[torch.Tensor]): The mask tensor for the input sequence.
-
-        Returns:
-            torch.Tensor: The output tensor.
-        """
-        h = x + self.attention.forward(
-            self.attention_norm(x), start_pos, freqs_cis, mask
-        )
-        out = h + self.feed_forward.forward(self.ffn_norm(h))
-        return out
-
-
-class Transformer(nn.Module):
-    """
-    A Transformer model for sequence-to-sequence tasks.
-
-    Args:
-        params (ModelArgs): A dataclass containing the model hyperparameters.
-
-    Attributes:
-        params (ModelArgs): The model hyperparameters.
-        vocab_size (int): The size of the vocabulary.
-        n_layers (int): The number of layers in the Transformer.
-        tok_embeddings (Embedding): The token embeddings layer.
-        layers (ModuleList): The list of TransformerBlock layers.
-        norm (RMSNorm): The normalization layer.
-        output (Linear): The output layer.
-        freqs_cis (torch.Tensor): The precomputed cosine frequencies.
-
-    Methods:
-        forward(tokens, start_pos): The forward pass of the model.
-    """
-
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-
-        self.tok_embeddings = Embedding(
-            params.vocab_size, params.dim
-        )
-
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = Linear(
-            params.dim, params.vocab_size, bias=False
-        )
-
-        self.freqs_cis = precompute_freqs_cis(
-            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
-        )
-
-    @torch.inference_mode()
-    def forward(self, tokens: torch.Tensor, start_pos: int):
-        """
-        The forward pass of the model.
-
-        Args:
-            tokens (torch.Tensor): The input sequence of tokens.
-            start_pos (int): The starting position of the sequence.
-
-        Returns:
-            output (torch.Tensor): The output sequence of logits.
-        """
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.to(h.device)
-        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
-
-        mask = None
-        if seqlen > 1:
-            mask = torch.full(
-                (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device
-            )
-            mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
-
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h).float()
-        return output
-    
\ No newline at end of file
diff --git a/pkg/tokenizer/__init__.py b/pkg/tokenizer/__init__.py
deleted file mode 100644
index b1eb644..0000000
--- a/pkg/tokenizer/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from pkg.tokenizer.tokenizer import Tokenizer
\ No newline at end of file
diff --git a/pkg/tokenizer/tokenizer.py b/pkg/tokenizer/tokenizer.py
deleted file mode 100644
index 77f3ce7..0000000
--- a/pkg/tokenizer/tokenizer.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# This file is adapted from the LLama project:
-# https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
-
-# Original LLama code by Facebook AI Research
-# Adapted by Aisuko
-
-from typing import List
-import sentencepiece as spm
-
-class Tokenizer:
-    """
-    A class for encoding and decoding text using SentencePiece tokenizer.
-
-    Attributes:
-    -----------
-    sp_model : sentencepiece.SentencePieceProcessor
-        The SentencePiece model used for encoding and decoding.
-    n_words : int
-        The size of the vocabulary.
-    bos_id : int
-        The ID of the beginning-of-sentence token.
-    eos_id : int
-        The ID of the end-of-sentence token.
-    pad_id : int
-        The ID of the padding token.
-    """
-
-    def __init__(self, model_path: str):
-        """
-        Initializes a Tokenizer object.
-
-        Parameters:
-        -----------
-        model_path : str
-            The path to the SentencePiece model file.
-        """
-        self.sp_model = spm.SentencePieceProcessor(model_file=model_path)
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.pad_id()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        """
-        Encodes a string using the SentencePiece model.
-
-        Parameters:
-        -----------
-        s : str
-            The string to be encoded.
-        bos : bool
-            Whether to add a beginning-of-sentence token to the beginning of the encoded sequence.
-        eos : bool
-            Whether to add an end-of-sentence token to the end of the encoded sequence.
-
-        Returns:
-        --------
-        List[int]
-            The encoded sequence as a list of integers.
-        """
-        assert type(s) is str
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def decode(self, t: List[int]) -> str:
-        """
-        Decodes a sequence of integers using the SentencePiece model.
-
-        Parameters:
-        -----------
-        t : List[int]
-            The sequence of integers to be decoded.
-
-        Returns:
-        --------
-        str
-            The decoded string.
-        """
-        return self.sp_model.decode(t)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 67a0a14..e747053 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "kimchima"
-version = "0.2.1"
-description = ""
+version = "0.2.2"
+description = "The collections of tools for ML model development."
 authors = ["Aisuko <urakiny@gmail.com>"]
 license = "Apache-2.0"
 readme = "README.md"
@@ -9,11 +9,7 @@ homepage = "https://github.com/Aisuko/kimchi"
 repository = "https://github.com/Aisuko/kimchi"
 keywords = ["ai", "llm"]
 
-packages = [
-    { include = "pkg" },
-]
 
-exclude = ["pkg/dump", "pkg/model", "pkg/tokenizer"]
 
 [tool.poetry.dependencies]
 python = "^3.11"
diff --git a/src/kimchima/__init__.py b/src/kimchima/__init__.py
new file mode 100644
index 0000000..20915a2
--- /dev/null
+++ b/src/kimchima/__init__.py
@@ -0,0 +1,22 @@
+# coding=utf-8
+# Copyright [2024] [Aisuko]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__="0.2.2"
+
+from .pkg import (
+    Auto,
+    Devices,
+    get_device,
+    get_capability
+)
\ No newline at end of file
diff --git a/src/kimchima/cmds/__init__.py b/src/kimchima/cmds/__init__.py
new file mode 100644
index 0000000..6e45d93
--- /dev/null
+++ b/src/kimchima/cmds/__init__.py
@@ -0,0 +1 @@
+from .auto_cli import CommandAuto
\ No newline at end of file
diff --git a/cmds/auto.py b/src/kimchima/cmds/auto_cli.py
similarity index 98%
rename from cmds/auto.py
rename to src/kimchima/cmds/auto_cli.py
index 91ab21d..43d35b8 100644
--- a/cmds/auto.py
+++ b/src/kimchima/cmds/auto_cli.py
@@ -19,7 +19,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from pkg import Auto
+from kimchima.pkg import Auto
 
 
 class CommandAuto:
diff --git a/kimchima.py b/src/kimchima/cmds/kimchima_cli.py
similarity index 96%
rename from kimchima.py
rename to src/kimchima/cmds/kimchima_cli.py
index 0b27aaf..3145842 100644
--- a/kimchima.py
+++ b/src/kimchima/cmds/kimchima_cli.py
@@ -14,7 +14,7 @@
 
 import argparse
 
-from cmds.auto import CommandAuto
+from kimchima.cmds.auto_cli import CommandAuto
 
 
 def main():
diff --git a/src/kimchima/pkg/__init__.py b/src/kimchima/pkg/__init__.py
new file mode 100644
index 0000000..d031dff
--- /dev/null
+++ b/src/kimchima/pkg/__init__.py
@@ -0,0 +1,6 @@
+from .auto import Auto
+from .devices import (
+    Devices,
+    get_device,
+    get_capability
+)
\ No newline at end of file
diff --git a/pkg/auto/auto.py b/src/kimchima/pkg/auto.py
similarity index 100%
rename from pkg/auto/auto.py
rename to src/kimchima/pkg/auto.py
diff --git a/pkg/devices/devices.py b/src/kimchima/pkg/devices.py
similarity index 65%
rename from pkg/devices/devices.py
rename to src/kimchima/pkg/devices.py
index 0688da5..d82e5b7 100644
--- a/pkg/devices/devices.py
+++ b/src/kimchima/pkg/devices.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from enum import Enum
 import torch
 import platform
 
+from typing import Tuple
+
+
 class Devices(Enum):
     Silicon = 'mps'
     CPU = 'cpu'
@@ -23,7 +28,7 @@ class Devices(Enum):
     GPU = 'cuda'
 
 
-def get_device():
+def get_device()-> Devices:
     """
     Only support Single GPU for now
     """
@@ -32,3 +37,17 @@ def get_device():
     elif torch.cuda.is_available():
         return Devices.GPU
     return Devices.CPU
+
+
+def get_capability()-> Tuple[int, int]:
+    """
+    Get the capability of the device(GPU) for current env, this is used for support latest quantization techniques like: Marlin
+    
+    Returns:
+        tuple: The capability of the device(GPU) for current env.
+
+    For not GPU env, return (0, 0)
+    """
+    if get_device() == Devices.GPU:
+        return torch.cuda.get_device_capability()
+    return (0, 0)
\ No newline at end of file
diff --git a/tests/__init__.py b/src/kimchima/tests/__init__.py
similarity index 100%
rename from tests/__init__.py
rename to src/kimchima/tests/__init__.py
diff --git a/tests/test_auto.py b/src/kimchima/tests/test_auto.py
similarity index 87%
rename from tests/test_auto.py
rename to src/kimchima/tests/test_auto.py
index ee69585..bae9d04 100644
--- a/tests/test_auto.py
+++ b/src/kimchima/tests/test_auto.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from pkg.auto import Auto
+from kimchima.pkg import Auto
 
 
 class TestAuto(unittest.TestCase):
@@ -22,13 +22,21 @@ class TestAuto(unittest.TestCase):
     model_name = 'sentence-transformers/all-MiniLM-L6-v2'
 
     def test_get_embeddings(self):
+        """
+        Test get_embeddings method
+        """
         model = Auto(model_name_or_path=self.model_name)
         embeddings = model.get_embeddings(text='Melbourne')
         self.assertIsNotNone(embeddings)
         self.assertEqual(embeddings.shape, (1, 384))
 
+
     def test_get_embeddings_with_list(self):
+        """
+        Test get_embeddings method with list of text
+        """
         model = Auto(model_name_or_path=self.model_name)
         embeddings = model.get_embeddings(text=['Melbourne', 'Sydney'])
         self.assertIsNotNone(embeddings)
         self.assertEqual(embeddings.shape, (2, 384))
+
diff --git a/tests/test_devices.py b/src/kimchima/tests/test_devices.py
similarity index 68%
rename from tests/test_devices.py
rename to src/kimchima/tests/test_devices.py
index 42032c5..000eb5a 100644
--- a/tests/test_devices.py
+++ b/src/kimchima/tests/test_devices.py
@@ -16,7 +16,7 @@
 import platform
 import torch
 
-from pkg.devices import Devices, get_device
+from kimchima.pkg import Devices, get_device, get_capability
 
 class TestDevices(unittest.TestCase):
 
@@ -34,3 +34,16 @@ def test_get_device(self):
         if platform.system() != 'Darwin' and not torch.cuda.is_available():
             self.assertEqual(get_device(), Devices.CPU)
 
+
+    def test_get_capability(self):
+
+        # Test if the device is a GPU
+        if get_device() == Devices.GPU:
+            self.assertIsInstance(get_capability(), tuple)
+            self.assertEqual(len(get_capability()), 2)
+            self.assertIsInstance(get_capability()[0], int)
+            self.assertIsInstance(get_capability()[1], int)
+        
+        # Test if the device is not a GPU
+        if get_device() != Devices.GPU:
+            self.assertEqual(get_capability(), (0, 0))
diff --git a/tests/test_dump.py b/tests/test_dump.py
deleted file mode 100644
index 7b022c6..0000000
--- a/tests/test_dump.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import unittest
-import torch
-from pkg.model import Transformer, ModelArgs
-
-from pkg.dump import Dump
-
-
-class TestDump(unittest.TestCase):
-
-    @unittest.skip("Pass testing for no useful features")
-    def test_dump(self):
-        n_vocab = 10
-        n_state = 8
-        multiple_of = 3
-        n_head = 4
-        n_kv_head = 2
-        n_layer = 3
-        norm_eps = 1e-6
-        max_batch_size = 1
-
-
-        model_args = ModelArgs(
-            dim=n_state,
-            n_layers=n_layer,
-            n_heads=n_head,
-            n_kv_heads=n_kv_head,
-            vocab_size=n_vocab,
-            multiple_of=multiple_of,
-            norm_eps=norm_eps,
-            max_batch_size=max_batch_size,
-        )
-
-        llama = Transformer(model_args)
-
-        with torch.no_grad():
-            tokens=torch.tensor([0,2,1], dtype=torch.int32).unsqueeze(0)
-            output=llama(tokens,0)
-            self.assertIsNotNone(tokens.numpy())
-            self.assertIsNotNone(output.numpy())
-        
-            Dump.save_transformer(llama, 'params')
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
deleted file mode 100644
index d55c7e2..0000000
--- a/tests/test_tokenizer.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import unittest
-import os
-
-from pkg.tokenizer import tokenizer
-
-class TestTokenizer(unittest.TestCase):
-    
-    test_prompt="This is a test prompt."
-
-    @unittest.skip("Pass testing for no useful features")
-    def test_tokenize(self):
-        # get home path from environment variable
-        home = os.environ['HOME']
-
-        tok=tokenizer.Tokenizer(home+"/Downloads/workspace/llama/tokenizer.model")
-
-        # tok should not be none
-        self.assertIsNotNone(tok)
-    
-        encoded =  tok.encode(self.test_prompt, True, True)
-        self.assertIsNotNone(encoded)
-
-        decoded = tok.decode(encoded)
-
-        self.assertEqual(decoded, self.test_prompt)
-