From c8a05de47965dcd4fb24d920f40c550a73107e22 Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood@skiff.com>
Date: Thu, 18 Jul 2024 14:36:20 -0700
Subject: [PATCH 1/9] initial llm changes

---
 src/senselab/text/tasks/llms/__init__.py |   4 +
 src/senselab/text/tasks/llms/llm_call.py | 105 +++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 src/senselab/text/tasks/llms/__init__.py
 create mode 100644 src/senselab/text/tasks/llms/llm_call.py

diff --git a/src/senselab/text/tasks/llms/__init__.py b/src/senselab/text/tasks/llms/__init__.py
new file mode 100644
index 00000000..0604f250
--- /dev/null
+++ b/src/senselab/text/tasks/llms/__init__.py
@@ -0,0 +1,4 @@
+"""This module provides the API for making LLM calls in senselab."""
+
+
+__version__ = "1.0.0"
diff --git a/src/senselab/text/tasks/llms/llm_call.py b/src/senselab/text/tasks/llms/llm_call.py
new file mode 100644
index 00000000..af4611d6
--- /dev/null
+++ b/src/senselab/text/tasks/llms/llm_call.py
@@ -0,0 +1,105 @@
+from openai import OpenAI
+from typing import Dict, List, Optional
+# from langchain_community.chat_models import ChatOpenAI # I had to run "pip install --only-binary :all: greenlet" first before installing langchain
+# from langchain_core.prompts import PromptTemplate
+# from langchain_core.messages import HumanMessage, SystemMessage
+# from langchain_core.output_parsers import StrOutputParser
+
+
+# openrouter account associated with bruceatwood1@gmail.com
+OPENROUTER_API_KEY = "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"
+
+
+class llm_server:
+    """
+    Wrapper for invoking various LLMs.
+    
+    This class provides a unified interface for interacting with different large language models (LLMs).
+    
+    Parameters:
+    -----------
+    model : str
+        The name of the model to use. This is a required parameter and should be one of the following options:
+        
+        - "mistral-7b"
+
+    Attributes:
+    -----------
+    model : str
+        The name of the selected model.
+
+    Methods:
+    --------
+    invoke
+
+    Example:
+    --------
+    To create an instance of llm_server with the "gpt-3.5-turbo" model:
+    
+    >>> llm = llm_server(model="mistral-7b")
+    >>> response = llm.invoke(message = "say hello world", system_instruction = "add bumblebee on a new line on end", params)
+    """
+    
+    def __init__(self, model_name: str):
+        self._model_name = self._get_model(model_name)        
+        self._client= OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key= OPENROUTER_API_KEY
+        )
+
+
+    def invoke(self, 
+               message: str, 
+               system_instruction: str, 
+               params: Optional[Dict] = None) -> str:
+        """
+        Class method to invoke the model with a given message and system instruction.
+
+        Parameters:
+        -----------
+        message : str
+            The user message to send to the model.
+        system_instruction : str
+            The system instruction for the model.
+        params : Optional[Dict]
+            Additional parameters for the model invocation, if any.
+
+        Returns:
+        --------
+            str
+                The content of the model's response.
+        """
+        if params:
+            for key, value in params.items():
+                setattr(self._model, key, value)
+
+        messages = [
+            {
+                "role": "user",
+                "content": message,
+            },
+            {
+                "role": "system",
+                "content": system_instruction
+            },
+        ]
+        
+        completion = self._client.chat.completions.create(
+            model=self._model_name,
+            messages=messages,
+        )
+        
+        return completion.choices[0].message.content
+    
+    
+    
+    def _get_model(self, model):
+        
+        model_mapping = {
+            "mistral_7b": "mistralai/mistral-7b-instruct:free"
+        }
+        if model in model_mapping:
+            return model_mapping[model]
+        else:
+            available_options = ",\n\t".join(model_mapping.keys())
+            raise ValueError(f"That is not a supported model. Available options: \n\t{available_options}")

From 5677fb77eb8203bd0632b24a781ae4b906f6309b Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Wed, 31 Jul 2024 09:16:10 -0400
Subject: [PATCH 2/9] deep eval test

---
 pyproject.toml                                |  3 +-
 .../tasks/evaluate_conversation/__init__.py   |  7 +++
 .../text/tasks/evaluate_conversation/api.py   | 22 ++++++++
 .../tasks/evaluate_conversation/deep_eval.py  | 33 ++++++++++++
 .../tasks/evaluate_conversation/metrics.py    | 51 +++++++++++++++++++
 .../text/tasks/evaluate_conversation_test.py  | 37 ++++++++++++++
 6 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 src/senselab/text/tasks/evaluate_conversation/__init__.py
 create mode 100644 src/senselab/text/tasks/evaluate_conversation/api.py
 create mode 100644 src/senselab/text/tasks/evaluate_conversation/deep_eval.py
 create mode 100644 src/senselab/text/tasks/evaluate_conversation/metrics.py
 create mode 100644 src/tests/text/tasks/evaluate_conversation_test.py

diff --git a/pyproject.toml b/pyproject.toml
index 37be721e..c88bead9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ numpy = "~=1.25"
 umap-learn = "~=0.5"
 scikit-learn = "~=1.5"
 nltk = "~=3.8"
+rouge-score = "~=0.1"
 
 [tool.poetry.group.dev]
 optional = true
@@ -158,7 +159,7 @@ skip = [
   "docs_style/pdoc-theme/syntax-highlighting.css",
   "*.ipynb"
 ]
-ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER", "te"]
+ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER", "te", "ROUGE", "rouge"]
 
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
diff --git a/src/senselab/text/tasks/evaluate_conversation/__init__.py b/src/senselab/text/tasks/evaluate_conversation/__init__.py
new file mode 100644
index 00000000..6a9953da
--- /dev/null
+++ b/src/senselab/text/tasks/evaluate_conversation/__init__.py
@@ -0,0 +1,7 @@
+"""senselab project integrates deepeval for evaluating conversations.
+
+Using an api.py script to interface with deep_eval.py,
+which includes a custom ROUGE metric for comprehensive evaluation.
+The ScriptLine class standardizes input data, and unit tests ensure accurate functionality,
+making Senselab a robust wrapper for deepeval and other tools.
+"""
diff --git a/src/senselab/text/tasks/evaluate_conversation/api.py b/src/senselab/text/tasks/evaluate_conversation/api.py
new file mode 100644
index 00000000..196f3b78
--- /dev/null
+++ b/src/senselab/text/tasks/evaluate_conversation/api.py
@@ -0,0 +1,22 @@
+"""This module provides the API for the senselab text evaluation."""
+
+from typing import Dict, List
+
+from senselab.utils.data_structures.script_line import ScriptLine
+
+from .deep_eval import evaluate_conversation
+
+
+def evaluate_chat(script_lines: List[ScriptLine]) -> Dict:
+    """Evaluate chat using the provided script lines and metrics.
+
+    Args:
+        script_lines (List[ScriptLine]): A list of script lines to evaluate.
+
+    Returns:
+        dict: The standardized result with overall score and metrics.
+    """
+    metrics = ["rouge1", "rouge2", "rougeL"]  # Define the metrics you want to use
+    result = evaluate_conversation(script_lines, metrics)
+    standardized_result = {"metrics": result["metrics"]}
+    return standardized_result
diff --git a/src/senselab/text/tasks/evaluate_conversation/deep_eval.py b/src/senselab/text/tasks/evaluate_conversation/deep_eval.py
new file mode 100644
index 00000000..54f07f4f
--- /dev/null
+++ b/src/senselab/text/tasks/evaluate_conversation/deep_eval.py
@@ -0,0 +1,33 @@
+"""deep_eval.py."""
+
+from typing import Dict, List
+
+from senselab.utils.data_structures.script_line import ScriptLine
+
+from .metrics import RougeMetric
+
+
+def evaluate_conversation(script_lines: List[ScriptLine], metrics: List[str]) -> Dict:
+    """Evaluate a conversation based on the provided script lines and metrics.
+
+    Args:
+        script_lines (List[ScriptLine]): A list of script lines to evaluate.
+        metrics (List[str]): A list of metrics to use for evaluation.
+
+    Returns:
+        dict: The evaluation result containing detailed metrics.
+    """
+    if not script_lines:
+        return {"metrics": []}
+    references: List[str] = [line.text for line in script_lines if line.speaker == "agent" and line.text is not None]
+    hypotheses: List[str] = [line.text for line in script_lines if line.speaker == "user" and line.text is not None]
+
+    if not references or not hypotheses:
+        return {"metrics": []}
+
+    metric_instance = RougeMetric()
+    scores = metric_instance.measure(references, hypotheses)
+
+    metrics_results = [{metric: score.get(metric, 0.0) for metric in metrics} for score in scores]
+
+    return {"metrics": metrics_results}
diff --git a/src/senselab/text/tasks/evaluate_conversation/metrics.py b/src/senselab/text/tasks/evaluate_conversation/metrics.py
new file mode 100644
index 00000000..662ef52c
--- /dev/null
+++ b/src/senselab/text/tasks/evaluate_conversation/metrics.py
@@ -0,0 +1,51 @@
+"""metrics.py."""
+
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+from rouge_score import rouge_scorer
+
+
+class Metric(ABC):
+    """Abstract base class for metrics."""
+
+    @abstractmethod
+    def measure(self, references: List[str], hypotheses: List[str]) -> List[Dict[str, Dict[str, float]]]:
+        """Measure the metric.
+
+        Args:
+            references (List[str]): A list of reference strings.
+            hypotheses (List[str]): A list of hypothesis strings.
+
+        Returns:
+            List[Dict[str, Dict[str, float]]]: A list of dictionaries containing the result of the measurement.
+        """
+        pass
+
+
+class RougeMetric(Metric):
+    """ROUGE metric calculation class."""
+
+    def __init__(self, name: str = "rouge", description: str = "ROUGE metric calculation") -> None:
+        """Initialize the ROUGE metric with a name and description.
+
+        Args:
+            name (str): The name of the metric.
+            description (str): The description of the metric.
+        """
+        self.name = name
+        self.description = description
+
+    def measure(self, references: List[str], hypotheses: List[str]) -> List[Dict[str, Dict[str, float]]]:
+        """Measure the ROUGE metric for the given references and hypotheses.
+
+        Args:
+            references (List[str]): A list of reference strings.
+            hypotheses (List[str]): A list of hypothesis strings.
+
+        Returns:
+            List[Dict[str, Dict[str, float]]]: A list of dictionaries containing ROUGE scores.
+        """
+        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+        scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
+        return [{key: value.fmeasure for key, value in score.items()} for score in scores]
diff --git a/src/tests/text/tasks/evaluate_conversation_test.py b/src/tests/text/tasks/evaluate_conversation_test.py
new file mode 100644
index 00000000..22868a36
--- /dev/null
+++ b/src/tests/text/tasks/evaluate_conversation_test.py
@@ -0,0 +1,37 @@
+"""Unit tests for evaluating chat functionality."""
+
+from typing import List
+
+import pytest
+
+from senselab.text.tasks.evaluate_conversation.api import evaluate_chat
+from senselab.utils.data_structures.script_line import ScriptLine
+
+
+@pytest.fixture
+def script_lines() -> List[ScriptLine]:
+    """Fixture for providing sample script lines.
+
+    Returns:
+        List[ScriptLine]: A list of sample script lines.
+    """
+    return [
+        ScriptLine(text="Mazen speaks Arabic", speaker="agent"),
+        ScriptLine(text="Mazen speaks Arabic", speaker="user"),
+        ScriptLine(text="I live in USA", speaker="agent"),
+        ScriptLine(text="I live in KSA", speaker="user"),
+    ]
+
+
+def test_evaluate_chat(script_lines: List[ScriptLine]) -> None:
+    """Test the evaluate_chat function.
+
+    Args:
+        script_lines (List[ScriptLine]): A list of script lines to evaluate.
+
+    Asserts:
+        The evaluation result is not None and contains overall score and metrics.
+    """
+    result = evaluate_chat(script_lines)
+    assert result is not None
+    assert "metrics" in result

From 7e48531020d2c4a7d5e1cfcbfe41cb4096418ddb Mon Sep 17 00:00:00 2001
From: Bruce Atwood <goshdam@OPENMIND7.MIT.EDU>
Date: Wed, 7 Aug 2024 15:17:28 -0400
Subject: [PATCH 3/9] test

---
 src/senselab/text/tasks/llms/__init__.py |   4 -
 src/senselab/text/tasks/llms/llm_call.py | 105 -----------------------
 2 files changed, 109 deletions(-)
 delete mode 100644 src/senselab/text/tasks/llms/__init__.py
 delete mode 100644 src/senselab/text/tasks/llms/llm_call.py

diff --git a/src/senselab/text/tasks/llms/__init__.py b/src/senselab/text/tasks/llms/__init__.py
deleted file mode 100644
index 0604f250..00000000
--- a/src/senselab/text/tasks/llms/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""This module provides the API for making LLM calls in senselab."""
-
-
-__version__ = "1.0.0"
diff --git a/src/senselab/text/tasks/llms/llm_call.py b/src/senselab/text/tasks/llms/llm_call.py
deleted file mode 100644
index af4611d6..00000000
--- a/src/senselab/text/tasks/llms/llm_call.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from openai import OpenAI
-from typing import Dict, List, Optional
-# from langchain_community.chat_models import ChatOpenAI # I had to run "pip install --only-binary :all: greenlet" first before installing langchain
-# from langchain_core.prompts import PromptTemplate
-# from langchain_core.messages import HumanMessage, SystemMessage
-# from langchain_core.output_parsers import StrOutputParser
-
-
-# openrouter account associated with bruceatwood1@gmail.com
-OPENROUTER_API_KEY = "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"
-
-
-class llm_server:
-    """
-    Wrapper for invoking various LLMs.
-    
-    This class provides a unified interface for interacting with different large language models (LLMs).
-    
-    Parameters:
-    -----------
-    model : str
-        The name of the model to use. This is a required parameter and should be one of the following options:
-        
-        - "mistral-7b"
-
-    Attributes:
-    -----------
-    model : str
-        The name of the selected model.
-
-    Methods:
-    --------
-    invoke
-
-    Example:
-    --------
-    To create an instance of llm_server with the "gpt-3.5-turbo" model:
-    
-    >>> llm = llm_server(model="mistral-7b")
-    >>> response = llm.invoke(message = "say hello world", system_instruction = "add bumblebee on a new line on end", params)
-    """
-    
-    def __init__(self, model_name: str):
-        self._model_name = self._get_model(model_name)        
-        self._client= OpenAI(
-            base_url="https://openrouter.ai/api/v1",
-            api_key= OPENROUTER_API_KEY
-        )
-
-
-    def invoke(self, 
-               message: str, 
-               system_instruction: str, 
-               params: Optional[Dict] = None) -> str:
-        """
-        Class method to invoke the model with a given message and system instruction.
-
-        Parameters:
-        -----------
-        message : str
-            The user message to send to the model.
-        system_instruction : str
-            The system instruction for the model.
-        params : Optional[Dict]
-            Additional parameters for the model invocation, if any.
-
-        Returns:
-        --------
-            str
-                The content of the model's response.
-        """
-        if params:
-            for key, value in params.items():
-                setattr(self._model, key, value)
-
-        messages = [
-            {
-                "role": "user",
-                "content": message,
-            },
-            {
-                "role": "system",
-                "content": system_instruction
-            },
-        ]
-        
-        completion = self._client.chat.completions.create(
-            model=self._model_name,
-            messages=messages,
-        )
-        
-        return completion.choices[0].message.content
-    
-    
-    
-    def _get_model(self, model):
-        
-        model_mapping = {
-            "mistral_7b": "mistralai/mistral-7b-instruct:free"
-        }
-        if model in model_mapping:
-            return model_mapping[model]
-        else:
-            available_options = ",\n\t".join(model_mapping.keys())
-            raise ValueError(f"That is not a supported model. Available options: \n\t{available_options}")

From d3d6144206b90523ea8510d56dbf0d0e5d28f7b7 Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood1@gmail.com>
Date: Thu, 8 Aug 2024 00:05:34 -0400
Subject: [PATCH 4/9] more selective changes

---
 .pre-commit-config.yaml                       |   1 -
 src/senselab/text/tasks/llms/__init__.py      |   3 +
 src/senselab/text/tasks/llms/data_ingest.py   | 173 ++++++++++++++++++
 src/senselab/text/tasks/llms/llm.py           | 109 +++++++++++
 .../tasks/llms/process_transcript_example.py  |  51 ++++++
 src/tests/text/tasks/llms_test.py             |  60 ++++++
 6 files changed, 396 insertions(+), 1 deletion(-)
 create mode 100644 src/senselab/text/tasks/llms/__init__.py
 create mode 100644 src/senselab/text/tasks/llms/data_ingest.py
 create mode 100644 src/senselab/text/tasks/llms/llm.py
 create mode 100644 src/senselab/text/tasks/llms/process_transcript_example.py
 create mode 100644 src/tests/text/tasks/llms_test.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b5880847..8bb6ea14 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,6 @@ repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.5.0
   hooks:
-  - id: check-added-large-files
   - id: check-case-conflict
   - id: end-of-file-fixer
   - id: trailing-whitespace
diff --git a/src/senselab/text/tasks/llms/__init__.py b/src/senselab/text/tasks/llms/__init__.py
new file mode 100644
index 00000000..85b8c0b3
--- /dev/null
+++ b/src/senselab/text/tasks/llms/__init__.py
@@ -0,0 +1,3 @@
+"""This module provides the API for making LLM calls in senselab."""
+
+__version__ = "1.0.0"
diff --git a/src/senselab/text/tasks/llms/data_ingest.py b/src/senselab/text/tasks/llms/data_ingest.py
new file mode 100644
index 00000000..b4c7e8a6
--- /dev/null
+++ b/src/senselab/text/tasks/llms/data_ingest.py
@@ -0,0 +1,173 @@
+"""This module provides a data manager for handling interactions with a LLM."""
+
+import json
+from pathlib import Path
+from typing import Dict, List
+
+
+class MessagesManager:
+    """Manages message data for interactions with a LLM.
+
+    Provides methods to load transcripts, convert JSON data to message objects,
+    and generate data from a human conversation to query potential AI responses.
+
+    Attributes:
+        messages (List[Dict[str, str]]): A list of message objects for the OpenAI API.
+
+    Methods:
+        __init__(transcript_path: Path) -> None: Initializes the manager with a transcript file path.
+        print_human_readable(messages: List[Dict[str, str]]) -> None: Prints messages in a readable format.
+        extract_response_opportunities() -> List[List[Dict[str, str]]]: Extracts sublists ending with user input.
+        _load_transcript(json_path: Path) -> Dict: Loads a JSON transcript from a file.
+        convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]: Converts transcript format to LLM format.
+    """
+
+    def __init__(self, transcript_path: Path) -> None:
+        """Initializes the manager with a transcript file path.
+
+        Args:
+            transcript_path (Path): The path to the JSON transcript file.
+        """
+        json_obj = self._load_transcript(transcript_path)
+        self.messages = self.convert_json_to_messages(json_obj)
+
+    @staticmethod
+    def print_human_readable(messages: List[Dict[str, str]]) -> None:
+        """Print a list of messages in a human-readable format.
+
+        Args:
+            messages (List[Dict[str, str]]): List of messages where each message is a dictionary
+                                            with 'role' and 'content' keys.
+        """
+        for message in messages:
+            print(f'{message["role"]}:\t\t{message["content"]}\n')
+
+    def extract_response_opportunities(self) -> List[List[Dict[str, str]]]:
+        """Extract consecutive sublists from the messages list, ending after every 'user' response.
+
+        This is used to compare AI responses to a human's response
+        over the course of a conversation, where the AI has the previous,
+        natural conversation before making its own response.
+
+        Returns:
+            List[List[Dict[str, str]]]: A list of consecutive sublists, each starting from the
+                                         beginning of the messages list and ending with a
+                                         message where the role is "user".
+        """
+        sublists = []
+
+        for i, message in enumerate(self.messages):
+            if message["role"] == "user" and i > 0:
+                sublist = self.messages[0 : i + 1]
+                sublists.append(sublist)
+
+        return sublists
+
+    @staticmethod
+    def _load_transcript(json_path: Path) -> Dict:
+        """Load a JSON transcript from the specified file path.
+
+        This static method reads a JSON file from the provided file path and
+        returns the loaded JSON object.
+
+        Args:
+            json_path (Path): The file path to the JSON transcript file.
+
+        Returns:
+            Dict: The JSON object loaded from the file.
+        """
+        with open(json_path, "r", encoding="utf-8") as file:
+            return json.load(file)
+
+    @staticmethod
+    def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
+        """Converts transcript segments to list of message objects, excluding system messages.
+
+        The input JSON object should have the following structure:
+        {
+            "segments": [
+                {
+                    "start": <float>,
+                    "end": <float>,
+                    "text": <string>,
+                    "words": [
+                        {
+                            "word": <string>,
+                            "start": <float>,
+                            "end": <float>,
+                            "score": <float>,
+                            "speaker": <string> [kid|teacher]
+                        },
+                        ...
+                    ],
+                    "speaker": <string> [kid|teacher]
+                },
+                ...
+            ]
+        }
+
+        The output will be a list of message objects,
+        suitable for OpenAI API, with the following structure:
+        [
+            {
+                "role": "user",
+                "content": "<user-input-string>"
+            },
+            {
+                "role": "assistant",
+                "content": "<assistant-response-string>"
+            },
+            ...
+        ]
+
+        The conversion will map the "teacher" speaker role to "assistant" and the "kid" speaker
+        role to "user".
+
+        Args:
+            json_obj (Dict): The input JSON object containing conversation segments.
+
+        Returns:
+            List[Dict[str, str]]: A list of message objects in the format required by the OpenAI API.
+
+        Raises:
+            ValueError: If the input JSON structure is invalid or contains an unknown speaker role.
+        """
+        # Ensure valid JSON structure
+        if not (isinstance(json_obj, dict) and isinstance(json_obj.get("segments"), list)):
+            raise ValueError("Invalid JSON structure: must be a dictionary with a 'segments' list")
+
+        messages = []
+        current_role: str = ""
+        current_content: List[str] = []
+
+        for segment in json_obj["segments"]:
+            # Validate segment structure
+            if not all(key in segment for key in ("words",)):
+                raise ValueError(f"Invalid segment structure: {segment}")
+
+            for word_obj in segment["words"]:
+                if not all(key in word_obj for key in ("word", "speaker")):
+                    raise ValueError(f"Invalid word structure: {word_obj}")
+
+                word = word_obj["word"]
+                speaker = word_obj["speaker"]
+
+                if speaker == "teacher":
+                    role = "assistant"
+                elif speaker == "kid":
+                    role = "user"
+                else:
+                    raise ValueError(f"Unknown speaker role: {speaker}")
+
+                if role != current_role:
+                    if current_content:
+                        messages.append({"role": current_role, "content": " ".join(current_content)})
+                    current_role = role
+                    current_content = [word]
+                else:
+                    current_content.append(word)
+
+        if current_content:
+            messages.append({"role": current_role, "content": " ".join(current_content)})
+
+        return messages
diff --git a/src/senselab/text/tasks/llms/llm.py b/src/senselab/text/tasks/llms/llm.py
new file mode 100644
index 00000000..ef9fda71
--- /dev/null
+++ b/src/senselab/text/tasks/llms/llm.py
@@ -0,0 +1,109 @@
+"""This module provides a wrapper for invoking various Large Language Models (LLMs).
+
+Classes:
+    LLM: A unified interface for interacting with various LLMs.
+"""
+
+import os
+from typing import Dict, List, Optional
+
+import torch
+from openai import OpenAI
+
+
+class LLM:
+    """Wrapper for invoking various LLMs.
+
+    This class provides a unified interface for interacting with LLMs,
+    running on a vllm server at localhost:8000.
+
+    Parameters:
+    -----------
+    model_name : str
+        The name of the model to use. This is a required argument. Options:
+        - "llama3-8b"
+        - "llama3-70b"
+
+    Methods:
+    --------
+    call(messages: List[Dict], system_instruction: Optional[str] = "",
+          max_tokens: Optional[int] = 100, temperature: Optional[float] = 0.3) -> str:
+        Invokes the model with the given message and system instruction.
+    start_server(num_gpus: int, base_url: str) -> None:
+        Starts the VLLM server with the specified number of GPUs.
+    """
+
+    def __init__(self, model_name: str) -> None:
+        """Initializes the LLM instance with a model name and OpenAI client.
+
+        Args:
+            model_name (str): The name of the model to use.
+        """
+        self._model_name = self._get_model(model_name)
+
+    def start_server(self, num_gpus: int, base_url: str = "http://localhost:8000/v1") -> None:
+        """Starts the VLLM server with the specified number of GPUs.
+
+        Args:
+            num_gpus (int): The number of GPUs to use for tensor parallelism in the VLLM server.
+            base_url (str): The base URL of the VLLM server, from which the host and port are extracted.
+        """
+        if torch.cuda.is_available():
+            host, port = base_url.split("//")[1].split(":")
+            port = port.split("/")[0]
+            os.system(
+                f"vllm serve {self._model_name} --host {host} --port {port} " f"--tensor-parallel-size {num_gpus}"
+            )
+            self._client = OpenAI(base_url=base_url, api_key="EMPTY")
+
+        else:
+            print("Please migrate to a compute node with GPU resources.")
+
+    def call(
+        self,
+        messages: List[Dict[str, str]],
+        system_instruction: Optional[str] = "",
+        max_tokens: Optional[int] = 100,
+        temperature: Optional[float] = 0.3,
+    ) -> str:
+        """Invokes the model with a given message and system instruction.
+
+        Args:
+            messages (List[Dict[str, str]]): The conversation history.
+            system_instruction (Optional[str]): The system instruction for the model.
+            max_tokens (Optional[int]): Maximum number of tokens to generate.
+            temperature (Optional[float]): Sampling temperature ranging between 0 and 2.
+
+        Returns:
+            str: The content of the model's response.
+        """
+        if system_instruction:
+            system_message = {"role": "system", "content": system_instruction}
+            messages.insert(0, system_message)
+
+        completion = self._client.chat.completions.create(
+            model=self._model_name, messages=messages, max_tokens=max_tokens, temperature=temperature
+        )
+
+        return completion.choices[0].message.content
+
+    def _get_model(self, model: str) -> str:
+        """Maps a model name to the corresponding model identifier.
+
+        Args:
+            model (str): The name of the model.
+
+        Returns:
+            str: The model identifier.
+
+        Raises:
+            ValueError: If the model name is unsupported.
+        """
+        model_mapping = {
+            "llama3-70b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "llama3-8b": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        }
+        if model in model_mapping:
+            return model_mapping[model]
+        available_options = ",\n\t".join(model_mapping.keys())
+        raise ValueError(f"Unsupported model. Available options: \n\t{available_options}")
diff --git a/src/senselab/text/tasks/llms/process_transcript_example.py b/src/senselab/text/tasks/llms/process_transcript_example.py
new file mode 100644
index 00000000..daeb928e
--- /dev/null
+++ b/src/senselab/text/tasks/llms/process_transcript_example.py
@@ -0,0 +1,51 @@
+"""Example usage of llms directory to process AI responses from transcript."""
+
+import sys
+from pathlib import Path
+from typing import Generator
+
+from tqdm import tqdm
+
+from senselab.text.tasks.llms.data_ingest import MessagesManager
+from senselab.text.tasks.llms.llm import LLM
+
+if __name__ == "__main__":
+    manager = MessagesManager(Path("/home/goshdam/sample_transcript.json"))
+    llm = LLM("llama3_70b")
+
+    # manager.print_human_readable(manager.messages)
+
+    SYSTEM_INSTRUCTION = (
+        "You are a friendly, supportive tutoring assistant for a child, "
+        "helping them to learn vocabulary, "
+        "interspersed with friendly human interaction."
+    )
+
+    all_messages = manager.extract_response_opportunities()
+
+    responses = [
+        llm.call(messages=messages, system_instruction=SYSTEM_INSTRUCTION, max_tokens=200, temperature=0.4)
+        for messages in tqdm(all_messages, file=sys.stderr)
+    ]
+
+    def response_gen() -> Generator[str, None, None]:
+        """Generator function that yields responses from the responses list.
+
+        Yields:
+            str: Each response in the responses list.
+        """
+        yield from responses
+
+    gen = response_gen()
+
+    for i, message in enumerate(manager.messages):
+        content = message["content"]
+
+        if message["role"] == "assistant":
+            if i > 0:
+                response_content = next(gen)
+                print(f"Teacher:\t{content}\n\nAI:\t{response_content}\n\n")
+            else:
+                print(f"Teacher:\t{content}\n\n")
+        else:
+            print(f"Student:\t{content}\n\n")
diff --git a/src/tests/text/tasks/llms_test.py b/src/tests/text/tasks/llms_test.py
new file mode 100644
index 00000000..08a7aa4c
--- /dev/null
+++ b/src/tests/text/tasks/llms_test.py
@@ -0,0 +1,60 @@
+"""This module is for testing the conversion of JSON conversation segments to message objects."""
+
+import os
+from typing import List
+
+import pytest
+
+from senselab.text.tasks.llms.data_ingest import MessagesManager
+
+if os.getenv("GITHUB_ACTIONS") != "true":
+
+    @pytest.fixture
+    def sample_json_obj() -> dict:
+        """Fixture for a sample JSON object representing conversation segments."""
+        return {
+            "segments": [
+                {
+                    "start": 0.0,
+                    "end": 1.0,
+                    "words": [
+                        {"word": "uh", "start": 0.0, "end": 0.5, "score": 1.0, "speaker": "kid"},
+                        {"word": "hello", "start": 0.6, "end": 1.0, "score": 1.0, "speaker": "teacher"},
+                    ],
+                    "speaker": "kid",
+                },
+                {
+                    "start": 1.0,
+                    "end": 2.0,
+                    "words": [
+                        {"word": "world", "start": 1.0, "end": 1.5, "score": 1.0, "speaker": "teacher"},
+                        {"word": "namaste", "start": 1.6, "end": 2.0, "score": 1.0, "speaker": "teacher"},
+                    ],
+                    "speaker": "teacher",
+                },
+                {
+                    "start": 2.0,
+                    "end": 3.0,
+                    "words": [
+                        {"word": "kemosabe", "start": 2.0, "end": 2.5, "score": 1.0, "speaker": "teacher"},
+                        {"word": "hi", "start": 2.6, "end": 2.8, "score": 1.0, "speaker": "kid"},
+                        {"word": "there", "start": 2.9, "end": 3.0, "score": 1.0, "speaker": "kid"},
+                    ],
+                    "speaker": "kid",
+                },
+            ]
+        }
+
+    @pytest.fixture
+    def expected_messages() -> List[dict]:
+        """Fixture for the expected list of message objects."""
+        return [
+            {"role": "user", "content": "uh"},
+            {"role": "assistant", "content": "hello world namaste kemosabe"},
+            {"role": "user", "content": "hi there"},
+        ]
+
+    def test_convert_json_to_messages(sample_json_obj: dict, expected_messages: List[dict]) -> None:
+        """Test the conversion of JSON conversation segments to message objects."""
+        result = MessagesManager.convert_json_to_messages(sample_json_obj)
+        assert result == expected_messages

From a25d33e23c33b76d1ffb943fe18d82dd9d2edcb9 Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Fri, 9 Aug 2024 12:26:24 -0400
Subject: [PATCH 5/9] adjusting doc and setting up tutorial for llms + fixing
 style issues

---
 src/senselab/text/tasks/llms/__init__.py |  2 +-
 src/senselab/text/tasks/llms/doc.md      |  7 ++++++
 tutorials/llms.ipynb                     | 27 ++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 src/senselab/text/tasks/llms/doc.md
 create mode 100644 tutorials/llms.ipynb

diff --git a/src/senselab/text/tasks/llms/__init__.py b/src/senselab/text/tasks/llms/__init__.py
index 85b8c0b3..c5fcbf59 100644
--- a/src/senselab/text/tasks/llms/__init__.py
+++ b/src/senselab/text/tasks/llms/__init__.py
@@ -1,3 +1,3 @@
-"""This module provides the API for making LLM calls in senselab."""
+""".. include:: ./doc.md"""  # noqa: D415
 
 __version__ = "1.0.0"
diff --git a/src/senselab/text/tasks/llms/doc.md b/src/senselab/text/tasks/llms/doc.md
new file mode 100644
index 00000000..ed3f3b34
--- /dev/null
+++ b/src/senselab/text/tasks/llms/doc.md
@@ -0,0 +1,7 @@
+# LLMs
+
+<button class="tutorial-button" onclick="window.location.href='https://github.com/sensein/senselab/blob/main/tutorials/llms.ipynb'">Tutorial</button>
+
+
+## Overview
+This module provides the API for making LLM calls in senselab.
diff --git a/tutorials/llms.ipynb b/tutorials/llms.ipynb
new file mode 100644
index 00000000..67bbf3ba
--- /dev/null
+++ b/tutorials/llms.ipynb
@@ -0,0 +1,27 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# LLMs\n",
+                "\n",
+                "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/llms.ipynb)\n",
+                "\n",
+                "This tutorial demonstrates how to use `senselab` for using LLMs."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": []
+        }
+    ],
+    "metadata": {
+        "language_info": {
+            "name": "python"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

From 78ffb037e3338b7e05a90cbe9204b44abb077474 Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood1@gmail.com>
Date: Tue, 17 Sep 2024 12:32:50 -0400
Subject: [PATCH 6/9] updates

---
 .pre-commit-config.yaml                       |   2 -
 .../tasks/evaluate_conversation/deep_eval.py  |   6 +-
 .../tasks/evaluate_conversation/metrics.py    |  75 +++----
 src/senselab/text/tasks/llms/doc.md           | 185 +++++++++++++++++-
 src/senselab/text/tasks/llms/llm.py           | 106 +++++++---
 .../tasks/llms/process_transcript_example.py  | 145 +++++++++++---
 .../{data_ingest.py => transcript_manager.py} |  92 ++++-----
 .../utils/data_structures/llm_response.py     |   5 +
 .../utils/data_structures/script_line.py      |   8 +
 .../data_structures/transcript_output.py      |  29 +++
 src/tests/text/tasks/llms_test.py             |  60 ------
 .../text/tasks/transcript_manager_test.py     | 101 ++++++++++
 12 files changed, 616 insertions(+), 198 deletions(-)
 rename src/senselab/text/tasks/llms/{data_ingest.py => transcript_manager.py} (64%)
 create mode 100644 src/senselab/utils/data_structures/llm_response.py
 create mode 100644 src/senselab/utils/data_structures/transcript_output.py
 delete mode 100644 src/tests/text/tasks/llms_test.py
 create mode 100644 src/tests/text/tasks/transcript_manager_test.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2847615a..d9fb4ac7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,8 +37,6 @@ repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.6.0
   hooks:
-  - id: check-added-large-files
-    args: [--maxkb=15000]
   - id: check-case-conflict
   - id: end-of-file-fixer
   - id: trailing-whitespace
diff --git a/src/senselab/text/tasks/evaluate_conversation/deep_eval.py b/src/senselab/text/tasks/evaluate_conversation/deep_eval.py
index 54f07f4f..5496e13d 100644
--- a/src/senselab/text/tasks/evaluate_conversation/deep_eval.py
+++ b/src/senselab/text/tasks/evaluate_conversation/deep_eval.py
@@ -4,7 +4,7 @@
 
 from senselab.utils.data_structures.script_line import ScriptLine
 
-from .metrics import RougeMetric
+from .metrics import Rouge
 
 
 def evaluate_conversation(script_lines: List[ScriptLine], metrics: List[str]) -> Dict:
@@ -25,8 +25,8 @@ def evaluate_conversation(script_lines: List[ScriptLine], metrics: List[str]) ->
     if not references or not hypotheses:
         return {"metrics": []}
 
-    metric_instance = RougeMetric()
-    scores = metric_instance.measure(references, hypotheses)
+    metric_instance = Rouge()
+    scores = metric_instance(references, hypotheses)
 
     metrics_results = [{metric: score.get(metric, 0.0) for metric in metrics} for score in scores]
 
diff --git a/src/senselab/text/tasks/evaluate_conversation/metrics.py b/src/senselab/text/tasks/evaluate_conversation/metrics.py
index 662ef52c..774d2559 100644
--- a/src/senselab/text/tasks/evaluate_conversation/metrics.py
+++ b/src/senselab/text/tasks/evaluate_conversation/metrics.py
@@ -1,51 +1,56 @@
-"""metrics.py."""
+"""Metrics to assess performance on tutor response.
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better.
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better.
+
+All other functions are value-independent.
+"""
 
-from abc import ABC, abstractmethod
 from typing import Dict, List
 
+import sacrebleu as sb
+import textstat
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCaseParams
 from rouge_score import rouge_scorer
+from sacrebleu.metrics import BLEUScore
+
+
+def Rouge(*args: List, **kwargs: Dict) -> rouge_scorer.RougeScorer:
+    """Wrapper for rouge_scorer's RougeScorer class."""
+    return rouge_scorer.RougeScorer(*args, **kwargs)
 
 
-class Metric(ABC):
-    """Abstract base class for metrics."""
+Rouge.__doc__ = rouge_scorer.RougeScorer.__doc__
 
-    @abstractmethod
-    def measure(self, references: List[str], hypotheses: List[str]) -> List[Dict[str, Dict[str, float]]]:
-        """Measure the metric.
 
-        Args:
-            references (List[str]): A list of reference strings.
-            hypotheses (List[str]): A list of hypothesis strings.
+def sentence_bleu_sacre(*args: List, **kwargs: Dict) -> BLEUScore:
+    """Wrapper for sacrebleu's sentence_bleu function."""
+    return sb.sentence_bleu(*args, **kwargs)
 
-        Returns:
-            List[Dict[str, Dict[str, float]]]: A list of dictionaries containing the result of the measurement.
-        """
-        pass
 
+sentence_bleu_sacre.__doc__ = sb.sentence_bleu.__doc__
 
-class RougeMetric(Metric):
-    """ROUGE metric calculation class."""
 
-    def __init__(self, name: str = "rouge", description: str = "ROUGE metric calculation") -> None:
-        """Initialize the ROUGE metric with a name and description.
+def word_count(*args: List, **kwargs: Dict) -> int:
+    """Wrapper for textstat's lexicon_count function."""
+    return textstat.lexicon_count(*args, **kwargs)
 
-        Args:
-            name (str): The name of the metric.
-            description (str): The description of the metric.
-        """
-        self.name = name
-        self.description = description
 
-    def measure(self, references: List[str], hypotheses: List[str]) -> List[Dict[str, Dict[str, float]]]:
-        """Measure the ROUGE metric for the given references and hypotheses.
+word_count.__doc__ = textstat.lexicon_count.__doc__
 
-        Args:
-            references (List[str]): A list of reference strings.
-            hypotheses (List[str]): A list of hypothesis strings.
 
-        Returns:
-            List[Dict[str, Dict[str, float]]]: A list of dictionaries containing ROUGE scores.
-        """
-        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
-        scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
-        return [{key: value.fmeasure for key, value in score.items()} for score in scores]
+correctness_metric = GEval(
+    name="Correctness",
+    criteria="Determine whether the actual output is factually correct based on the expected output.",
+    # NOTE: you can only provide either criteria or evaluation_steps, and not both
+    evaluation_steps=[
+        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+        "You should also heavily penalize omission of detail",
+        "Vague language, or contradicting OPINIONS, are OK",
+    ],
+    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+)
diff --git a/src/senselab/text/tasks/llms/doc.md b/src/senselab/text/tasks/llms/doc.md
index ed3f3b34..3fae1acf 100644
--- a/src/senselab/text/tasks/llms/doc.md
+++ b/src/senselab/text/tasks/llms/doc.md
@@ -1,7 +1,188 @@
 # LLMs
 
-<button class="tutorial-button" onclick="window.location.href='https://github.com/sensein/senselab/blob/main/tutorials/llms.ipynb'">Tutorial</button>
-
 
 ## Overview
 This module provides the API for making LLM calls in senselab.
+
+This project focuses on ingesting and processing data, utilizing language models, and handling transcript data. It provides utilities for parsing unstructured text and generating meaningful insights using a combination of custom functions and pre-trained models.
+
+## Structure
+The project contains the following main components:
+
+transcript_manager.py: Handles data ingestion and preprocessing tasks.
+
+llm.py: Integrates language model-related functionality.
+
+process_transcript_example.py: Demonstrates how to process transcript data, using methods provided in this package.
+
+
+## transcript_manager.py
+
+The `transcript_manager` module provides a data manager for handling interactions with a large language model (LLM). It allows the loading of transcripts, converting JSON data into scriptline objects, and extracting conversation data in a format that can be used to query potential AI responses.
+
+### Class: `Transcript`
+
+The `Transcript` class manages message data for interactions with a LLM. It provides methods to load transcripts, convert JSON transcript data into a usable format, and extract conversation segments for AI response opportunities. You will use it by initializing it on a valid transcript path. That transcript data is loaded in and stored as a list of scriptlines. These can then be printed in a readable format, you can see the number of tokens in the transcript, and the data is ready to be called by the LLM class in llm.py.
+
+### Attributes:
+ **`scriptlines (List[ScriptLine])`**: A list of `ScriptLine` objects representing the conversation. See documentionation in senselab/utils/data_structures/script_line.py.
+
+### Methods
+
+#### 1. `__init__(self, transcript_path: Path) -> None`
+
+Initializes the `MessagesManager` with a path to the JSON transcript file. Loads the transcript and converts it into scriptline objects.
+
+**Parameters:**
+- `transcript_path (Path)`: The path to the JSON transcript file.
+
+
+#### 2. `print_human_readable(self) -> None`
+
+Prints the scriptlines attribute in a human-readable format, where each message is displayed with the speaker and content.
+
+
+#### 3. `extract_response_opportunities(self) -> List[List[Dict[str, str]]]`
+
+Extracts consecutive sublists from the message list, ending after every 'user' response. These sublists can be used to compare AI responses to human responses over the course of a conversation.
+
+**Returns:**
+- `List[List[Dict[str, str]]]`: A list of consecutive sublists of messages, each ending with a 'user' message.
+
+Example:
+```python
+response_opportunities = manager.extract_response_opportunities()
+```
+
+
+#### 4. `convert_json_to_scriptlines(self, json_obj: Dict) -> List[ScriptLine]`
+
+Converts transcript segments from a JSON object into a list of `ScriptLine` objects, where each scriptline contains the text and speaker. This method also maps "teacher" to "assistant" and "kid" to "user".
+
+**Parameters:**
+- `json_obj (Dict)`: The JSON object containing the conversation segments.
+
+    The input JSON object should have the following structure:
+    ```
+                {
+                    "segments": [
+                        {
+                            "start": <float>,
+                            "end": <float>,
+                            "text": <string>,
+                            "words": [
+                                {
+                                    "word": <string>,
+                                    "start": <float>,
+                                    "end": <float>,
+                                    "score": <float>,
+                                    "speaker": <string> [kid|teacher]
+                                },
+                                ...
+                            ],
+                            "speaker": <string> [kid|teacher]
+                        },
+                        ...
+                    ]
+                }
+    ```
+
+**Returns:**
+- `List[ScriptLine]`: A list of `ScriptLine` objects representing the conversation.
+
+**Raises:**
+- `ValueError`: If the input JSON structure is invalid or contains an unknown speaker role.
+
+
+#### 5. `get_num_tokens(self) -> int`
+
+Returns the total number of tokens in the stored scriptlines. Uses OpenAI GPT-4o tokenizer.
+
+**Returns:**
+- `int`: Number of tokens in the transcript.
+---
+
+## Example Usage
+
+```python
+from pathlib import Path
+from transcript_manager import Transcript
+
+# Initialize the manager with the path to a transcript
+transcript = Transcript(Path("transcript.json"))
+
+transcript.print_human_readable(messages)
+
+# Extract response opportunities from the conversation
+response_opportunities = transcript.extract_response_opportunities()
+
+# Get the number of tokens used in the conversation
+num_tokens = transcript.get_num_tokens()
+
+print(f"Total tokens: {num_tokens}")
+```
+---
+
+
+
+### Class: `LLM`
+
+The `LLM` class abstracts the interaction with different large language models (LLMs) such as `llama3-8b`, `llama3-70b`, and `gpt-4o`. The `LLM` class is designed to start a server for model interaction, handle inputs, and produce outputs based on the model selected.
+
+Note that some models (like `gpt-4o`) are called through external endpoints, while others (like `llama3-8b`) are hosted locally and need to be initialized first. Depending on the model, the `call` function sends requests either to an external server or a locally hosted server.
+
+#### Attributes:
+ **`_model_name (str)`**: The name of the model being used (e.g., `"llama3-70b"`).
+ **`_base_url (str)`**: The URL where the server is hosted.
+ **`_tokenizer (AutoTokenizer)`**: Tokenizer for the selected model.
+
+---
+
+#### Methods
+
+##### 1. `__init__(self, model_name: str) -> None`
+
+Initializes the `LLM` instance with the specified model name, setting up the necessary client and tokenizer.
+
+**Parameters:**
+- `model_name (str)`: The name of the model to initialize.
+---
+
+##### 2. `start_server(self, num_gpus: int, base_url: str = "http://localhost:8000/v1") -> Popen`
+
+Starts a VLLM server with the specified number of GPUs, serving the specified local model. The server enables tensor parallelism to manage large models efficiently.
+
+**Parameters:**
+- `num_gpus (int)`: The number of GPUs to initialize the model with.
+- `base_url (Optional[str])`: The URL where the server is to be hosted. Default is `"http://localhost:8000/v1"`.
+
+**Returns:**
+- `Popen`: A `Popen` object representing the running server process.
+---
+
+##### 3. `call(self, messages: List[Dict], system_instruction: Optional[str] = "", max_tokens: Optional[int] = 100, temperature: Optional[float] = 0.3, measure: Optional[bool] = False) -> LLMResponse`
+
+Sends a series of messages to the model server and returns the model’s output. The `system_instruction` parameter provides additional context for the model, while the `measure` flag allows for token and latency measurements.
+
+**Parameters:**
+- `messages (List[Dict])`: List of messages in the conversation. Each message is a dictionary with `role` and `content` keys.
+- `system_instruction (Optional[str])`: Instruction for the system. Default is an empty string.
+- `max_tokens (Optional[int])`: Maximum number of tokens for the output.
+- `temperature (Optional[float])`: Sampling temperature, controlling randomness. Default is `0.3`.
+- `measure (Optional[bool])`: If `True`, measures latency and token usage. Default is `False`.
+
+**Returns:**
+- `LLMResponse`: An object containing the response content, latency, and token information (if measure flag set to True). See documentation at senselab/utils/data_structures/llm_response.py.
+
+### Example Usage
+
+```
+llm = LLM("llama3-70b")
+
+llm.start_server(num_gpus=4)
+
+messages = [{"role": "user", "content": "Tell me a joke."}]
+response = llm.call(messages, system_instruction="You are a friendly assistant")
+print(response.content)
+```
+---
diff --git a/src/senselab/text/tasks/llms/llm.py b/src/senselab/text/tasks/llms/llm.py
index ef9fda71..6a2b2599 100644
--- a/src/senselab/text/tasks/llms/llm.py
+++ b/src/senselab/text/tasks/llms/llm.py
@@ -4,11 +4,17 @@
     LLM: A unified interface for interacting with various LLMs.
 """
 
-import os
-from typing import Dict, List, Optional
+import time
+from subprocess import PIPE, Popen, check_output
+from typing import List, Optional, Tuple
 
+import requests
 import torch
 from openai import OpenAI
+from transformers import AutoTokenizer  # type: ignore
+
+from senselab.utils.data_structures.llm_response import LLMResponse
+from senselab.utils.data_structures.script_line import ScriptLine
 
 
 class LLM:
@@ -23,6 +29,7 @@ class LLM:
         The name of the model to use. This is a required argument. Options:
         - "llama3-8b"
         - "llama3-70b"
+        - "gpt-4o"
 
     Methods:
     --------
@@ -39,69 +46,118 @@ def __init__(self, model_name: str) -> None:
         Args:
             model_name (str): The name of the model to use.
         """
-        self._model_name = self._get_model(model_name)
+        self._model_name, self._serving_url = self._get_model(model_name)
+
+        self._tokenizer = AutoTokenizer.from_pretrained(self._model_name)
+
+        self._client = OpenAI(base_url=self._serving_url)
 
-    def start_server(self, num_gpus: int, base_url: str = "http://localhost:8000/v1") -> None:
-        """Starts the VLLM server with the specified number of GPUs.
+    def start_server(self, num_gpus: int, timeout: int = 300) -> Optional[Popen]:
+        """Starts the VLLM server with the specified number of GPUs and logs the output.
 
         Args:
             num_gpus (int): The number of GPUs to use for tensor parallelism in the VLLM server.
             base_url (str): The base URL of the VLLM server, from which the host and port are extracted.
+            timeout (int): Time, in seconds, to wait for the server to start before termination.
+
+        Returns:
+            Popen instance from subprocess module
         """
         if torch.cuda.is_available():
-            host, port = base_url.split("//")[1].split(":")
-            port = port.split("/")[0]
-            os.system(
-                f"vllm serve {self._model_name} --host {host} --port {port} " f"--tensor-parallel-size {num_gpus}"
-            )
-            self._client = OpenAI(base_url=base_url, api_key="EMPTY")
-
+            host = check_output("hostname -I | awk '{print $1}'", shell=True, text=True).strip()
+            port = 8000
+            command = f"vllm serve {self._model_name} --host {host} --port {port} --tensor-parallel-size {num_gpus}"
+            self._serving_url = f"http://{host}:{port}/v1"
+
+            # Run the server in the background
+            process = Popen(command, shell=True, stdout=PIPE, stderr=PIPE, text=True)
+
+            # Wait for the server to start
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                try:
+                    response = requests.get(self._serving_url, timeout=5)
+                    if response.status_code == 200:
+                        print("Server is up and running with a 200 response!")
+                        break
+                except requests.ConnectionError:
+                    pass
+                time.sleep(5)
+            else:
+                print(f"Server did not respond with a 200 status code within {timeout} seconds.")
+                process.terminate()
+                return None
+
+            self._client = OpenAI(base_url=self._serving_url, api_key="EMPTY")
+            print(f"Serving on Host: {host}\tPort: {port}")
+            return process
         else:
             print("Please migrate to a compute node with GPU resources.")
+            return None
 
     def call(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[ScriptLine],
         system_instruction: Optional[str] = "",
         max_tokens: Optional[int] = 100,
         temperature: Optional[float] = 0.3,
-    ) -> str:
+        measure: Optional[bool] = False,
+    ) -> LLMResponse:
         """Invokes the model with a given message and system instruction.
 
         Args:
-            messages (List[Dict[str, str]]): The conversation history.
+            messages (List[ScriptLine]): Conversation history.
             system_instruction (Optional[str]): The system instruction for the model.
             max_tokens (Optional[int]): Maximum number of tokens to generate.
             temperature (Optional[float]): Sampling temperature ranging between 0 and 2.
+            measure (Optional[bool]): Whether to measure token counts and latency.
 
         Returns:
-            str: The content of the model's response.
+            LLMResponse: Named tuple with model's response, token counts, and latency (if measured).
         """
+        openai_messages = [{"role": msg.speaker, "content": msg.text} for msg in messages]
+
         if system_instruction:
-            system_message = {"role": "system", "content": system_instruction}
-            messages.insert(0, system_message)
+            system_message = {"role": "system", "content": system_instruction}  # type: ignore
+            openai_messages.insert(0, system_message)  # type: ignore
+
+        in_tokens = out_tokens = latency = None
+
+        # initialize latency measurements
+        if measure:
+            in_tokens = sum(len(self._tokenizer.encode(message["content"])) for message in openai_messages)
+            start_time = time.time()
 
         completion = self._client.chat.completions.create(
-            model=self._model_name, messages=messages, max_tokens=max_tokens, temperature=temperature
+            model=self._model_name,
+            messages=openai_messages,  # type: ignore[arg-type]
+            max_tokens=max_tokens,
+            temperature=temperature,
         )
+        content = completion.choices[0].message.content
+
+        if measure:
+            latency = time.time() - start_time
+            out_tokens = len(self._tokenizer.encode(content))
 
-        return completion.choices[0].message.content
+        return LLMResponse(content=content, latency=latency, in_tokens=in_tokens, out_tokens=out_tokens)
 
-    def _get_model(self, model: str) -> str:
-        """Maps a model name to the corresponding model identifier.
+    def _get_model(self, model: str) -> Tuple[str, str]:
+        """Maps a model name to the corresponding model identifier and url.
 
         Args:
             model (str): The name of the model.
 
         Returns:
-            str: The model identifier.
+            Tuple[str,str]: 1) model identifier 2) URL
 
         Raises:
             ValueError: If the model name is unsupported.
         """
         model_mapping = {
-            "llama3-70b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "llama3-8b": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "llama3-70b": ("meta-llama/Meta-Llama-3.1-70B-Instruct", "http://localhost:8000/v1"),
+            "llama3-8b": ("meta-llama/Meta-Llama-3.1-8B-Instruct", "http://localhost:8000/v1"),
+            "gpt-4o": ("gpt-4o", "https://api.openai.com/v1"),
         }
         if model in model_mapping:
             return model_mapping[model]
diff --git a/src/senselab/text/tasks/llms/process_transcript_example.py b/src/senselab/text/tasks/llms/process_transcript_example.py
index daeb928e..6365fa5b 100644
--- a/src/senselab/text/tasks/llms/process_transcript_example.py
+++ b/src/senselab/text/tasks/llms/process_transcript_example.py
@@ -1,51 +1,142 @@
 """Example usage of llms directory to process AI responses from transcript."""
 
+import pickle
 import sys
 from pathlib import Path
-from typing import Generator
+from typing import Generator, List
 
+import pandas as pd
 from tqdm import tqdm
 
-from senselab.text.tasks.llms.data_ingest import MessagesManager
 from senselab.text.tasks.llms.llm import LLM
+from senselab.text.tasks.llms.transcript_manager import Transcript
+from senselab.utils.data_structures.llm_response import LLMResponse
+from senselab.utils.data_structures.transcript_output import TranscriptOutput
 
-if __name__ == "__main__":
-    manager = MessagesManager(Path("/home/goshdam/sample_transcript.json"))
-    llm = LLM("llama3_70b")
 
-    # manager.print_human_readable(manager.messages)
+def generate_ai_conversation(
+    transcript_path: Path, prompt_path: Path, temp: float, model_name: str, measure: bool, cache_path: Path, llm: LLM
+) -> TranscriptOutput:
+    """Generates an AI conversation based on transcript and prompt data.
 
-    SYSTEM_INSTRUCTION = (
-        "You are a friendly, supportive tutoring assistant for a child, "
-        "helping them to learn vocabulary, "
-        "interspersed with friendly human interaction."
-    )
+    Args:
+        transcript_path (Path): Path to the transcript file.
+        prompt_path (Path): Path to the prompt file.
+        temp (float): Temperature parameter for the LLM.
+        model_name (str): Name of the model to use.
+        measure (bool): Whether to measure performance (e.g., tokens, latency).
+        cache_path (Path): Path to store the cached responses.
+        llm (LLM): instantiated model being used.
+
+    Returns:
+        TranscriptOutput: The resulting transcript and data as a `TranscriptOutput` object.
+    """
+    manager = Transcript(transcript_path)
+
+    with open(prompt_path, "r") as f:
+        system_instruction = f.read()
 
     all_messages = manager.extract_response_opportunities()
 
-    responses = [
-        llm.call(messages=messages, system_instruction=SYSTEM_INSTRUCTION, max_tokens=200, temperature=0.4)
-        for messages in tqdm(all_messages, file=sys.stderr)
-    ]
+    # Check if cached responses already exist
+    if cache_path.exists():  # type: ignore
+        with open(cache_path, "rb") as f:  # type: ignore
+            responses = pickle.load(f)  # type: ignore
+        print(f"Loaded cached responses for {transcript_path.name}")
+    else:
+        responses = [
+            llm.call(
+                messages=messages,
+                system_instruction=system_instruction,
+                max_tokens=200,
+                temperature=temp,
+                measure=measure,
+            )
+            for messages in tqdm(all_messages, desc=f"Processing: {transcript_path.name}")
+        ]
 
-    def response_gen() -> Generator[str, None, None]:
-        """Generator function that yields responses from the responses list.
+        with open(cache_path, "wb") as f:  # type: ignore
+            pickle.dump(responses, f)  # type: ignore
 
-        Yields:
-            str: Each response in the responses list.
-        """
+    def response_gen() -> Generator[LLMResponse, None, None]:
+        """Generates responses from the cached or newly generated data."""
         yield from responses
 
     gen = response_gen()
 
-    for i, message in enumerate(manager.messages):
-        content = message["content"]
-
-        if message["role"] == "assistant":
+    df = pd.DataFrame(columns=["student", "teacher", "AI", "in_tokens", "out_tokens", "latency"])
+    j = 0  # student-response pair number
+    for i, message in enumerate(manager.scriptlines):
+        content = message.text
+        if message.speaker == "assistant":
             if i > 0:
                 response_content = next(gen)
-                print(f"Teacher:\t{content}\n\nAI:\t{response_content}\n\n")
+                df.at[j, "teacher"] = content
+                df.at[j, "AI"] = response_content.content
+                if measure:
+                    df.at[j, "in_tokens"] = response_content.in_tokens
+                    df.at[j, "out_tokens"] = response_content.out_tokens
+                    df.at[j, "latency"] = response_content.latency
             else:
-                print(f"Teacher:\t{content}\n\n")
+                df.at[j, "teacher"] = content
+            j += 1
         else:
-            print(f"Student:\t{content}\n\n")
+            df.at[j, "student"] = content
+
+    df.fillna("", inplace=True)
+
+    return TranscriptOutput(
+        temp=temp, model=model_name, prompt=prompt_path.name, transcript=transcript_path.name, data=df
+    )
+
+
+def generate_all_transcripts(
+    transcript_dir: Path, prompt_path: Path, temp: float, model_name: str, measure: bool, cache_dir: Path, llm: LLM
+) -> List[TranscriptOutput]:
+    """Generates AI conversations for all transcripts in a directory.
+
+    Args:
+        transcript_dir (Path): Directory containing transcript files.
+        prompt_path (Path): Path to the prompt file.
+        temp (float): Temperature parameter for the LLM.
+        model_name (str): Name of the model to use.
+        measure (bool): Whether to measure performance (e.g., tokens, latency).
+        cache_dir (Path): Directory to store cached responses.
+        llm (LLM): instantiated model being used.
+
+    Returns:
+        List[TranscriptOutput]: A list of `TranscriptOutput` objects.
+    """
+    outputs = []
+    for transcript_path in transcript_dir.iterdir():
+        cache_path = cache_dir / f"{transcript_path.stem}_cache.pkl"
+        outputs.append(
+            generate_ai_conversation(transcript_path, prompt_path, temp, model_name, measure, cache_path, llm)
+        )
+    return outputs
+
+
+if __name__ == "__main__":
+    transcript_dir = Path("/home/goshdam/transcripts")
+    prompt_path = Path("/home/goshdam/prompts/V2_1076.txt")
+    temp = 0.5
+    model_name = "llama3-70b"
+
+    llm = LLM(model_name)
+
+    if sys.argv[1] == "run":
+        output_path = Path("/home/goshdam/outputs/outputs_llama.pkl")
+        cache_dir = Path("/home/goshdam/outputs/cache")
+        measure = True
+
+        cache_dir.mkdir(parents=True, exist_ok=True)
+
+        outputs = generate_all_transcripts(transcript_dir, prompt_path, temp, model_name, measure, cache_dir, llm)
+
+        with open(output_path, "wb") as f:
+            pickle.dump(outputs, f)
+
+        print(f"Successfully saved all {len(outputs)} outputs to {output_path}")
+
+    elif sys.argv[1] == "server":
+        llm.start_server(num_gpus=4)
diff --git a/src/senselab/text/tasks/llms/data_ingest.py b/src/senselab/text/tasks/llms/transcript_manager.py
similarity index 64%
rename from src/senselab/text/tasks/llms/data_ingest.py
rename to src/senselab/text/tasks/llms/transcript_manager.py
index b4c7e8a6..729c7840 100644
--- a/src/senselab/text/tasks/llms/data_ingest.py
+++ b/src/senselab/text/tasks/llms/transcript_manager.py
@@ -4,22 +4,27 @@
 from pathlib import Path
 from typing import Dict, List
 
+import tiktoken
 
-class MessagesManager:
+from senselab.utils.data_structures.script_line import ScriptLine
+
+
+class Transcript:
     """Manages message data for interactions with a LLM.
 
     Provides methods to load transcripts, convert JSON data to message objects,
     and generate data from a human conversation to query potential AI responses.
 
     Attributes:
-        messages (List[Dict[str, str]]): A list of message objects for the OpenAI API.
+        scriptlines (List[Scriptline]): A list of Scriptline objects.
 
     Methods:
         __init__(transcript_path: Path) -> None: Initializes the manager with a transcript file path.
-        print_human_readable(messages: List[Dict[str, str]]) -> None: Prints messages in a readable format.
-        extract_response_opportunities() -> List[List[Dict[str, str]]]: Extracts sublists ending with user input.
+        print_human_readable() -> None: Prints messages in a readable format.
+        extract_response_opportunities() -> List[List[Scriptline]]: Extracts sublists ending with user input.
+        get_num_tokens()-> int: total number of tokens in transcript
         _load_transcript(json_path: Path) -> Dict: Loads a JSON transcript from a file.
-        convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]: Converts transcript format to LLM format.
+        convert_json_to_scriptlines(json_obj: Dict) -> List[ScriptLine]: Converts transcript format to LLM format.
     """
 
     def __init__(self, transcript_path: Path) -> None:
@@ -28,21 +33,30 @@ def __init__(self, transcript_path: Path) -> None:
         Args:
             transcript_path (Path): The path to the JSON transcript file.
         """
+        if not transcript_path.exists():
+            raise ValueError("Transcript path not found!")
         json_obj = self._load_transcript(transcript_path)
-        self.messages = self.convert_json_to_messages(json_obj)
+        self.scriptlines = self.convert_json_to_scriptlines(json_obj)
 
-    @staticmethod
-    def print_human_readable(messages: List[Dict[str, str]]) -> None:
-        """Print a list of messages in a human-readable format.
+    def print_human_readable(self) -> None:
+        """Prints the stored scriptlines in a human-readable format."""
+        for message in self.scriptlines:
+            print(f"{message.speaker}:\t\t{message.text}\n")
 
-        Args:
-            messages (List[Dict[str, str]]): List of messages where each message is a dictionary
-                                            with 'role' and 'content' keys.
-        """
-        for message in messages:
-            print(f'{message["role"]}:\t\t{message["content"]}\n')
+    def get_num_tokens(self) -> int:
+        """Returns the total number of OpenAI tokens in the conversation.
 
-    def extract_response_opportunities(self) -> List[List[Dict[str, str]]]:
+        Returns:
+            int: number of tokens
+        """
+        c = 0
+        encoding = tiktoken.encoding_for_model("gpt-4o")
+        for message in self.scriptlines:
+            if message.text:
+                c += len(encoding.encode(message.text))
+        return c
+
+    def extract_response_opportunities(self) -> List[List[ScriptLine]]:
         """Extract consecutive sublists from the messages list, ending after every 'user' response.
 
         This is used to compare AI responses to a human's response
@@ -50,15 +64,15 @@ def extract_response_opportunities(self) -> List[List[Dict[str, str]]]:
         natural conversation before making its own response.
 
         Returns:
-            List[List[Dict[str, str]]]: A list of consecutive sublists, each starting from the
-                                         beginning of the messages list and ending with a
-                                         message where the role is "user".
+            List[ScriptLine]: A list of sublists, each starting from the
+                                         beginning of the messages list and ending with the next
+                                         sequential message where the role is "user".
         """
         sublists = []
 
-        for i, message in enumerate(self.messages):
-            if message["role"] == "user" and i > 0:
-                sublist = self.messages[0 : i + 1]
+        for i, message in enumerate(self.scriptlines):
+            if message.speaker == "user" and i > 0:
+                sublist = self.scriptlines[0 : i + 1]
                 sublists.append(sublist)
 
         return sublists
@@ -77,11 +91,13 @@ def _load_transcript(json_path: Path) -> Dict:
             Dict: The JSON object loaded from the file.
         """
         with open(json_path, "r", encoding="utf-8") as file:
-            return json.load(file)
+            data = json.load(file)
+
+        return data
 
     @staticmethod
-    def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
-        """Converts transcript segments to list of message objects, excluding system messages.
+    def convert_json_to_scriptlines(json_obj: Dict) -> List[ScriptLine]:
+        """Converts transcript segments to list of ScriptLine objects.
 
         The input JSON object should have the following structure:
         {
@@ -106,19 +122,6 @@ def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
             ]
         }
 
-        The output will be a list of message objects,
-        suitable for OpenAI API, with the following structure:
-        [
-            {
-                "role": "user",
-                "content": "<user-input-string>"
-            },
-            {
-                "role": "assistant",
-                "content": "<assistant-response-string>"
-            },
-            ...
-        ]
 
         The conversion will map the "teacher" speaker role to "assistant" and the "kid" speaker
         role to "user".
@@ -127,7 +130,7 @@ def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
             json_obj (Dict): The input JSON object containing conversation segments.
 
         Returns:
-            List[Dict[str, str]]: A list of message objects in the format required by the OpenAI API.
+            List[ScriptLine]: See src/senselab/utils/data_structures/script_line.py
 
         Raises:
             ValueError: If the input JSON structure is invalid or contains an unknown speaker role.
@@ -136,7 +139,7 @@ def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
         if not (isinstance(json_obj, dict) and isinstance(json_obj.get("segments"), list)):
             raise ValueError("Invalid JSON structure: must be a dictionary with a 'segments' list")
 
-        messages = []
+        scriptlines = []
         current_role: str = ""
         current_content: List[str] = []
 
@@ -157,17 +160,18 @@ def convert_json_to_messages(json_obj: Dict) -> List[Dict[str, str]]:
                 elif speaker == "kid":
                     role = "user"
                 else:
-                    raise ValueError(f"Unknown speaker role: {speaker}")
+                    continue
 
                 if role != current_role:
                     if current_content:
-                        messages.append({"role": current_role, "content": " ".join(current_content)})
+                        scriptlines.append(ScriptLine(text=" ".join(current_content), speaker=current_role))
+
                     current_role = role
                     current_content = [word]
                 else:
                     current_content.append(word)
 
         if current_content:
-            messages.append({"role": current_role, "content": " ".join(current_content)})
+            scriptlines.append(ScriptLine(text=" ".join(current_content), speaker=current_role))
 
-        return messages
+        return scriptlines
diff --git a/src/senselab/utils/data_structures/llm_response.py b/src/senselab/utils/data_structures/llm_response.py
new file mode 100644
index 00000000..10839541
--- /dev/null
+++ b/src/senselab/utils/data_structures/llm_response.py
@@ -0,0 +1,5 @@
+"""This module contains the definition of the LLMResponse object."""
+
+from collections import namedtuple
+
+LLMResponse = namedtuple("LLMResponse", ["content", "latency", "in_tokens", "out_tokens"])
diff --git a/src/senselab/utils/data_structures/script_line.py b/src/senselab/utils/data_structures/script_line.py
index 79ad1b72..6f8ed12b 100644
--- a/src/senselab/utils/data_structures/script_line.py
+++ b/src/senselab/utils/data_structures/script_line.py
@@ -88,6 +88,14 @@ def get_chunks(self) -> Optional[List["ScriptLine"]]:
         """
         return self.chunks
 
+    def __repr__(self) -> str:
+        """Return a string representation of the ScriptLine object.
+
+        Returns:
+            str: A formatted string with the object's attributes.
+        """
+        return f"<ScriptLine(speaker={self.speaker}, text={self.text}, start={self.start}, end={self.end})>"
+
     @classmethod
     def from_dict(cls, d: Dict[str, Any]) -> "ScriptLine":
         """Create a ScriptLine instance from a dictionary.
diff --git a/src/senselab/utils/data_structures/transcript_output.py b/src/senselab/utils/data_structures/transcript_output.py
new file mode 100644
index 00000000..292c56f4
--- /dev/null
+++ b/src/senselab/utils/data_structures/transcript_output.py
@@ -0,0 +1,29 @@
+"""This module contains the definition of the TranscriptOutput object."""
+
+from dataclasses import dataclass
+
+import pandas as pd
+
+
+@dataclass
+class TranscriptOutput:
+    """Represents an output from an AI conversation transcript."""
+
+    temp: float
+    model: str
+    prompt: str
+    transcript: str
+    data: pd.DataFrame
+
+    def __str__(self) -> str:
+        """Return a formatted string representation of the transcript.
+
+        Returns:
+            str: A formatted string representing the transcript.
+        """
+        output = ""
+        for _, row in self.data.iterrows():
+            output += f"Student:\t{row['student']}\n\n"
+            output += f"Teacher:\t{row['teacher']}\n"
+            output += f"AI:\t{row['AI']}\n\n"
+        return output
diff --git a/src/tests/text/tasks/llms_test.py b/src/tests/text/tasks/llms_test.py
deleted file mode 100644
index 08a7aa4c..00000000
--- a/src/tests/text/tasks/llms_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This module is for testing the conversion of JSON conversation segments to message objects."""
-
-import os
-from typing import List
-
-import pytest
-
-from senselab.text.tasks.llms.data_ingest import MessagesManager
-
-if os.getenv("GITHUB_ACTIONS") != "true":
-
-    @pytest.fixture
-    def sample_json_obj() -> dict:
-        """Fixture for a sample JSON object representing conversation segments."""
-        return {
-            "segments": [
-                {
-                    "start": 0.0,
-                    "end": 1.0,
-                    "words": [
-                        {"word": "uh", "start": 0.0, "end": 0.5, "score": 1.0, "speaker": "kid"},
-                        {"word": "hello", "start": 0.6, "end": 1.0, "score": 1.0, "speaker": "teacher"},
-                    ],
-                    "speaker": "kid",
-                },
-                {
-                    "start": 1.0,
-                    "end": 2.0,
-                    "words": [
-                        {"word": "world", "start": 1.0, "end": 1.5, "score": 1.0, "speaker": "teacher"},
-                        {"word": "namaste", "start": 1.6, "end": 2.0, "score": 1.0, "speaker": "teacher"},
-                    ],
-                    "speaker": "teacher",
-                },
-                {
-                    "start": 2.0,
-                    "end": 3.0,
-                    "words": [
-                        {"word": "kemosabe", "start": 2.0, "end": 2.5, "score": 1.0, "speaker": "teacher"},
-                        {"word": "hi", "start": 2.6, "end": 2.8, "score": 1.0, "speaker": "kid"},
-                        {"word": "there", "start": 2.9, "end": 3.0, "score": 1.0, "speaker": "kid"},
-                    ],
-                    "speaker": "kid",
-                },
-            ]
-        }
-
-    @pytest.fixture
-    def expected_messages() -> List[dict]:
-        """Fixture for the expected list of message objects."""
-        return [
-            {"role": "user", "content": "uh"},
-            {"role": "assistant", "content": "hello world namaste kemosabe"},
-            {"role": "user", "content": "hi there"},
-        ]
-
-    def test_convert_json_to_messages(sample_json_obj: dict, expected_messages: List[dict]) -> None:
-        """Test the conversion of JSON conversation segments to message objects."""
-        result = MessagesManager.convert_json_to_messages(sample_json_obj)
-        assert result == expected_messages
diff --git a/src/tests/text/tasks/transcript_manager_test.py b/src/tests/text/tasks/transcript_manager_test.py
new file mode 100644
index 00000000..f49b2407
--- /dev/null
+++ b/src/tests/text/tasks/transcript_manager_test.py
@@ -0,0 +1,101 @@
+"""Test cases for the transcript manager module."""
+
+import json
+import os
+from pathlib import Path
+from typing import List
+
+import pytest
+
+from senselab.text.tasks.llms.transcript_manager import Transcript
+from senselab.utils.data_structures.script_line import ScriptLine
+
+if os.getenv("GITHUB_ACTIONS") != "true":
+
+    @pytest.fixture
+    def sample_json_obj() -> dict:
+        """Fixture for a sample JSON object representing conversation segments."""
+        return {
+            "segments": [
+                {
+                    "start": 0.0,
+                    "end": 1.0,
+                    "words": [
+                        {"word": "uh", "start": 0.0, "end": 0.5, "score": 1.0, "speaker": "kid"},
+                        {"word": "hello", "start": 0.6, "end": 1.0, "score": 1.0, "speaker": "teacher"},
+                    ],
+                    "speaker": "kid",
+                },
+                {
+                    "start": 1.0,
+                    "end": 2.0,
+                    "words": [
+                        {"word": "world", "start": 1.0, "end": 1.5, "score": 1.0, "speaker": "teacher"},
+                        {"word": "namaste", "start": 1.6, "end": 2.0, "score": 1.0, "speaker": "teacher"},
+                    ],
+                    "speaker": "teacher",
+                },
+                {
+                    "start": 2.0,
+                    "end": 3.0,
+                    "words": [
+                        {"word": "kemosabe", "start": 2.0, "end": 2.5, "score": 1.0, "speaker": "teacher"},
+                        {"word": "hi", "start": 2.6, "end": 2.8, "score": 1.0, "speaker": "kid"},
+                        {"word": "there", "start": 2.9, "end": 3.0, "score": 1.0, "speaker": "kid"},
+                    ],
+                    "speaker": "kid",
+                },
+            ]
+        }
+
+    @pytest.fixture
+    def sample_transcript(tmp_path: Path, sample_json_obj: dict) -> Path:
+        """Fixture to create a sample transcript file."""
+        transcript_file = tmp_path / "transcript.json"
+        with transcript_file.open("w") as f:
+            json.dump(sample_json_obj, f)
+        return transcript_file
+
+    @pytest.fixture
+    def expected_messages() -> List[ScriptLine]:
+        """Fixture for the expected list of message objects."""
+        return [
+            ScriptLine(speaker="user", text="uh"),
+            ScriptLine(speaker="assistant", text="hello world namaste kemosabe"),
+            ScriptLine(speaker="user", text="hi there"),
+        ]
+
+    def test_convert_json_to_messages(sample_json_obj: dict, expected_messages: List[ScriptLine]) -> None:
+        """Test the conversion of JSON conversation segments to message objects."""
+        result = Transcript.convert_json_to_scriptlines(sample_json_obj)
+        assert result == expected_messages
+
+    def test_missing_word_or_speaker_field() -> None:
+        """Test behavior when word or speaker field is missing from the segment."""
+        invalid_json = {
+            "segments": [
+                {
+                    "start": 0.0,
+                    "end": 1.0,
+                    "words": [{"word": "hello"}],  # Missing speaker
+                    "speaker": "teacher",
+                }
+            ]
+        }
+        with pytest.raises(ValueError, match="Invalid word structure"):
+            Transcript.convert_json_to_scriptlines(invalid_json)
+
+    def test_get_num_tokens(sample_transcript: Path) -> None:
+        """Test the ability of the program to return the correct number of expected tokens."""
+        transcript = Transcript(sample_transcript)  # Initialize the transcript
+        result = transcript.get_num_tokens()  # Get the token count
+        assert result == 10
+
+    def test_response_opportunities_extraction(sample_transcript: Path) -> None:
+        """Test the extraction of response opportunities."""
+        transcript = Transcript(sample_transcript)
+        opportunities = transcript.extract_response_opportunities()
+
+        assert len(opportunities) == 2, "Expected two response opportunities"
+        assert opportunities[0][-1].speaker == "user", "Expected last message to be first message from user"
+        assert opportunities[1][-1].speaker == "user", "Expected last message to be second message from 'user'"

From 0fe4dd6439d4baafcb0c1c6a445d265092856a1d Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood1@gmail.com>
Date: Tue, 17 Sep 2024 13:41:47 -0400
Subject: [PATCH 7/9] updated pyproject.toml

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 5e6d58be..0d1d175d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,9 @@ umap-learn = "~=0.5"
 scikit-learn = "~=1.5"
 nltk = "~=3.8"
 rouge-score = "~=0.1"
+tiktoken = "^0.7.0"
+sacrebleu = "^2.4.3"
+pytest-testmon = "^2.1.1"
 
 [tool.poetry.group.dev]
 optional = true

From 74cedbcf7e205b76d87ad40c607152f76dec483e Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood1@gmail.com>
Date: Tue, 17 Sep 2024 14:03:12 -0400
Subject: [PATCH 8/9] updated pyproject.toml

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index f117fabe..9bb4ea8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,8 @@ tiktoken = "^0.7.0"
 sacrebleu = "^2.4.3"
 pytest-testmon = "^2.1.1"
 vocos = "~=0.1"
+deepeval = "^1.2.2"
+textstat = "^0.7.4"
 
 [tool.poetry.group.dev]
 optional = true

From 3bbc485674167153eaa047b8aec59c1e9ad14fa1 Mon Sep 17 00:00:00 2001
From: Bruce Atwood <bruceatwood1@gmail.com>
Date: Tue, 10 Dec 2024 11:40:07 -0500
Subject: [PATCH 9/9] updated lllms

---
 requirements.txt                              | 15 ++++++++
 src/senselab/text/tasks/llms/llm.py           |  7 ++--
 .../tasks/llms/process_transcript_example.py  | 36 +++++++++++--------
 3 files changed, 40 insertions(+), 18 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..9ce2f9ef
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+datasets==2.20.0
+ffmpeg_python==0.2.0
+huggingface_hub==0.23.5
+jiwer==3.0.5
+nest_asyncio==1.6.0
+nltk==3.9.1
+scikit_learn==1.5.2
+scipy==1.14.1
+spacy==3.7.5
+textstat==0.7.4
+threadpoolctl==3.5.0
+torch_audiomentations==0.11.1
+transformers==4.45.1
+typing_extensions==4.12.2
+umap_learn==0.5.6
diff --git a/src/senselab/text/tasks/llms/llm.py b/src/senselab/text/tasks/llms/llm.py
index 6a2b2599..c5e90c9b 100644
--- a/src/senselab/text/tasks/llms/llm.py
+++ b/src/senselab/text/tasks/llms/llm.py
@@ -4,6 +4,7 @@
     LLM: A unified interface for interacting with various LLMs.
 """
 
+import os
 import time
 from subprocess import PIPE, Popen, check_output
 from typing import List, Optional, Tuple
@@ -52,7 +53,7 @@ def __init__(self, model_name: str) -> None:
 
         self._client = OpenAI(base_url=self._serving_url)
 
-    def start_server(self, num_gpus: int, timeout: int = 300) -> Optional[Popen]:
+    def start_server(self, num_gpus: int, timeout: int = 700) -> Optional[Popen]:
         """Starts the VLLM server with the specified number of GPUs and logs the output.
 
         Args:
@@ -155,8 +156,8 @@ def _get_model(self, model: str) -> Tuple[str, str]:
             ValueError: If the model name is unsupported.
         """
         model_mapping = {
-            "llama3-70b": ("meta-llama/Meta-Llama-3.1-70B-Instruct", "http://localhost:8000/v1"),
-            "llama3-8b": ("meta-llama/Meta-Llama-3.1-8B-Instruct", "http://localhost:8000/v1"),
+            "llama3-70b": ("meta-llama/Meta-Llama-3.1-70B-Instruct", f"http://{os.getenv('VLLM_IP_ADDRESS')}:8000/v1"),
+            "llama3-8b": ("meta-llama/Meta-Llama-3.1-8B-Instruct", f"http://{os.getenv('VLLM_IP_ADDRESS')}:8000/v1"),
             "gpt-4o": ("gpt-4o", "https://api.openai.com/v1"),
         }
         if model in model_mapping:
diff --git a/src/senselab/text/tasks/llms/process_transcript_example.py b/src/senselab/text/tasks/llms/process_transcript_example.py
index 6365fa5b..f6badf3b 100644
--- a/src/senselab/text/tasks/llms/process_transcript_example.py
+++ b/src/senselab/text/tasks/llms/process_transcript_example.py
@@ -1,7 +1,8 @@
 """Example usage of llms directory to process AI responses from transcript."""
 
+import os
 import pickle
-import sys
+import time
 from pathlib import Path
 from typing import Generator, List
 
@@ -117,26 +118,31 @@ def generate_all_transcripts(
 
 
 if __name__ == "__main__":
-    transcript_dir = Path("/home/goshdam/transcripts")
-    prompt_path = Path("/home/goshdam/prompts/V2_1076.txt")
+    transcript_dir = Path("/home/goshdam/to_do")
+    prompt_path = Path("/home/goshdam/prompts/V2_1038.txt")
     temp = 0.5
     model_name = "llama3-70b"
-
     llm = LLM(model_name)
 
-    if sys.argv[1] == "run":
-        output_path = Path("/home/goshdam/outputs/outputs_llama.pkl")
-        cache_dir = Path("/home/goshdam/outputs/cache")
-        measure = True
+    timeout = 700  # in seconds
+    poll_interval = 5  # interval to check in seconds
+    start_time = time.time()
+
+    while os.getenv("VLLM_STATUS") != "Running":
+        elapsed_time = time.time() - start_time
+        if elapsed_time > timeout:
+            raise TimeoutError(f"Timed out after {timeout} seconds waiting for VLLM_STATUS to be 'Running'.")
+        time.sleep(poll_interval)
 
-        cache_dir.mkdir(parents=True, exist_ok=True)
+    output_path = Path("/home/goshdam/outputs/outputs_llama.pkl")
+    cache_dir = Path("/home/goshdam/outputs/cache")
+    measure = False
 
-        outputs = generate_all_transcripts(transcript_dir, prompt_path, temp, model_name, measure, cache_dir, llm)
+    cache_dir.mkdir(parents=True, exist_ok=True)
 
-        with open(output_path, "wb") as f:
-            pickle.dump(outputs, f)
+    outputs = generate_all_transcripts(transcript_dir, prompt_path, temp, model_name, measure, cache_dir, llm)
 
-        print(f"Successfully saved all {len(outputs)} outputs to {output_path}")
+    with open(output_path, "wb") as f:
+        pickle.dump(outputs, f)
 
-    elif sys.argv[1] == "server":
-        llm.start_server(num_gpus=4)
+    print(f"Successfully saved all {len(outputs)} outputs to {output_path}")