From 5daf69f774027c7571c593940569b48d0a4cdbbd Mon Sep 17 00:00:00 2001 From: fahad_shaikh Date: Tue, 3 Dec 2024 16:48:26 +0530 Subject: [PATCH 1/5] added llm classifier --- kairon/nlu/__init__.py | 1 + .../classifiers}/__init__.py | 0 .../openai.py => nlu/classifiers/llm.py} | 171 ++++++++++-------- kairon/shared/actions/utils.py | 3 - kairon/shared/nlu/featurizer/openai.py | 22 ++- requirements/prod.txt | 3 +- 6 files changed, 111 insertions(+), 89 deletions(-) create mode 100644 kairon/nlu/__init__.py rename kairon/{shared/nlu/classifier => nlu/classifiers}/__init__.py (100%) rename kairon/{shared/nlu/classifier/openai.py => nlu/classifiers/llm.py} (57%) diff --git a/kairon/nlu/__init__.py b/kairon/nlu/__init__.py new file mode 100644 index 000000000..e3b801c51 --- /dev/null +++ b/kairon/nlu/__init__.py @@ -0,0 +1 @@ +from .classifiers.llm import LLMClassifier \ No newline at end of file diff --git a/kairon/shared/nlu/classifier/__init__.py b/kairon/nlu/classifiers/__init__.py similarity index 100% rename from kairon/shared/nlu/classifier/__init__.py rename to kairon/nlu/classifiers/__init__.py diff --git a/kairon/shared/nlu/classifier/openai.py b/kairon/nlu/classifiers/llm.py similarity index 57% rename from kairon/shared/nlu/classifier/openai.py rename to kairon/nlu/classifiers/llm.py index 2bf8abbb7..7160a012b 100644 --- a/kairon/shared/nlu/classifier/openai.py +++ b/kairon/nlu/classifiers/llm.py @@ -3,6 +3,8 @@ import typing from typing import Any, Dict, List, Optional, Text from abc import ABC + +from pydantic import BaseModel from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.shared.nlu.training_data.message import Message from rasa.shared.nlu.training_data.training_data import TrainingData @@ -11,24 +13,39 @@ import faiss import rasa.utils.io as io_utils import os -from rasa.shared.nlu.constants import TEXT, INTENT -import openai +from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES import numpy as np +from tensorflow.python.ops.gen_batch_ops import batch from tqdm import tqdm from rasa.engine.graph import GraphComponent, ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe +import litellm +from rasa.shared.utils.io import create_directory_for_file +from more_itertools import chunked + +litellm.drop_params = True logger = logging.getLogger(__name__) if typing.TYPE_CHECKING: pass +class Entities(BaseModel): + value: str + start: int + end: int + entity: str + + +class ClassifierOutput(BaseModel): + intent: str + entities: List[Entities] = None @DefaultV1Recipe.register( - DefaultV1Recipe.ComponentType.INTENT_CLASSIFIER, is_trainable=False + DefaultV1Recipe.ComponentType.INTENT_CLASSIFIER, is_trainable=True ) -class OpenAIClassifier(IntentClassifier, GraphComponent, ABC): - """Intent and Entity classifier using the OpenAI Completion framework""" +class LLMClassifier(IntentClassifier, GraphComponent, ABC): + """Intent and Entity classifiers using the OpenAI Completion framework""" system_prompt = "You will be provided with a text, and your task is to classify its intent as {0}. Provide output in json format with the following keys intent, explanation, text." @@ -41,7 +58,7 @@ def __init__( vector: Optional[faiss.IndexFlatIP] = None, data: Optional[Dict[Text, Any]] = None, ) -> None: - """Construct a new intent classifier using the OpenAI Completion framework.""" + """Construct a new intent classifiers using the OpenAI Completion framework.""" self.component_config = config self._model_storage = model_storage self._resource = resource @@ -55,15 +72,16 @@ def __init__( self.data = data + @classmethod def required_packages(cls) -> List[Text]: - return ["openai", "faiss", "numpy"] + return ["litellm", "numpy"] @staticmethod def get_default_config() -> Dict[Text, Any]: return { "bot_id": None, - "prediction_model": "gpt-4", + "prediction_model": "gpt-4o-mini", "embedding_model": "text-embedding-3-small", "embedding_size": 1536, "top_k": 5, @@ -77,103 +95,105 @@ def load_api_key(self, bot_id: Text): from kairon.shared.admin.processor import Sysadmin llm_secret = Sysadmin.get_llm_secret("openai", bot_id) self.api_key = llm_secret.get('api_key') - elif os.environ.get("OPENAI_API_KEY"): - self.api_key = os.environ.get("OPENAI_API_KEY") + elif os.environ.get("LLM_API_KEY"): + self.api_key = os.environ.get("LLM_API_KEY") else: raise KeyError( - f"either set bot_id'in OpenAIClassifier config or set OPENAI_API_KEY in environment variables" + f"either set bot_id'in LLMClassifier config or set LLM_API_KEY in environment variables" ) def get_embeddings(self, text): - embedding = openai.Embedding.create( + embeddings = litellm.embedding( model="text-embedding-3-small", input=text, api_key=self.api_key - )["data"][0]["embedding"] - return embedding + ) + return [ embedding['embedding'] for embedding in embeddings['data']] - def process_training_data(self, training_data: TrainingData) -> TrainingData: - """Train the intent classifier on a data set.""" + def train(self, training_data: TrainingData) -> Resource: + """Train the intent classifiers on a data set.""" data_map = [] vector_map = [] - for example in tqdm(training_data.intent_examples): - vector_map.append(self.get_embeddings(example.get(TEXT))) - data_map.append({"text": example.get(TEXT), "intent": example.get(INTENT)}) + batch_size = 100 + with tqdm(len(training_data.intent_examples)) as pbar: + counter = 1 + for chunks in chunked(training_data.intent_examples, batch_size): + data = [{"text": example.get(TEXT), INTENT: example.get(INTENT), ENTITIES: example.get(ENTITIES)} for example in chunks] + vector_data = [example.get(TEXT) for example in chunks] + vector_map.extend(self.get_embeddings(vector_data)) + data_map.extend(data) + pbar.update(batch_size*counter) + counter +=1 + np_vector = np.asarray(vector_map, dtype=np.float32) faiss.normalize_L2(np_vector) self.vector.add(np_vector) self.data = data_map - return training_data + self.persist() + return self._resource def prepare_context(self, embeddings, text): dist, indx = self.vector.search( np.asarray([embeddings], dtype=np.float32), k=self.component_config.get("top_k", 5), ) - labels = ",".join(set(self.data[i]["intent"] for i in indx[0])) + labels = [] + context = "" + for i in indx[0]: + labels.append(self.data[i]["intent"]) + context += "text: "+self.data[i]["intent"]+"\nclassifier: {'intent': "+self.data[i][INTENT]+", 'entities': "+self.data[i][ENTITIES]+"}" + messages = [ {"role": "system", "content": self.system_prompt.format(labels)}, + {"role": "user", "content": f"""##{self.system_prompt}\n##Based on the below sample generate the intent.If text does not belongs to the {labels} then classify it as nlu_fallback\n\n{context}\n\ntext: {text}\nclassifier"""} ] - context = "\n\n".join( - f"\n\ntext: {self.data[i]['text']}\nintent: {self.data[i]['intent']}" - for i in indx[0] - ) - messages.append( - { - "role": "user", - "content": f"##{self.system_prompt}\n\n##Based on the below sample generate the intent.If text does not belongs to the labels then classify it as nlu_fallback\n\n{context}\n\ntext: {text}", - } - ) + return messages def predict(self, text): embedding = self.get_embeddings(text) messages = self.prepare_context(embedding, text) - retry = 0 intent = None explanation = None - while retry < self.component_config.get("retry", 3): - try: - response = openai.ChatCompletion.create( - model=self.component_config.get("prediction_model", "gpt-3.5-turbo"), - messages=messages, - temperature=self.component_config.get("temperature", 0.0), - max_tokens=self.component_config.get("max_tokens", 50), - top_p=1, - frequency_penalty=0, - presence_penalty=0, - stop=["\n\n"], - api_key=self.api_key, - ) - logger.debug(response) - responses = json.loads(response.choices[0]["message"]["content"]) - intent = responses["intent"] if "intent" in responses.keys() else None - explanation = ( - responses["explanation"] - if "explanation" in responses.keys() - else None - ) - break - except TimeoutError as e: - logger.error(e) - retry += 1 - if retry == 3: - raise e + try: + response = litellm.completion( + model=self.component_config.get("prediction_model", "gpt-3.5-turbo"), + messages=messages, + response_format=ClassifierOutput, + temperature=self.component_config.get("temperature", 0.0), + max_tokens=self.component_config.get("max_tokens", 50), + top_p=1, + frequency_penalty=0, + presence_penalty=0, + api_key=self.api_key, + retry=3 + ) + logger.debug(response) + responses = json.loads(response.choices[0]["message"]["content"]) + intent = responses["intent"] if "intent" in responses.keys() else None + explanation = ( + responses["explanation"] + if "explanation" in responses.keys() + else None + ) + except Exception as e: + logger.error(e) return intent, explanation - def process(self, message: Message) -> None: + def process(self, messages: List[Message]) -> List[Message]: """Return the most likely intent and its probability for a message.""" - - if not self.vector and not self.data: - # component is either not trained or didn't - # receive enough training data - intent = None - intent_ranking = [] - else: - label, reason = self.predict(message.get(TEXT)) - intent = {"name": label, "confidence": 1, "reason": reason} - intent_ranking = [] - - message.set("intent", intent, add_to_output=True) - message.set("intent_ranking", intent_ranking, add_to_output=True) + for message in messages: + if not self.vector and not self.data: + # component is either not trained or didn't + # receive enough training data + intent = None + intent_ranking = [] + else: + label, reason = self.predict(message.get(TEXT)) + intent = {"name": label, "confidence": 1, "reason": reason} + intent_ranking = [] + + message.set("intent", intent, add_to_output=True) + message.set("intent_ranking", intent_ranking, add_to_output=True) + return messages @classmethod def create( @@ -182,7 +202,7 @@ def create( model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, - ) -> "OpenAIClassifier": + ) -> "LLMClassifier": """Creates a new untrained component (see parent class for full docstring).""" return cls(config, model_storage, resource, execution_context) @@ -194,7 +214,7 @@ def load( resource: Resource, execution_context: ExecutionContext, **kwargs: Any, - ) -> "OpenAIClassifier": + ) -> "LLMClassifier": """Loads a policy from the storage (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_path: @@ -223,6 +243,7 @@ def persist(self) -> None: vector_file_name = file_name + "_vector.db" data_file_name = file_name + "_data.pkl" if self.vector and self.data: + create_directory_for_file(model_path) faiss.write_index( self.vector, os.path.join(model_path, vector_file_name) ) diff --git a/kairon/shared/actions/utils.py b/kairon/shared/actions/utils.py index f9cbcb621..c2a31396a 100644 --- a/kairon/shared/actions/utils.py +++ b/kairon/shared/actions/utils.py @@ -1,6 +1,3 @@ -import time - -import ujson as json import logging import re import time diff --git a/kairon/shared/nlu/featurizer/openai.py b/kairon/shared/nlu/featurizer/openai.py index 3c0d07d4b..19853a2b9 100644 --- a/kairon/shared/nlu/featurizer/openai.py +++ b/kairon/shared/nlu/featurizer/openai.py @@ -190,7 +190,7 @@ def process_training_data(self, training_data: TrainingData) -> TrainingData: batch_start_index += batch_size return training_data - def process(self, message: Message) -> None: + def process(self, messages: List[Message]) -> List[Message]: """Process an incoming message by computing its tokens and dense features. Args: @@ -199,15 +199,17 @@ def process(self, message: Message) -> None: # process of all featurizers operates only on TEXT and ACTION_TEXT attributes, # because all other attributes are labels which are featurized during training # and their features are stored by the model itself. - for attribute in {TEXT, ACTION_TEXT}: - if message.get(attribute): - self._set_lm_features( - self._get_docs_for_batch( - [message], attribute=attribute - )[0], - message, - attribute, - ) + for message in messages: + for attribute in {TEXT, ACTION_TEXT}: + if message.get(attribute): + self._set_lm_features( + self._get_docs_for_batch( + [message], attribute=attribute + )[0], + message, + attribute, + ) + return messages def _set_lm_features( self, doc: Dict[Text, Any], message: Message, attribute: Text = TEXT diff --git a/requirements/prod.txt b/requirements/prod.txt index 1e8ad7217..671749292 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -67,4 +67,5 @@ jsonschema_rs==0.18.1 mongoengine-jsonschema==0.1.3 fernet==1.0.1 google-generativeai -huggingface-hub==0.25.2 \ No newline at end of file +huggingface-hub==0.25.2 +more-itertools \ No newline at end of file From aa0cf2086623c648b4172e8956d3c24294f6cfad Mon Sep 17 00:00:00 2001 From: fahad_shaikh Date: Tue, 3 Dec 2024 16:58:59 +0530 Subject: [PATCH 2/5] log corrected --- kairon/nlu/classifiers/llm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kairon/nlu/classifiers/llm.py b/kairon/nlu/classifiers/llm.py index 7160a012b..1745f4e09 100644 --- a/kairon/nlu/classifiers/llm.py +++ b/kairon/nlu/classifiers/llm.py @@ -15,15 +15,16 @@ import os from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES import numpy as np -from tensorflow.python.ops.gen_batch_ops import batch from tqdm import tqdm from rasa.engine.graph import GraphComponent, ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe import litellm from rasa.shared.utils.io import create_directory_for_file from more_itertools import chunked +import os litellm.drop_params = True +os.environ["LITELLM_LOG"] = "ERROR" logger = logging.getLogger(__name__) @@ -120,7 +121,7 @@ def train(self, training_data: TrainingData) -> Resource: vector_data = [example.get(TEXT) for example in chunks] vector_map.extend(self.get_embeddings(vector_data)) data_map.extend(data) - pbar.update(batch_size*counter) + pbar.update(batch_size) counter +=1 np_vector = np.asarray(vector_map, dtype=np.float32) From f0ceb08ee8d98dec8db063414dd9d95a0c9bae34 Mon Sep 17 00:00:00 2001 From: fahad_shaikh Date: Thu, 5 Dec 2024 17:20:43 +0530 Subject: [PATCH 3/5] prompt corrected --- kairon/nlu/classifiers/llm.py | 109 +++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 41 deletions(-) diff --git a/kairon/nlu/classifiers/llm.py b/kairon/nlu/classifiers/llm.py index 1745f4e09..22740ddfb 100644 --- a/kairon/nlu/classifiers/llm.py +++ b/kairon/nlu/classifiers/llm.py @@ -12,8 +12,7 @@ from rasa.engine.storage.storage import ModelStorage import faiss import rasa.utils.io as io_utils -import os -from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES +from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ENTITY_ATTRIBUTE_TYPE import numpy as np from tqdm import tqdm from rasa.engine.graph import GraphComponent, ExecutionContext @@ -22,6 +21,7 @@ from rasa.shared.utils.io import create_directory_for_file from more_itertools import chunked import os +from rasa.nlu.extractors.extractor import EntityExtractorMixin litellm.drop_params = True os.environ["LITELLM_LOG"] = "ERROR" @@ -31,24 +31,13 @@ if typing.TYPE_CHECKING: pass -class Entities(BaseModel): - value: str - start: int - end: int - entity: str - - -class ClassifierOutput(BaseModel): - intent: str - entities: List[Entities] = None - @DefaultV1Recipe.register( DefaultV1Recipe.ComponentType.INTENT_CLASSIFIER, is_trainable=True ) -class LLMClassifier(IntentClassifier, GraphComponent, ABC): +class LLMClassifier(IntentClassifier, GraphComponent, EntityExtractorMixin, ABC): """Intent and Entity classifiers using the OpenAI Completion framework""" - system_prompt = "You will be provided with a text, and your task is to classify its intent as {0}. Provide output in json format with the following keys intent, explanation, text." + system_prompt = "You will be provided with a text, and your task is to classify its intent and entities. Provide output in json format with the following keys intent, explanation, text and entities." def __init__( self, @@ -87,7 +76,6 @@ def get_default_config() -> Dict[Text, Any]: "embedding_size": 1536, "top_k": 5, "temperature": 0.0, - "max_tokens": 50, "retry": 3, } @@ -105,7 +93,7 @@ def load_api_key(self, bot_id: Text): def get_embeddings(self, text): embeddings = litellm.embedding( - model="text-embedding-3-small", input=text, api_key=self.api_key + model="text-embedding-3-small", input=text, api_key=self.api_key, max_retries=3 ) return [ embedding['embedding'] for embedding in embeddings['data']] @@ -117,10 +105,11 @@ def train(self, training_data: TrainingData) -> Resource: with tqdm(len(training_data.intent_examples)) as pbar: counter = 1 for chunks in chunked(training_data.intent_examples, batch_size): - data = [{"text": example.get(TEXT), INTENT: example.get(INTENT), ENTITIES: example.get(ENTITIES)} for example in chunks] - vector_data = [example.get(TEXT) for example in chunks] - vector_map.extend(self.get_embeddings(vector_data)) - data_map.extend(data) + data = [{"text": example.get(TEXT).strip(), INTENT: example.get(INTENT).strip(), ENTITIES: example.get(ENTITIES)} for example in chunks if example.get(INTENT) and example.get(INTENT)] + vector_data = [example.get(TEXT).strip() for example in chunks if example.get(INTENT) and example.get(TEXT)] + if data and vector_data: + vector_map.extend(self.get_embeddings(vector_data)) + data_map.extend(data) pbar.update(batch_size) counter +=1 @@ -136,48 +125,81 @@ def prepare_context(self, embeddings, text): np.asarray([embeddings], dtype=np.float32), k=self.component_config.get("top_k", 5), ) - labels = [] - context = "" + intents = set() + entities = set() + data = [] for i in indx[0]: - labels.append(self.data[i]["intent"]) - context += "text: "+self.data[i]["intent"]+"\nclassifier: {'intent': "+self.data[i][INTENT]+", 'entities': "+self.data[i][ENTITIES]+"}" + if self.data[i].get(INTENT): + intents.add(self.data[i][INTENT]) + entities = set([entity[ENTITY_ATTRIBUTE_TYPE] for entity in entities]) + entities_obj = self.data[i][ENTITIES]if self.data[i][ENTITIES] else [] + data.append({ + 'text': self.data[i][TEXT], + 'intent': self.data[i][INTENT], + 'entities': entities_obj + }) messages = [ - {"role": "system", "content": self.system_prompt.format(labels)}, - {"role": "user", "content": f"""##{self.system_prompt}\n##Based on the below sample generate the intent.If text does not belongs to the {labels} then classify it as nlu_fallback\n\n{context}\n\ntext: {text}\nclassifier"""} + {"role": "user", "content": f"""You will be provided with a text, and your task is to classify its intent and extract any relevant entities. Provide the output in JSON format with the following keys: `intent`, `explanation`, `text`, and `entities`. + +### Intents +The possible intents are: +{intents} + +### Entities +You should extract entities from the text, although no specific entity types are provided (currently set to an empty set). + +The entities that can be extracted are: +{entities} + +Ensure to only extract entities that are relevant to the classification. + +--- + +**Example:** + +```json +{json.dumps(data)} +``` + +### Task +Classify the intent and extract entities for the given text: + +**Text**: `"take to xy100"` + +Please provide your answer in the specified JSON format.""" + } ] + return messages def predict(self, text): - embedding = self.get_embeddings(text) + embedding = self.get_embeddings(text)[0] messages = self.prepare_context(embedding, text) intent = None explanation = None + entities = [] try: response = litellm.completion( model=self.component_config.get("prediction_model", "gpt-3.5-turbo"), messages=messages, - response_format=ClassifierOutput, + response_format={ "type": "json_object" }, temperature=self.component_config.get("temperature", 0.0), - max_tokens=self.component_config.get("max_tokens", 50), top_p=1, frequency_penalty=0, presence_penalty=0, api_key=self.api_key, - retry=3 + max_retries=3 ) logger.debug(response) responses = json.loads(response.choices[0]["message"]["content"]) - intent = responses["intent"] if "intent" in responses.keys() else None - explanation = ( - responses["explanation"] - if "explanation" in responses.keys() - else None - ) + intent = responses["intent"] if "intent" in responses.keys() else "nlu_fallback" + explanation = responses["explanation"] if "explanation" in responses.keys() else None + entities = responses["entities"]if "entities" in responses.keys() else [] except Exception as e: logger.error(e) - return intent, explanation + return intent, explanation, entities def process(self, messages: List[Message]) -> List[Message]: """Return the most likely intent and its probability for a message.""" @@ -187,13 +209,16 @@ def process(self, messages: List[Message]) -> List[Message]: # receive enough training data intent = None intent_ranking = [] + entities = [] else: - label, reason = self.predict(message.get(TEXT)) + label, reason, entities = self.predict(message.get(TEXT)) intent = {"name": label, "confidence": 1, "reason": reason} intent_ranking = [] + entities = self.add_extractor_name(entities) message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True) + message.set(ENTITIES, entities, add_to_output=True) return messages @classmethod @@ -219,8 +244,10 @@ def load( """Loads a policy from the storage (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_path: - vector_file = os.path.join(model_path, config.get("vector")) - data_file = os.path.join(model_path, config.get("data")) + file_name = cls.__name__ + + vector_file = os.path.join(model_path, file_name + "_vector.db") + data_file = os.path.join(model_path, file_name + "_data.pkl") if os.path.exists(vector_file): vector = faiss.read_index(vector_file) From bcdce9509909daf660e872b3f763099daf887b01 Mon Sep 17 00:00:00 2001 From: fahad_shaikh Date: Thu, 5 Dec 2024 17:21:59 +0530 Subject: [PATCH 4/5] removed unused import --- kairon/nlu/classifiers/llm.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/kairon/nlu/classifiers/llm.py b/kairon/nlu/classifiers/llm.py index 22740ddfb..f438c36de 100644 --- a/kairon/nlu/classifiers/llm.py +++ b/kairon/nlu/classifiers/llm.py @@ -1,27 +1,26 @@ -import ujson as json import logging +import os import typing -from typing import Any, Dict, List, Optional, Text from abc import ABC +from typing import Any, Dict, List, Optional, Text -from pydantic import BaseModel -from rasa.nlu.classifiers.classifier import IntentClassifier -from rasa.shared.nlu.training_data.message import Message -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.engine.storage.resource import Resource -from rasa.engine.storage.storage import ModelStorage import faiss -import rasa.utils.io as io_utils -from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ENTITY_ATTRIBUTE_TYPE +import litellm import numpy as np -from tqdm import tqdm +import rasa.utils.io as io_utils +import ujson as json +from more_itertools import chunked from rasa.engine.graph import GraphComponent, ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe -import litellm -from rasa.shared.utils.io import create_directory_for_file -from more_itertools import chunked -import os +from rasa.engine.storage.resource import Resource +from rasa.engine.storage.storage import ModelStorage +from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.nlu.extractors.extractor import EntityExtractorMixin +from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ENTITY_ATTRIBUTE_TYPE +from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData +from rasa.shared.utils.io import create_directory_for_file +from tqdm import tqdm litellm.drop_params = True os.environ["LITELLM_LOG"] = "ERROR" From a2cced780bb7a7ec3fa94b88978b357956b6cb59 Mon Sep 17 00:00:00 2001 From: fahad_shaikh Date: Thu, 5 Dec 2024 18:29:21 +0530 Subject: [PATCH 5/5] removed unused import --- kairon/shared/nlu/featurizer/openai.py | 234 ------------------------- system.yaml | 5 +- tests/testing_data/system.yaml | 3 +- 3 files changed, 2 insertions(+), 240 deletions(-) delete mode 100644 kairon/shared/nlu/featurizer/openai.py diff --git a/kairon/shared/nlu/featurizer/openai.py b/kairon/shared/nlu/featurizer/openai.py deleted file mode 100644 index 19853a2b9..000000000 --- a/kairon/shared/nlu/featurizer/openai.py +++ /dev/null @@ -1,234 +0,0 @@ -import logging -import os -from abc import ABC -from typing import Any, Optional, Text, List, Dict, Tuple, Type - -import numpy as np -import openai -from rasa.engine.graph import GraphComponent, ExecutionContext -from rasa.engine.recipes.default_recipe import DefaultV1Recipe -from rasa.nlu.constants import ( - DENSE_FEATURIZABLE_ATTRIBUTES, - SEQUENCE_FEATURES, - SENTENCE_FEATURES, - FEATURIZER_CLASS_ALIAS, - TOKENS_NAMES -) -from rasa.nlu.featurizers.dense_featurizer.dense_featurizer import DenseFeaturizer -from rasa.nlu.tokenizers.tokenizer import Tokenizer -from rasa.shared.nlu.constants import ( - TEXT, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, - ACTION_TEXT, -) -from rasa.shared.nlu.training_data.features import Features -from rasa.shared.nlu.training_data.message import Message -from rasa.shared.nlu.training_data.training_data import TrainingData -from tqdm import tqdm - -logger = logging.getLogger(__name__) - -@DefaultV1Recipe.register( - DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=False -) -class OpenAIFeaturizer(DenseFeaturizer, GraphComponent, ABC): - """Featurizer using openai language models.""" - - - def __init__( - self, config: Dict[Text, Any], execution_context: ExecutionContext - ) -> None: - """Initializes OpenAIFeaturizer with the specified model. - - Args: - component_config: Configuration for the component. - """ - super(OpenAIFeaturizer, self).__init__(execution_context.node_name, config) - self.load_api_key(config.get("bot_id")) - - def load_api_key(self, bot_id: Text): - if bot_id: - from kairon.shared.admin.processor import Sysadmin - llm_secret = Sysadmin.get_llm_secret("openai", bot_id) - self.api_key = llm_secret.get('api_key') - elif os.environ.get("OPENAI_API_KEY"): - self.api_key = os.environ.get("OPENAI_API_KEY") - else: - raise KeyError( - f"either set bot_id'in OpenAIFeaturizer config or set OPENAI_API_KEY in environment variables" - ) - - @classmethod - def required_components(cls) -> List[Type]: - """Packages needed to be installed.""" - return [Tokenizer] - - @classmethod - def required_packages(cls) -> List[Text]: - """Packages needed to be installed.""" - return ["openai"] - - @staticmethod - def get_default_config() -> Dict[Text, Any]: - """Returns OpenAIFeaturizer's default config.""" - return { - **DenseFeaturizer.get_default_config(), - "bot_id": None, - } - - def get_tokens_embeddings(self, tokens): - embeddings = [] - for token in tokens: - embeddings.append(self.get_embeddings(token.text)) - return embeddings - - def get_embeddings(self, text): - embedding = openai.Embedding.create( - model="text-embedding-3-small", - input=text, - api_key=self.api_key - )['data'][0]['embedding'] - return embedding - - def _get_model_features_for_batch( - self, - batch_examples: List[Message], - attribute: Text, - ) -> Tuple[np.ndarray, np.ndarray]: - """Compute dense features of each example in the batch. - - Args: - batch_examples: List of examples in the batch. - attribute: attribute of the Message object to be processed. - - Returns: - Sentence and token level dense representations. - """ - sentence_embeddings = [] - sequence_embeddings = [] - for example in batch_examples: - text = example.get(attribute) - tokens = example.get(TOKENS_NAMES[attribute]) - - sequence_embeddings.append(np.array(self.get_tokens_embeddings(tokens))) - sentence_embeddings.append(np.array([self.get_embeddings(text)])) - - return np.array(sentence_embeddings), np.array(sequence_embeddings) - - def _get_docs_for_batch( - self, - batch_examples: List[Message], - attribute: Text, - ) -> List[Dict[Text, Any]]: - """Compute language model docs for all examples in the batch. - - Args: - batch_examples: Batch of message objects for which language model docs - need to be computed. - attribute: Property of message to be processed, one of ``TEXT`` or - ``RESPONSE``. - - Returns: - List of language model docs for each message in batch. - """ - - ( - batch_sentence_features, - batch_sequence_features, - ) = self._get_model_features_for_batch( - batch_examples, attribute - ) - - # A doc consists of - # {'sequence_features': ..., 'sentence_features': ...} - batch_docs = [] - for index in range(len(batch_examples)): - doc = { - SEQUENCE_FEATURES: batch_sequence_features[index], - SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)), - } - batch_docs.append(doc) - - return batch_docs - - def process_training_data(self, training_data: TrainingData) -> TrainingData: - """Compute tokens and dense features for each message in training data. - - Args: - training_data: NLU training data to be tokenized and featurized - config: NLU pipeline config consisting of all components. - """ - batch_size = 64 - - for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: - - non_empty_examples = list( - filter(lambda x: x.get(attribute), training_data.training_examples) - ) - - batch_start_index = 0 - with tqdm( - total=len(non_empty_examples), - desc=f"Computing language model features for attribute '{attribute}'", - ) as pbar: - while batch_start_index < len(non_empty_examples): - - batch_end_index = min( - batch_start_index + batch_size, len(non_empty_examples) - ) - # Collect batch examples - batch_messages = non_empty_examples[batch_start_index:batch_end_index] - - # Construct a doc with relevant features - # extracted(tokens, dense_features) - batch_docs = self._get_docs_for_batch(batch_messages, attribute) - - for index, ex in enumerate(batch_messages): - self._set_lm_features(batch_docs[index], ex, attribute) - pbar.update(1) - batch_start_index += batch_size - return training_data - - def process(self, messages: List[Message]) -> List[Message]: - """Process an incoming message by computing its tokens and dense features. - - Args: - message: Incoming message object - """ - # process of all featurizers operates only on TEXT and ACTION_TEXT attributes, - # because all other attributes are labels which are featurized during training - # and their features are stored by the model itself. - for message in messages: - for attribute in {TEXT, ACTION_TEXT}: - if message.get(attribute): - self._set_lm_features( - self._get_docs_for_batch( - [message], attribute=attribute - )[0], - message, - attribute, - ) - return messages - - def _set_lm_features( - self, doc: Dict[Text, Any], message: Message, attribute: Text = TEXT - ) -> None: - """Adds the precomputed word vectors to the messages features.""" - sequence_features = doc[SEQUENCE_FEATURES] - sentence_features = doc[SENTENCE_FEATURES] - - final_sequence_features = Features( - sequence_features, - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[FEATURIZER_CLASS_ALIAS], - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - sentence_features, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[FEATURIZER_CLASS_ALIAS], - ) - message.add_features(final_sentence_features) diff --git a/system.yaml b/system.yaml index 5c98fb7a5..eec7ce823 100644 --- a/system.yaml +++ b/system.yaml @@ -236,11 +236,8 @@ properties: core: components: - - custom.ner.SpacyPatternNER - - custom.fallback.FallbackIntentFilter - kairon.shared.nlu.featurizer.lm_featurizer.LanguageModelFeaturizer - - kairon.shared.nlu.classifier.openai.OpenAIClassifier - - kairon.shared.nlu.featurizer.openai.OpenAIFeaturizer + - kairon.nlu.LLMClassifier policies: - kairon.shared.rule_policy.RulePolicy deprecated-components: diff --git a/tests/testing_data/system.yaml b/tests/testing_data/system.yaml index d9dffe97b..25ad50573 100644 --- a/tests/testing_data/system.yaml +++ b/tests/testing_data/system.yaml @@ -232,9 +232,8 @@ properties: core: components: - - custom.ner.SpacyPatternNER - - custom.fallback.FallbackIntentFilter - kairon.shared.nlu.featurizer.lm_featurizer.LanguageModelFeaturizer + - kairon.nlu.LLMClassifier policies: - kairon.shared.rule_policy.RulePolicy deprecated-components: