From dce9960977e52cc03ae07115e858bdbe308773ed Mon Sep 17 00:00:00 2001
From: "Jonathan C. McKinney" <pseudotensor@gmail.com>
Date: Tue, 29 Oct 2024 18:51:23 -0700
Subject: [PATCH 1/2] make sure cache is writable

---
 docs/Dockerfile.delta2 | 6 ++++--
 src/version.py         | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/Dockerfile.delta2 b/docs/Dockerfile.delta2
index def13cab7..8d947140a 100644
--- a/docs/Dockerfile.delta2
+++ b/docs/Dockerfile.delta2
@@ -27,7 +27,9 @@ RUN wget https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-ubuntu2204-7.0.4.
     cp -r mongodb-linux-x86_64-ubuntu2204-7.0.4/bin /usr/lib/python3.10/site-packages/fiftyone/db/ && \
     chmod -R a+rwx /usr/lib/python3.10/site-packages/fiftyone/db
 
+RUN chmod a+rwx /workspace/.cache
+
 USER h2ogpt
 
-# docker build -f docs/Dockerfile.delta2 -t gcr.io/vorvan/h2oai/h2oai-h2ogpt-runtime:0.2.1-1286-patch1 .
-# docker push gcr.io/vorvan/h2oai/h2oai-h2ogpt-runtime:0.2.1-1286-patch1
\ No newline at end of file
+# docker build -f docs/Dockerfile.delta2 -t gcr.io/vorvan/h2oai/h2oai-h2ogpt-runtime:0.2.1-1286-patch2 .
+# docker push gcr.io/vorvan/h2oai/h2oai-h2ogpt-runtime:0.2.1-1286-patch2
\ No newline at end of file
diff --git a/src/version.py b/src/version.py
index dc6c550a0..4ccf9d17c 100644
--- a/src/version.py
+++ b/src/version.py
@@ -1 +1 @@
-__version__ = "69ab64742ced476fbe54d4eb91e228c0ada54290"
+__version__ = "9e71f30a01ef47e0f9333f5580a55382b4cd15e2"

From e77f54aa6d4f2b1b31a4f1b2cc27b9b0c0033ad6 Mon Sep 17 00:00:00 2001
From: "Jonathan C. McKinney" <pseudotensor@gmail.com>
Date: Tue, 29 Oct 2024 20:18:52 -0700
Subject: [PATCH 2/2] Allow llava_model to be openai model with specific model
 name for now

---
 src/gpt_langchain.py | 134 ++++++++++++++++++++++++++++++++-----------
 src/version.py       |   2 +-
 2 files changed, 100 insertions(+), 36 deletions(-)

diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py
index 1f8d8432b..07afebd64 100644
--- a/src/gpt_langchain.py
+++ b/src/gpt_langchain.py
@@ -2464,12 +2464,15 @@ def get_num_tokens(self, text: str) -> int:
 
 class GenerateStream:
     def get_count_output_tokens(self, ret):
-        if hasattr(ret, 'llm_output') and 'model_name' in ret.llm_output and ret.llm_output['model_name'] in ['o1-mini', 'o1-preview']:
+        if hasattr(ret, 'llm_output') and 'model_name' in ret.llm_output and ret.llm_output['model_name'] in ['o1-mini',
+                                                                                                              'o1-preview']:
             usage_dict = ret.llm_output['token_usage']
             if 'completion_tokens' in usage_dict:
                 self.count_output_tokens += usage_dict['completion_tokens']
-            if 'completion_tokens_details' in usage_dict and 'reasoning_tokens' in usage_dict['completion_tokens_details']:
-                print("reasoning tokens for %s: %s" % (ret.llm_output['model_name'], usage_dict['completion_tokens_details']['reasoning_tokens']))
+            if 'completion_tokens_details' in usage_dict and 'reasoning_tokens' in usage_dict[
+                'completion_tokens_details']:
+                print("reasoning tokens for %s: %s" % (
+                ret.llm_output['model_name'], usage_dict['completion_tokens_details']['reasoning_tokens']))
 
     def generate_prompt(
             self,
@@ -2608,12 +2611,15 @@ async def _agenerate(
 
 class GenerateNormal:
     def get_count_output_tokens(self, ret):
-        if hasattr(ret, 'llm_output') and 'model_name' in ret.llm_output and ret.llm_output['model_name'] in ['o1-mini', 'o1-preview']:
+        if hasattr(ret, 'llm_output') and 'model_name' in ret.llm_output and ret.llm_output['model_name'] in ['o1-mini',
+                                                                                                              'o1-preview']:
             usage_dict = ret.llm_output['token_usage']
             if 'completion_tokens' in usage_dict:
                 self.count_output_tokens += usage_dict['completion_tokens']
-            if 'completion_tokens_details' in usage_dict and 'reasoning_tokens' in usage_dict['completion_tokens_details']:
-                print("reasoning tokens for %s: %s" % (ret.llm_output['model_name'], usage_dict['completion_tokens_details']['reasoning_tokens']))
+            if 'completion_tokens_details' in usage_dict and 'reasoning_tokens' in usage_dict[
+                'completion_tokens_details']:
+                print("reasoning tokens for %s: %s" % (
+                ret.llm_output['model_name'], usage_dict['completion_tokens_details']['reasoning_tokens']))
 
     def generate_prompt(
             self,
@@ -3292,7 +3298,7 @@ def get_llm(use_openai_model=False,
 
         if json_vllm:
             response_format_real = response_format if not (
-                        guided_json or guided_regex or guided_choice or guided_grammar) else 'text'
+                    guided_json or guided_regex or guided_choice or guided_grammar) else 'text'
             vllm_extra_dict = get_vllm_extra_dict(tokenizer,
                                                   stop_sequences=prompter.stop_sequences if prompter else [],
                                                   # repetition_penalty=repetition_penalty,  # could pass
@@ -3437,7 +3443,8 @@ def get_llm(use_openai_model=False,
         if model_name in ['o1-mini', 'o1-preview']:
             gen_server_kwargs['max_completion_tokens'] = gen_server_kwargs.pop('max_tokens')
             max_reasoning_tokens = int(os.getenv("MAX_REASONING_TOKENS", 25000))
-            gen_server_kwargs['max_completion_tokens'] = max_reasoning_tokens + max(100, gen_server_kwargs['max_completion_tokens'])
+            gen_server_kwargs['max_completion_tokens'] = max_reasoning_tokens + max(100, gen_server_kwargs[
+                'max_completion_tokens'])
             gen_server_kwargs['temperature'] = 1.0
             model_kwargs.pop('presence_penalty', None)
             model_kwargs.pop('n', None)
@@ -5111,33 +5118,90 @@ def file_to_doc(file,
                 print("END: Pix2Struct", flush=True)
         if llava_model and enable_llava and 'vllm' not in llava_model:
             file_llava = fix_image_file(file, do_align=True, do_rotate=True, do_pad=False)
-            # LLaVa
-            if verbose:
-                print("BEGIN: LLaVa", flush=True)
-            try:
-                from vision.utils_vision import get_llava_response
-                res, llava_prompt = get_llava_response(file_llava, llava_model,
-                                                       prompt=llava_prompt,
-                                                       allow_prompt_auto=True,
-                                                       max_time=60,  # not too much time for docQA
-                                                       verbose=verbose,
-                                                       )
-                metadata = dict(source=file, date=str(datetime.now()), input_type='LLaVa')
-                docs1c = [Document(page_content=res, metadata=metadata)]
-                docs1c = [x for x in docs1c if x.page_content]
-                add_meta(docs1c, file, parser='LLaVa: %s' % llava_model, file_as_source=True)
-                # caption didn't set source, so fix-up meta
-                hash_of_file = hash_file(file)
-                [doci.metadata.update(source=file, source_true=file_llava, hashid=hash_of_file,
-                                      llava_prompt=llava_prompt or '') for doci in
-                 docs1c]
-                docs1.extend(docs1c)
-            except BaseException as e0:
-                print("LLaVa: %s: %s" % (str(e0), traceback.print_exception(e0)), flush=True)
-                e = e0
-            handled |= len(docs1) > 0
-            if verbose:
-                print("END: LLaVa", flush=True)
+
+            if llava_model.startswith('openai:'):
+                if verbose:
+                    print("BEGIN: OpenAI docAI", flush=True)
+                try:
+                    from openai import OpenAI
+                    openai_client = OpenAI(base_url=os.getenv('H2OGPT_OPENAI_BASE_URL', 'https://api.openai.com'),
+                                           api_key=os.getenv('H2OGPT_OPENAI_API_KEY', 'EMPTY'), timeout=60)
+                    if llava_prompt in ['auto', None]:
+                        llava_prompt = "Describe the image and what does the image say?"
+                    from vision.utils_vision import img_to_base64
+                    file_llava_url = img_to_base64(file_llava)
+                    content = [{
+                        'type': 'text',
+                        'text': llava_prompt,
+                    }, {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url':
+                                file_llava_url,
+                        },
+                    }]
+                    messages = [dict(role='system',
+                                     content='You are a keen document vision model that can understand complex images and text and respond to queries or convert text inside images to text.'),
+                                dict(role='user', content=content)]
+                    stream_output = False
+                    gen_server_kwargs = dict()
+                    model_name = llava_model.split('openai:')[1]
+                    responses = openai_client.chat.completions.create(
+                        model=model_name,
+                        messages=messages,
+                        stream=stream_output,
+                        **gen_server_kwargs,
+                    )
+                    if responses.choices is None and responses.model_extra:
+                        raise RuntimeError("OpenAI Chat failed: %s" % responses.model_extra)
+                    res = responses.choices[0].message.content
+                    if not res:
+                        raise RuntimeError("OpenAI Chat had no response")
+
+                    metadata = dict(source=file, date=str(datetime.now()), input_type='OpenAI DocAI')
+                    docs1c = [Document(page_content=res, metadata=metadata)]
+                    docs1c = [x for x in docs1c if x.page_content]
+                    add_meta(docs1c, file, parser='LLaVa: %s' % llava_model, file_as_source=True)
+                    # caption didn't set source, so fix-up meta
+                    hash_of_file = hash_file(file)
+                    [doci.metadata.update(source=file, source_true=file_llava, hashid=hash_of_file,
+                                          llava_prompt=llava_prompt or '') for doci in
+                     docs1c]
+                    docs1.extend(docs1c)
+                except BaseException as e0:
+                    print("LLaVa: %s: %s" % (str(e0), traceback.print_exception(e0)), flush=True)
+                    e = e0
+                handled |= len(docs1) > 0
+                if verbose:
+                    print("END: OpenAI docAI", flush=True)
+            else:
+                # LLaVa
+                if verbose:
+                    print("BEGIN: LLaVa", flush=True)
+                try:
+                    from vision.utils_vision import get_llava_response
+                    res, llava_prompt = get_llava_response(file_llava, llava_model,
+                                                           prompt=llava_prompt,
+                                                           allow_prompt_auto=True,
+                                                           max_time=60,  # not too much time for docQA
+                                                           verbose=verbose,
+                                                           )
+                    metadata = dict(source=file, date=str(datetime.now()), input_type='LLaVa')
+                    docs1c = [Document(page_content=res, metadata=metadata)]
+                    docs1c = [x for x in docs1c if x.page_content]
+                    add_meta(docs1c, file, parser='LLaVa: %s' % llava_model, file_as_source=True)
+                    # caption didn't set source, so fix-up meta
+                    hash_of_file = hash_file(file)
+                    [doci.metadata.update(source=file, source_true=file_llava, hashid=hash_of_file,
+                                          llava_prompt=llava_prompt or '') for doci in
+                     docs1c]
+                    docs1.extend(docs1c)
+                except BaseException as e0:
+                    print("LLaVa: %s: %s" % (str(e0), traceback.print_exception(e0)), flush=True)
+                    e = e0
+                handled |= len(docs1) > 0
+                if verbose:
+                    print("END: LLaVa", flush=True)
 
         doc1 = chunk_sources(docs1)
         if len(doc1) == 0:
diff --git a/src/version.py b/src/version.py
index 4ccf9d17c..aecb4d6db 100644
--- a/src/version.py
+++ b/src/version.py
@@ -1 +1 @@
-__version__ = "9e71f30a01ef47e0f9333f5580a55382b4cd15e2"
+__version__ = "dce9960977e52cc03ae07115e858bdbe308773ed"