diff --git a/.gitignore b/.gitignore index 6e36090d..f23fd690 100644 --- a/.gitignore +++ b/.gitignore @@ -162,13 +162,5 @@ cython_debug/ .DS_Store # Avoid sending testing screenshots up -screenshot.png -screenshot_with_grid.png -screenshot_with_labeled_grid.png -screenshot_mini.png -screenshot_mini_with_grid.png -grid_screenshot.png -grid_reflection_screenshot.png -reflection_screenshot.png -summary_screenshot.png -operate/screenshots/ \ No newline at end of file +*.png +operate/screenshots/ diff --git a/operate/actions/api_interactions.py b/operate/actions.py similarity index 54% rename from operate/actions/api_interactions.py rename to operate/actions.py index cae3da1e..45013c13 100644 --- a/operate/actions/api_interactions.py +++ b/operate/actions.py @@ -3,36 +3,65 @@ import json import base64 import re +import io +import asyncio +import aiohttp + from PIL import Image +from ultralytics import YOLO import google.generativeai as genai -from operate.config.settings import Config -from operate.exceptions.exceptions import ModelNotRecognizedException -from operate.utils.screenshot_util import capture_screen_with_cursor, add_grid_to_image, capture_mini_screenshot_with_cursor -from operate.utils.action_util import get_last_assistant_message -from operate.utils.prompt_util import format_vision_prompt, format_accurate_mode_vision_prompt,format_summary_prompt +from operate.settings import Config +from operate.exceptions import ModelNotRecognizedException +from operate.utils.screenshot import ( + capture_screen_with_cursor, + add_grid_to_image, + capture_mini_screenshot_with_cursor, +) +from operate.utils.os import get_last_assistant_message +from operate.prompts import ( + format_vision_prompt, + format_accurate_mode_vision_prompt, + format_summary_prompt, + format_decision_prompt, + format_label_prompt, +) + + +from operate.utils.label import ( + add_labels, + parse_click_content, + get_click_position_in_percent, + get_label_coordinates, +) +from operate.utils.style import ( + ANSI_GREEN, + ANSI_RED, + ANSI_RESET, +) + # Load configuration config = Config() + client = config.initialize_openai_client() +yolo_model = YOLO("./operate/model/weights/best.pt") # Load your trained model -def get_next_action(model, messages, objective, accurate_mode): - if model == "gpt-4-vision-preview": - content = get_next_action_from_openai( - messages, objective, accurate_mode) - return content + +async def get_next_action(model, messages, objective): + if model == "gpt-4": + return call_gpt_4_v(messages, objective) + if model == "gpt-4-with-som": + return await call_gpt_4_v_labeled(messages, objective) elif model == "agent-1": return "coming soon" elif model == "gemini-pro-vision": - content = get_next_action_from_gemini_pro_vision( - messages, objective - ) - return content + return call_gemini_pro_vision(messages, objective) raise ModelNotRecognizedException(model) -def get_next_action_from_openai(messages, objective, accurate_mode): +def call_gpt_4_v(messages, objective): """ Get the next action for Self-Operating Computer """ @@ -95,24 +124,6 @@ def get_next_action_from_openai(messages, objective, accurate_mode): content = response.choices[0].message.content - if accurate_mode: - if content.startswith("CLICK"): - # Adjust pseudo_messages to include the accurate_mode_message - - click_data = re.search(r"CLICK \{ (.+) \}", content).group(1) - click_data_json = json.loads(f"{{{click_data}}}") - prev_x = click_data_json["x"] - prev_y = click_data_json["y"] - - if config.debug: - print( - f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}" - ) - content = accurate_mode_double_check( - "gpt-4-vision-preview", pseudo_messages, prev_x, prev_y - ) - assert content != "ERROR", "ERROR: accurate_mode_double_check failed" - return content except Exception as e: @@ -120,7 +131,7 @@ def get_next_action_from_openai(messages, objective, accurate_mode): return "Failed take action after looking at the screenshot" -def get_next_action_from_gemini_pro_vision(messages, objective): +def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision """ @@ -172,14 +183,13 @@ def get_next_action_from_gemini_pro_vision(messages, objective): return "Failed take action after looking at the screenshot" +# This function is not used. `-accurate` mode was removed for now until a new PR fixes it. def accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y): """ Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location """ - print("[get_next_action_from_gemini_pro_vision] accurate_mode_double_check") try: - screenshot_filename = os.path.join( - "screenshots", "screenshot_mini.png") + screenshot_filename = os.path.join("screenshots", "screenshot_mini.png") capture_mini_screenshot_with_cursor( file_path=screenshot_filename, x=prev_x, y=prev_y ) @@ -191,8 +201,7 @@ def accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y): with open(new_screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - accurate_vision_prompt = format_accurate_mode_vision_prompt( - prev_x, prev_y) + accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y) accurate_mode_message = { "role": "user", @@ -234,7 +243,7 @@ def summarize(model, messages, objective): capture_screen_with_cursor(screenshot_filename) summary_prompt = format_summary_prompt(objective) - + if model == "gpt-4-vision-preview": with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") @@ -266,7 +275,135 @@ def summarize(model, messages, objective): ) content = summary_message.text return content - + except Exception as e: print(f"Error in summarize: {e}") - return "Failed to summarize the workflow" \ No newline at end of file + return "Failed to summarize the workflow" + + +async def call_gpt_4_v_labeled(messages, objective): + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + previous_action = get_last_assistant_message(messages) + + img_base64_labeled, img_base64_original, label_coordinates = add_labels( + img_base64, yolo_model + ) + + decision_prompt = format_decision_prompt(objective, previous_action) + labeled_click_prompt = format_label_prompt(objective) + + click_message = { + "role": "user", + "content": [ + {"type": "text", "text": labeled_click_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64_labeled}" + }, + }, + ], + } + decision_message = { + "role": "user", + "content": [ + {"type": "text", "text": decision_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64_original}" + }, + }, + ], + } + + click_messages = messages.copy() + click_messages.append(click_message) + decision_messages = messages.copy() + decision_messages.append(decision_message) + + click_future = fetch_openai_response_async(click_messages) + decision_future = fetch_openai_response_async(decision_messages) + + click_response, decision_response = await asyncio.gather( + click_future, decision_future + ) + + # Extracting the message content from the ChatCompletionMessage object + click_content = click_response.get("choices")[0].get("message").get("content") + + decision_content = ( + decision_response.get("choices")[0].get("message").get("content") + ) + + if not decision_content.startswith("CLICK"): + return decision_content + + label_data = parse_click_content(click_content) + + if label_data and "label" in label_data: + coordinates = get_label_coordinates(label_data["label"], label_coordinates) + image = Image.open( + io.BytesIO(base64.b64decode(img_base64)) + ) # Load the image to get its size + image_size = image.size # Get the size of the image (width, height) + click_position_percent = get_click_position_in_percent( + coordinates, image_size + ) + if not click_position_percent: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + x_percent = f"{click_position_percent[0]:.2f}%" + y_percent = f"{click_position_percent[1]:.2f}%" + click_action = f'CLICK {{ "x": "{x_percent}", "y": "{y_percent}", "description": "{label_data["decision"]}", "reason": "{label_data["reason"]}" }}' + + else: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + return click_action + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + +async def fetch_openai_response_async(messages): + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {config.openai_api_key}", + } + data = { + "model": "gpt-4-vision-preview", + "messages": messages, + "frequency_penalty": 1, + "presence_penalty": 1, + "temperature": 0.7, + "max_tokens": 300, + } + + async with aiohttp.ClientSession() as session: + async with session.post( + url, headers=headers, data=json.dumps(data) + ) as response: + return await response.json() diff --git a/operate/actions/__init__.py b/operate/actions/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/operate/config/__init__.py b/operate/config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/operate/dialogs/dialog.py b/operate/dialog.py similarity index 82% rename from operate/dialogs/dialog.py rename to operate/dialog.py index 9247d6cd..6c95085b 100644 --- a/operate/dialogs/dialog.py +++ b/operate/dialog.py @@ -1,38 +1,39 @@ -import sys +import sys import os import platform +import asyncio from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt -from operate.utils.prompt_util import style -from operate.exceptions.exceptions import ModelNotRecognizedException -from operate.prompts.prompt import USER_QUESTION -from operate.config.settings import Config -from operate.utils.ansi_colors import ( +from operate.exceptions import ModelNotRecognizedException +from operate.prompts import USER_QUESTION +from operate.settings import Config +from operate.utils.style import ( ANSI_GREEN, ANSI_RESET, ANSI_BLUE, ANSI_YELLOW, ANSI_RED, ANSI_BRIGHT_MAGENTA, + style, ) -from operate.utils.action_util import ( +from operate.utils.os import ( keyboard_type, search, - mouse_click, + click, ) -from operate.actions.api_interactions import get_next_action,summarize -from operate.utils.utils import parse_response +from operate.actions import get_next_action, summarize +from operate.utils.misc import parse_response # Load configuration config = Config() -def main(model, accurate_mode, terminal_prompt, voice_mode=False): + +def main(model, terminal_prompt, voice_mode=False): """ Main function for the Self-Operating Computer. Parameters: - model: The model used for generating responses. - - accurate_mode: A boolean indicating whether to use accurate mode for response generation. - terminal_prompt: A string representing the prompt provided in the terminal. - voice_mode: A boolean indicating whether to enable voice mode. @@ -40,9 +41,9 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False): None """ mic = None - # Initialize `WhisperMic`, if `voice_mode` is True + # Initialize `WhisperMic`, if `voice_mode` is True - validation(model, accurate_mode, voice_mode) + validation(model, voice_mode) if voice_mode: try: @@ -102,7 +103,7 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False): if config.debug: print("[loop] messages before next action:\n\n\n", messages[1:]) try: - response = get_next_action(model, messages, objective, accurate_mode) + response = asyncio.run(get_next_action(model, messages, objective)) action = parse_response(response) action_type = action.get("type") @@ -140,7 +141,7 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False): elif action_type == "TYPE": function_response = keyboard_type(action_detail) elif action_type == "CLICK": - function_response = mouse_click(action_detail) + function_response = click(action_detail) else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" @@ -165,27 +166,18 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False): break - -def validation( - model, - accurate_mode, - voice_mode, -): +def validation(model, voice_mode): """ Validate the input parameters for the dialog operation. Args: model (str): The model to be used for the dialog operation. - accurate_mode (bool): Flag indicating whether to use accuracy mode. voice_mode (bool): Flag indicating whether to use voice mode. Raises: SystemExit: If the input parameters are invalid. """ - if accurate_mode and model != "gpt-4-vision-preview": - print("To use accuracy mode, please use gpt-4-vision-preview") - sys.exit(1) if voice_mode and not config.openai_api_key: print("To use voice mode, please add an OpenAI API key") diff --git a/operate/dialogs/__init__.py b/operate/dialogs/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/operate/exceptions/exceptions.py b/operate/exceptions.py similarity index 100% rename from operate/exceptions/exceptions.py rename to operate/exceptions.py diff --git a/operate/exceptions/__init__.py b/operate/exceptions/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/operate/main.py b/operate/main.py index 9894f17a..8b2df0c9 100644 --- a/operate/main.py +++ b/operate/main.py @@ -2,8 +2,9 @@ Self-Operating Computer """ import argparse -from operate.utils.ansi_colors import ANSI_BRIGHT_MAGENTA -from operate.dialogs.dialog import main +from operate.utils.style import ANSI_BRIGHT_MAGENTA +from operate.dialog import main + def main_entry(): parser = argparse.ArgumentParser( @@ -14,7 +15,7 @@ def main_entry(): "--model", help="Specify the model to use", required=False, - default="gpt-4-vision-preview", + default="gpt-4", ) # Add a voice flag @@ -23,14 +24,6 @@ def main_entry(): help="Use voice input mode", action="store_true", ) - - parser.add_argument( - "-accurate", - help="Activate Reflective Mouse Click Mode", - action="store_true", - required=False, - ) - # Allow for direct input of prompt parser.add_argument( "--prompt", @@ -43,7 +36,6 @@ def main_entry(): args = parser.parse_args() main( args.model, - accurate_mode=args.accurate, terminal_prompt=args.prompt, voice_mode=args.voice, ) diff --git a/operate/model/weights/best.pt b/operate/model/weights/best.pt new file mode 100644 index 00000000..c7aa7c59 Binary files /dev/null and b/operate/model/weights/best.pt differ diff --git a/operate/prompts.py b/operate/prompts.py new file mode 100644 index 00000000..0e6b88ce --- /dev/null +++ b/operate/prompts.py @@ -0,0 +1,252 @@ +from operate.settings import Config + +config = Config() +monitor_size = config.monitor_size + +# General user Prompts +USER_QUESTION = "Hello, I can help you with anything. What would you like done?" + +# constants for the vision prompt +ACCURATE_PIXEL_COUNT = ( + 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big +) + +# ------------------------- +# VISION PROMPT +# ------------------------- +VISION_PROMPT = """ +You are a Self-Operating Computer. You use the same operating system as a human. + +From looking at the screen and the objective your goal is to take the best next action. + +To operate the computer you have the four options below. + +1. CLICK - Move mouse and click +2. TYPE - Type on the keyboard +3. SEARCH - Search for a program on Mac and open it +4. DONE - When you completed the task respond with the exact following phrase content + +Here are the response formats below. + +1. CLICK +Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }} +Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%" + +2. TYPE +Response: TYPE + +2. SEARCH +Response: SEARCH + +3. DONE +Response: DONE + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +TYPE Hello, I hope you are doing well. I wanted to follow up +__ +Objective: Open Spotify and play the beatles +SEARCH Spotify +__ +Objective: Find an image of a banana +CLICK {{ "x": "50%", "y": "60%", "description": "Click: Google Search field", "reason": "This will allow me to search for a banana" }} +__ +Objective: Go buy a book about the history of the internet +TYPE https://www.amazon.com/ +__ + +A few important notes: + +- Default to opening Google Chrome with SEARCH to find things that are on the internet. +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ "x": "50%", "y": "55%" }} +- The Chrome address bar is generally at: {{ "x": "50%", "y": "9%" }} +- After you click to enter a field you can go ahead and start typing! +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +{previous_action} + +IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. + +Objective: {objective} +""" + + +# ---------------------------------- +# ACCURATE MODE VISION PROMPT +# ---------------------------------- +ACCURATE_MODE_VISION_PROMPT = """ +It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot. +As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. +This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess. + +If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer. +Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer. + +There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer. + +Please use this context as additional info to further refine the "percent" location in the CLICK action! +""" + +DECISION_PROMPT = """ +You are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective. + +Here are your methods you can use to operating the computer. + +1. CLICK - Move mouse and click +2. TYPE - Type on the keyboard +3. SEARCH - Search for a program that is installed on Mac locally and open it +4. DONE - When you completed the task respond with the exact following phrase content + +Here are the response formats below. + +1. CLICK +Response: CLICK + +2. TYPE +Response: TYPE "value you want to type" + +2. SEARCH +Response: SEARCH "app you want to search for on Mac" + +3. DONE +Response: DONE + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +TYPE Hello, I hope you are doing well. I wanted to follow up +__ +Objective: Open Spotify and play the beatles +SEARCH Spotify +__ +Objective: Find an image of a banana +CLICK +__ +Objective: Go buy a book about the history of the internet +TYPE https://www.amazon.com/ +__ + +A few important notes: + +- Default to opening Google Chrome with SEARCH to find things that are on the Web. +- After you open Google Chrome you need to click on the address bar to find a website. +- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer. +- After you click to enter a field you can go ahead and start typing! +- If you can see the field is active, go ahead and type! +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +{previous_action} + +IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. + +{objective} +""" + +LABELED_IMAGE_PROMPT = """ +Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. + +Important to remember, you can only click on labeled elements. + +Label IDs are in the following format with `x` being a number: `~x` + +The labels are placed just above the bounding boxes so that they can be read clearly. + +Response formats below. + +1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. +Response: {{ "decision": "~decision here~", "reason": "~reason here~", "label": "~x" }} + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +{{ "decision": "Click the Outlook send button", "reason": "I can see the email is already written and now I just need to send it.", "label": "~27" }} +__ +Objective: Play the Holiday music on YouTube +{{ "decision": "Click on the Play button", "reason": "It appears there is a row with a holiday song available in the Spotify UI", "label": "~3" }} +__ + +A few important notes: +- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number. +- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. +- Do not preappend with ```json, just return the JSON object. + +{objective} +""" + + +# ------------------------- +# SUMMARY PROMPT +# ------------------------- +SUMMARY_PROMPT = """ +You are a Self-Operating Computer. A user request has been executed. Present the results succinctly. + +Include the following key contexts of the completed request: + +1. State the original objective. +2. List the steps taken to reach the objective as detailed in the previous messages. +3. Reference the screenshot that was used. + +Summarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user. + +Remember: The user will not interact with this summary. You are solely reporting the outcomes. + +Original objective: {objective} + +Display the results clearly: +""" + + +def format_summary_prompt(objective): + """ + Format the summary prompt + """ + prompt = SUMMARY_PROMPT.format(objective=objective) + return prompt + + +def format_vision_prompt(objective, previous_action): + """ + Format the vision prompt + """ + if previous_action: + previous_action = f"Here was the previous action you took: {previous_action}" + else: + previous_action = "" + prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action) + return prompt + + +def format_accurate_mode_vision_prompt(prev_x, prev_y): + """ + Format the accurate mode vision prompt + """ + width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100 + height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100 + prompt = ACCURATE_MODE_VISION_PROMPT.format( + prev_x=prev_x, prev_y=prev_y, width=width, height=height + ) + return prompt + + +def format_decision_prompt(objective, previous_action): + """ + Format the vision prompt + """ + if previous_action: + previous_action = f"Here was the previous action you took: {previous_action}" + else: + previous_action = "" + prompt = DECISION_PROMPT.format( + objective=objective, previous_action=previous_action + ) + return prompt + + +def format_label_prompt(objective): + """ + Format the vision prompt + """ + prompt = LABELED_IMAGE_PROMPT.format(objective=objective) + return prompt diff --git a/operate/prompts/__init__.py b/operate/prompts/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/operate/prompts/prompt.py b/operate/prompts/prompt.py deleted file mode 100644 index 7a4758e8..00000000 --- a/operate/prompts/prompt.py +++ /dev/null @@ -1,107 +0,0 @@ -# General user Prompts -USER_QUESTION = "Hello, I can help you with anything. What would you like done?" - -# constants for the vision prompt -ACCURATE_PIXEL_COUNT = ( - 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big -) - -# ------------------------- -# VISION PROMPT -# ------------------------- -VISION_PROMPT = """ -You are a Self-Operating Computer. You use the same operating system as a human. - -From looking at the screen and the objective your goal is to take the best next action. - -To operate the computer you have the four options below. - -1. CLICK - Move mouse and click -2. TYPE - Type on the keyboard -3. SEARCH - Search for a program on Mac and open it -4. DONE - When you completed the task respond with the exact following phrase content - -Here are the response formats below. - -1. CLICK -Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }} -Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%" - -2. TYPE -Response: TYPE - -2. SEARCH -Response: SEARCH - -3. DONE -Response: DONE - -Here are examples of how to respond. -__ -Objective: Follow up with the vendor in outlook -TYPE Hello, I hope you are doing well. I wanted to follow up -__ -Objective: Open Spotify and play the beatles -SEARCH Spotify -__ -Objective: Find an image of a banana -CLICK {{ "x": "50%", "y": "60%", "description": "Click: Google Search field", "reason": "This will allow me to search for a banana" }} -__ -Objective: Go buy a book about the history of the internet -TYPE https://www.amazon.com/ -__ - -A few important notes: - -- Default to opening Google Chrome with SEARCH to find things that are on the internet. -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar -- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ "x": "50%", "y": "55%" }} -- The Chrome address bar is generally at: {{ "x": "50%", "y": "9%" }} -- After you click to enter a field you can go ahead and start typing! -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. - -{previous_action} - -IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. - -Objective: {objective} -""" - - -# ---------------------------------- -# ACCURATE MODE VISION PROMPT -# ---------------------------------- -ACCURATE_MODE_VISION_PROMPT = """ -It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot. -As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. -This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess. - -If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer. -Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer. - -There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer. - -Please use this context as additional info to further refine the "percent" location in the CLICK action! -""" - - -# ------------------------- -# SUMMARY PROMPT -# ------------------------- -SUMMARY_PROMPT = """ -You are a Self-Operating Computer. A user request has been executed. Present the results succinctly. - -Include the following key contexts of the completed request: - -1. State the original objective. -2. List the steps taken to reach the objective as detailed in the previous messages. -3. Reference the screenshot that was used. - -Summarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user. - -Remember: The user will not interact with this summary. You are solely reporting the outcomes. - -Original objective: {objective} - -Display the results clearly: -""" diff --git a/operate/config/settings.py b/operate/settings.py similarity index 100% rename from operate/config/settings.py rename to operate/settings.py diff --git a/operate/utils/ansi_colors.py b/operate/utils/ansi_colors.py deleted file mode 100644 index eb724150..00000000 --- a/operate/utils/ansi_colors.py +++ /dev/null @@ -1,22 +0,0 @@ -import sys -import platform -import os -# Check if on a windows terminal that supports ANSI escape codes -def supports_ansi(): - """ - Check if the terminal supports ANSI escape codes - """ - plat = platform.system() - supported_platform = plat != "Windows" or "ANSICON" in os.environ - is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() - return supported_platform and is_a_tty - - -# Define ANSI color codes -ANSI_GREEN = "\033[32m" if supports_ansi() else "" # Standard green text -ANSI_BRIGHT_GREEN = "\033[92m" if supports_ansi() else "" # Bright/bold green text -ANSI_RESET = "\033[0m" if supports_ansi() else "" # Reset to default text color -ANSI_BLUE = "\033[94m" if supports_ansi() else "" # Bright blue -ANSI_YELLOW = "\033[33m" if supports_ansi() else "" # Standard yellow text -ANSI_RED = "\033[31m" if supports_ansi() else "" -ANSI_BRIGHT_MAGENTA = "\033[95m" if supports_ansi() else "" # Bright magenta text \ No newline at end of file diff --git a/operate/utils/label.py b/operate/utils/label.py new file mode 100644 index 00000000..2d3674f4 --- /dev/null +++ b/operate/utils/label.py @@ -0,0 +1,182 @@ +import io +import base64 +import json +import os +import time +import asyncio +from PIL import Image, ImageDraw + + +def validate_and_extract_image_data(data): + if not data or "messages" not in data: + raise ValueError("Invalid request, no messages found") + + messages = data["messages"] + if ( + not messages + or not isinstance(messages, list) + or not messages[-1].get("image_url") + ): + raise ValueError("No image provided or incorrect format") + + image_data = messages[-1]["image_url"]["url"] + if not image_data.startswith("data:image"): + raise ValueError("Invalid image format") + + return image_data.split("base64,")[-1], messages + + +def get_label_coordinates(label, label_coordinates): + """ + Retrieves the coordinates for a given label. + + :param label: The label to find coordinates for (e.g., "~1"). + :param label_coordinates: Dictionary containing labels and their coordinates. + :return: Coordinates of the label or None if the label is not found. + """ + return label_coordinates.get(label) + + +def is_overlapping(box1, box2): + x1_box1, y1_box1, x2_box1, y2_box1 = box1 + x1_box2, y1_box2, x2_box2, y2_box2 = box2 + + # Check if there is no overlap + if x1_box1 > x2_box2 or x1_box2 > x2_box1: + return False + if ( + y1_box1 > y2_box2 or y1_box2 > y2_box1 + ): # Adjusted to check 100px proximity above + return False + + return True + + +def add_labels(base64_data, yolo_model): + image_bytes = base64.b64decode(base64_data) + image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line + image_debug = image_labeled.copy() # Create a copy for the debug image + image_original = ( + image_labeled.copy() + ) # Copy of the original image for base64 return + + results = yolo_model(image_labeled) + + draw = ImageDraw.Draw(image_labeled) + debug_draw = ImageDraw.Draw( + image_debug + ) # Create a separate draw object for the debug image + font_size = 45 + + detections_dir = "detections" + label_coordinates = {} # Dictionary to store coordinates + + if not os.path.exists(detections_dir): + os.makedirs(detections_dir) + + counter = 0 + drawn_boxes = [] # List to keep track of boxes already drawn + for result in results: + if hasattr(result, "boxes"): + for det in result.boxes: + bbox = det.xyxy[0] + x1, y1, x2, y2 = bbox.tolist() + + debug_label = "D_" + str(counter) + debug_index_position = (x1, y1 - font_size) + debug_draw.rectangle([(x1, y1), (x2, y2)], outline="blue", width=1) + debug_draw.text( + debug_index_position, + debug_label, + fill="blue", + font_size=font_size, + ) + + overlap = any( + is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes + ) + + if not overlap: + draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=1) + label = "~" + str(counter) + index_position = (x1, y1 - font_size) + draw.text( + index_position, + label, + fill="red", + font_size=font_size, + ) + + # Add the non-overlapping box to the drawn_boxes list + drawn_boxes.append((x1, y1, x2, y2)) + label_coordinates[label] = (x1, y1, x2, y2) + + counter += 1 + + # Save the image + timestamp = time.strftime("%Y%m%d-%H%M%S") + + output_path = os.path.join(detections_dir, f"img_{timestamp}_labeled.png") + output_path_debug = os.path.join(detections_dir, f"img_{timestamp}_debug.png") + output_path_original = os.path.join(detections_dir, f"img_{timestamp}_original.png") + + image_labeled.save(output_path) + image_debug.save(output_path_debug) + image_original.save(output_path_original) + + buffered_original = io.BytesIO() + image_original.save(buffered_original, format="PNG") # I guess this is needed + img_base64_original = base64.b64encode(buffered_original.getvalue()).decode("utf-8") + + # Convert image to base64 for return + buffered_labeled = io.BytesIO() + image_labeled.save(buffered_labeled, format="PNG") # I guess this is needed + img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode("utf-8") + + return img_base64_labeled, img_base64_original, label_coordinates + + +def parse_click_content(message_content): + """ + Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data. + + :param message_content: The content of the response message. + :return: A dictionary with the relevant data or a message indicating a NONE action. + """ + try: + # Check for and remove erroneous ```json at the start and ``` at the end + if message_content.startswith("```json"): + message_content = message_content[ + len("```json") : + ] # Remove starting ```json + if message_content.endswith("```"): + message_content = message_content[: -len("```")] # Remove ending ``` + + # Convert JSON string to dictionary + return json.loads(message_content.strip()) + except json.JSONDecodeError as e: + return {"error": "Invalid JSON format"} + + return {"error": "Invalid response format"} + + +def get_click_position_in_percent(coordinates, image_size): + """ + Calculates the click position at the center of the bounding box and converts it to percentages. + + :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2). + :param image_size: A tuple of the image dimensions (width, height). + :return: A tuple of the click position in percentages (x_percent, y_percent). + """ + if not coordinates or not image_size: + return None + + # Calculate the center of the bounding box + x_center = (coordinates[0] + coordinates[2]) / 2 + y_center = (coordinates[1] + coordinates[3]) / 2 + + # Convert to percentages + x_percent = (x_center / image_size[0]) * 100 + y_percent = (y_center / image_size[1]) * 100 + + return x_percent, y_percent diff --git a/operate/utils/utils.py b/operate/utils/misc.py similarity index 98% rename from operate/utils/utils.py rename to operate/utils/misc.py index e68db46e..6959d4d8 100644 --- a/operate/utils/utils.py +++ b/operate/utils/misc.py @@ -1,5 +1,7 @@ import json import re + + def convert_percent_to_decimal(percent_str): """ Converts a percentage string to a decimal value. @@ -26,7 +28,6 @@ def convert_percent_to_decimal(percent_str): except ValueError as e: print(f"Error converting percent to decimal: {e}") return None - def extract_json_from_string(s): @@ -52,7 +53,8 @@ def extract_json_from_string(s): except Exception as e: print(f"Error parsing JSON: {e}") return None - + + def parse_response(response): """ Parses the given response and returns a dictionary with the type and data. @@ -97,4 +99,4 @@ def parse_response(response): search_data = re.search(r"SEARCH (.+)", response).group(1) return {"type": "SEARCH", "data": search_data} - return {"type": "UNKNOWN", "data": response} \ No newline at end of file + return {"type": "UNKNOWN", "data": response} diff --git a/operate/utils/action_util.py b/operate/utils/os.py similarity index 97% rename from operate/utils/action_util.py rename to operate/utils/os.py index 47b21266..98d05c11 100644 --- a/operate/utils/action_util.py +++ b/operate/utils/os.py @@ -1,8 +1,10 @@ import pyautogui -import platform +import platform import time import math -from operate.utils.utils import convert_percent_to_decimal + +from operate.utils.misc import convert_percent_to_decimal + def keyboard_type(text): """ @@ -20,6 +22,7 @@ def keyboard_type(text): pyautogui.press("enter") return "Type: " + text + def search(text): """ Searches for a program or file by typing the given text in the search bar and pressing Enter. @@ -49,6 +52,32 @@ def search(text): pyautogui.press("enter") return "Open program: " + text + +def click(click_detail): + """ + Perform a mouse click at the specified coordinates. + + Args: + click_detail (dict): A dictionary containing the coordinates of the click. + + Returns: + str: The description of the click if successful, otherwise "We failed to click". + """ + try: + x = convert_percent_to_decimal(click_detail["x"]) + y = convert_percent_to_decimal(click_detail["y"]) + + if click_detail and isinstance(x, float) and isinstance(y, float): + click_at_percentage(x, y) + return click_detail["description"] + else: + return "We failed to click" + + except Exception as e: + print(f"Error parsing JSON: {e}") + return "We failed to click" + + def click_at_percentage( x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5 ): @@ -88,31 +117,6 @@ def click_at_percentage( return "Successfully clicked" -def mouse_click(click_detail): - """ - Perform a mouse click at the specified coordinates. - - Args: - click_detail (dict): A dictionary containing the coordinates of the click. - - Returns: - str: The description of the click if successful, otherwise "We failed to click". - """ - try: - x = convert_percent_to_decimal(click_detail["x"]) - y = convert_percent_to_decimal(click_detail["y"]) - - if click_detail and isinstance(x, float) and isinstance(y, float): - click_at_percentage(x, y) - return click_detail["description"] - else: - return "We failed to click" - - except Exception as e: - print(f"Error parsing JSON: {e}") - return "We failed to click" - - def get_last_assistant_message(messages): """ Retrieve the last message from the assistant in the messages array. @@ -125,4 +129,3 @@ def get_last_assistant_message(messages): else: return messages[index] return None # Return None if no assistant message is found - diff --git a/operate/utils/prompt_util.py b/operate/utils/prompt_util.py deleted file mode 100644 index a2f87996..00000000 --- a/operate/utils/prompt_util.py +++ /dev/null @@ -1,49 +0,0 @@ -from prompt_toolkit.styles import Style as PromptStyle -from operate.prompts.prompt import VISION_PROMPT, ACCURATE_PIXEL_COUNT, ACCURATE_MODE_VISION_PROMPT, SUMMARY_PROMPT -from operate.config.settings import Config - -# Load settings -config = Config() -monitor_size = config.monitor_size - -# Define style -style = PromptStyle.from_dict( - { - "dialog": "bg:#88ff88", - "button": "bg:#ffffff #000000", - "dialog.body": "bg:#44cc44 #ffffff", - "dialog shadow": "bg:#003800", - } -) - - -def format_summary_prompt(objective): - """ - Format the summary prompt - """ - prompt = SUMMARY_PROMPT.format(objective=objective) - return prompt - - -def format_vision_prompt(objective, previous_action): - """ - Format the vision prompt - """ - if previous_action: - previous_action = f"Here was the previous action you took: {previous_action}" - else: - previous_action = "" - prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action) - return prompt - - -def format_accurate_mode_vision_prompt(prev_x, prev_y): - """ - Format the accurate mode vision prompt - """ - width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100 - height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100 - prompt = ACCURATE_MODE_VISION_PROMPT.format( - prev_x=prev_x, prev_y=prev_y, width=width, height=height - ) - return prompt \ No newline at end of file diff --git a/operate/utils/screenshot_util.py b/operate/utils/screenshot.py similarity index 87% rename from operate/utils/screenshot_util.py rename to operate/utils/screenshot.py index 4b362a90..087416ba 100644 --- a/operate/utils/screenshot_util.py +++ b/operate/utils/screenshot.py @@ -6,13 +6,14 @@ import Xlib.display import Xlib.X import Xlib.Xutil # not sure if Xutil is necessary -from operate.config.settings import Config -from operate.prompts.prompt import ACCURATE_PIXEL_COUNT +from operate.settings import Config +from operate.prompts import ACCURATE_PIXEL_COUNT # Load configuration config = Config() monitor_size = config.monitor_size + def add_grid_to_image(original_image_path, new_image_path, grid_interval): """ Add a grid to an image. @@ -46,17 +47,14 @@ def draw_label_with_background( position, text, draw, font_size, bg_width, bg_height ): # Adjust the position based on the background size - text_position = (position[0] + bg_width // 2, - position[1] + bg_height // 2) + text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2) # Draw the text background draw.rectangle( - [position[0], position[1], position[0] + - bg_width, position[1] + bg_height], + [position[0], position[1], position[0] + bg_width, position[1] + bg_height], fill="white", ) # Draw the text - draw.text(text_position, text, fill="black", - font_size=font_size, anchor="mm") + draw.text(text_position, text, fill="black", font_size=font_size, anchor="mm") # Draw vertical lines and labels at every `grid_interval` pixels for x in range(grid_interval, width, grid_interval): @@ -107,10 +105,8 @@ def capture_mini_screenshot_with_cursor( y = (y / 100) * monitor_size["height"] # Define the coordinates for the rectangle - x1, y1 = int(x - ACCURATE_PIXEL_COUNT / - 2), int(y - ACCURATE_PIXEL_COUNT / 2) - x2, y2 = int(x + ACCURATE_PIXEL_COUNT / - 2), int(y + ACCURATE_PIXEL_COUNT / 2) + x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) + x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2) screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2)) screenshot = screenshot.resize( @@ -135,8 +131,7 @@ def capture_mini_screenshot_with_cursor( ] # convert x from 50 to 0.5 * monitor_width y = (y / 100) * monitor_size["height"] - x1, y1 = int(x - ACCURATE_PIXEL_COUNT / - 2), int(y - ACCURATE_PIXEL_COUNT / 2) + x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) width = ACCURATE_PIXEL_COUNT height = ACCURATE_PIXEL_COUNT @@ -184,5 +179,4 @@ def capture_screen_with_cursor(file_path): # Use the screencapture utility to capture the screen with the cursor subprocess.run(["screencapture", "-C", file_path]) else: - print( - f"The platform you're using ({user_platform}) is not currently supported") + print(f"The platform you're using ({user_platform}) is not currently supported") diff --git a/operate/utils/style.py b/operate/utils/style.py new file mode 100644 index 00000000..2948582f --- /dev/null +++ b/operate/utils/style.py @@ -0,0 +1,36 @@ +import sys +import platform +import os +from prompt_toolkit.styles import Style as PromptStyle + + +# Define style +style = PromptStyle.from_dict( + { + "dialog": "bg:#88ff88", + "button": "bg:#ffffff #000000", + "dialog.body": "bg:#44cc44 #ffffff", + "dialog shadow": "bg:#003800", + } +) + + +# Check if on a windows terminal that supports ANSI escape codes +def supports_ansi(): + """ + Check if the terminal supports ANSI escape codes + """ + plat = platform.system() + supported_platform = plat != "Windows" or "ANSICON" in os.environ + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + return supported_platform and is_a_tty + + +# Define ANSI color codes +ANSI_GREEN = "\033[32m" if supports_ansi() else "" # Standard green text +ANSI_BRIGHT_GREEN = "\033[92m" if supports_ansi() else "" # Bright/bold green text +ANSI_RESET = "\033[0m" if supports_ansi() else "" # Reset to default text color +ANSI_BLUE = "\033[94m" if supports_ansi() else "" # Bright blue +ANSI_YELLOW = "\033[33m" if supports_ansi() else "" # Standard yellow text +ANSI_RED = "\033[31m" if supports_ansi() else "" +ANSI_BRIGHT_MAGENTA = "\033[95m" if supports_ansi() else "" # Bright magenta text diff --git a/requirements.txt b/requirements.txt index 558510ec..2c796cd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,4 +47,6 @@ typing_extensions==4.8.0 urllib3==2.0.7 wcwidth==0.2.9 zipp==3.17.0 -google-generativeai==0.3.0 \ No newline at end of file +google-generativeai==0.3.0 +aiohttp==3.9.1 +ultralytics==8.0.227 \ No newline at end of file