From 94115a03486e322cf94b176354f5109db1debee9 Mon Sep 17 00:00:00 2001 From: Lance Martin Date: Sat, 30 Nov 2024 10:18:22 -0800 Subject: [PATCH] Test ntbk --- company_maistro.ipynb | 898 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 898 insertions(+) create mode 100644 company_maistro.ipynb diff --git a/company_maistro.ipynb b/company_maistro.ipynb new file mode 100644 index 0000000..6c916c9 --- /dev/null +++ b/company_maistro.ipynb @@ -0,0 +1,898 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import asyncio\n", + "import operator\n", + "import json\n", + "\n", + "from tavily import TavilyClient, AsyncTavilyClient\n", + "\n", + "from langchain_anthropic import ChatAnthropic\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", + "from langchain_core.runnables import RunnableConfig\n", + "from langsmith import traceable\n", + "\n", + "from langgraph.constants import Send\n", + "from langgraph.graph import START, END, StateGraph\n", + "\n", + "from pydantic import BaseModel, Field\n", + "from typing_extensions import Annotated, Any, List, Optional, Literal\n", + "from dataclasses import dataclass, field\n", + "\n", + "import configuration\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# LLMs\n", + "gpt_4o = ChatOpenAI(model=\"gpt-4o\", temperature=0)\n", + "claude_3_5_sonnet = ChatAnthropic(model=\"claude-3-5-sonnet-20240620\", temperature=0)\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Search\n", + "tavily_client = TavilyClient()\n", + "tavily_async_client = AsyncTavilyClient()\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Utils\n", + "@traceable\n", + "async def tavily_search_async(search_queries, tavily_topic, tavily_days):\n", + " \"\"\"\n", + " Performs concurrent web searches using the Tavily API.\n", + "\n", + " Args:\n", + " search_queries (List[SearchQuery]): List of search queries to process\n", + " tavily_topic (str): Type of search to perform ('news' or 'general')\n", + " tavily_days (int): Number of days to look back for news articles (only used when tavily_topic='news')\n", + "\n", + " Returns:\n", + " List[dict]: List of search results from Tavily API, one per query\n", + "\n", + " Note:\n", + " For news searches, each result will include articles from the last `tavily_days` days.\n", + " For general searches, the time range is unrestricted.\n", + " \"\"\"\n", + "\n", + " search_tasks = []\n", + " for query in search_queries:\n", + " if tavily_topic == \"news\":\n", + " search_tasks.append(\n", + " tavily_async_client.search(\n", + " query,\n", + " max_results=5,\n", + " include_raw_content=True,\n", + " topic=\"news\",\n", + " days=tavily_days,\n", + " )\n", + " )\n", + " else:\n", + " search_tasks.append(\n", + " tavily_async_client.search(\n", + " query, max_results=5, include_raw_content=True, topic=\"general\"\n", + " )\n", + " )\n", + "\n", + " # Execute all searches concurrently\n", + " search_docs = await asyncio.gather(*search_tasks)\n", + "\n", + " return search_docs\n", + "\n", + "\n", + "def deduplicate_and_format_sources(\n", + " search_response, max_tokens_per_source, include_raw_content=True\n", + "):\n", + " \"\"\"\n", + " Takes either a single search response or list of responses from Tavily API and formats them.\n", + " Limits the raw_content to approximately max_tokens_per_source.\n", + " include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.\n", + "\n", + " Args:\n", + " search_response: Either:\n", + " - A dict with a 'results' key containing a list of search results\n", + " - A list of dicts, each containing search results\n", + "\n", + " Returns:\n", + " str: Formatted string with deduplicated sources\n", + " \"\"\"\n", + " # Convert input to list of results\n", + " if isinstance(search_response, dict):\n", + " sources_list = search_response[\"results\"]\n", + " elif isinstance(search_response, list):\n", + " sources_list = []\n", + " for response in search_response:\n", + " if isinstance(response, dict) and \"results\" in response:\n", + " sources_list.extend(response[\"results\"])\n", + " else:\n", + " sources_list.extend(response)\n", + " else:\n", + " raise ValueError(\n", + " \"Input must be either a dict with 'results' or a list of search results\"\n", + " )\n", + "\n", + " # Deduplicate by URL\n", + " unique_sources = {}\n", + " for source in sources_list:\n", + " if source[\"url\"] not in unique_sources:\n", + " unique_sources[source[\"url\"]] = source\n", + "\n", + " # Format output\n", + " formatted_text = \"Sources:\\n\\n\"\n", + " for i, source in enumerate(unique_sources.values(), 1):\n", + " formatted_text += f\"Source {source['title']}:\\n===\\n\"\n", + " formatted_text += f\"URL: {source['url']}\\n===\\n\"\n", + " formatted_text += (\n", + " f\"Most relevant content from source: {source['content']}\\n===\\n\"\n", + " )\n", + " if include_raw_content:\n", + " # Using rough estimate of 4 characters per token\n", + " char_limit = max_tokens_per_source * 4\n", + " # Handle None raw_content\n", + " raw_content = source.get(\"raw_content\", \"\")\n", + " if raw_content is None:\n", + " raw_content = \"\"\n", + " print(f\"Warning: No raw_content found for source {source['url']}\")\n", + " if len(raw_content) > char_limit:\n", + " raw_content = raw_content[:char_limit] + \"... [truncated]\"\n", + " formatted_text += f\"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\\n\\n\"\n", + "\n", + " return formatted_text.strip()\n", + "\n", + "\n", + "def format_all_notes(completed_notes: list[str]) -> str:\n", + " \"\"\"Format a list of notes into a string\"\"\"\n", + " formatted_str = \"\"\n", + " for idx, company_notes in enumerate(completed_notes, 1):\n", + " formatted_str += f\"\"\"\n", + "{'='*60}\n", + "Note: {idx}:\n", + "{'='*60}\n", + "Notes from research:\n", + "{company_notes}\"\"\"\n", + " return formatted_str\n", + "\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Schema\n", + "class SearchQuery(BaseModel):\n", + " search_query: str = Field(None, description=\"Query for web search.\")\n", + "\n", + "\n", + "class Queries(BaseModel):\n", + " queries: List[SearchQuery] = Field(\n", + " description=\"List of search queries.\",\n", + " )\n", + "\n", + "\n", + "DEFAULT_EXTRACTION_SCHEMA = {\n", + " \"title\": \"CompanyInfo\",\n", + " \"description\": \"Basic information about a company\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"company_name\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Official name of the company\"\n", + " },\n", + " \"founding_year\": {\n", + " \"type\": \"integer\",\n", + " \"description\": \"Year the company was founded\"\n", + " },\n", + " \"founder_names\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\"type\": \"string\"},\n", + " \"description\": \"Names of the founding team members\"\n", + " },\n", + " \"product_description\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Brief description of the company's main product or service\"\n", + " },\n", + " \"funding_summary\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Summary of the company's funding history\"\n", + " }\n", + " },\n", + " \"required\": [\"company_name\"]\n", + "}\n", + "\n", + "@dataclass(kw_only=True)\n", + "class InputState:\n", + " \"\"\"Input state defines the interface between the graph and the user (external API).\"\"\"\n", + "\n", + " company: str\n", + " \"Company to research provided by the user.\"\n", + "\n", + " extraction_schema: dict[str, Any] = field(\n", + " default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA\n", + " )\n", + " \"The json schema defines the information the agent is tasked with filling out.\"\n", + "\n", + " user_notes: Optional[dict[str, Any]] = field(default=None)\n", + " \"Any notes from the user to start the research process.\"\n", + "\n", + "\n", + "@dataclass(kw_only=True)\n", + "class OverallState:\n", + " \"\"\"Input state defines the interface between the graph and the user (external API).\"\"\"\n", + "\n", + " company: str\n", + " \"Company to research provided by the user.\"\n", + "\n", + " extraction_schema: dict[str, Any] = field(\n", + " default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA\n", + " )\n", + " \"The json schema defines the information the agent is tasked with filling out.\"\n", + "\n", + " user_notes: str = field(default=None)\n", + " \"Any notes from the user to start the research process.\"\n", + "\n", + " completed_notes: Annotated[list, operator.add] = field(default_factory=list)\n", + " \"Notes from completed research related to the schema\"\n", + "\n", + " info: dict[str, Any] = field(default=None)\n", + " \"\"\"\n", + " A dictionary containing the extracted and processed information\n", + " based on the user's query and the graph's execution.\n", + " This is the primary output of the enrichment process.\n", + " \"\"\"\n", + "\n", + " is_satisfactory: bool = field(default=None)\n", + " \"True if all required fields are well populated, False otherwise\"\n", + "\n", + " reflection_search_queries: list[str] = field(default=None)\n", + " \"If is_satisfactory is False, provide targeted search queries to find the missing information\"\n", + "\n", + " reflection_steps_taken: int = field(default=0)\n", + " \"Number of times the reflection node has been executed\"\n", + "\n", + "@dataclass(kw_only=True)\n", + "class OutputState:\n", + " \"\"\"The response object for the end user.\n", + "\n", + " This class defines the structure of the output that will be provided\n", + " to the user after the graph's execution is complete.\n", + " \"\"\"\n", + "\n", + " info: dict[str, Any]\n", + " \"\"\"\n", + " A dictionary containing the extracted and processed information\n", + " based on the user's query and the graph's execution.\n", + " This is the primary output of the enrichment process.\n", + " \"\"\"\n", + "\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Prompts\n", + "\n", + "extraction_prompt = \"\"\"Your task is to take notes gather from web research\n", + "\n", + "and extract them into the following schema. \n", + "\n", + "\n", + "{info}\n", + "\n", + "\n", + "Here are all the notes from research:\n", + "\n", + "\n", + "{notes}\n", + "\n", + " \"\"\"\n", + "\n", + "query_writer_instructions = \"\"\"You are a search query generator tasked with creating targeted search queries to gather specific company information.\n", + "\n", + "Here is the company you are researching: {company}\n", + "\n", + "Generate at most {max_search_queries} search queries that will help gather the following information:\n", + "\n", + "\n", + "{info}\n", + "\n", + "\n", + "Your query should:\n", + "1. Focus on finding factual, up-to-date company information\n", + "2. Target official sources, news, and reliable business databases\n", + "3. Prioritize finding information that matches the schema requirements\n", + "4. Include the company name and relevant business terms\n", + "5. Be specific enough to avoid irrelevant results\n", + "\n", + "Create a focused query that will maximize the chances of finding schema-relevant information.\"\"\"\n", + "\n", + "_INFO_PROMPT = \"\"\"You are doing web research on a company, {company}. \n", + "\n", + "The following schema shows the type of information we're interested in:\n", + "\n", + "\n", + "{info}\n", + "\n", + "\n", + "You have just scraped website content. Your task is to take clear, organized notes about the company, focusing on topics relevant to our interests.\n", + "\n", + "\n", + "{content}\n", + "\n", + "\n", + "Here are any additional notes from the user:\n", + "\n", + "{user_notes}\n", + "\n", + "\n", + "Please provide detailed research notes that:\n", + "1. Are well-organized and easy to read\n", + "2. Focus on topics mentioned in the schema\n", + "3. Include specific facts, dates, and figures when available\n", + "4. Maintain accuracy of the original content\n", + "5. Note when important information appears to be missing or unclear\n", + "\n", + "Remember: Don't try to format the output to match the schema - just take clear notes that capture all relevant information.\"\"\"\n", + "\n", + "REFLECTION_PROMPT = \"\"\"You are a research analyst tasked with reviewing the quality and completeness of extracted company information.\n", + "\n", + "Compare the extracted information with the required schema:\n", + "\n", + "\n", + "{schema}\n", + "\n", + "\n", + "Here is the extracted information:\n", + "\n", + "{info}\n", + "\n", + "\n", + "Analyze if all required fields are present and sufficiently populated. Consider:\n", + "1. Are any required fields missing?\n", + "2. Are any fields incomplete or containing uncertain information?\n", + "3. Are there fields with placeholder values or \"unknown\" markers?\n", + "\n", + "Return a structured response that has the following fields:\n", + "- \"is_satisfactory\": boolean, # True if all required fields are well populated, False otherwise\n", + "- \"missing_fields\": [string], # List of field names that are missing or incomplete\n", + "- \"reflection_search_queries\": [string], # If is_satisfactory is False, provide {max_search_queries} targeted search queries to find the missing information\n", + "- \"reasoning\": string # Brief explanation of your assessment\n", + "\n", + "\"\"\"\n", + "\n", + "class ReflectionOutput(BaseModel):\n", + " is_satisfactory: bool = Field(\n", + " description=\"True if all required fields are well populated, False otherwise\"\n", + " )\n", + " missing_fields: List[str] = Field(\n", + " description=\"List of field names that are missing or incomplete\"\n", + " )\n", + " reflection_search_queries: List[str] = Field(\n", + " description=\"If is_satisfactory is False, provide 1-3 targeted search queries to find the missing information\"\n", + " )\n", + " reasoning: str = Field(\n", + " description=\"Brief explanation of the assessment\"\n", + " )\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Nodes\n", + "\n", + "async def research_company(state: OverallState, config: RunnableConfig) -> str:\n", + " \"\"\"Execute a multi-step web search and information extraction process.\n", + "\n", + " This function performs the following steps:\n", + " 1. Generates multiple search queries based on the input query\n", + " 2. Executes concurrent web searches using the Tavily API\n", + " 3. Deduplicates and formats the search results\n", + " 4. Extracts structured information based on the provided schema\n", + "\n", + " Args:\n", + " query: The initial search query string\n", + " state: Injected application state containing the extraction schema\n", + " config: Runtime configuration for the search process\n", + "\n", + " Returns:\n", + " str: Structured notes from the search results that are\n", + " relevant to the extraction schema in state.extraction_schema\n", + "\n", + " Note:\n", + " The function uses concurrent execution for multiple search queries to improve\n", + " performance and combines results from various sources for comprehensive coverage.\n", + " \"\"\"\n", + "\n", + " # Get configuration\n", + " configurable = configuration.Configuration.from_runnable_config(config)\n", + " max_search_queries = configurable.max_search_queries\n", + " max_search_results = configurable.max_search_results\n", + "\n", + " # Initialize search client\n", + " tavily_async_client = AsyncTavilyClient()\n", + "\n", + " # Generate search queries\n", + " structured_llm = claude_3_5_sonnet.with_structured_output(Queries)\n", + "\n", + " # Check reflection output - access attribute directly\n", + " reflection_output = getattr(state, \"is_satisfactory\", None)\n", + " reflection_queries = getattr(state, \"reflection_search_queries\", None)\n", + " \n", + " # If we have performed reflection and have new search queries \n", + " if reflection_output is not None and reflection_queries:\n", + " # Get generated search queries\n", + " query_list = reflection_queries\n", + " else:\n", + " # Format system instructions\n", + " query_instructions = query_writer_instructions.format(\n", + " company=state.company,\n", + " info=json.dumps(state.extraction_schema, indent=2),\n", + " max_search_queries=max_search_queries,\n", + " )\n", + "\n", + " # Generate queries\n", + " results = structured_llm.invoke(\n", + " [SystemMessage(content=query_instructions)]\n", + " + [\n", + " HumanMessage(\n", + " content=f\"Please generate a list of search queries related to the schema that you want to populate.\"\n", + " )\n", + " ]\n", + " )\n", + "\n", + " # Queries\n", + " query_list = [query.search_query for query in results.queries]\n", + "\n", + " # Search tasks\n", + " search_tasks = []\n", + " for query in query_list:\n", + " search_tasks.append(\n", + " tavily_async_client.search(\n", + " query,\n", + " max_results=max_search_results,\n", + " include_raw_content=True,\n", + " topic=\"general\",\n", + " )\n", + " )\n", + "\n", + " # Execute all searches concurrently\n", + " search_docs = await asyncio.gather(*search_tasks)\n", + "\n", + " # Deduplicate and format sources\n", + " source_str = deduplicate_and_format_sources(\n", + " search_docs, max_tokens_per_source=1000, include_raw_content=True\n", + " )\n", + "\n", + " # Generate structured notes relevant to the extraction schema\n", + " p = _INFO_PROMPT.format(\n", + " info=json.dumps(state.extraction_schema, indent=2),\n", + " content=source_str,\n", + " company=state.company,\n", + " user_notes=state.user_notes,\n", + " )\n", + " result = await claude_3_5_sonnet.ainvoke(p)\n", + " return {\"completed_notes\": [str(result.content)]}\n", + "\n", + "def gather_notes_extract_schema(state: OverallState) -> dict[str, Any]:\n", + " \"\"\"Gather notes from the web search and extract the schema fields.\"\"\"\n", + "\n", + " # Format all notes\n", + " notes = format_all_notes(state.completed_notes)\n", + "\n", + " # Extract schema fields\n", + " system_prompt = extraction_prompt.format(\n", + " info=json.dumps(state.extraction_schema, indent=2), notes=notes\n", + " )\n", + " structured_llm = claude_3_5_sonnet.with_structured_output(state.extraction_schema)\n", + " result = structured_llm.invoke(\n", + " [\n", + " SystemMessage(content=system_prompt),\n", + " HumanMessage(content=f\"Produce a structured output from these notes.\"),\n", + " ]\n", + " )\n", + " return {\"info\": result}\n", + "\n", + "def reflection(state: OverallState, config: RunnableConfig) -> dict[str, Any]:\n", + " \"\"\"Reflect on the extracted information and generate search queries to find missing information.\"\"\"\n", + "\n", + " # Get configuration\n", + " configurable = configuration.Configuration.from_runnable_config(config)\n", + "\n", + " # Generate search queries\n", + " structured_llm = claude_3_5_sonnet.with_structured_output(ReflectionOutput)\n", + "\n", + " # Format reflection prompt\n", + " system_prompt = REFLECTION_PROMPT.format(schema=json.dumps(state.extraction_schema, indent=2), \n", + " info=state.info, \n", + " max_search_queries=configurable.max_search_queries)\n", + "\n", + " # Invoke\n", + " result = structured_llm.invoke(\n", + " [\n", + " SystemMessage(content=system_prompt),\n", + " HumanMessage(content=f\"Produce a structured reflection output.\"),\n", + " ]\n", + " )\n", + "\n", + " if result.is_satisfactory:\n", + " return {\"is_satisfactory\":result.is_satisfactory}\n", + " else:\n", + " return {\"is_satisfactory\":result.is_satisfactory, \n", + " \"reflection_search_queries\": result.reflection_search_queries,\n", + " \"reflection_steps_taken\": state.reflection_steps_taken + 1}\n", + "\n", + "def route_from_reflection(state: OverallState, config: RunnableConfig) -> Literal[\"__end__\", \"research_company\"]:\n", + " \"\"\"Route the graph based on the reflection output.\"\"\"\n", + "\n", + " # Get configuration\n", + " configurable = configuration.Configuration.from_runnable_config(config)\n", + " \n", + " # If we have satisfactory results, end the process\n", + " if state.is_satisfactory:\n", + " return END\n", + " \n", + " # If results aren't satisfactory but we haven't hit max steps, continue research\n", + " if state.reflection_steps_taken <= configurable.max_reflection_steps:\n", + " return \"research_company\"\n", + " \n", + " # If we've exceeded max steps, end even if not satisfactory\n", + " return END\n", + "\n", + "# Add nodes and edges\n", + "builder = StateGraph(\n", + " OverallState,\n", + " input=InputState,\n", + " output=OutputState,\n", + " config_schema=configuration.Configuration,\n", + ")\n", + "builder.add_node(\"gather_notes_extract_schema\", gather_notes_extract_schema)\n", + "builder.add_node(\"research_company\", research_company)\n", + "builder.add_node(\"reflection\", reflection)\n", + "\n", + "builder.add_edge(START, \"research_company\")\n", + "builder.add_edge(\"research_company\", \"gather_notes_extract_schema\")\n", + "builder.add_edge(\"gather_notes_extract_schema\", \"reflection\")\n", + "builder.add_conditional_edges(\"reflection\", route_from_reflection)\n", + "\n", + "# Compile\n", + "graph = builder.compile()\n", + "\n", + "# View\n", + "from IPython.display import Image, display\n", + "display(Image(graph.get_graph(xray=1).draw_mermaid_png()))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: No raw_content found for source https://blogs.oracle.com/ateam/post/oci-generative-ai-integration-with-langchain-usecases\n" + ] + }, + { + "data": { + "text/plain": [ + "{'info': {'company_name': 'LangChain',\n", + " 'founding_year': 2022,\n", + " 'founder_names': ['Harrison Chase', 'Ankush Gola'],\n", + " 'product_description': 'LangChain is an open-source framework for developing applications using large language models (LLMs). It provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents. The framework offers flexible abstractions and an AI-first toolkit for building context-aware reasoning applications, serving as a generic interface for nearly any LLM. It allows integration with external data sources and software workflows, and its modular approach enables developers to compare different prompts and foundation models with minimal code rewriting.',\n", + " 'funding_summary': 'LangChain has raised a total of $35 million. This includes a $25 million Series A round led by Sequoia Capital on February 15, 2024, and a $10 million Seed round led by Benchmark on April 4, 2023. The company\\'s valuation was reported to be \"at least $200 million\" as of April 2023.'}}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Test default schema \n", + "\n", + "# Create proper InputState instance\n", + "input_state = InputState(\n", + " company=\"LangChain\",\n", + ")\n", + "\n", + "# Invoke with the proper input state\n", + "result = await graph.ainvoke(input_state)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'company': 'langchain',\n", + " 'completed_notes': [\"Research Notes: LangChain\\n\\nCompany Name:\\n- LangChain\\n\\nFounding Year:\\n- 2022 (mentioned in multiple sources)\\n\\nFounders:\\n- Harrison Chase (Co-Founder & CEO)\\n- Ankush Gola (Co-Founder)\\n\\nProduct Description:\\n- Open source framework for developing applications using large language models (LLMs)\\n- Provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents\\n- Offers flexible abstractions and an AI-first toolkit\\n- Helps connect LLMs to private data sources and APIs to create context-aware, reasoning applications\\n- Includes integrations with various cloud storage, APIs, databases, and LLM providers\\n\\nFunding Summary:\\n- Total raised: $35 million (as of July 2023)\\n- Latest round: $25 million Series A (April 2023)\\n- Led by Sequoia Capital\\n- Previous round: $10 million seed round led by Benchmark (announced April 4, 2023)\\n- Valuation: At least $200 million (as of April 2023)\\n\\nAdditional Notes:\\n- Headquarters: San Francisco, California\\n- Launched in October 2022, quickly gained popularity among developers\\n- As of June 2023, had 20,000+ developers in its Discord community\\n- GitHub stats (as of July 2023): 55K stars, 7.1K forks, 13.9K users, 1.19K contributors\\n- In October 2023, introduced LangServe, a deployment tool\\n- Recently launched LangSmith, a paid LLMOps product for the entire LLM application lifecycle\\n- LangSmith had over 70,000 signups since closed beta launch in July 2023\\n- Used by over 5,000 companies monthly, including Rakuten, Elastic, Moody's, and Retool\\n\\nMissing/Unclear Information:\\n- Exact founding date not specified\\n- Complete list of founding team members not provided\\n- Detailed breakdown of funding rounds prior to the $25 million Series A is not clear\"],\n", + " 'info': {'company_name': 'LangChain',\n", + " 'founding_year': 2022,\n", + " 'founder_names': ['Harrison Chase', 'Ankush Gola'],\n", + " 'product_description': 'LangChain is an open source framework for developing applications using large language models (LLMs). It provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents. The framework offers flexible abstractions and an AI-first toolkit, helping connect LLMs to private data sources and APIs to create context-aware, reasoning applications. It includes integrations with various cloud storage, APIs, databases, and LLM providers.',\n", + " 'funding_summary': \"LangChain has raised a total of $35 million as of July 2023. Their latest round was a $25 million Series A in April 2023, led by Sequoia Capital. This was preceded by a $10 million seed round led by Benchmark, announced on April 4, 2023. The company's valuation was at least $200 million as of April 2023.\"},\n", + " 'is_satisfactory': True}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Test remote graph connected w/ Studio \n", + "\n", + "from langgraph.pregel.remote import RemoteGraph\n", + "\n", + "url = \"http://localhost:60827\"\n", + "graph_id = \"company_maistro\"\n", + "\n", + "graph = RemoteGraph(graph_id, url=url)\n", + "\n", + "# Some input to the graph\n", + "input = {\"company\":\"langchain\"}\n", + "\n", + "# Can also be a subgraph in an existing graph\n", + "response = await graph.ainvoke(input)\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'company': 'langchain', 'completed_notes': []}\n", + "{'company': 'langchain', 'completed_notes': [\"Research Notes: LangChain\\n\\nCompany Name:\\n- LangChain\\n\\nFounding Year:\\n- 2022 (mentioned in multiple sources)\\n\\nFounders:\\n- Harrison Chase (Co-Founder & CEO)\\n- Ankush Gola (Co-Founder)\\n\\nProduct Description:\\n- Open source framework for developing applications using large language models (LLMs)\\n- Provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents\\n- Offers flexible abstractions and an AI-first toolkit\\n- Helps connect LLMs to private data sources and APIs to create context-aware, reasoning applications\\n- Includes integrations with various cloud storage, APIs, databases, and LLM providers\\n\\nFunding Summary:\\n- Total raised: $35 million (as of July 2023)\\n- Latest round: $25 million Series A (April 2023)\\n- Led by Sequoia Capital\\n- Previous round: $10 million seed round led by Benchmark (announced April 4, 2023)\\n- Valuation: At least $200 million (as of April 2023)\\n\\nAdditional Notes:\\n- Headquarters: San Francisco, California\\n- Launched in October 2022, quickly gained popularity among developers\\n- As of June 2023, had 20,000+ developers in its Discord community\\n- GitHub stats (as of July 2023): 55K stars, 7.1K forks, 13.9K users, 1.19K contributors\\n- In October 2023, introduced LangServe, a deployment tool\\n- Recently launched LangSmith, a paid LLMOps product for the entire LLM application lifecycle\\n- LangSmith had over 70,000 signups since closed beta launch in July 2023\\n- Used by over 5,000 companies monthly, including Rakuten, Elastic, Moody's, and Retool\\n\\nMissing/Unclear Information:\\n- Exact founding date not specified\\n- Complete list of founding team members not provided\\n- Detailed breakdown of funding rounds prior to the $25 million Series A is not clear\"]}\n", + "{'company': 'langchain', 'completed_notes': [\"Research Notes: LangChain\\n\\nCompany Name:\\n- LangChain\\n\\nFounding Year:\\n- 2022 (mentioned in multiple sources)\\n\\nFounders:\\n- Harrison Chase (Co-Founder & CEO)\\n- Ankush Gola (Co-Founder)\\n\\nProduct Description:\\n- Open source framework for developing applications using large language models (LLMs)\\n- Provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents\\n- Offers flexible abstractions and an AI-first toolkit\\n- Helps connect LLMs to private data sources and APIs to create context-aware, reasoning applications\\n- Includes integrations with various cloud storage, APIs, databases, and LLM providers\\n\\nFunding Summary:\\n- Total raised: $35 million (as of July 2023)\\n- Latest round: $25 million Series A (April 2023)\\n- Led by Sequoia Capital\\n- Previous round: $10 million seed round led by Benchmark (announced April 4, 2023)\\n- Valuation: At least $200 million (as of April 2023)\\n\\nAdditional Notes:\\n- Headquarters: San Francisco, California\\n- Launched in October 2022, quickly gained popularity among developers\\n- As of June 2023, had 20,000+ developers in its Discord community\\n- GitHub stats (as of July 2023): 55K stars, 7.1K forks, 13.9K users, 1.19K contributors\\n- In October 2023, introduced LangServe, a deployment tool\\n- Recently launched LangSmith, a paid LLMOps product for the entire LLM application lifecycle\\n- LangSmith had over 70,000 signups since closed beta launch in July 2023\\n- Used by over 5,000 companies monthly, including Rakuten, Elastic, Moody's, and Retool\\n\\nMissing/Unclear Information:\\n- Exact founding date not specified\\n- Complete list of founding team members not provided\\n- Detailed breakdown of funding rounds prior to the $25 million Series A is not clear\"], 'info': {'company_name': 'LangChain', 'founding_year': 2022, 'founder_names': ['Harrison Chase', 'Ankush Gola'], 'product_description': 'LangChain is an open source framework for developing applications using large language models (LLMs). It provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents. The framework offers flexible abstractions and an AI-first toolkit, helping connect LLMs to private data sources and APIs to create context-aware, reasoning applications. It includes integrations with various cloud storage, APIs, databases, and LLM providers.', 'funding_summary': \"LangChain has raised a total of $35 million as of July 2023. Their latest round was a $25 million Series A in April 2023, led by Sequoia Capital. This was preceded by a $10 million seed round led by Benchmark, announced on April 4, 2023. The company's valuation was at least $200 million as of April 2023.\"}}\n", + "{'company': 'langchain', 'completed_notes': [\"Research Notes: LangChain\\n\\nCompany Name:\\n- LangChain\\n\\nFounding Year:\\n- 2022 (mentioned in multiple sources)\\n\\nFounders:\\n- Harrison Chase (Co-Founder & CEO)\\n- Ankush Gola (Co-Founder)\\n\\nProduct Description:\\n- Open source framework for developing applications using large language models (LLMs)\\n- Provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents\\n- Offers flexible abstractions and an AI-first toolkit\\n- Helps connect LLMs to private data sources and APIs to create context-aware, reasoning applications\\n- Includes integrations with various cloud storage, APIs, databases, and LLM providers\\n\\nFunding Summary:\\n- Total raised: $35 million (as of July 2023)\\n- Latest round: $25 million Series A (April 2023)\\n- Led by Sequoia Capital\\n- Previous round: $10 million seed round led by Benchmark (announced April 4, 2023)\\n- Valuation: At least $200 million (as of April 2023)\\n\\nAdditional Notes:\\n- Headquarters: San Francisco, California\\n- Launched in October 2022, quickly gained popularity among developers\\n- As of June 2023, had 20,000+ developers in its Discord community\\n- GitHub stats (as of July 2023): 55K stars, 7.1K forks, 13.9K users, 1.19K contributors\\n- In October 2023, introduced LangServe, a deployment tool\\n- Recently launched LangSmith, a paid LLMOps product for the entire LLM application lifecycle\\n- LangSmith had over 70,000 signups since closed beta launch in July 2023\\n- Used by over 5,000 companies monthly, including Rakuten, Elastic, Moody's, and Retool\\n\\nMissing/Unclear Information:\\n- Exact founding date not specified\\n- Complete list of founding team members not provided\\n- Detailed breakdown of funding rounds prior to the $25 million Series A is not clear\"], 'info': {'company_name': 'LangChain', 'founding_year': 2022, 'founder_names': ['Harrison Chase', 'Ankush Gola'], 'product_description': 'LangChain is an open source framework for developing applications using large language models (LLMs). It provides tools and APIs to simplify building LLM-driven applications like chatbots and virtual agents. The framework offers flexible abstractions and an AI-first toolkit, helping connect LLMs to private data sources and APIs to create context-aware, reasoning applications. It includes integrations with various cloud storage, APIs, databases, and LLM providers.', 'funding_summary': \"LangChain has raised a total of $35 million as of July 2023. Their latest round was a $25 million Series A in April 2023, led by Sequoia Capital. This was preceded by a $10 million seed round led by Benchmark, announced on April 4, 2023. The company's valuation was at least $200 million as of April 2023.\"}, 'is_satisfactory': True}\n" + ] + } + ], + "source": [ + "## Test SDK connected w/ Studio \n", + "\n", + "from langgraph_sdk import get_client\n", + "client = get_client(url=\"http://localhost:60827\")\n", + "new_thread = await client.threads.create()\n", + "async for chunk in client.runs.stream(new_thread[\"thread_id\"], \n", + " assistant_id=\"25f608e5-2c8c-565f-a5f2-39604c6d4ff4\",\n", + " input={\"company\":\"langchain\"},\n", + " stream_mode=\"values\"):\n", + "\n", + " if chunk.event == 'values':\n", + " print(chunk.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'info': {'company_name': 'LangChain',\n", + " 'verified_company': True,\n", + " 'company_summary': \"LangChain is a San Francisco-based technology company founded in 2022 that develops an AI-powered large language model (LLM) framework. The company has raised approximately $30-35 million in funding and was valued at over $200 million as of April 2023. LangChain's main products include the LangChain Framework, LangGraph Cloud, LangSmith, and LangServe, all designed to simplify the creation and deployment of LLM-based applications. With a strong focus on community engagement and rapid product evolution, LangChain has gained significant traction among developers and continues to expand its offerings in the AI industry.\",\n", + " 'key_executives': [{'name': 'Harrison Chase',\n", + " 'title': 'Co-Founder & CEO',\n", + " 'verification_date': '2023-07-01',\n", + " 'confidence_level': 'high',\n", + " 'source': 'https://craft.co/langchain'},\n", + " {'name': 'Ankush Gola',\n", + " 'title': 'Co-Founder',\n", + " 'verification_date': '2023-07-01',\n", + " 'confidence_level': 'high',\n", + " 'source': 'https://craft.co/langchain'},\n", + " {'name': 'Brie Wolfson',\n", + " 'title': 'Marketing',\n", + " 'verification_date': '2023-07-01',\n", + " 'confidence_level': 'medium',\n", + " 'source': 'https://craft.co/langchain'},\n", + " {'name': 'Nuno Campos',\n", + " 'title': 'Founding Engineer',\n", + " 'verification_date': '2023-07-01',\n", + " 'confidence_level': 'medium',\n", + " 'source': 'https://craft.co/langchain'}],\n", + " 'org_chart_summary': 'Detailed information about the organizational structure is not provided in the sources, but the company appears to have a relatively flat structure with key executives and founding members leading different aspects of the business.',\n", + " 'main_products': [{'name': 'LangChain Framework',\n", + " 'description': 'Core framework designed to simplify creation of applications using large language models (LLMs). It provides tools for connecting LLMs to other data sources and environments, enables chaining commands, and offers memory components for managing chat history.',\n", + " 'launch_date': '2022-10',\n", + " 'current_status': 'Active and continuously updated'},\n", + " {'name': 'LangGraph Cloud',\n", + " 'description': 'Infrastructure for running agents at scale. Used for building full-stack generative UI apps, Discord bots, and self-corrective RAG applications.',\n", + " 'launch_date': '2023',\n", + " 'current_status': 'Active'},\n", + " {'name': 'LangSmith',\n", + " 'description': 'Tool for testing, evaluating, and optimizing LLM applications. Features self-improving evaluators with human feedback integration.',\n", + " 'launch_date': '2023',\n", + " 'current_status': 'Active'},\n", + " {'name': 'LangServe',\n", + " 'description': 'Deployment tool for transitioning from LCEL prototypes to production-ready applications.',\n", + " 'launch_date': '2023-10',\n", + " 'current_status': 'Active'}]}}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Test hard schema \n", + "\n", + "HARD_EXTRACTION_SCHEMA = {\n", + " \"title\": \"CompanyInfo\",\n", + " \"description\": \"Comprehensive information about a company with confidence tracking\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"company_name\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Official name of the company\"\n", + " },\n", + " \"verified_company\": {\n", + " \"type\": \"boolean\",\n", + " \"description\": \"Confirmation this is the intended company, not a similarly named one\"\n", + " },\n", + " \"similar_companies\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\"type\": \"string\"},\n", + " \"description\": \"List of similarly named companies that could be confused with the target\"\n", + " },\n", + " \"distinguishing_features\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Key features that distinguish this company from similarly named ones\"\n", + " },\n", + " \"key_executives\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\"},\n", + " \"title\": {\"type\": \"string\"},\n", + " \"verification_date\": {\"type\": \"string\"},\n", + " \"confidence_level\": {\n", + " \"type\": \"string\",\n", + " \"enum\": [\"high\", \"medium\", \"low\", \"uncertain\"]\n", + " },\n", + " \"source\": {\"type\": \"string\"}\n", + " }\n", + " }\n", + " },\n", + " \"org_chart_summary\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Brief description of organizational structure\"\n", + " },\n", + " \"leadership_caveats\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Any uncertainties or caveats about leadership information\"\n", + " },\n", + " \"main_products\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\"},\n", + " \"description\": {\"type\": \"string\"},\n", + " \"launch_date\": {\"type\": \"string\"},\n", + " \"current_status\": {\"type\": \"string\"}\n", + " }\n", + " }\n", + " },\n", + " \"services\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\"},\n", + " \"description\": {\"type\": \"string\"},\n", + " \"target_market\": {\"type\": \"string\"}\n", + " }\n", + " }\n", + " },\n", + " \"recent_developments\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"date\": {\"type\": \"string\"},\n", + " \"title\": {\"type\": \"string\"},\n", + " \"summary\": {\"type\": \"string\"},\n", + " \"source_url\": {\"type\": \"string\"},\n", + " \"significance\": {\"type\": \"string\"}\n", + " }\n", + " },\n", + " \"description\": \"Major news and developments from the last 6 months\"\n", + " },\n", + " \"historical_challenges\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"issue_type\": {\"type\": \"string\"},\n", + " \"description\": {\"type\": \"string\"},\n", + " \"date_period\": {\"type\": \"string\"},\n", + " \"resolution\": {\"type\": \"string\"},\n", + " \"current_status\": {\"type\": \"string\"}\n", + " }\n", + " },\n", + " \"description\": \"Past challenges, issues, or controversies\"\n", + " },\n", + " \"sources\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"url\": {\"type\": \"string\"},\n", + " \"title\": {\"type\": \"string\"},\n", + " \"date_accessed\": {\"type\": \"string\"},\n", + " \"information_type\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\"type\": \"string\"},\n", + " \"description\": \"Types of information sourced from this link (e.g., leadership, products, news)\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"company_summary\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Concise, dense summary of the most important company information (max 250 words)\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"company_name\",\n", + " \"verified_company\",\n", + " \"company_summary\",\n", + " \"key_executives\",\n", + " \"main_products\",\n", + " \"sources\"\n", + " ]\n", + "}\n", + "\n", + "# Create proper InputState instance\n", + "input_state = InputState(\n", + " company=\"LangChain\",\n", + " extraction_schema=HARD_EXTRACTION_SCHEMA,\n", + ")\n", + "\n", + "# Invoke with the proper input state\n", + "result = await graph.ainvoke(input_state)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "company_maistro_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}