From f0ae7f3c3ee46b5cabd663acdb3a4bb754fd06b7 Mon Sep 17 00:00:00 2001 From: robin Date: Mon, 10 Feb 2025 18:43:44 -0500 Subject: [PATCH] Update linkedin logic --- aa.html | 5 + .../reports/founder_and_team.py | 154 ++++++++---------- .../cf_analysis_agent/reports/valuation.py | 3 +- .../cf_analysis_agent/utils/env_variables.py | 3 +- .../cf_analysis_agent/utils/linkedin_utls.py | 124 ++++++++++++++ .../cf_analysis_agent/utils/llm_utils.py | 8 +- ai-agents/crowd-fund-analysis/pyproject.toml | 12 +- .../crowd-fund-analysis/terraform/main.tf | 2 + 8 files changed, 216 insertions(+), 95 deletions(-) create mode 100644 aa.html create mode 100644 ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/linkedin_utls.py diff --git a/aa.html b/aa.html new file mode 100644 index 000000000..16800439e --- /dev/null +++ b/aa.html @@ -0,0 +1,5 @@ +COMPOUND Academy Site
\ No newline at end of file diff --git a/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/founder_and_team.py b/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/founder_and_team.py index 189660a2c..c60767343 100644 --- a/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/founder_and_team.py +++ b/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/founder_and_team.py @@ -1,38 +1,25 @@ import json import traceback -from typing import List, Dict, Any +from typing import List from dotenv import load_dotenv from langchain_community.utilities import GoogleSerperAPIWrapper from langchain_core.messages import HumanMessage -from linkedin_scraper import Person, actions -from selenium import webdriver -from selenium.webdriver.chrome.options import Options from typing_extensions import TypedDict from cf_analysis_agent.agent_state import AgentState, Config, ReportType from cf_analysis_agent.structures.report_structures import StartupAndTeamInfoStructure -from cf_analysis_agent.utils.env_variables import LINKEDIN_EMAIL, LINKEDIN_PASSWORD +from cf_analysis_agent.utils.linkedin_utls import scrape_linkedin_with_proxycurl, TeamMemberLinkedinUrl, \ + RawLinkedinProfile from cf_analysis_agent.utils.llm_utils import get_llm, structured_report_response from cf_analysis_agent.utils.prompt_utils import create_prompt_for_checklist -from cf_analysis_agent.utils.report_utils import create_report_file_and_upload_to_s3, update_report_status_failed, \ +from cf_analysis_agent.utils.report_utils import update_report_status_failed, \ update_report_status_in_progress, update_report_with_structured_output load_dotenv() search = GoogleSerperAPIWrapper() -class TeamMemberLinkedinUrl(TypedDict): - id: str - name: str - url: str - - -class RawLinkedinProfile(TypedDict): - id: str - name: str - profile: Dict[str, Any] - class AnalyzedTeamProfile(TypedDict): id: str @@ -50,18 +37,71 @@ class AnalyzedTeamProfile(TypedDict): def find_startup_info(config: Config, page_content: str) -> StartupAndTeamInfoStructure: prompt = ( - "From the scraped content, extract the following project info as JSON:\n\n" - " - startup_name: str (The name of the project or startup being discussed)\n" - " - startup_details: str (A single sentence explaining what the startup does)\n" - " - industry: str (A brief overview of the industry, including how it has grown in the last 3-5 years, its expected growth in the next 3-5 years, challenges, and unique benefits for startups in this space)\n" - " - team_members: list of objects {id: str (Unique ID for each team member, formatted as firstname_lastname), name: str (The name of the team member), title: str (The position of the team member in the startup), info: str (Details or additional information about the team member as mentioned on the startup page)}\n\n" - "Return ONLY a raw JSON object. Do not include any code fences or additional text." + """From the scraped content, extract the following project info as JSON: + + - startup_name: str (The name of the project or startup being discussed) + - startup_details: str (A single sentence explaining what the startup does) + - industry: str (A brief overview of the industry, including how it has grown in the last 3-5 years, its expected growth in the next 3-5 years, challenges, and unique benefits for startups in this space) + - team_members: list of objects {id: str (Unique ID for each team member, formatted as firstname_lastname), name: str (The name of the team member), title: str (The position of the team member in the startup), info: str (Details or additional information about the team member as mentioned on the startup page)} + + Return the extracted information as a JSON object. + + { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "StartupAndTeamInfoStructure", + "description": "Information about the startup, industry, and team", + "type": "object", + "properties": { + "startup_name": { + "type": "string", + "description": "The name of the project or startup being discussed" + }, + "startup_details": { + "type": "string", + "description": "A single sentence explaining what the startup does" + }, + "industry": { + "type": "string", + "description": "A brief overview of the industry, including how it has grown in the last 3-5 years, its expected growth in the next 3-5 years, challenges, and unique benefits for startups in this space" + }, + "team_members": { + "type": "array", + "description": "A list of team members with their details", + "items": { + "type": "object", + "title": "TeamMemberStructure", + "description": "Information about the team members", + "properties": { + "id": { + "type": "string", + "description": "Unique ID for each team member, formatted as firstname_lastname" + }, + "name": { + "type": "string", + "description": "The name of the team member" + }, + "title": { + "type": "string", + "description": "The position of the team member in the startup" + }, + "info": { + "type": "string", + "description": "Details or additional information about the team member as mentioned on the startup page" + } + }, + "required": ["id", "name", "title", "info"] + } + } + }, + "required": ["startup_name", "startup_details", "industry", "team_members"] + } - f"Startup Info:\n{page_content}" + """ + f"Startup Info: \n{page_content}" ) print("Fetching Team Info") structured_llm = get_llm(config).with_structured_output(StartupAndTeamInfoStructure) response = structured_llm.invoke([HumanMessage(content=prompt)]) + print("Team Info Fetched", response.model_dump_json(indent=4)) return response @@ -99,59 +139,9 @@ def find_linkedin_urls(startup_info: StartupAndTeamInfoStructure): return linkedin_urls -def scrape_linkedin_profiles(linkedin_urls: list): - """ - Uses linkedin_scraper to retrieve LinkedIn profile data based on the URLs - - If a team member has no LinkedIn URL (empty string), their profile will be empty. - """ - - # Setup Selenium WebDriver - chrome_options = Options() - chrome_options.add_argument("--headless") # Run in headless mode - chrome_options.add_argument("--disable-gpu") # Disable GPU for better compatibility - chrome_options.add_argument("--window-size=1920,1080") # Optional: Set window size - chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resources in containerized environments - chrome_options.add_argument("--no-sandbox") # Bypass OS security model (use with caution) - - driver = webdriver.Chrome(options=chrome_options) - actions.login(driver, LINKEDIN_EMAIL, LINKEDIN_PASSWORD) - - def scrape_linkedin_profile(url: str) -> Dict[str, Any]: - if not url: - return {} - try: - # Scrape the LinkedIn profile using linkedin_scraper - # Login to LinkedIn - person = Person(url, driver=driver, scrape=False) - person.scrape(close_on_complete=False) - - # Collect profile details - return { - "name": person.name, - "experiences": person.experiences, - "educations": person.educations, - } - except Exception as e: - print(traceback.format_exc()) - return {} - - # Iterate through the LinkedIn URLs and scrape profiles - raw_profiles: List[RawLinkedinProfile] = [] - for member in linkedin_urls: - profile_data = scrape_linkedin_profile(member["url"]) - raw_profiles.append({ - "id": member.id, - "name": member.name, - "profile": profile_data - }) - # Close the driver - driver.quit() - - return raw_profiles -def evaluate_profiles(config: Config, raw_profiles: list, startup_info: StartupAndTeamInfoStructure): +def evaluate_profiles(config: Config, raw_profiles: list[RawLinkedinProfile], startup_info: StartupAndTeamInfoStructure): analyzed_profiles = [] team_members = startup_info.team_members for member_data in raw_profiles: @@ -161,14 +151,14 @@ def evaluate_profiles(config: Config, raw_profiles: list, startup_info: StartupA team_member_info = {} for tm in team_members: - if tm["id"] == member_id: + if tm.id == member_id: team_member_info = tm break - id_ = team_member_info.get("id", "") - name_ = team_member_info.get("name", "") - title_ = team_member_info.get("title", "") - info_ = team_member_info.get("info", "") + id_ = team_member_info.id + name_ = team_member_info.name + title_ = team_member_info.title + info_ = team_member_info.info filtered_profile = { "firstName": member_profile.get("firstName", ""), @@ -255,7 +245,7 @@ def evaluate_profiles(config: Config, raw_profiles: list, startup_info: StartupA .""" return structured_report_response( config, - "detailed_execution_speed_report", + "evaluate_profiles", table_prompt ) @@ -270,7 +260,7 @@ def create_founder_and_team_report(state: AgentState) -> None: update_report_status_in_progress(project_id, ReportType.FOUNDER_AND_TEAM) startup_info: StartupAndTeamInfoStructure = find_startup_info(state.get("config"), combined_text) linkedin_urls = find_linkedin_urls(startup_info) - raw_profiles = scrape_linkedin_profiles(linkedin_urls) + raw_profiles = scrape_linkedin_with_proxycurl(linkedin_urls) team_info_report = evaluate_profiles(state.get("config"), raw_profiles, startup_info) update_report_with_structured_output(project_id, ReportType.FOUNDER_AND_TEAM, team_info_report) except Exception as e: diff --git a/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/valuation.py b/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/valuation.py index 4c547477c..4d6b71f96 100644 --- a/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/valuation.py +++ b/ai-agents/crowd-fund-analysis/cf_analysis_agent/reports/valuation.py @@ -3,8 +3,7 @@ from cf_analysis_agent.agent_state import AgentState, get_combined_content, ReportType from cf_analysis_agent.structures.report_structures import StructuredReportResponse from cf_analysis_agent.utils.llm_utils import structured_report_response -from cf_analysis_agent.utils.prompt_utils import create_prompt_for_checklist -from cf_analysis_agent.utils.report_utils import create_report_file_and_upload_to_s3, update_report_status_failed, \ +from cf_analysis_agent.utils.report_utils import update_report_status_failed, \ update_report_status_in_progress, update_report_with_structured_output diff --git a/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/env_variables.py b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/env_variables.py index 470d519f1..2c97fabcc 100644 --- a/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/env_variables.py +++ b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/env_variables.py @@ -11,4 +11,5 @@ SERPER_API_KEY = os.getenv("SERPER_API_KEY") LINKEDIN_EMAIL = os.getenv("LINKEDIN_EMAIL") LINKEDIN_PASSWORD = os.getenv("LINKEDIN_PASSWORD") -ADMIN_CODES = set(code.strip() for code in os.getenv("ADMIN_CODES", "").split(",")) \ No newline at end of file +ADMIN_CODES = set(code.strip() for code in os.getenv("ADMIN_CODES", "").split(",")) +PROXYCURL_API_KEY = os.getenv("PROXYCURL_API_KEY") diff --git a/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/linkedin_utls.py b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/linkedin_utls.py new file mode 100644 index 000000000..e3d0441d8 --- /dev/null +++ b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/linkedin_utls.py @@ -0,0 +1,124 @@ +import traceback +from typing import List, Dict, Any + +import requests +from linkedin_scraper import Person, actions +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from typing_extensions import TypedDict + +from cf_analysis_agent.utils.env_variables import LINKEDIN_EMAIL, LINKEDIN_PASSWORD, PROXYCURL_API_KEY + + +class TeamMemberLinkedinUrl(TypedDict): + id: str + name: str + url: str + + +class RawLinkedinProfile(TypedDict): + id: str + name: str + profile: Dict[str, Any] + + + +def scrape_linkedin_with_proxycurl(member: list[TeamMemberLinkedinUrl]) -> list[RawLinkedinProfile]: + raw_profiles: List[RawLinkedinProfile] = [] + for member in member: + # Set the authorization header using your API key + headers = {'Authorization': f'Bearer {PROXYCURL_API_KEY}'} + + # API endpoint for retrieving a LinkedIn person profile + api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin' + linkedin_url = member.get('url') + + # Set up the parameters. Note that you should include only one of: + # 'linkedin_profile_url', 'twitter_profile_url', or 'facebook_profile_url' + params = { + 'linkedin_profile_url': linkedin_url, + 'extra': 'include', + 'github_profile_id': 'include', + 'facebook_profile_id': 'include', + 'twitter_profile_id': 'include', + 'personal_contact_number': 'include', + 'personal_email': 'include', + 'inferred_salary': 'include', + 'skills': 'include', + 'use_cache': 'if-present', + 'fallback_to_cache': 'on-error', + } + + # Make the GET request to the API + response = requests.get(api_endpoint, headers=headers, params=params) + + # Check if the request was successful + if response.status_code == 200: + profile = response.json() + print(f"Downloaded profile from url: {linkedin_url} : {profile}") + # see - https://nubela.co/proxycurl/docs#people-api-person-profile-endpoint + raw_profiles.append({ + "id": member.get('id'), + "name": member.get('name'), + "profile": profile + }) + + else: + # Print error details and raise an exception if needed + print(f"Error fetching profile: {linkedin_url}: ", response.status_code, response.text) + # response.raise_for_status() + + + return raw_profiles + + +def scrape_linkedin_with_linkedin_scraper(linkedin_urls: list[TeamMemberLinkedinUrl]) -> list[RawLinkedinProfile]: + """ + Uses linkedin_scraper to retrieve LinkedIn profile data based on the URLs + + If a team member has no LinkedIn URL (empty string), their profile will be empty. + """ + + # Setup Selenium WebDriver + chrome_options = Options() + chrome_options.add_argument("--headless") # Run in headless mode + chrome_options.add_argument("--disable-gpu") # Disable GPU for better compatibility + chrome_options.add_argument("--window-size=1920,1080") # Optional: Set window size + chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resources in containerized environments + chrome_options.add_argument("--no-sandbox") # Bypass OS security model (use with caution) + + driver = webdriver.Chrome(options=chrome_options) + actions.login(driver, LINKEDIN_EMAIL, LINKEDIN_PASSWORD) + + def scrape_linkedin_profile(url: str) -> Dict[str, Any]: + if not url: + return {} + try: + # Scrape the LinkedIn profile using linkedin_scraper + # Login to LinkedIn + person = Person(url, driver=driver, scrape=False) + person.scrape(close_on_complete=False) + + # Collect profile details + return { + "name": person.name, + "experiences": person.experiences, + "educations": person.educations, + } + except Exception as e: + print(traceback.format_exc()) + return {} + + # Iterate through the LinkedIn URLs and scrape profiles + raw_profiles: List[RawLinkedinProfile] = [] + for member in linkedin_urls: + profile_data = scrape_linkedin_profile(member["url"]) + raw_profiles.append({ + "id": member.get('id'), + "name": member.get('name'), + "profile": profile_data + }) + # Close the driver + driver.quit() + + return raw_profiles diff --git a/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/llm_utils.py b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/llm_utils.py index af87ad637..91dbbeaf7 100644 --- a/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/llm_utils.py +++ b/ai-agents/crowd-fund-analysis/cf_analysis_agent/utils/llm_utils.py @@ -14,7 +14,7 @@ DEFAULT_LLM_CONFIG: Config = {"configurable": {"model": OPEN_AI_DEFAULT_MODEL}} MINI_4_0_CONFIG: Config = {"configurable": {"model": "gpt-4o-mini"}} -MINI_O_3_CONFIG: Config = {"configurable": {"model": "gpt-o3-mini"}} +MINI_O_3_CONFIG: Config = {"configurable": {"model": "o3-mini"}} NORMAL_4_0_CONFIG: Config = {"configurable": {"model": "gpt-4o"}} DEEP_SEEK_R1_CONFIG: Config = {"configurable": {"model": "deepseek-r1-distill-llama-70b"}} @@ -30,11 +30,11 @@ def get_llm(config: Config) -> BaseChatModel: if model in _llm_cache: return _llm_cache[model] else: - if model == "gpt-4o-mini" or model == "gpt-o3-mini" or model == "gpt-4o": - _llm_cache[model] = ChatOpenAI(model_name=model, temperature=0, max_tokens=4000) + if model == "gpt-4o-mini" or model == "o3-mini" or model == "gpt-4o": + _llm_cache[model] = ChatOpenAI(model=model, temperature=0, max_tokens=4000) return _llm_cache[model] elif model == "deepseek-r1-distill-llama-70b": - _llm_cache[model] = ChatGroq(temperature=0, model_name=model, max_tokens=4000) + _llm_cache[model] = ChatGroq(temperature=0, model=model, max_tokens=4000) return _llm_cache[model] else: raise Exception(f"Model {model} not supported") diff --git a/ai-agents/crowd-fund-analysis/pyproject.toml b/ai-agents/crowd-fund-analysis/pyproject.toml index d8c05ebe7..f727d07c7 100644 --- a/ai-agents/crowd-fund-analysis/pyproject.toml +++ b/ai-agents/crowd-fund-analysis/pyproject.toml @@ -7,16 +7,16 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.10" -langgraph = "^0.2.56" -langchain = "^0.3.10" -langchain-core = "^0.3.23" -langchain-community = "^0.3.10" -langchain-openai = "^0.2.12" +langgraph = "0.2.70" +langchain = "0.3.18" +langchain-core = "0.3.34" +langchain-community = "0.3.17" +langchain-openai = "0.3.4" scrapingant-client = "^2.1.0" python-dotenv = "^1.0.1" requests = "^2.32.3" typing-extensions = "^4.12.2" -langchain-google-community = "^2.0.3" +langchain-google-community = "2.0.4" asyncio = "^3.4.3" flask = "^3.1.0" markdown = "^3.7" diff --git a/ai-agents/crowd-fund-analysis/terraform/main.tf b/ai-agents/crowd-fund-analysis/terraform/main.tf index 0f6462634..e864aba64 100644 --- a/ai-agents/crowd-fund-analysis/terraform/main.tf +++ b/ai-agents/crowd-fund-analysis/terraform/main.tf @@ -62,6 +62,7 @@ resource "aws_lightsail_container_service_deployment_version" "cf_deployment" { S3_BUCKET_NAME = var.s3_bucket_name LINKEDIN_EMAIL = var.linkedin_email LINKEDIN_PASSWORD = var.linkedin_password + PROXYCURL_API_KEY = var.proxycurl_api_key } ports = { @@ -125,6 +126,7 @@ variable "aws_default_region" {} variable "s3_bucket_name" {} variable "linkedin_email" {} variable "linkedin_password" {} +variable "proxycurl_api_key" {} output "public_url" { value = aws_lightsail_container_service.cf_service.url