-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
320 lines (260 loc) · 13.2 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import json
import requests
import streamlit as st
import nltk
from typing import Type
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.prompts import MessagesPlaceholder
from langchain.memory import ConversationSummaryBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import SystemMessage
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
# Secrets
browserless_api_key = st.secrets["BROWSERLESS_API_KEY"]
serper_api_key = st.secrets["SERP_API_KEY"]
# Download the Punkt tokenizer models used by NLTK for sentence splitting.
nltk.download('punkt')
# Download the list of stopwords from NLTK, used to filter out common words.
nltk.download('stopwords')
# Search Tool
def search(query):
url = "https://google.serper.dev/search"
payload = json.dumps({"q": query})
headers = {"X-API-KEY": serper_api_key, "Content-Type": "application/json"}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
# Scraping Tool
def scrape_website(objective, url):
if not url:
print("URL is empty, skipping scrape.")
return None
print("Scraping website...")
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json",
}
data = {"url": url}
data_json = json.dumps(data)
post_url = f"https://chrome.browserless.io/content?token={browserless_api_key}&stealth=true"
response = requests.post(post_url, headers=headers, data=data_json)
# Check the response status code
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
text = soup.get_text()
print("Found Content")
if len(text) > 10000:
print ("Summarizing Content")
output = summary(objective, text)
return output
else:
return text
else:
print(f"HTTP request failed with status code {response.status_code}")
print("Response text:", response.text)
return None
# Summary Tool
def summary(objective, content):
# Initialize the ChatOpenAI model with specific parameters
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
# Create an instance of RecursiveCharacterTextSplitter, which breaks the content
# into smaller chunks suitable for processing, based on the provided separators
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500
)
# Split the content into smaller documents using the text_splitter
docs = text_splitter.create_documents([content])
# Define the prompt template for summarization, where the model is asked
# to summarize the provided text for a specific objective
map_prompt = """
Write a summary of the following text for {objective}:
"{text}"
SUMMARY:
"""
map_prompt_template = PromptTemplate(
template=map_prompt, input_variables=["text", "objective"]
)
# Load the summarization chain with the defined prompt and the ChatOpenAI model
summary_chain = load_summarize_chain(
llm=llm,
chain_type="map_reduce",
map_prompt=map_prompt_template,
combine_prompt=map_prompt_template,
verbose=True,
)
# Run the summary chain to generate the summary of the content based on the objective
output = summary_chain.run(input_documents=docs, objective=objective)
return output
# Define a data model for the input of the scrape_website function
class ScrapeWebsiteInput(BaseModel):
"""Inputs for scrape_website"""
# Field to specify the objective or task for the scraping
objective: str = Field(
description="The objective & task that users give to the agent"
)
# Field to specify the URL of the website to be scraped
url: str = Field(description="The url of the website to be scraped")
# Define a class to handle website scraping operations
class ScrapeWebsiteTool(BaseTool):
# Name of the tool
name = "scrape_website"
# A brief description of the tool
description = ("useful when you need to get data from a website url, "
"passing both url and objective to the function; DO NOT "
"make up any url, the url should only be from the search results")
# Define the expected input schema for the tool
args_schema: Type[BaseModel] = ScrapeWebsiteInput
# Define the main function that runs the tool
def _run(self, objective: str, url: str):
return scrape_website(objective, url)
# Define an asynchronous function, which is currently not implemented
def _arun(self, url: str):
raise NotImplementedError("error here")
# 3. Create langchain agent with the tools above
tools = [
Tool(
name="Search",
func=search,
description="useful for when you need to answer questions about current events, data. You should ask targeted questions",
),
ScrapeWebsiteTool(),
]
system_message = SystemMessage(
content="""You are a world class researcher, who can do detailed research on any topic and produce facts based results;
you do not make things up, you will try as hard as possible to gather facts & data to back up the research
Please make sure you complete the objective above with the following rules:
1/ You should do enough research to gather as much information as possible about the objective
2/ If there are url of relevant links & articles, you will scrape it to gather more information
3/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
4/ You should not make things up, you should only write facts & data that you have gathered
5/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research"""
)
# Define additional arguments for initializing the ChatGPT agent
agent_kwargs = {
# Include extra prompt messages, specifically a placeholder for 'memory' which stores past interactions
"extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
# Define a system message that may be used to set the behavior or context for the agent
"system_message": system_message,
}
# Initialize the ChatOpenAI (language model) instance with specific parameters
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
# Set up a buffer memory to store a summary of the conversation, which helps in retaining context across interactions
memory = ConversationSummaryBufferMemory(
memory_key="memory", # Key to identify the memory buffer
return_messages=True, # Indicate whether to return past messages
llm=llm, # Associate the language model instance
max_token_limit=1000 # Set a maximum token limit for the stored messages
)
# Initialize the agent using specified tools, language model, and other configurations
agent = initialize_agent(
tools, # List of tools or functions available for the agent to use
llm, # Language model instance
agent=AgentType.OPENAI_FUNCTIONS, # Type of agent being initialized
verbose=True, # Set to True to enable detailed logging
agent_kwargs=agent_kwargs, # Additional agent-specific arguments
memory=memory, # The buffer memory instance
)
# Extract relevant URLs from search results
def extract_relevant_urls_from_search(search_results, company_name, role):
relevant_urls = []
for result in search_results.get('organic', []):
url = result.get('link', '')
content = result.get('content', '').lower()
title = result.get('title', '').lower()
if company_name.lower() in content or role.lower() in content or company_name.lower() in title or role.lower() in title:
relevant_urls.append(url)
return relevant_urls
def generate_interview_questions_from_content(role, content):
# Define a prompt for the agent to generate questions based on the content
prompt = f"Based on the following content related to {role}, generate potential interview questions:\n\n{content}\n\nQuestions:"
# Use the agent to generate questions
response = agent.run(prompt)
# Extract questions from the agent's response
questions = [q.strip() for q in response.split('\n') if q.strip()]
return questions
def extract_keywords_from_description(description):
# Tokenize the job description
words = word_tokenize(description)
# Filter out stopwords and non-alphabetic words
stop_words = set(stopwords.words('english'))
keywords = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
return keywords
def filter_irrelevant_questions(questions):
# We can add more conditions based on feedback to improve filtering.
return [q for q in questions if len(q.split()) > 6 and not q.startswith("Can you describe")]
def is_relevant_question(question, job_description_keywords):
# List of irrelevant question starters
irrelevant_starts = ["Can you share", "Were there", "How would you approach answering", "Can you explain how"]
# Check if the question starts with any of the irrelevant starts
if any(question.startswith(start) for start in irrelevant_starts):
return False
# Check if the question contains any keyword from the job description
if any(keyword in question.lower() for keyword in job_description_keywords):
return True
return True
def research_interview_questions(company_name, role, job_description):
# Feedback to the user
print("Searching for interview experiences online...")
query = f"interview questions for {role} at {company_name}"
search_results = json.loads(search(query))
# Feedback to the user
print("Extracting relevant URLs from the search results...")
relevant_urls = extract_relevant_urls_from_search(search_results, company_name, role)
all_questions = []
loop_limit = 3
loop_count = 0
for url in relevant_urls:
if loop_count >= loop_limit:
print("Reached the maximum number of scraping iterations. Moving on...")
break
# Feedback to the user
print(f"Scraping content from {url}...")
content = scrape_website("Interview Questions Research", url)
if content:
# Feedback to the user
print("Generating interview questions based on the scraped content...")
questions = generate_interview_questions_from_content(role, content)
all_questions.extend(questions)
loop_count += 1
# Extract keywords from job description
job_description_keywords = extract_keywords_from_description(job_description)
# Filter out irrelevant questions
relevant_questions = [q for q in all_questions if is_relevant_question(q, job_description_keywords)]
if not relevant_questions:
return ["I couldn't find specific interview questions for this role."]
return relevant_questions
def clean_question_numbering(question):
# Splitting the question into words and rejoining them with spaces.
return ' '.join(question.split()[1:]) if question.split()[0][-1] == '.' else question
def is_experience_question(question):
# Keywords that typically indicate past experience questions
experience_keywords = ["did you", "have you", "were you", "can you describe"]
return any(keyword in question.lower() for keyword in experience_keywords)
# Streamlit web app
def main():
st.set_page_config(page_title="Interview Prep Bot", page_icon=":briefcase:")
st.header("Interview Prep Bot :briefcase:")
company_name = st.text_input("Company Name", "Capital One")
role = st.text_input("Role", "Lead Software Engineer")
job_description = st.text_area("Paste the Job Description Here", "Enter the detailed job description provided by the company.")
if st.button("Research Interview Questions"):
st.info(f"Researching interview experiences for the role of {role} at {company_name}...")
interview_questions = research_interview_questions(company_name, role, job_description)
# Clean the questions
cleaned_questions = [clean_question_numbering(question) for question in interview_questions]
# Filter out experience-related questions
filtered_questions = [q for q in cleaned_questions if not is_experience_question(q)]
# Display relevant questions
st.header("Potential Interview Questions:")
for idx, question in enumerate(filtered_questions, 1):
st.write(f"{idx}. {question}")
if __name__ == '__main__':
main()