Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create run_storm_wiki_gpt_with_DuckDuckGoRM.py #292

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 258 additions & 0 deletions examples/storm_examples/run_storm_wiki_gpt_with_DuckDuckGoRM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
"""
This STORM Wiki pipeline powered by GPT-4o-mini and GPT-4l and DuckDuckGo retrieval mode.

Installation instructions:

git clone https://github.com/stanford-oval/storm.git
cd storm
python -m venv .
.\Scripts\activate
pip install -r requirements.txt
pip install duckduckgo_search

**Since I wanted to latest updates I intentionally did NOT pip install knowledge_storm. Thus, this script is geared towards that
installation procedure only.

Create secrets.toml in the installation directory with the following:
OPENAI_API_KEY="[INSERT KEY HERE]"
OPENAI_API_TYPE="openai"

Put this script in the directory and run it:
python run_storm_wiki_gpt_with_DuckDuckGo_RM.py

The command prompt will ask you for a search topic and will eventually run to completion.

Output will be structured as follows:
args.output_dir/
├── topic_name/
├── conversation_log.json # Log of information-seeking conversation
├── raw_search_results.json # Raw search results from search engine
├── direct_gen_outline.txt # Outline directly generated with LLM's parametric knowledge
├── storm_gen_outline.txt # Outline refined with collected information
├── url_to_info.json # Sources that are used in the final article
├── storm_gen_article.txt # Final article generated
└── storm_gen_article_polished.txt # Polished final article (if args.do_polish_article is True)

PLEASE NOTE:

(1) This script DOES NOT accept command line argument flags. Rather, the "do_research," "do_generate_outline," "do_generate_article,"
and "do_polish_article" are hardcoded into the script. I found that this more easily allows users to, for example, pull updated
settings from a config.yaml file or pass them from a graphical user interface.

(2) DuckDuckGo has extreme rate limiting. This script deals with that through various strategies while still complying with robots.txt.
Just be aware, it'll take approximately 20 minutes to finish due to all of the backoffs and retries, but it will in-fact complete.

(3) In my test, researching "floating point numbers" cost:

+------------------------+--------+--------+
| Model | Type | Cost |
+------------------------+--------+--------+
| gpt-4o-mini-2024-07-18 | input | $0.01 |
| gpt-4o-mini-2024-07-18 | output | $0.02 |
| chatgpt-4o-latest | input | $0.16 |
| chatgpt-4o-latest | output | $0.13 |
+------------------------+--------+--------+
"""

import os
from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
from knowledge_storm.rm import DuckDuckGoSearchRM, backoff_hdlr, giveup_hdlr
from knowledge_storm.lm import OpenAIModel
from knowledge_storm.utils import load_api_key
from duckduckgo_search.exceptions import DuckDuckGoSearchException
import random
import time
import backoff
from typing import Dict, List

# This class inherits from knowledge_storm's and builds upon it. This is absolutely necessary due to DuckDuckGo's extreme rate limiting.
class EnhancedDuckDuckGoSearchRM(DuckDuckGoSearchRM):
# NOTE, these are windows-specific.
WINDOWS_PLATFORMS = [
'Windows NT 10.0; Win64; x64',
'Windows NT 10.0; WOW64',
'Windows NT 10.0',
'Windows NT 6.3; Win64; x64',
'Windows NT 6.2; Win64; x64',
'Windows NT 6.1; Win64; x64'
]

MODERN_BROWSERS = [
'Chrome/120.0.0.0',
'Chrome/119.0.0.0',
'Chrome/118.0.0.0',
'Edge/120.0.0.0',
'Edge/119.0.0.0',
'Firefox/120.0',
'Firefox/119.0'
]

def __init__(
self,
k: int = 3,
safe_search: str = "On",
region: str = "us-en",
min_char_count: int = 150,
snippet_chunk_size: int = 1000,
webpage_helper_max_threads: int = 10,
request_delay: float = 0.2,
max_retries: int = 8,
max_time: int = 1000,
backend: str = "auto"
):
super().__init__(
k=k,
safe_search=safe_search,
region=region,
min_char_count=min_char_count,
snippet_chunk_size=snippet_chunk_size,
webpage_helper_max_threads=webpage_helper_max_threads
)
self.request_delay = request_delay
self.current_delay = request_delay
self.max_retries = max_retries
self.max_time = max_time
self.request_count = 0
self.duck_duck_go_backend = backend

def get_random_headers(self) -> Dict[str, str]:
"""Generate random headers to mimic different browsers"""
platform = random.choice(self.WINDOWS_PLATFORMS)
browser = random.choice(self.MODERN_BROWSERS)
user_agent = f'Mozilla/5.0 ({platform}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}'

return {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1'
}

def custom_backoff_handler(self, details):
pass

def custom_giveup_handler(self, exc):
return False

def get_backoff_decorator(self):
return backoff.on_exception(
backoff.expo,
Exception,
max_time=self.max_time,
max_tries=self.max_retries,
on_backoff=self.custom_backoff_handler,
giveup=self.custom_giveup_handler
)

@property
def request(self):
@self.get_backoff_decorator()
def _request(query: str):
try:
self.request_count += 1
random_delay = random.uniform(0.1, 5.0)
time.sleep(random_delay)

self.ddgs.headers = self.get_random_headers()

results = self.ddgs.text(
query,
max_results=self.k,
backend=self.duck_duck_go_backend,
region=self.duck_duck_go_region,
safesearch=self.duck_duck_go_safe_search
)

self.current_delay = self.request_delay

return results

except DuckDuckGoSearchException as e:
if "202 Ratelimit" in str(e):
self.current_delay *= 2
time.sleep(self.current_delay)

raise

return _request

def forward(self, query_or_queries, exclude_urls=[]):
return super().forward(query_or_queries, exclude_urls)

def main():
load_api_key(toml_file_path='secrets.toml')

lm_configs = STORMWikiLMConfigs()
openai_kwargs = {
'api_key': os.getenv("OPENAI_API_KEY"),
# feel free to change these two
'temperature': 1.0,
'top_p': 0.9,
}

gpt_35 = OpenAIModel(model='gpt-4o-mini', max_tokens=500, **openai_kwargs)
gpt_4 = OpenAIModel(model='chatgpt-4o-latest', max_tokens=3000, **openai_kwargs)

lm_configs.set_conv_simulator_lm(gpt_35)
lm_configs.set_question_asker_lm(gpt_35)
lm_configs.set_outline_gen_lm(gpt_4)
lm_configs.set_article_gen_lm(gpt_4)
lm_configs.set_article_polish_lm(gpt_4)

engine_args = STORMWikiRunnerArguments(
output_dir='./results',
max_conv_turn=3,
max_perspective=3,
search_top_k=3,
max_thread_num=3
)

# parameters set here will supersede the default values, which is sometime necessary with DuckDuckGo
rm = EnhancedDuckDuckGoSearchRM(
k=engine_args.search_top_k,
# don't change these two parameters
safe_search='On',
backend='auto', # "api" has been deprecated in the duckduckgo_search library
# but feel free to experiment with the rest
region='us-en',
min_char_count=100,
snippet_chunk_size=800,
webpage_helper_max_threads=1,
request_delay=2.0,
max_retries=12,
max_time=2000
)

runner = STORMWikiRunner(engine_args, lm_configs, rm)

topic = input('Topic: ')
try:
runner.run(
topic=topic,
do_research=True,
do_generate_outline=True,
do_generate_article=True,
do_polish_article=True,
remove_duplicate=True
)
runner.post_run()
runner.summary()
except Exception as e:
print("\nFull exception details:")
print("Type:", type(e))
print("String:", str(e))
print("Repr:", repr(e))
print("Dir:", dir(e))
if hasattr(e, 'args'):
print("Args:", e.args)
raise

if __name__ == "__main__":
main()