diff --git a/examples/storm_examples/run_storm_wiki_gpt_with_DuckDuckGoRM.py b/examples/storm_examples/run_storm_wiki_gpt_with_DuckDuckGoRM.py new file mode 100644 index 00000000..fe953eac --- /dev/null +++ b/examples/storm_examples/run_storm_wiki_gpt_with_DuckDuckGoRM.py @@ -0,0 +1,258 @@ +""" +This STORM Wiki pipeline powered by GPT-4o-mini and GPT-4l and DuckDuckGo retrieval mode. + +Installation instructions: + +git clone https://github.com/stanford-oval/storm.git +cd storm +python -m venv . +.\Scripts\activate +pip install -r requirements.txt +pip install duckduckgo_search + +**Since I wanted to latest updates I intentionally did NOT pip install knowledge_storm. Thus, this script is geared towards that +installation procedure only. + +Create secrets.toml in the installation directory with the following: +OPENAI_API_KEY="[INSERT KEY HERE]" +OPENAI_API_TYPE="openai" + +Put this script in the directory and run it: +python run_storm_wiki_gpt_with_DuckDuckGo_RM.py + +The command prompt will ask you for a search topic and will eventually run to completion. + +Output will be structured as follows: +args.output_dir/ +├── topic_name/ + ├── conversation_log.json # Log of information-seeking conversation + ├── raw_search_results.json # Raw search results from search engine + ├── direct_gen_outline.txt # Outline directly generated with LLM's parametric knowledge + ├── storm_gen_outline.txt # Outline refined with collected information + ├── url_to_info.json # Sources that are used in the final article + ├── storm_gen_article.txt # Final article generated + └── storm_gen_article_polished.txt # Polished final article (if args.do_polish_article is True) + +PLEASE NOTE: + +(1) This script DOES NOT accept command line argument flags. Rather, the "do_research," "do_generate_outline," "do_generate_article," +and "do_polish_article" are hardcoded into the script. I found that this more easily allows users to, for example, pull updated +settings from a config.yaml file or pass them from a graphical user interface. + +(2) DuckDuckGo has extreme rate limiting. This script deals with that through various strategies while still complying with robots.txt. +Just be aware, it'll take approximately 20 minutes to finish due to all of the backoffs and retries, but it will in-fact complete. + +(3) In my test, researching "floating point numbers" cost: + ++------------------------+--------+--------+ +| Model | Type | Cost | ++------------------------+--------+--------+ +| gpt-4o-mini-2024-07-18 | input | $0.01 | +| gpt-4o-mini-2024-07-18 | output | $0.02 | +| chatgpt-4o-latest | input | $0.16 | +| chatgpt-4o-latest | output | $0.13 | ++------------------------+--------+--------+ +""" + +import os +from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs +from knowledge_storm.rm import DuckDuckGoSearchRM, backoff_hdlr, giveup_hdlr +from knowledge_storm.lm import OpenAIModel +from knowledge_storm.utils import load_api_key +from duckduckgo_search.exceptions import DuckDuckGoSearchException +import random +import time +import backoff +from typing import Dict, List + +# This class inherits from knowledge_storm's and builds upon it. This is absolutely necessary due to DuckDuckGo's extreme rate limiting. +class EnhancedDuckDuckGoSearchRM(DuckDuckGoSearchRM): + # NOTE, these are windows-specific. + WINDOWS_PLATFORMS = [ + 'Windows NT 10.0; Win64; x64', + 'Windows NT 10.0; WOW64', + 'Windows NT 10.0', + 'Windows NT 6.3; Win64; x64', + 'Windows NT 6.2; Win64; x64', + 'Windows NT 6.1; Win64; x64' + ] + + MODERN_BROWSERS = [ + 'Chrome/120.0.0.0', + 'Chrome/119.0.0.0', + 'Chrome/118.0.0.0', + 'Edge/120.0.0.0', + 'Edge/119.0.0.0', + 'Firefox/120.0', + 'Firefox/119.0' + ] + + def __init__( + self, + k: int = 3, + safe_search: str = "On", + region: str = "us-en", + min_char_count: int = 150, + snippet_chunk_size: int = 1000, + webpage_helper_max_threads: int = 10, + request_delay: float = 0.2, + max_retries: int = 8, + max_time: int = 1000, + backend: str = "auto" + ): + super().__init__( + k=k, + safe_search=safe_search, + region=region, + min_char_count=min_char_count, + snippet_chunk_size=snippet_chunk_size, + webpage_helper_max_threads=webpage_helper_max_threads + ) + self.request_delay = request_delay + self.current_delay = request_delay + self.max_retries = max_retries + self.max_time = max_time + self.request_count = 0 + self.duck_duck_go_backend = backend + + def get_random_headers(self) -> Dict[str, str]: + """Generate random headers to mimic different browsers""" + platform = random.choice(self.WINDOWS_PLATFORMS) + browser = random.choice(self.MODERN_BROWSERS) + user_agent = f'Mozilla/5.0 ({platform}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}' + + return { + 'User-Agent': user_agent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1' + } + + def custom_backoff_handler(self, details): + pass + + def custom_giveup_handler(self, exc): + return False + + def get_backoff_decorator(self): + return backoff.on_exception( + backoff.expo, + Exception, + max_time=self.max_time, + max_tries=self.max_retries, + on_backoff=self.custom_backoff_handler, + giveup=self.custom_giveup_handler + ) + + @property + def request(self): + @self.get_backoff_decorator() + def _request(query: str): + try: + self.request_count += 1 + random_delay = random.uniform(0.1, 5.0) + time.sleep(random_delay) + + self.ddgs.headers = self.get_random_headers() + + results = self.ddgs.text( + query, + max_results=self.k, + backend=self.duck_duck_go_backend, + region=self.duck_duck_go_region, + safesearch=self.duck_duck_go_safe_search + ) + + self.current_delay = self.request_delay + + return results + + except DuckDuckGoSearchException as e: + if "202 Ratelimit" in str(e): + self.current_delay *= 2 + time.sleep(self.current_delay) + + raise + + return _request + + def forward(self, query_or_queries, exclude_urls=[]): + return super().forward(query_or_queries, exclude_urls) + +def main(): + load_api_key(toml_file_path='secrets.toml') + + lm_configs = STORMWikiLMConfigs() + openai_kwargs = { + 'api_key': os.getenv("OPENAI_API_KEY"), + # feel free to change these two + 'temperature': 1.0, + 'top_p': 0.9, + } + + gpt_35 = OpenAIModel(model='gpt-4o-mini', max_tokens=500, **openai_kwargs) + gpt_4 = OpenAIModel(model='chatgpt-4o-latest', max_tokens=3000, **openai_kwargs) + + lm_configs.set_conv_simulator_lm(gpt_35) + lm_configs.set_question_asker_lm(gpt_35) + lm_configs.set_outline_gen_lm(gpt_4) + lm_configs.set_article_gen_lm(gpt_4) + lm_configs.set_article_polish_lm(gpt_4) + + engine_args = STORMWikiRunnerArguments( + output_dir='./results', + max_conv_turn=3, + max_perspective=3, + search_top_k=3, + max_thread_num=3 + ) + + # parameters set here will supersede the default values, which is sometime necessary with DuckDuckGo + rm = EnhancedDuckDuckGoSearchRM( + k=engine_args.search_top_k, + # don't change these two parameters + safe_search='On', + backend='auto', # "api" has been deprecated in the duckduckgo_search library + # but feel free to experiment with the rest + region='us-en', + min_char_count=100, + snippet_chunk_size=800, + webpage_helper_max_threads=1, + request_delay=2.0, + max_retries=12, + max_time=2000 + ) + + runner = STORMWikiRunner(engine_args, lm_configs, rm) + + topic = input('Topic: ') + try: + runner.run( + topic=topic, + do_research=True, + do_generate_outline=True, + do_generate_article=True, + do_polish_article=True, + remove_duplicate=True + ) + runner.post_run() + runner.summary() + except Exception as e: + print("\nFull exception details:") + print("Type:", type(e)) + print("String:", str(e)) + print("Repr:", repr(e)) + print("Dir:", dir(e)) + if hasattr(e, 'args'): + print("Args:", e.args) + raise + +if __name__ == "__main__": + main()