eimenhmdt · eimenhmdt · Apr 18, 2023 · Apr 18, 2023 · matthiaskern · Apr 18, 2023
diff --git a/autoresearcher/__init__.py b/autoresearcher/__init__.py
@@ -1 +1,2 @@
 from .workflows.literature_review.literature_review import literature_review
+from .workflows.paper_analysis.analyze_paper import analyze_paper
diff --git a/autoresearcher/agents/data_analysis_agent.py b/autoresearcher/agents/data_analysis_agent.py
@@ -0,0 +1,19 @@
+from langchain.agents import create_pandas_dataframe_agent
+from langchain.llms import OpenAI
+import pandas as pd
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"
+
+
+class DataAnalysisAgent:
+    def __init__(self, csv_file_path):
+        self.df = pd.read_csv(csv_file_path)
+        self.agent = create_pandas_dataframe_agent(OpenAI(temperature=0), self.df, verbose=True)
+
+    def run(self, command):
+        return self.agent.run(command)
diff --git a/autoresearcher/data_sources/file_loaders/__init__.py b/autoresearcher/data_sources/file_loaders/__init__.py
diff --git a/autoresearcher/data_sources/file_loaders/pdf_loader.py b/autoresearcher/data_sources/file_loaders/pdf_loader.py
@@ -0,0 +1,6 @@
+from langchain.document_loaders import PyMuPDFLoader
+
+def load_pdf(pdf_path):
+    loader = PyMuPDFLoader(pdf_path)
+    pages = loader.load()
+    return pages
diff --git a/autoresearcher/utils/analyze_section.py b/autoresearcher/utils/analyze_section.py
@@ -0,0 +1,18 @@
+from autoresearcher.llms.openai import openai_call
+from termcolor import colored
+
+
+def analyze_section(faiss_index, section_name, search_term, k=1):
+    print(colored(f"Analyzing {section_name}...", "cyan"))
+    docs = faiss_index.similarity_search(search_term, k=k)
+
+    context = ""
+    for doc in docs:
+        formatted_string = str(doc)
+        context += formatted_string + "\n"
+
+    prompt = f"What are the main {section_name} in the paper:" + context
+
+    result = openai_call(prompt, use_gpt4=False, temperature=0, max_tokens=500)
+
+    return result
diff --git a/autoresearcher/workflows/paper_analysis/__init__.py b/autoresearcher/workflows/paper_analysis/__init__.py
diff --git a/autoresearcher/workflows/paper_analysis/analyze_paper.py b/autoresearcher/workflows/paper_analysis/analyze_paper.py
@@ -0,0 +1,36 @@
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+import os
+from dotenv import load_dotenv
+from termcolor import colored
+from autoresearcher.data_sources.file_loaders.pdf_loader import load_pdf
+from autoresearcher.utils.analyze_section import analyze_section
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"
+
+def analyze_paper(pdf_path):
+    print(
+        colored(
+            f"Paper analysis initiated", "yellow", attrs=["bold", "blink"]
+        )
+    )
+    pages = load_pdf(pdf_path)
+
+    faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())
+
+    findings = analyze_section(faiss_index, "findings", "Conclusion, findings, results")
+    methodology = analyze_section(faiss_index, "methodology", "Methodology")
+    limitations = analyze_section(faiss_index, "limitations", "Limitations")
+
+    merged_findings = (
+        f"{colored('Main findings:', 'green')} {findings}\n"
+        f"{colored('Methodology:', 'yellow')} {methodology}\n"
+        f"{colored('Limitations:', 'red')} {limitations}"
+    )
+
+    print("\n" + colored("Analysis results:", "magenta", attrs=["bold"]))
+    print(merged_findings)
diff --git a/setup.py b/setup.py
@@ -27,6 +27,9 @@
         "termcolor==1.1.0",
         "jellyfish==0.11.2",
         "tiktoken==0.3.3",
+        "faiss-cpu==1.7.3",
+        "PyMuPDF==1.22.0",
+        "langchain>=0.0.141",
         "setuptools>=42",
         "wheel"
     ],
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .workflows.literature_review.literature_review import literature_review
		from .workflows.paper_analysis.analyze_paper import analyze_paper