Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add paper analysis workflow & pdf loader #19

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions autoresearcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .workflows.literature_review.literature_review import literature_review
from .workflows.paper_analysis.analyze_paper import analyze_paper
19 changes: 19 additions & 0 deletions autoresearcher/agents/data_analysis_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from langchain.agents import create_pandas_dataframe_agent
from langchain.llms import OpenAI
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"


class DataAnalysisAgent:
def __init__(self, csv_file_path):
self.df = pd.read_csv(csv_file_path)
self.agent = create_pandas_dataframe_agent(OpenAI(temperature=0), self.df, verbose=True)

def run(self, command):
return self.agent.run(command)
Empty file.
6 changes: 6 additions & 0 deletions autoresearcher/data_sources/file_loaders/pdf_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from langchain.document_loaders import PyMuPDFLoader

def load_pdf(pdf_path):
loader = PyMuPDFLoader(pdf_path)
pages = loader.load()
return pages
18 changes: 18 additions & 0 deletions autoresearcher/utils/analyze_section.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from autoresearcher.llms.openai import openai_call
from termcolor import colored


def analyze_section(faiss_index, section_name, search_term, k=1):
print(colored(f"Analyzing {section_name}...", "cyan"))
docs = faiss_index.similarity_search(search_term, k=k)

context = ""
for doc in docs:
formatted_string = str(doc)
context += formatted_string + "\n"

prompt = f"What are the main {section_name} in the paper:" + context

result = openai_call(prompt, use_gpt4=False, temperature=0, max_tokens=500)

return result
Empty file.
36 changes: 36 additions & 0 deletions autoresearcher/workflows/paper_analysis/analyze_paper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
from dotenv import load_dotenv
from termcolor import colored
from autoresearcher.data_sources.file_loaders.pdf_loader import load_pdf
from autoresearcher.utils.analyze_section import analyze_section

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"

def analyze_paper(pdf_path):
print(
colored(
f"Paper analysis initiated", "yellow", attrs=["bold", "blink"]
)
)
pages = load_pdf(pdf_path)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The quality of retrieval might be slightly improved by splitting the documents into smaller sections and adding some chunk overlap.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the feedback!


faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())

findings = analyze_section(faiss_index, "findings", "Conclusion, findings, results")
methodology = analyze_section(faiss_index, "methodology", "Methodology")
limitations = analyze_section(faiss_index, "limitations", "Limitations")

merged_findings = (
f"{colored('Main findings:', 'green')} {findings}\n"
f"{colored('Methodology:', 'yellow')} {methodology}\n"
f"{colored('Limitations:', 'red')} {limitations}"
)

print("\n" + colored("Analysis results:", "magenta", attrs=["bold"]))
print(merged_findings)
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"termcolor==1.1.0",
"jellyfish==0.11.2",
"tiktoken==0.3.3",
"faiss-cpu==1.7.3",
"PyMuPDF==1.22.0",
"langchain>=0.0.141",
"setuptools>=42",
"wheel"
],
Expand Down