-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathusing_gpt_api_to_automate_data_entry.py
63 lines (51 loc) · 2 KB
/
using_gpt_api_to_automate_data_entry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import openai
import PyPDF2
import pandas as pd
# Set your GPT API key
openai.api_key = 'your-api-key'
# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfFileReader(file)
text = ''
for page_num in range(reader.getNumPages()):
page = reader.getPage(page_num)
text += page.extract_text()
return text
# Step 2: Extract structured data using GPT API
def extract_data_with_gpt(pdf_text):
response = openai.Completion.create(
engine="text-davinci-003",
prompt=f"Extract the following information from this invoice:\n\n{pdf_text}\n\nExtract the following fields:\n- Customer Name\n- Invoice Number\n- Date\n- Line Items (Description, Quantity, Unit Price, Total)\n- Total Amount",
max_tokens=300
)
return response.choices[0].text.strip()
# Step 3: Parse the GPT output into structured data
def parse_gpt_output(gpt_output):
data = {}
lines = gpt_output.split("\n")
for line in lines:
if "Customer Name" in line:
data['Customer Name'] = line.split(":")[1].strip()
if "Invoice Number" in line:
data['Invoice Number'] = line.split(":")[1].strip()
if "Total Amount" in line:
data['Total Amount'] = line.split(":")[1].strip()
# You can parse line items similarly
return data
# Step 4: Export structured data to Excel
def export_to_excel(data, output_path):
df = pd.DataFrame([data])
df.to_excel(output_path, index=False)
# Main function
def main(pdf_path, output_path):
# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_path)
# Extract structured data using GPT
extracted_data = extract_data_with_gpt(pdf_text)
# Parse the GPT output
parsed_data = parse_gpt_output(extracted_data)
# Export the parsed data to Excel
export_to_excel(parsed_data, output_path)
if __name__ == "__main__":
main('invoice.pdf', 'invoice_output.xlsx')