-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparser.py
24 lines (18 loc) · 897 Bytes
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
import pandas as pd
import json
class DataSetParser(BaseBlobParser):
"""Parser for dataset blobs."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
content = ""
with blob.as_bytes_io() as file:
if blob.mimetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
content = pd.read_excel(file, index_col=0)
elif blob.mimetype == "text/csv":
content = pd.read_csv(file)
content = content.to_dict()
yield Document(page_content=json.dumps(content), metadata={"source": "analytibot", "name": blob.path.rsplit('.', 1)[0]})