-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
130 lines (114 loc) · 3.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import PyPDF2
from typing import Union
from tqdm import tqdm
import csv
def load_txt(document_name: str) -> Union[str, None]:
"""
From a txt, load its content and return it as a string
"""
try:
with open(document_name, "r") as file:
content = file.read()
return content
except FileNotFoundError:
print(f"'{document_name}' not found")
return None
except Exception as e:
print(f"Error happened : {e}")
return None
def load_jsonl(document_name: str) -> Union[str, None]:
"""
From a json, load its content and return it as a string
"""
donnees_json = []
try:
with open(document_name, "r") as file:
for ligne in tqdm(file):
donnees = json.loads(ligne)
donnees_json.append(donnees)
return " ".join(ele for ele in donnees_json)
except FileNotFoundError:
print(f"'{document_name}' not found")
return None
except Exception as e:
print(f"Error happened : {e}")
return None
def load_json(document_name: str) -> Union[str, None]:
"""
From a json, load its content and return it as a string
"""
try:
with open(document_name, "r") as file:
content = json.load(file)
return content
except FileNotFoundError:
print(f"'{document_name}' not found")
return None
except Exception as e:
print(f"Error happened : {e}")
return None
def load_pdf(document_name: str) -> Union[str, None]:
"""
From a pdf, load its content and return it as a string
"""
try:
with open(document_name, "rb") as file:
lct = PyPDF2.PdfReader(file)
contenu = ""
for page_num in range(len(lct.pages)):
page = lct.pages[page_num]
contenu += page.extract_text()
return contenu
except FileNotFoundError:
print(f"'{document_name}' not found")
return None
except Exception as e:
print(f"Error happened : {e}")
return None
def load_csv(document_name: str) -> Union[str, None]:
"""
From a csv, load its content and return a string
"""
data = ""
try:
with open(document_name, newline="") as csvfile:
reader = csv.reader(csvfile)
# reader = csv.DictReader(csvfile)
# headers = reader.fieldnames
# print(f"I found these headers : {headers}")
for row in reader:
data += ",".join(row) + "\n"
except FileNotFoundError:
print(f"'{document_name}' not found")
return None
except Exception as e:
print(f"Error happened : {e}")
return None
return data
def load_document(document_name: str) -> str:
"""
Load content from a document (path).
Currently supported: txt, json, jsonl, pdf & csv
"""
content = None
print(f"Loading document {document_name}")
if document_name.endswith(".txt"):
content = load_txt(document_name=document_name)
elif document_name.endswith(".json"):
content = load_json(document_name=document_name)
elif document_name.endswith(".jsonl"):
content = load_jsonl(document_name=document_name)
elif document_name.endswith(".pdf"):
content = load_pdf(document_name=document_name)
elif document_name.endswith(".csv"):
content = load_csv(document_name=document_name)
else:
print(f"Document format not supported.")
return
if content is None:
print(f"Error: no loaded content detected")
return
else:
print(f"Content loaded.")
return content