-
Notifications
You must be signed in to change notification settings - Fork 140
/
Copy pathhelper_functions.py
245 lines (183 loc) · 8.32 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import tiktoken
import re
from langchain.docstore.document import Document
import PyPDF2
import pylcs
import pandas as pd
import textwrap
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""
Calculates the number of tokens in a given string using a specified encoding.
Args:
string: The input string to tokenize.
encoding_name: The name of the encoding to use (e.g., 'cl100k_base').
Returns:
The number of tokens in the string according to the specified encoding.
"""
encoding = tiktoken.encoding_for_model(encoding_name) # Get the encoding object
num_tokens = len(encoding.encode(string)) # Encode the string and count tokens
return num_tokens
def replace_t_with_space(list_of_documents):
"""
Replaces all tab characters ('\t') with spaces in the page content of each document.
Args:
list_of_documents: A list of document objects, each with a 'page_content' attribute.
Returns:
The modified list of documents with tab characters replaced by spaces.
"""
for doc in list_of_documents:
doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
return list_of_documents
def replace_double_lines_with_one_line(text):
"""
Replaces consecutive double newline characters ('\n\n') with a single newline character ('\n').
Args:
text: The input text string.
Returns:
The text string with double newlines replaced by single newlines.
"""
cleaned_text = re.sub(r'\n\n', '\n', text) # Replace double newlines with single newlines
return cleaned_text
def split_into_chapters(book_path):
"""
Splits a PDF book into chapters based on chapter title patterns.
Args:
book_path (str): The path to the PDF book file.
Returns:
list: A list of Document objects, each representing a chapter with its text content and chapter number metadata.
"""
with open(book_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
documents = pdf_reader.pages # Get all pages from the PDF
# Concatenate text from all pages
text = " ".join([doc.extract_text() for doc in documents])
# Split text into chapters based on chapter title pattern (adjust as needed)
chapters = re.split(r'(CHAPTER\s[A-Z]+(?:\s[A-Z]+)*)', text)
# Create Document objects with chapter metadata
chapter_docs = []
chapter_num = 1
for i in range(1, len(chapters), 2):
chapter_text = chapters[i] + chapters[i + 1] # Combine chapter title and content
doc = Document(page_content=chapter_text, metadata={"chapter": chapter_num})
chapter_docs.append(doc)
chapter_num += 1
return chapter_docs
def extract_book_quotes_as_documents(documents, min_length=50):
quotes_as_documents = []
# Correct pattern for quotes longer than min_length characters, including line breaks
quote_pattern_longer_than_min_length = re.compile(rf'“(.{{{min_length},}}?)”', re.DOTALL)
for doc in documents:
content = doc.page_content
content = content.replace('\n', ' ')
found_quotes = quote_pattern_longer_than_min_length.findall(content)
for quote in found_quotes:
quote_doc = Document(page_content=quote)
quotes_as_documents.append(quote_doc)
return quotes_as_documents
def escape_quotes(text):
"""Escapes both single and double quotes in a string.
Args:
text: The string to escape.
Returns:
The string with single and double quotes escaped.
"""
return text.replace('"', '\\"').replace("'", "\\'")
def text_wrap(text, width=120):
"""
Wraps the input text to the specified width.
Args:
text (str): The input text to wrap.
width (int): The width at which to wrap the text.
Returns:
str: The wrapped text.
"""
return textwrap.fill(text, width=width)
def is_similarity_ratio_lower_than_th(large_string, short_string, th):
"""
Checks if the similarity ratio between two strings is lower than a given threshold.
Args:
large_string: The larger string to compare.
short_string: The shorter string to compare.
th: The similarity threshold.
Returns:
True if the similarity ratio is lower than the threshold, False otherwise.
"""
# Calculate the length of the longest common subsequence (LCS)
lcs = pylcs.lcs_sequence_length(large_string, short_string)
# Calculate the similarity ratio
similarity_ratio = lcs / len(short_string)
# Check if the similarity ratio is lower than the threshold
if similarity_ratio < th:
return True
else:
return False
def analyse_metric_results(results_df):
"""
Analyzes and prints the results of various metrics.
Args:
results_df: A pandas DataFrame containing the metric results.
"""
for metric_name, metric_value in results_df.items():
print(f"\n**{metric_name.upper()}**")
# Extract the numerical value from the Series object
if isinstance(metric_value, pd.Series):
metric_value = metric_value.values[0] # Assuming the value is at index 0
# Print explanation and score for each metric
if metric_name == "faithfulness":
print("Measures how well the generated answer is supported by the retrieved documents.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better faithfulness.
elif metric_name == "answer_relevancy":
print("Measures how relevant the generated answer is to the question.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better relevance.
elif metric_name == "context_precision":
print("Measures the proportion of retrieved documents that are actually relevant.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better precision (avoiding irrelevant documents).
elif metric_name == "context_relevancy":
print("Measures how relevant the retrieved documents are to the question.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better relevance of retrieved documents.
elif metric_name == "context_recall":
print("Measures the proportion of relevant documents that are successfully retrieved.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better recall (finding all relevant documents).
elif metric_name == "context_entity_recall":
print("Measures the proportion of relevant entities mentioned in the question that are also found in the retrieved documents.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better recall of relevant entities.
elif metric_name == "answer_similarity":
print("Measures the semantic similarity between the generated answer and the ground truth answer.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates closer semantic meaning between the answers.
elif metric_name == "answer_correctness":
print("Measures whether the generated answer is factually correct.")
print(f"Score: {metric_value:.4f}")
# Interpretation: Higher score indicates better correctness.
import dill
def save_object(obj, filename):
"""
Save a Python object to a file using dill.
Args:
- obj: The Python object to save.
- filename: The name of the file where the object will be saved.
"""
with open(filename, 'wb') as file:
dill.dump(obj, file)
print(f"Object has been saved to '{filename}'.")
def load_object(filename):
"""
Load a Python object from a file using dill.
Args:
- filename: The name of the file from which the object will be loaded.
Returns:
- The loaded Python object.
"""
with open(filename, 'rb') as file:
obj = dill.load(file)
print(f"Object has been loaded from '{filename}'.")
return obj
# Example usage:
# save_object(plan_and_execute_app, 'plan_and_execute_app.pkl')
# plan_and_execute_app = load_object('plan_and_execute_app.pkl')