From 3e18a0b5f48f6f481c6406fcf90aa85c9413d0fd Mon Sep 17 00:00:00 2001
From: Yassine Souissi <74144843+yassinsws@users.noreply.github.com>
Date: Wed, 27 Nov 2024 20:14:10 +0100
Subject: [PATCH] Add links to citations (#139)
---
app/domain/data/lecture_unit_dto.py | 1 +
app/pipeline/chat/lecture_chat_pipeline.py | 6 ++++--
app/pipeline/lecture_ingestion_pipeline.py | 1 +
app/pipeline/prompts/citation_prompt.txt | 16 ++++++++------
app/pipeline/shared/citation_pipeline.py | 5 +++--
app/retrieval/lecture_retrieval.py | 8 +++----
app/vector_database/lecture_schema.py | 25 +++++++++++++++++++++-
7 files changed, 46 insertions(+), 16 deletions(-)
diff --git a/app/domain/data/lecture_unit_dto.py b/app/domain/data/lecture_unit_dto.py
index 26ef1785..1396b823 100644
--- a/app/domain/data/lecture_unit_dto.py
+++ b/app/domain/data/lecture_unit_dto.py
@@ -7,6 +7,7 @@ class LectureUnitDTO(BaseModel):
pdf_file_base64: str = Field(default="", alias="pdfFile")
lecture_unit_id: int = Field(alias="lectureUnitId")
lecture_unit_name: str = Field(default="", alias="lectureUnitName")
+ lecture_unit_link: str = Field(default="", alias="lectureUnitLink")
lecture_id: int = Field(alias="lectureId")
lecture_name: str = Field(default="", alias="lectureName")
course_id: int = Field(alias="courseId")
diff --git a/app/pipeline/chat/lecture_chat_pipeline.py b/app/pipeline/chat/lecture_chat_pipeline.py
index cd17c9cb..7d3d5312 100644
--- a/app/pipeline/chat/lecture_chat_pipeline.py
+++ b/app/pipeline/chat/lecture_chat_pipeline.py
@@ -47,7 +47,9 @@ def lecture_initial_prompt():
questions about the lectures. To answer them the best way, relevant lecture content is provided to you with the
student's question. If the context provided to you is not enough to formulate an answer to the student question
you can simply ask the student to elaborate more on his question. Use only the parts of the context provided for
- you that is relevant to the student's question. If the user greets you greet him back, and ask him how you can help
+ you that is relevant to the student's question. If the user greets you greet him back,
+ and ask him how you can help.
+ Always formulate your answer in the same language as the user's language.
"""
@@ -105,7 +107,7 @@ def __call__(self, dto: LectureChatPipelineExecutionDTO):
retrieved_lecture_chunks = self.retriever(
chat_history=history,
student_query=query.contents[0].text_content,
- result_limit=10,
+ result_limit=5,
course_name=dto.course.name,
course_id=dto.course.id,
base_url=dto.settings.artemis_base_url,
diff --git a/app/pipeline/lecture_ingestion_pipeline.py b/app/pipeline/lecture_ingestion_pipeline.py
index 73d6371b..fe22cd4a 100644
--- a/app/pipeline/lecture_ingestion_pipeline.py
+++ b/app/pipeline/lecture_ingestion_pipeline.py
@@ -74,6 +74,7 @@ def create_page_data(
LectureSchema.LECTURE_NAME.value: lecture_unit_dto.lecture_name,
LectureSchema.LECTURE_UNIT_ID.value: lecture_unit_dto.lecture_unit_id,
LectureSchema.LECTURE_UNIT_NAME.value: lecture_unit_dto.lecture_unit_name,
+ LectureSchema.LECTURE_UNIT_LINK.value: lecture_unit_dto.lecture_unit_link,
LectureSchema.COURSE_ID.value: lecture_unit_dto.course_id,
LectureSchema.COURSE_NAME.value: lecture_unit_dto.course_name,
LectureSchema.COURSE_DESCRIPTION.value: lecture_unit_dto.course_description,
diff --git a/app/pipeline/prompts/citation_prompt.txt b/app/pipeline/prompts/citation_prompt.txt
index 4c143351..f69e885c 100644
--- a/app/pipeline/prompts/citation_prompt.txt
+++ b/app/pipeline/prompts/citation_prompt.txt
@@ -1,17 +1,18 @@
In the paragraphs below you are provided with an answer to a question. Underneath the answer you will find the paragraphs that the answer was based on.
Add citations of the paragraphs to the answer. Cite the paragraphs in brackets after the sentence where the information is used in the answer.
-At the end of the answer list each source with its corresponding number and provide the Lecture Title, Unit Name, as well as the page number in this format "[1] Lecture title, Unit name, page number".
-Do not Include the Actual paragraphs, only the citations at the end.
-Only include the paragraphs that are relevant to the answer.
+At the end of the answer, list each source with its corresponding number and provide the Lecture Title, page number, and a clickable link in this format: [1] "Lecture title", "Lecture unit title", "page number".
+If the answer uses multiple pages from the same lecture, list the page numbers at the same line separated by commas in this format : [1] "Lecture title", "Lecture unit title", "page number1,number2,number3".
+Do not include the actual paragraphs, only the citations at the end.
+Only include the citations of the paragraphs that are relevant to the answer.
If the answer actually does not contain any information from the paragraphs, please do not include any citations and return '!NONE!'.
But if the answer contains information from the paragraphs, ALWAYS include citations.
-Here is an example how to rewrite the answer with citations (note the empty line between the message and the citiations):
+Here is an example how to rewrite the answer with citations (ONLY ADD CITATION IF THE PROVIDED PARAGRAPHS ARE RELEVANT TO THE ANSWER):
"
Lorem ipsum dolor sit amet, consectetur adipiscing elit [1]. Ded do eiusmod tempor incididunt ut labore et dolore magna aliqua [2].
-[1] Lecture 1, Unit A, page 2.
-[2] Lecture 2, Unit B, page 5.
+[1] Lecture 1, Unit A, page 2,3,4.
+[2] Lecture 2, Unit B, page 5,25.
"
Here are the answer and the paragraphs:
@@ -19,7 +20,8 @@ Here are the answer and the paragraphs:
Answer without citations:
{Answer}
-Paragraphs:
+Paragraphs with their Lecture Names, Unit Names, Links and Page Numbers:
{Paragraphs}
Answer with citations (ensure empty line between the message and the citations):
+If the answer actually does not contain any information from the paragraphs, please do not include any citations and return '!NONE!'.
diff --git a/app/pipeline/shared/citation_pipeline.py b/app/pipeline/shared/citation_pipeline.py
index b630bd4d..01e96e2b 100644
--- a/app/pipeline/shared/citation_pipeline.py
+++ b/app/pipeline/shared/citation_pipeline.py
@@ -26,7 +26,7 @@ def __init__(self):
super().__init__(implementation_id="citation_pipeline")
request_handler = CapabilityRequestHandler(
requirements=RequirementList(
- gpt_version_equivalent=4.5,
+ gpt_version_equivalent=4.25,
context_length=16385,
)
)
@@ -53,10 +53,11 @@ def create_formatted_string(self, paragraphs):
"""
formatted_string = ""
for i, paragraph in enumerate(paragraphs):
- lct = "Lecture: {}, Unit: {}, Page: {}\nContent:\n---{}---\n\n".format(
+ lct = "Lecture: {}, Unit: {}, Page: {}, Link: {},\nContent:\n---{}---\n\n".format(
paragraph.get(LectureSchema.LECTURE_NAME.value),
paragraph.get(LectureSchema.LECTURE_UNIT_NAME.value),
paragraph.get(LectureSchema.PAGE_NUMBER.value),
+ paragraph.get(LectureSchema.LECTURE_UNIT_LINK.value),
paragraph.get(LectureSchema.PAGE_TEXT_CONTENT.value),
)
formatted_string += lct
diff --git a/app/retrieval/lecture_retrieval.py b/app/retrieval/lecture_retrieval.py
index b681b589..75a5f278 100644
--- a/app/retrieval/lecture_retrieval.py
+++ b/app/retrieval/lecture_retrieval.py
@@ -404,11 +404,11 @@ def search_in_db(
alpha=hybrid_factor,
vector=vec,
return_properties=[
- LectureSchema.PAGE_TEXT_CONTENT.value,
- LectureSchema.COURSE_NAME.value,
- LectureSchema.LECTURE_NAME.value,
- LectureSchema.PAGE_NUMBER.value,
LectureSchema.COURSE_ID.value,
+ LectureSchema.LECTURE_UNIT_NAME.value,
+ LectureSchema.LECTURE_UNIT_LINK.value,
+ LectureSchema.PAGE_NUMBER.value,
+ LectureSchema.PAGE_TEXT_CONTENT.value,
],
limit=result_limit,
filters=filter_weaviate,
diff --git a/app/vector_database/lecture_schema.py b/app/vector_database/lecture_schema.py
index 912abed7..8c998c85 100644
--- a/app/vector_database/lecture_schema.py
+++ b/app/vector_database/lecture_schema.py
@@ -20,6 +20,7 @@ class LectureSchema(Enum):
LECTURE_NAME = "lecture_name"
LECTURE_UNIT_ID = "lecture_unit_id"
LECTURE_UNIT_NAME = "lecture_unit_name"
+ LECTURE_UNIT_LINK = "lecture_unit_link"
PAGE_TEXT_CONTENT = "page_text_content"
PAGE_NUMBER = "page_number"
BASE_URL = "base_url"
@@ -30,7 +31,23 @@ def init_lecture_schema(client: WeaviateClient) -> Collection:
Initialize the schema for the lecture slides
"""
if client.collections.exists(LectureSchema.COLLECTION_NAME.value):
- return client.collections.get(LectureSchema.COLLECTION_NAME.value)
+ collection = client.collections.get(LectureSchema.COLLECTION_NAME.value)
+ properties = collection.config.get(simple=True).properties
+ if not any(
+ property.__name__ == LectureSchema.LECTURE_UNIT_LINK.value
+ for property_found in properties
+ ):
+ return collection
+ else:
+ collection.config.add_property(
+ Property(
+ name=LectureSchema.LECTURE_UNIT_LINK.value,
+ description="The link to the Lecture Unit",
+ data_type=DataType.TEXT,
+ index_searchable=False,
+ )
+ )
+ return collection
return client.collections.create(
name=LectureSchema.COLLECTION_NAME.value,
vectorizer_config=Configure.Vectorizer.none(),
@@ -78,6 +95,12 @@ def init_lecture_schema(client: WeaviateClient) -> Collection:
description="The name of the lecture unit",
data_type=DataType.TEXT,
),
+ Property(
+ name=LectureSchema.LECTURE_UNIT_LINK.value,
+ description="The link to the Lecture Unit",
+ data_type=DataType.TEXT,
+ index_searchable=False,
+ ),
Property(
name=LectureSchema.PAGE_TEXT_CONTENT.value,
description="The original text content from the slide",