Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add links to citations #139

Merged
merged 14 commits into from
Nov 27, 2024
Merged
1 change: 1 addition & 0 deletions app/domain/data/lecture_unit_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class LectureUnitDTO(BaseModel):
pdf_file_base64: str = Field(default="", alias="pdfFile")
lecture_unit_id: int = Field(alias="lectureUnitId")
lecture_unit_name: str = Field(default="", alias="lectureUnitName")
lecture_unit_link: str = Field(default="", alias="lectureUnitLink")
lecture_id: int = Field(alias="lectureId")
lecture_name: str = Field(default="", alias="lectureName")
course_id: int = Field(alias="courseId")
Expand Down
11 changes: 6 additions & 5 deletions app/pipeline/chat/course_chat_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def get_competency_list() -> list:
regarding their progress overall or in a specific area.
A competency has the following attributes: name, description, taxonomy, soft due date, optional,
and mastery threshold.
The response may include metrics for each competency, such as progress and mastery (0%-100%).
The response may include metrics for each competency, such as progress and confidence (0%-100%).
These are system-generated.
The judgment of learning (JOL) values indicate the self-reported confidence by the student (0-5, 5 star).
The object describing it also indicates the system-computed confidence at the time when the student
Expand All @@ -248,6 +248,7 @@ def get_competency_list() -> list:
"info": competency_metrics.competency_information.get(comp, None),
"exercise_ids": competency_metrics.exercises.get(comp, []),
"progress": competency_metrics.progress.get(comp, 0),
"confidence": competency_metrics.confidence.get(comp, 0),
"mastery": (
(1 - weight) * competency_metrics.progress.get(comp, 0)
+ weight * competency_metrics.confidence.get(comp, 0)
Expand All @@ -263,10 +264,11 @@ def get_competency_list() -> list:
]

@tool()
def lecture_content_retrieval() -> str:
def lecture_content_retrieval(prompt: str) -> str:
"""
Retrieve content from indexed lecture slides.
This will run a RAG retrieval based on the chat history on the indexed lecture slides and return the most relevant paragraphs.
The query should be a natural language question that can be answered by looking into the lecture materials.
This will run a RAG retrieval on the indexed lecture slides and return the most relevant paragraphs.
Use this if you think it can be useful to answer the student's question, or if the student explicitly asks
a question about the lecture content or slides.
Only use this once.
Expand All @@ -283,9 +285,8 @@ def lecture_content_retrieval() -> str:

result = ""
for paragraph in self.retrieved_paragraphs:
lct = "Lecture: {}, Unit: {}, Page: {}\nContent:\n---{}---\n\n".format(
lct = "Lecture: {}, Page: {}\nContent:\n---{}---\n\n".format(
paragraph.get(LectureSchema.LECTURE_NAME.value),
paragraph.get(LectureSchema.LECTURE_UNIT_NAME.value),
paragraph.get(LectureSchema.PAGE_NUMBER.value),
paragraph.get(LectureSchema.PAGE_TEXT_CONTENT.value),
)
Expand Down
6 changes: 4 additions & 2 deletions app/pipeline/chat/lecture_chat_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def lecture_initial_prompt():
questions about the lectures. To answer them the best way, relevant lecture content is provided to you with the
student's question. If the context provided to you is not enough to formulate an answer to the student question
you can simply ask the student to elaborate more on his question. Use only the parts of the context provided for
you that is relevant to the student's question. If the user greets you greet him back, and ask him how you can help
you that is relevant to the student's question. If the user greets you greet him back,
and ask him how you can help.
Always formulate your answer in the same language as the user's language.
"""


Expand Down Expand Up @@ -103,7 +105,7 @@ def __call__(self, dto: LectureChatPipelineExecutionDTO):
retrieved_lecture_chunks = self.retriever(
chat_history=history,
student_query=query.contents[0].text_content,
result_limit=10,
result_limit=5,
course_name=dto.course.name,
course_id=dto.course.id,
base_url=dto.settings.artemis_base_url,
Expand Down
1 change: 1 addition & 0 deletions app/pipeline/lecture_ingestion_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def create_page_data(
LectureSchema.LECTURE_NAME.value: lecture_unit_dto.lecture_name,
LectureSchema.LECTURE_UNIT_ID.value: lecture_unit_dto.lecture_unit_id,
LectureSchema.LECTURE_UNIT_NAME.value: lecture_unit_dto.lecture_unit_name,
LectureSchema.LECTURE_UNIT_LINK.value: lecture_unit_dto.lecture_unit_link,
LectureSchema.COURSE_ID.value: lecture_unit_dto.course_id,
LectureSchema.COURSE_NAME.value: lecture_unit_dto.course_name,
LectureSchema.COURSE_DESCRIPTION.value: lecture_unit_dto.course_description,
Expand Down
14 changes: 8 additions & 6 deletions app/pipeline/prompts/citation_prompt.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
In the paragraphs below you are provided with an answer to a question. Underneath the answer you will find the paragraphs that the answer was based on.
Add citations of the paragraphs to the answer. Cite the paragraphs in brackets after the sentence where the information is used in the answer.
At the end of the answer list each source with its corresponding number and provide the Lecture Title, Unit Name, as well as the page number in this format "[1] Lecture title, Unit name, page number".
At the end of the answer, list each source with its corresponding number and provide the Lecture Title, page number, and a clickable link in this format: [1] <a href="URL">"Lecture title", "page number"</a>.
If the answer uses multiple pages from the same lecture, list the page numbers at the same line separated by commas in this format : [1] <a href="URL">"Lecture title", "page number1,number2,number3"</a>.
Do not Include the Actual paragraphs, only the citations at the end.
Only include the paragraphs that are relevant to the answer.
Only include the citations of the paragraphs that are relevant to the answer.
If the answer actually does not contain any information from the paragraphs, please do not include any citations and return '!NONE!'.
But if the answer contains information from the paragraphs, ALWAYS include citations.

Here is an example how to rewrite the answer with citations (note the empty line between the message and the citiations):
Here is an example how to rewrite the answer with citations (ONLY ADD CITATION IF THE PROVIDED PARAGRAPHS ARE RELEVANT TO THE ANSWER):
"
Lorem ipsum dolor sit amet, consectetur adipiscing elit [1]. Ded do eiusmod tempor incididunt ut labore et dolore magna aliqua [2].

[1] Lecture 1, Unit A, page 2.
[2] Lecture 2, Unit B, page 5.
[1] <a href="https://example.com/lecture1">Lecture 1, page 2,3,4</a>.
[2] <a href="https://example.com/lecture2">Lecture 2, page 5,25</a>.
"

Here are the answer and the paragraphs:

Answer without citations:
{Answer}

Paragraphs:
Paragraphs with their Lecture Names, Links and Page Numbers:
{Paragraphs}

Answer with citations (ensure empty line between the message and the citations):
If the answer actually does not contain any information from the paragraphs, please do not include any citations and return '!NONE!'.
6 changes: 3 additions & 3 deletions app/pipeline/shared/citation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self):
super().__init__(implementation_id="citation_pipeline")
request_handler = CapabilityRequestHandler(
requirements=RequirementList(
gpt_version_equivalent=4.5,
gpt_version_equivalent=3.5,
context_length=16385,
)
)
Expand All @@ -51,10 +51,10 @@ def create_formatted_string(self, paragraphs):
"""
formatted_string = ""
for i, paragraph in enumerate(paragraphs):
lct = "Lecture: {}, Unit: {}, Page: {}\nContent:\n---{}---\n\n".format(
paragraph.get(LectureSchema.LECTURE_NAME.value),
lct = "Lecture: {}, Page: {}, Link: {}, \nContent:\n---{}---\n\n".format(
paragraph.get(LectureSchema.LECTURE_UNIT_NAME.value),
paragraph.get(LectureSchema.PAGE_NUMBER.value),
paragraph.get(LectureSchema.LECTURE_UNIT_LINK.value),
paragraph.get(LectureSchema.PAGE_TEXT_CONTENT.value),
)
formatted_string += lct
Expand Down
8 changes: 4 additions & 4 deletions app/retrieval/lecture_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,11 +387,11 @@ def search_in_db(
alpha=hybrid_factor,
vector=vec,
return_properties=[
LectureSchema.PAGE_TEXT_CONTENT.value,
LectureSchema.COURSE_NAME.value,
LectureSchema.LECTURE_NAME.value,
LectureSchema.PAGE_NUMBER.value,
LectureSchema.COURSE_ID.value,
LectureSchema.LECTURE_UNIT_NAME.value,
LectureSchema.LECTURE_UNIT_LINK.value,
LectureSchema.PAGE_NUMBER.value,
LectureSchema.PAGE_TEXT_CONTENT.value,
],
limit=result_limit,
filters=filter_weaviate,
Expand Down
25 changes: 24 additions & 1 deletion app/vector_database/lecture_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class LectureSchema(Enum):
LECTURE_NAME = "lecture_name"
LECTURE_UNIT_ID = "lecture_unit_id"
LECTURE_UNIT_NAME = "lecture_unit_name"
LECTURE_UNIT_LINK = "lecture_unit_link"
PAGE_TEXT_CONTENT = "page_text_content"
PAGE_NUMBER = "page_number"
BASE_URL = "base_url"
Expand All @@ -30,7 +31,23 @@ def init_lecture_schema(client: WeaviateClient) -> Collection:
Initialize the schema for the lecture slides
"""
if client.collections.exists(LectureSchema.COLLECTION_NAME.value):
return client.collections.get(LectureSchema.COLLECTION_NAME.value)
collection = client.collections.get(LectureSchema.COLLECTION_NAME.value)
properties = collection.config.get(simple=True).properties
if not any(
property.__name__ == LectureSchema.LECTURE_UNIT_LINK.value
for property_found in properties
):
return collection
else:
collection.config.add_property(
Property(
name=LectureSchema.LECTURE_UNIT_LINK.value,
description="The link to the Lecture Unit",
data_type=DataType.TEXT,
index_searchable=False,
)
)
return collection
return client.collections.create(
name=LectureSchema.COLLECTION_NAME.value,
vectorizer_config=Configure.Vectorizer.none(),
Expand Down Expand Up @@ -78,6 +95,12 @@ def init_lecture_schema(client: WeaviateClient) -> Collection:
description="The name of the lecture unit",
data_type=DataType.TEXT,
),
Property(
name=LectureSchema.LECTURE_UNIT_LINK.value,
description="The link to the Lecture Unit",
data_type=DataType.TEXT,
index_searchable=False,
),
Property(
name=LectureSchema.PAGE_TEXT_CONTENT.value,
description="The original text content from the slide",
Expand Down
Loading