Skip to content

Commit

Permalink
Allow timestamping paragraphs (#59)
Browse files Browse the repository at this point in the history
* WIP

* Add cline code

* Fix

* Add unit tests

* More complicated unit test

* Fix

* Fix

* Decimal point in test case

* Simplify requirement to only track strating time of a sentence

* Use ParagraphsSegmenter in MarkdownFormater

* Add paragraph counting in integ test

* Fix

* Check ytdlp response with Pydantic

* WIP

* Add paragraph timestamping cli

* Fix tests

* Add integration test case
  • Loading branch information
shun-liang authored Nov 7, 2024
1 parent f1e5a90 commit 413e8da
Show file tree
Hide file tree
Showing 17 changed files with 491 additions and 119 deletions.
6 changes: 6 additions & 0 deletions src/yt2doc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def main(
"--llm-api-key",
help="API key for the LLM server; No need to set if using local Ollama server",
),
to_timestamp_paragraphs: bool = typer.Option(
False,
"--timestamp-paragraphs",
help="Prepend timestamp to paragraphs",
),
skip_cache: typing.Annotated[
bool,
typer.Option("--skip-cache", help="If should skip reading from cache"),
Expand Down Expand Up @@ -145,6 +150,7 @@ def main(
sat_model=sat_model,
segment_unchaptered=segment_unchaptered,
ignore_source_chapters=ignore_source_chapters,
to_timestamp_paragraphs=to_timestamp_paragraphs,
llm_model=llm_model,
llm_server=llm_server,
llm_api_key=llm_api_key,
Expand Down
20 changes: 11 additions & 9 deletions src/yt2doc/extraction/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
class Extractor:
def __init__(
self,
video_info_extractor: youtube_interfaces.IYtVideoInfoExtractor,
media_info_extractor: youtube_interfaces.IYtMediaInfoExtractor,
transcriber: transcription_interfaces.ITranscriber,
file_cache: interfaces.IFileCache,
ignore_source_chapters: bool,
) -> None:
self.yt_dlp_adapter = video_info_extractor
self.yt_dlp_adapter = media_info_extractor
self.transcriber = transcriber
self.file_cache = file_cache
self.ignore_source_chapters = ignore_source_chapters
Expand All @@ -28,16 +28,16 @@ def extract_by_chapter(
) -> interfaces.ChapteredTranscript:
logger.info(f"Extracting video {video_url} by chapter.")

video_info = self.yt_dlp_adapter.extract_video_info(video_url=video_url)
media_info = self.yt_dlp_adapter.extract_media_info(video_url=video_url)

if self.ignore_source_chapters:
video_info.chapters = []
media_info.chapters = []

if (
not skip_cache
and (
cached_chaptered_transcript := self.file_cache.get_chaptered_transcript(
video_id=video_info.video_id
video_id=media_info.video_id
)
)
is not None
Expand All @@ -52,7 +52,7 @@ def extract_by_chapter(
with Timer() as transcribe_timer:
transcript = self.transcriber.transcribe(
audio_path=audio_path,
video_info=video_info,
media_info=media_info,
)
transcripts_by_chapter = [
interfaces.TranscriptChapter(
Expand All @@ -65,14 +65,16 @@ def extract_by_chapter(

chaptered_transcript = interfaces.ChapteredTranscript(
url=video_url,
title=video_info.title,
video_id=media_info.video_id,
title=media_info.title,
webpage_url_domain=media_info.webpage_url_domain,
chapters=transcripts_by_chapter,
chaptered_at_source=len(video_info.chapters) > 0,
chaptered_at_source=len(media_info.chapters) > 0,
language=transcript.language,
)

self.file_cache.cache_chaptered_transcript(
video_id=video_info.video_id,
video_id=media_info.video_id,
transcript=chaptered_transcript,
)

Expand Down
2 changes: 2 additions & 0 deletions src/yt2doc/extraction/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class TranscriptChapter(BaseModel):
class ChapteredTranscript(BaseModel):
url: str
title: str
video_id: str
webpage_url_domain: str
language: str
chapters: typing.Sequence[TranscriptChapter]
chaptered_at_source: bool
Expand Down
18 changes: 14 additions & 4 deletions src/yt2doc/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from yt2doc.formatting.formatter import MarkdownFormatter
from yt2doc.formatting.llm_topic_segmenter import LLMTopicSegmenter
from yt2doc.formatting.llm_adapter import LLMAdapter
from yt2doc.formatting.paragraphs_segmenter import ParagraphsSegmenter
from yt2doc.yt2doc import Yt2Doc


Expand All @@ -31,6 +32,7 @@ def get_yt2doc(
sat_model: str,
segment_unchaptered: bool,
ignore_source_chapters: bool,
to_timestamp_paragraphs: bool,
llm_model: typing.Optional[str],
llm_server: str,
llm_api_key: str,
Expand All @@ -43,6 +45,7 @@ def get_yt2doc(
)

sat = SaT(sat_model)
paragraphs_segmenter = ParagraphsSegmenter(sat=sat)
if segment_unchaptered is True:
if llm_model is None:
raise LLMModelNotSpecified(
Expand All @@ -57,17 +60,24 @@ def get_yt2doc(
)
llm_adapter = LLMAdapter(llm_client=llm_client, llm_model=llm_model)
llm_topic_segmenter = LLMTopicSegmenter(llm_adapter=llm_adapter)
formatter = MarkdownFormatter(sat=sat, topic_segmenter=llm_topic_segmenter)
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
topic_segmenter=llm_topic_segmenter,
)
else:
formatter = MarkdownFormatter(sat=sat)
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
)

video_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
transcriber = Transcriber(
temp_dir=temp_dir,
whisper_adapter=whisper_adapter,
)
extractor = Extractor(
video_info_extractor=video_info_extractor,
media_info_extractor=media_info_extractor,
transcriber=transcriber,
file_cache=file_cache,
ignore_source_chapters=ignore_source_chapters,
Expand Down
73 changes: 50 additions & 23 deletions src/yt2doc/formatting/formatter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import typing
import logging

from wtpsplit import SaT
from datetime import timedelta

from yt2doc.extraction import interfaces as extraction_interfaces
from yt2doc.formatting import interfaces
Expand All @@ -12,24 +12,38 @@
class MarkdownFormatter:
def __init__(
self,
sat: SaT,
paragraphs_segmenter: interfaces.IParagraphsSegmenter,
to_timestamp_paragraphs: bool,
topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
) -> None:
self.sat = sat
self.paragraphs_segmenter = paragraphs_segmenter
self.topic_segmenter = topic_segmenter
self.video_title_template = "# {name}"
self.chapter_title_template = "## {name}"
self.to_timestamp_paragraphs = to_timestamp_paragraphs

def _paragraph_text(self, text: str) -> str:
if len(text) < 15:
return text
logger.info("Splitting text into paragraphs with Segment Any Text.")
paragraphed_sentences: typing.List[typing.List[str]] = self.sat.split(
text, do_paragraph_segmentation=True, verbose=True
)
paragraphs = ["".join(sentences) for sentences in paragraphed_sentences]
paragraphed_text = "\n\n".join(paragraphs)
return paragraphed_text
@staticmethod
def _paragraphs_to_text(
paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
video_id: str,
webpage_url_domain: str,
to_timestamp_paragraphs: bool,
) -> str:
paragraph_texts = []
for paragraph in paragraphs:
first_sentence = paragraph[0]
paragraph_text = "".join(sentence.text for sentence in paragraph)
paragraph_text = paragraph_text.strip()
if to_timestamp_paragraphs:
paragraph_start_second = round(first_sentence.start_second)
paragraph_start_h_m_s = str(timedelta(seconds=paragraph_start_second))
if webpage_url_domain == "youtube.com":
timestamp_prefix = f"[({paragraph_start_h_m_s})](https://youtu.be/{video_id}?t={paragraph_start_second})"
else:
timestamp_prefix = f"({paragraph_start_h_m_s})"
paragraph_text = f"{timestamp_prefix} {paragraph_text}"
paragraph_texts.append(paragraph_text)
return "\n\n".join(paragraph_texts)

def format_chaptered_transcript(
self, chaptered_transcript: extraction_interfaces.ChapteredTranscript
Expand All @@ -42,24 +56,37 @@ def format_chaptered_transcript(
and len(chaptered_transcript.chapters) == 1
):
transcript_segments = chaptered_transcript.chapters[0].segments
full_text = "".join([segment.text for segment in transcript_segments])
logger.info(
"Splitting text into paragraphs with Segment Any Text for topic segmentation."
paragraphed_sentences = self.paragraphs_segmenter.segment(
transcription_segments=transcript_segments
)
paragraphed_sentences: typing.List[typing.List[str]] = self.sat.split(
full_text, do_paragraph_segmentation=True, verbose=True
chapters = self.topic_segmenter.segment(
sentences_in_paragraphs=paragraphed_sentences
)
chapters = self.topic_segmenter.segment(paragraphs=paragraphed_sentences)
chapter_and_text_list = [
(chapter.title, chapter.text) for chapter in chapters
(
chapter.title,
self._paragraphs_to_text(
paragraphs=chapter.paragraphs,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
),
)
for chapter in chapters
]

else:
for chapter in chaptered_transcript.chapters:
chapter_text = self._paragraph_text(
"".join(s.text for s in chapter.segments)
paragraphed_sentences = self.paragraphs_segmenter.segment(
transcription_segments=chapter.segments
)
chapter_full_text = self._paragraphs_to_text(
paragraphs=paragraphed_sentences,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
)
chapter_and_text_list.append((chapter.title, chapter_text.strip()))
chapter_and_text_list.append((chapter.title, chapter_full_text.strip()))

transcript_text = "\n\n".join(
[
Expand Down
18 changes: 15 additions & 3 deletions src/yt2doc/formatting/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
from pydantic import BaseModel

from yt2doc.extraction import interfaces as extraction_interfaces
from yt2doc.transcription import interfaces as transcription_interfaces


class Sentence(BaseModel):
start_second: float
text: str


class Chapter(BaseModel):
title: str
text: str
paragraphs: typing.Sequence[typing.Sequence[Sentence]]


class FormattedTranscript(BaseModel):
Expand All @@ -20,10 +26,16 @@ class FormattedPlaylist(BaseModel):
transcripts: typing.Sequence[FormattedTranscript]


class IParagraphsSegmenter(typing.Protocol):
def segment(
self, transcription_segments: typing.Sequence[transcription_interfaces.Segment]
) -> typing.List[typing.List[Sentence]]: ...


class ILLMAdapter(typing.Protocol):
def get_topic_changing_paragraph_indexes(
self, paragraphs: typing.List[typing.List[str]]
) -> typing.List[int]: ...
) -> typing.Sequence[int]: ...

def generate_title_for_paragraphs(
self, paragraphs: typing.List[typing.List[str]]
Expand All @@ -32,7 +44,7 @@ def generate_title_for_paragraphs(

class ITopicSegmenter(typing.Protocol):
def segment(
self, paragraphs: typing.List[typing.List[str]]
self, sentences_in_paragraphs: typing.List[typing.List[Sentence]]
) -> typing.Sequence[Chapter]: ...


Expand Down
2 changes: 1 addition & 1 deletion src/yt2doc/formatting/llm_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def __init__(self, llm_client: Instructor, llm_model: str) -> None:

def get_topic_changing_paragraph_indexes(
self, paragraphs: typing.List[typing.List[str]]
) -> typing.List[int]:
) -> typing.Sequence[int]:
def validate_paragraph_indexes(v: typing.List[int]) -> typing.List[int]:
n = len(paragraphs)
unique_values = set(v)
Expand Down
Loading

0 comments on commit 413e8da

Please sign in to comment.