Allow timestamping paragraphs (#59)

* WIP * Add cline code * Fix * Add unit tests * More complicated unit test * Fix * Fix * Decimal point in test case * Simplify requirement to only track strating time of a sentence * Use ParagraphsSegmenter in MarkdownFormater * Add paragraph counting in integ test * Fix * Check ytdlp response with Pydantic * WIP * Add paragraph timestamping cli * Fix tests * Add integration test case
shun-liang · Nov 7, 2024 · 413e8da · 413e8da
1 parent f1e5a90
commit 413e8da
Show file tree

Hide file tree

Showing 17 changed files with 491 additions and 119 deletions.
diff --git a/src/yt2doc/cli.py b/src/yt2doc/cli.py
@@ -74,6 +74,11 @@ def main(
         "--llm-api-key",
         help="API key for the LLM server; No need to set if using local Ollama server",
     ),
+    to_timestamp_paragraphs: bool = typer.Option(
+        False,
+        "--timestamp-paragraphs",
+        help="Prepend timestamp to paragraphs",
+    ),
     skip_cache: typing.Annotated[
         bool,
         typer.Option("--skip-cache", help="If should skip reading from cache"),
@@ -145,6 +150,7 @@ def main(
             sat_model=sat_model,
             segment_unchaptered=segment_unchaptered,
             ignore_source_chapters=ignore_source_chapters,
+            to_timestamp_paragraphs=to_timestamp_paragraphs,
             llm_model=llm_model,
             llm_server=llm_server,
             llm_api_key=llm_api_key,

diff --git a/src/yt2doc/extraction/extractor.py b/src/yt2doc/extraction/extractor.py
@@ -11,12 +11,12 @@
 class Extractor:
     def __init__(
         self,
-        video_info_extractor: youtube_interfaces.IYtVideoInfoExtractor,
+        media_info_extractor: youtube_interfaces.IYtMediaInfoExtractor,
         transcriber: transcription_interfaces.ITranscriber,
         file_cache: interfaces.IFileCache,
         ignore_source_chapters: bool,
     ) -> None:
-        self.yt_dlp_adapter = video_info_extractor
+        self.yt_dlp_adapter = media_info_extractor
         self.transcriber = transcriber
         self.file_cache = file_cache
         self.ignore_source_chapters = ignore_source_chapters
@@ -28,16 +28,16 @@ def extract_by_chapter(
     ) -> interfaces.ChapteredTranscript:
         logger.info(f"Extracting video {video_url} by chapter.")
 
-        video_info = self.yt_dlp_adapter.extract_video_info(video_url=video_url)
+        media_info = self.yt_dlp_adapter.extract_media_info(video_url=video_url)
 
         if self.ignore_source_chapters:
-            video_info.chapters = []
+            media_info.chapters = []
 
         if (
             not skip_cache
             and (
                 cached_chaptered_transcript := self.file_cache.get_chaptered_transcript(
-                    video_id=video_info.video_id
+                    video_id=media_info.video_id
                 )
             )
             is not None
@@ -52,7 +52,7 @@ def extract_by_chapter(
         with Timer() as transcribe_timer:
             transcript = self.transcriber.transcribe(
                 audio_path=audio_path,
-                video_info=video_info,
+                media_info=media_info,
             )
             transcripts_by_chapter = [
                 interfaces.TranscriptChapter(
@@ -65,14 +65,16 @@ def extract_by_chapter(
 
         chaptered_transcript = interfaces.ChapteredTranscript(
             url=video_url,
-            title=video_info.title,
+            video_id=media_info.video_id,
+            title=media_info.title,
+            webpage_url_domain=media_info.webpage_url_domain,
             chapters=transcripts_by_chapter,
-            chaptered_at_source=len(video_info.chapters) > 0,
+            chaptered_at_source=len(media_info.chapters) > 0,
             language=transcript.language,
         )
 
         self.file_cache.cache_chaptered_transcript(
-            video_id=video_info.video_id,
+            video_id=media_info.video_id,
             transcript=chaptered_transcript,
         )
 

diff --git a/src/yt2doc/extraction/interfaces.py b/src/yt2doc/extraction/interfaces.py
@@ -13,6 +13,8 @@ class TranscriptChapter(BaseModel):
 class ChapteredTranscript(BaseModel):
     url: str
     title: str
+    video_id: str
+    webpage_url_domain: str
     language: str
     chapters: typing.Sequence[TranscriptChapter]
     chaptered_at_source: bool

diff --git a/src/yt2doc/factories.py b/src/yt2doc/factories.py
@@ -15,6 +15,7 @@
 from yt2doc.formatting.formatter import MarkdownFormatter
 from yt2doc.formatting.llm_topic_segmenter import LLMTopicSegmenter
 from yt2doc.formatting.llm_adapter import LLMAdapter
+from yt2doc.formatting.paragraphs_segmenter import ParagraphsSegmenter
 from yt2doc.yt2doc import Yt2Doc
 
 
@@ -31,6 +32,7 @@ def get_yt2doc(
     sat_model: str,
     segment_unchaptered: bool,
     ignore_source_chapters: bool,
+    to_timestamp_paragraphs: bool,
     llm_model: typing.Optional[str],
     llm_server: str,
     llm_api_key: str,
@@ -43,6 +45,7 @@ def get_yt2doc(
     )
 
     sat = SaT(sat_model)
+    paragraphs_segmenter = ParagraphsSegmenter(sat=sat)
     if segment_unchaptered is True:
         if llm_model is None:
             raise LLMModelNotSpecified(
@@ -57,17 +60,24 @@ def get_yt2doc(
         )
         llm_adapter = LLMAdapter(llm_client=llm_client, llm_model=llm_model)
         llm_topic_segmenter = LLMTopicSegmenter(llm_adapter=llm_adapter)
-        formatter = MarkdownFormatter(sat=sat, topic_segmenter=llm_topic_segmenter)
+        formatter = MarkdownFormatter(
+            paragraphs_segmenter=paragraphs_segmenter,
+            to_timestamp_paragraphs=to_timestamp_paragraphs,
+            topic_segmenter=llm_topic_segmenter,
+        )
     else:
-        formatter = MarkdownFormatter(sat=sat)
+        formatter = MarkdownFormatter(
+            paragraphs_segmenter=paragraphs_segmenter,
+            to_timestamp_paragraphs=to_timestamp_paragraphs,
+        )
 
-    video_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
+    media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
     transcriber = Transcriber(
         temp_dir=temp_dir,
         whisper_adapter=whisper_adapter,
     )
     extractor = Extractor(
-        video_info_extractor=video_info_extractor,
+        media_info_extractor=media_info_extractor,
         transcriber=transcriber,
         file_cache=file_cache,
         ignore_source_chapters=ignore_source_chapters,

diff --git a/src/yt2doc/formatting/formatter.py b/src/yt2doc/formatting/formatter.py
@@ -1,7 +1,7 @@
 import typing
 import logging
 
-from wtpsplit import SaT
+from datetime import timedelta
 
 from yt2doc.extraction import interfaces as extraction_interfaces
 from yt2doc.formatting import interfaces
@@ -12,24 +12,38 @@
 class MarkdownFormatter:
     def __init__(
         self,
-        sat: SaT,
+        paragraphs_segmenter: interfaces.IParagraphsSegmenter,
+        to_timestamp_paragraphs: bool,
         topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
     ) -> None:
-        self.sat = sat
+        self.paragraphs_segmenter = paragraphs_segmenter
         self.topic_segmenter = topic_segmenter
         self.video_title_template = "# {name}"
         self.chapter_title_template = "## {name}"
+        self.to_timestamp_paragraphs = to_timestamp_paragraphs
 
-    def _paragraph_text(self, text: str) -> str:
-        if len(text) < 15:
-            return text
-        logger.info("Splitting text into paragraphs with Segment Any Text.")
-        paragraphed_sentences: typing.List[typing.List[str]] = self.sat.split(
-            text, do_paragraph_segmentation=True, verbose=True
-        )
-        paragraphs = ["".join(sentences) for sentences in paragraphed_sentences]
-        paragraphed_text = "\n\n".join(paragraphs)
-        return paragraphed_text
+    @staticmethod
+    def _paragraphs_to_text(
+        paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
+        video_id: str,
+        webpage_url_domain: str,
+        to_timestamp_paragraphs: bool,
+    ) -> str:
+        paragraph_texts = []
+        for paragraph in paragraphs:
+            first_sentence = paragraph[0]
+            paragraph_text = "".join(sentence.text for sentence in paragraph)
+            paragraph_text = paragraph_text.strip()
+            if to_timestamp_paragraphs:
+                paragraph_start_second = round(first_sentence.start_second)
+                paragraph_start_h_m_s = str(timedelta(seconds=paragraph_start_second))
+                if webpage_url_domain == "youtube.com":
+                    timestamp_prefix = f"[({paragraph_start_h_m_s})](https://youtu.be/{video_id}?t={paragraph_start_second})"
+                else:
+                    timestamp_prefix = f"({paragraph_start_h_m_s})"
+                paragraph_text = f"{timestamp_prefix} {paragraph_text}"
+            paragraph_texts.append(paragraph_text)
+        return "\n\n".join(paragraph_texts)
 
     def format_chaptered_transcript(
         self, chaptered_transcript: extraction_interfaces.ChapteredTranscript
@@ -42,24 +56,37 @@ def format_chaptered_transcript(
             and len(chaptered_transcript.chapters) == 1
         ):
             transcript_segments = chaptered_transcript.chapters[0].segments
-            full_text = "".join([segment.text for segment in transcript_segments])
-            logger.info(
-                "Splitting text into paragraphs with Segment Any Text for topic segmentation."
+            paragraphed_sentences = self.paragraphs_segmenter.segment(
+                transcription_segments=transcript_segments
             )
-            paragraphed_sentences: typing.List[typing.List[str]] = self.sat.split(
-                full_text, do_paragraph_segmentation=True, verbose=True
+            chapters = self.topic_segmenter.segment(
+                sentences_in_paragraphs=paragraphed_sentences
             )
-            chapters = self.topic_segmenter.segment(paragraphs=paragraphed_sentences)
             chapter_and_text_list = [
-                (chapter.title, chapter.text) for chapter in chapters
+                (
+                    chapter.title,
+                    self._paragraphs_to_text(
+                        paragraphs=chapter.paragraphs,
+                        video_id=chaptered_transcript.video_id,
+                        webpage_url_domain=chaptered_transcript.webpage_url_domain,
+                        to_timestamp_paragraphs=self.to_timestamp_paragraphs,
+                    ),
+                )
+                for chapter in chapters
             ]
 
         else:
             for chapter in chaptered_transcript.chapters:
-                chapter_text = self._paragraph_text(
-                    "".join(s.text for s in chapter.segments)
+                paragraphed_sentences = self.paragraphs_segmenter.segment(
+                    transcription_segments=chapter.segments
+                )
+                chapter_full_text = self._paragraphs_to_text(
+                    paragraphs=paragraphed_sentences,
+                    video_id=chaptered_transcript.video_id,
+                    webpage_url_domain=chaptered_transcript.webpage_url_domain,
+                    to_timestamp_paragraphs=self.to_timestamp_paragraphs,
                 )
-                chapter_and_text_list.append((chapter.title, chapter_text.strip()))
+                chapter_and_text_list.append((chapter.title, chapter_full_text.strip()))
 
         transcript_text = "\n\n".join(
             [

diff --git a/src/yt2doc/formatting/interfaces.py b/src/yt2doc/formatting/interfaces.py
@@ -3,11 +3,17 @@
 from pydantic import BaseModel
 
 from yt2doc.extraction import interfaces as extraction_interfaces
+from yt2doc.transcription import interfaces as transcription_interfaces
+
+
+class Sentence(BaseModel):
+    start_second: float
+    text: str
 
 
 class Chapter(BaseModel):
     title: str
-    text: str
+    paragraphs: typing.Sequence[typing.Sequence[Sentence]]
 
 
 class FormattedTranscript(BaseModel):
@@ -20,10 +26,16 @@ class FormattedPlaylist(BaseModel):
     transcripts: typing.Sequence[FormattedTranscript]
 
 
+class IParagraphsSegmenter(typing.Protocol):
+    def segment(
+        self, transcription_segments: typing.Sequence[transcription_interfaces.Segment]
+    ) -> typing.List[typing.List[Sentence]]: ...
+
+
 class ILLMAdapter(typing.Protocol):
     def get_topic_changing_paragraph_indexes(
         self, paragraphs: typing.List[typing.List[str]]
-    ) -> typing.List[int]: ...
+    ) -> typing.Sequence[int]: ...
 
     def generate_title_for_paragraphs(
         self, paragraphs: typing.List[typing.List[str]]
@@ -32,7 +44,7 @@ def generate_title_for_paragraphs(
 
 class ITopicSegmenter(typing.Protocol):
     def segment(
-        self, paragraphs: typing.List[typing.List[str]]
+        self, sentences_in_paragraphs: typing.List[typing.List[Sentence]]
     ) -> typing.Sequence[Chapter]: ...
 
 

diff --git a/src/yt2doc/formatting/llm_adapter.py b/src/yt2doc/formatting/llm_adapter.py
@@ -11,7 +11,7 @@ def __init__(self, llm_client: Instructor, llm_model: str) -> None:
 
     def get_topic_changing_paragraph_indexes(
         self, paragraphs: typing.List[typing.List[str]]
-    ) -> typing.List[int]:
+    ) -> typing.Sequence[int]:
         def validate_paragraph_indexes(v: typing.List[int]) -> typing.List[int]:
             n = len(paragraphs)
             unique_values = set(v)