Skip to content

Commit

Permalink
Merge pull request #46 from bscholer/master
Browse files Browse the repository at this point in the history
Enhancements to Whisper ASR Integration
  • Loading branch information
djmango authored Feb 11, 2024
2 parents 06fbe83 + ae1fbf0 commit 6cf075b
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 40 deletions.
148 changes: 116 additions & 32 deletions src/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ interface TranscriptionSettings {
swiftink_access_token: string | null;
swiftink_refresh_token: string | null;
lineSpacing: string;
encode: boolean;
initialPrompt: string;
vadFilter: boolean;
wordTimestamps: boolean;
}

const SWIFTINK_AUTH_CALLBACK =
Expand All @@ -43,6 +47,10 @@ const DEFAULT_SETTINGS: TranscriptionSettings = {
swiftink_access_token: null,
swiftink_refresh_token: null,
lineSpacing: "multi",
encode: true,
initialPrompt: "",
vadFilter: false, // this doesn't seem to do anything in the current version of the Whisper ASR server
wordTimestamps: false,
};

const LANGUAGES = {
Expand Down Expand Up @@ -192,6 +200,15 @@ class TranscriptionSettingTab extends PluginSettingTab {
element.style.display = "block";
});
}
// Hide the settings that depend on timestamps based on whether timestamps are enabled.
// Just here to keep the UI up to date, TODO move this, and the rest to a separate function
containerEl
.findAll(".depends-on-timestamps")
.forEach((element) => {
element.style.display = this.plugin.settings.timestamps
? "block"
: "none";
});
}),
);

Expand Down Expand Up @@ -248,6 +265,41 @@ class TranscriptionSettingTab extends PluginSettingTab {
await this.plugin.saveSettings();
});
});

new Setting(containerEl)
.setName("Enable timestamps")
.setDesc("Add timestamps to the beginning of each line")
.addToggle((toggle) =>
toggle
.setValue(this.plugin.settings.timestamps)
.onChange(async (value) => {
this.plugin.settings.timestamps = value;
await this.plugin.saveSettings();
containerEl
.findAll(".depends-on-timestamps")
.forEach((element) => {
element.style.display = value ? "block" : "none";
});
}),
);

new Setting(containerEl)
.setName("Timestamp format")
.setDesc(
"Your choice of hours, minutes, and/or seconds in the timestamp",
)
.setClass("depends-on-timestamps")
.addDropdown((dropdown) =>
dropdown
.addOption("HH:mm:ss", "HH:mm:ss")
.addOption("mm:ss", "mm:ss")
.addOption("ss", "ss")
.setValue(this.plugin.settings.timestampFormat)
.onChange(async (value) => {
this.plugin.settings.timestampFormat = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("Swiftink Settings")
Expand Down Expand Up @@ -318,37 +370,6 @@ class TranscriptionSettingTab extends PluginSettingTab {
});
});

new Setting(containerEl)
.setName("Enable timestamps")
.setDesc("Add timestamps to the beginning of each line")
.setClass("swiftink-settings")
.addToggle((toggle) =>
toggle
.setValue(this.plugin.settings.timestamps)
.onChange(async (value) => {
this.plugin.settings.timestamps = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("Timestamp format")
.setDesc(
"Your choice of hours, minutes, and/or seconds in the timestamp",
)
.setClass("swiftink-settings")
.addDropdown((dropdown) =>
dropdown
.addOption("HH:mm:ss", "HH:mm:ss")
.addOption("mm:ss", "mm:ss")
.addOption("ss", "ss")
.setValue(this.plugin.settings.timestampFormat)
.onChange(async (value) => {
this.plugin.settings.timestampFormat = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("Embed summary")
.setDesc("Embed the generated transcription summary in the note")
Expand All @@ -367,7 +388,7 @@ class TranscriptionSettingTab extends PluginSettingTab {

new Setting(containerEl)
.setName("Embed outline")
.setDesc("Embed the generated trancription outline in the note")
.setDesc("Embed the generated transcription outline in the note")
.setTooltip(
"This will only work if you have a Swiftink Pro account",
)
Expand Down Expand Up @@ -437,6 +458,60 @@ class TranscriptionSettingTab extends PluginSettingTab {
}),
);

new Setting(containerEl)
.setName("Encode")
.setDesc("Encode audio first through ffmpeg")
.setClass("whisper-asr-settings")
.addToggle((toggle) =>
toggle
.setValue(this.plugin.settings.encode)
.onChange(async (value) => {
this.plugin.settings.encode = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("Initial prompt")
.setDesc("Model follows the style of the prompt, rather than any instructions contained within. 224 tokens max. More info at https://cookbook.openai.com/examples/whisper_prompting_guide")
.setClass("whisper-asr-settings")
.addTextArea((text) =>
text
.setPlaceholder(DEFAULT_SETTINGS.initialPrompt)
.setValue(this.plugin.settings.initialPrompt)
.onChange(async (value) => {
this.plugin.settings.initialPrompt = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("Word timestamps")
.setDesc("Include timestamps for each word, can get very verbose! Only works if timestamps are enabled.")
.setClass("whisper-asr-settings")
.setClass("depends-on-timestamps")
.addToggle((toggle) =>
toggle
.setValue(this.plugin.settings.wordTimestamps)
.onChange(async (value) => {
this.plugin.settings.wordTimestamps = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl)
.setName("VAD filter")
.setDesc("Filter out silence from the audio")
.setClass("whisper-asr-settings")
.addToggle((toggle) =>
toggle
.setValue(this.plugin.settings.vadFilter)
.onChange(async (value) => {
this.plugin.settings.vadFilter = value;
await this.plugin.saveSettings();
}),
);

new Setting(containerEl).setName("Advanced Settings").setHeading();

new Setting(containerEl)
Expand Down Expand Up @@ -505,6 +580,15 @@ class TranscriptionSettingTab extends PluginSettingTab {
});
}

// Initially hide the settings that depend on timestamps based on whether timestamps are enabled
if (!this.plugin.settings.timestamps) {
containerEl
.findAll(".depends-on-timestamps")
.forEach((element) => {
element.style.display = "none";
});
}

// Initially hide the settings for user auth/unauth based on whether the user is signed in
if (this.plugin.user == null) {
containerEl
Expand Down
56 changes: 48 additions & 8 deletions src/transcribe.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { TranscriptionSettings, /*SWIFTINK_AUTH_CALLBACK*/ API_BASE } from "src/settings";
import { TranscriptionSettings, /*SWIFTINK_AUTH_CALLBACK*/ API_BASE, DEFAULT_SETTINGS } from "src/settings";
import { Notice, requestUrl, RequestUrlParam, TFile, Vault, App } from "obsidian";
import { format } from "date-fns";
import { paths, components } from "./types/swiftink";
import { payloadGenerator, PayloadData } from "src/utils";
import { PayloadData, payloadGenerator, preprocessWhisperASRResponse } from "src/utils";
import { StatusBar } from "./status";
import { SupabaseClient } from "@supabase/supabase-js";
import * as tus from "tus-js-client";
import { WhisperASRSegment } from "./types/whisper-asr";

type TranscriptionBackend = (file: TFile) => Promise<string>;

Check warning on line 11 in src/transcribe.ts

View workflow job for this annotation

GitHub Actions / build

'file' is defined but never used

Expand Down Expand Up @@ -90,9 +91,14 @@ export class TranscriptionEngine {
const [request_body, boundary_string] =
await payloadGenerator(payload_data);

let args = "task=transcribe";
if (this.settings.language != "auto")
args += `&language=${this.settings.language}`;
let args = "output=json"; // always output json, so we can have the timestamps if we need them
const { translate, encode, vadFilter, timestamps, wordTimestamps, language, initialPrompt } = this.settings;
if (translate) args += `&task=translate`;
if (encode !== DEFAULT_SETTINGS.encode) args += `&encode=${encode}`;
if (vadFilter !== DEFAULT_SETTINGS.vadFilter) args += `&vad_filter=${vadFilter}`;
if (timestamps && wordTimestamps !== DEFAULT_SETTINGS.wordTimestamps) args += `&word_timestamps=${wordTimestamps}`;
if (language !== DEFAULT_SETTINGS.language) args += `&language=${language}`;
if (initialPrompt) args += `&initial_prompt=${initialPrompt}`;

const urls = this.settings.whisperASRUrls
.split(";")
Expand All @@ -113,9 +119,43 @@ export class TranscriptionEngine {

try {
const response = await requestUrl(options);
if (this.settings.debug) console.log(response);
if (typeof response.text === "string") return response.text;
else return response.json.text;
if (this.settings.debug) console.log("Raw response:", response);

const preprocessed = preprocessWhisperASRResponse(response.json);
if (this.settings.debug) console.log("Preprocessed response:", preprocessed);

if (
this.settings.wordTimestamps
&& preprocessed.segments.some((segment: WhisperASRSegment) => segment.wordTimestamps)
) {
// Create segments for each word timestamp if word timestamps are available and enabled
const wordSegments = preprocessed.segments
.reduce((acc: components["schemas"]["TimestampedTextSegment"][], segment: WhisperASRSegment) => {
if (segment.wordTimestamps) {
acc.push(...segment.wordTimestamps.map(wordTimestamp => ({
start: wordTimestamp.start,
end: wordTimestamp.end,
text: wordTimestamp.word
} as components["schemas"]["TimestampedTextSegment"])));
}
return acc;
}, []);
return this.segmentsToTimestampedString(wordSegments, this.settings.timestampFormat);
} else if (this.settings.timestamps) {
// Use existing segment-to-string functionality if only segment timestamps are needed
const segments = preprocessed.segments.map((segment: WhisperASRSegment) => ({
start: segment.start,
end: segment.end,
text: segment.text
}));
return this.segmentsToTimestampedString(segments, this.settings.timestampFormat);
} else if (preprocessed.segments) {
// Concatenate all segments into a single string if no timestamps are required
return preprocessed.segments.map((segment: WhisperASRSegment) => segment.text).join("\n");
} else {
// Fallback to full text if no segments are there
return preprocessed.text;
}
} catch (error) {
if (this.settings.debug) console.error("Error with URL:", url, error);
// Don't return or throw yet, try the next URL
Expand Down
Loading

0 comments on commit 6cf075b

Please sign in to comment.