-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathtranscribe.ts
140 lines (117 loc) · 3.63 KB
/
transcribe.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import path from "path";
import fs from "fs";
import os from "os";
import {
downloadWhisperModel,
installWhisperCpp,
transcribe,
toCaptions,
type WhisperModel,
type Language,
} from "@remotion/install-whisper-cpp";
import { execSync } from "child_process";
import * as readline from "readline";
const SPEECH_START_SECONDS_DEFAULT = 0;
const DEFAULT_AUDIO_PATH = "./public/audio.wav";
interface TranscriptionOptions {
audioPath: string;
speechStartsAtSecond: number;
language?: Language;
}
async function askQuestions(
rl: readline.Interface,
): Promise<TranscriptionOptions> {
const question = (query: string): Promise<string> => {
return new Promise((resolve) => {
rl.question(query, resolve);
});
};
// Ask for audio file path
const audioPath =
(await question(
`❓ Path to audio file (default: ${DEFAULT_AUDIO_PATH}): `,
)) || DEFAULT_AUDIO_PATH;
// Ask for speech start time - this helps avoid false triggers from background music/noise
const speechStartStr = await question(
`❓ At what second does the actual speech begin? (default: ${SPEECH_START_SECONDS_DEFAULT}): `,
);
const speechStartsAtSecond = speechStartStr
? parseFloat(speechStartStr)
: SPEECH_START_SECONDS_DEFAULT;
return {
audioPath,
speechStartsAtSecond,
};
}
async function transcribeAudio(options: TranscriptionOptions) {
const WHISPER_VERSION = process.platform === "win32" ? "1.6.0" : "1.7.4";
const WHISPER_MODEL: WhisperModel = "medium";
const WHISPER_PATH = path.join(process.cwd(), "whisper.cpp");
await installWhisperCpp({
to: WHISPER_PATH,
version: WHISPER_VERSION,
});
await downloadWhisperModel({
model: WHISPER_MODEL,
folder: WHISPER_PATH,
});
// Create a temporary audio file with the speech start cut out and converted to 16-bit WAV
const tempAudioForWhisper = path.join(
os.tmpdir(),
`whisper-${Date.now()}.wav`,
);
// Cut the audio starting from speech start time and convert to 16-bit WAV
execSync(
`npx remotion ffmpeg -i "${options.audioPath}" -ss ${options.speechStartsAtSecond} -ar 16000 -ac 1 "${tempAudioForWhisper}" -y`,
);
const whisperCppOutput = await transcribe({
model: WHISPER_MODEL,
whisperPath: WHISPER_PATH,
inputPath: tempAudioForWhisper,
tokenLevelTimestamps: true,
language: options.language || "English",
whisperCppVersion: WHISPER_VERSION,
});
// Optional: Apply our recommended postprocessing
const { captions } = toCaptions({
whisperCppOutput,
});
const offsetInMillis = options.speechStartsAtSecond * 1000;
// Shift all timestamps forward by speech start time
const adjustedCaptions = captions.map((caption) => ({
...caption,
startMs: caption.startMs + offsetInMillis,
endMs: caption.endMs + offsetInMillis,
timestampMs: caption.timestampMs
? caption.timestampMs + offsetInMillis
: null,
}));
fs.writeFileSync(
path.join(process.cwd(), "./public/captions.json"),
JSON.stringify(adjustedCaptions, null, 2),
);
console.info(
"Transcription complete. Check the captions.json file for the results.",
);
// Clean up temporary file
fs.unlinkSync(tempAudioForWhisper);
return adjustedCaptions;
}
async function startTranscribe() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
try {
const options = await askQuestions(rl);
await transcribeAudio(options);
} finally {
// Close readline interface
rl.close();
}
}
// Only run the CLI if this file is run directly
if (require.main === module) {
startTranscribe();
}
export { transcribeAudio, type TranscriptionOptions };