From da551ea9d5a5d0836483caee1bc4726fef85dbf8 Mon Sep 17 00:00:00 2001 From: Tom Sightler Date: Mon, 23 Dec 2024 16:31:17 -0500 Subject: [PATCH] Use re-packetized Opus audio This removes the ffmpeg dependency for outbound audio to Homekit. The incoming Opus audio from Ring is simply repacketized into a format acceptable to Homekit with no additional transcoding. Revert "Use re-packetized Opus audio" This reverts commit 5c59b423e5fabad8c7f2ca16a121fdb949d77c27. Improve Opus audio quality --- packages/homebridge-ring/camera-source.ts | 99 +++------- packages/homebridge-ring/opus-repacketizer.ts | 183 ------------------ 2 files changed, 32 insertions(+), 250 deletions(-) delete mode 100644 packages/homebridge-ring/opus-repacketizer.ts diff --git a/packages/homebridge-ring/camera-source.ts b/packages/homebridge-ring/camera-source.ts index 66a0ec80..81c8b5d4 100644 --- a/packages/homebridge-ring/camera-source.ts +++ b/packages/homebridge-ring/camera-source.ts @@ -2,7 +2,6 @@ import type { RingCamera } from 'ring-client-api' import { hap } from './hap.ts' import type { SrtpOptions } from '@homebridge/camera-utils' import { - doesFfmpegSupportCodec, generateSrtpOptions, ReturnAudioTranscoder, RtpSplitter, @@ -38,7 +37,6 @@ import { SrtcpSession, } from 'werift' import type { StreamingSession } from 'ring-client-api/streaming/streaming-session' -import { OpusRepacketizer } from './opus-repacketizer.ts' import path from 'node:path' const __dirname = new URL('.', import.meta.url).pathname, @@ -70,7 +68,7 @@ class StreamingSessionWrapper { videoSrtp = generateSrtpOptions() audioSplitter = new RtpSplitter() videoSplitter = new RtpSplitter() - repacketizeAudioSplitter = new RtpSplitter() + transcodedAudioSplitter = new RtpSplitter() constructor( public streamingSession: StreamingSession, @@ -142,57 +140,26 @@ class StreamingSessionWrapper { targetAddress, audio: { port: audioPort }, } = this.prepareStreamRequest, - { - audio: { - codec: audioCodec, - sample_rate: audioSampleRate, - packet_time: audioPacketTime, - }, - } = startStreamRequest, - // Repacketize the audio stream after it's been transcoded - opusRepacketizer = new OpusRepacketizer(audioPacketTime / 20), - audioIntervalScale = ((audioSampleRate / 8) * audioPacketTime) / 20, + timestampIncrement = + startStreamRequest.audio.sample_rate * + startStreamRequest.audio.packet_time, audioSrtpSession = new SrtpSession(getSessionConfig(this.audioSrtp)) - let firstTimestamp: number, - audioPacketCount = 0 - - this.repacketizeAudioSplitter.addMessageHandler(({ message }) => { - let rtp: RtpPacket | undefined = RtpPacket.deSerialize(message) + let runningTimestamp: number - if (audioCodec === AudioStreamingCodecType.OPUS) { - // borrowed from scrypted - // Original source: https://github.com/koush/scrypted/blob/c13ba09889c3e0d9d3724cb7d49253c9d787fb97/plugins/homekit/src/types/camera/camera-streaming-srtp-sender.ts#L124-L143 - rtp = opusRepacketizer.repacketize(rtp) + this.transcodedAudioSplitter.addMessageHandler(({ message }) => { + const rtp: RtpPacket | undefined = RtpPacket.deSerialize(message) - if (!rtp) { - return null - } - - if (!firstTimestamp) { - firstTimestamp = rtp.header.timestamp - } - - // from HAP spec: - // RTP Payload Format for Opus Speech and Audio Codec RFC 7587 with an exception - // that Opus audio RTP Timestamp shall be based on RFC 3550. - // RFC 3550 indicates that PCM audio based with a sample rate of 8k and a packet - // time of 20ms would have a monotonic interval of 8k / (1000 / 20) = 160. - // So 24k audio would have a monotonic interval of (24k / 8k) * 160 = 480. - // HAP spec also states that it may request packet times of 20, 30, 40, or 60. - // In practice, HAP has been seen to request 20 on LAN and 60 over LTE. - // So the RTP timestamp must scale accordingly. - // Further investigation indicates that HAP doesn't care about the actual sample rate at all, - // that's merely a suggestion. When encoding Opus, it can seemingly be an arbitrary sample rate, - // audio will work so long as the rtp timestamps are created properly: which is a construct of the sample rate - // HAP requests, and the packet time is respected, - // opus 48khz will work just fine. - rtp.header.timestamp = - (firstTimestamp + audioPacketCount * 160 * audioIntervalScale) % - 0xffffffff - audioPacketCount++ + // For some reason HAP uses RFC 3550 timestamps instead of following RTP Paylod + // Format for Opus Speech and Audio Codec from RFC 7587 like everyone else. + // This calculates and replaces the timestamps before forwarding to Homekit. + if (!runningTimestamp) { + runningTimestamp = rtp.header.timestamp } + rtp.header.timestamp = runningTimestamp % 0xffffffff + runningTimestamp += timestampIncrement + // encrypt the packet const encryptedPacket = audioSrtpSession.encrypt(rtp.payload, rtp.header) @@ -246,36 +213,29 @@ class StreamingSessionWrapper { const transcodingPromise = this.streamingSession.startTranscoding({ input: ['-vn'], audio: [ - '-map', - '0:a', - - // OPUS specific - it works, but audio is very choppy '-acodec', 'libopus', - '-frame_duration', - request.audio.packet_time, '-application', 'lowdelay', - - // Shared options + '-frame_duration', + request.audio.packet_time.toString(), '-flags', '+global_header', - '-ac', - `${request.audio.channel}`, '-ar', `${request.audio.sample_rate}k`, '-b:a', `${request.audio.max_bit_rate}k`, '-bufsize', `${request.audio.max_bit_rate * 4}k`, + '-ac', + `${request.audio.channel}`, '-payload_type', request.audio.pt, '-ssrc', this.audioSsrc, '-f', 'rtp', - `rtp://127.0.0.1:${await this.repacketizeAudioSplitter - .portPromise}?pkt_size=376`, + `rtp://127.0.0.1:${await this.transcodedAudioSplitter.portPromise}`, ], video: false, output: [], @@ -309,16 +269,20 @@ class StreamingSessionWrapper { outputArgs: [ '-acodec', 'libopus', - '-ac', - '1', - '-ar', - '24k', - '-b:a', - '24k', '-application', 'lowdelay', + '-frame_duration', + request.audio.packet_time.toString(), '-flags', '+global_header', + '-ar', + '48k', + '-b:a', + '48k', + '-bufsize', + '192k', + '-ac', + '2', '-f', 'rtp', `rtp://127.0.0.1:${await returnAudioTranscodedSplitter.portPromise}`, @@ -344,7 +308,7 @@ class StreamingSessionWrapper { stop() { this.audioSplitter.close() - this.repacketizeAudioSplitter.close() + this.transcodedAudioSplitter.close() this.videoSplitter.close() this.streamingSession.stop() } @@ -363,6 +327,7 @@ export class CameraSource implements CameraStreamingDelegate { supportedCryptoSuites: [SRTPCryptoSuites.AES_CM_128_HMAC_SHA1_80], video: { resolutions: [ + [1920, 1024, 30], [1280, 720, 30], [1024, 768, 30], [640, 480, 30], diff --git a/packages/homebridge-ring/opus-repacketizer.ts b/packages/homebridge-ring/opus-repacketizer.ts deleted file mode 100644 index d5cea034..00000000 --- a/packages/homebridge-ring/opus-repacketizer.ts +++ /dev/null @@ -1,183 +0,0 @@ -// OpusRepacketizer is borrowed from scrypted -// Original source: https://github.com/koush/scrypted/blob/3150a3033515a3886af1e6b35a0ba7432b63e02b/plugins/homekit/src/types/camera/opus-repacketizer.ts - -import type { RtpPacket } from 'werift' - -// https://datatracker.ietf.org/doc/html/rfc6716 - -// INPUT (for single frame sample, see RFC for other 4 code values) - -// 0 1 2 3 -// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | config |s|0|0| | -// +-+-+-+-+-+-+-+-+ | -// | Compressed frame 1 (N-1 bytes)... : -// : | -// | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -// OUTPUT - -// 0 1 2 3 -// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | config |s|1|1|1|p| M | Padding length (Optional) : -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// : N1 (1-2 bytes): N2 (1-2 bytes): ... : N[M-1] | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | | -// : Compressed frame 1 (N1 bytes)... : -// | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | | -// : Compressed frame 2 (N2 bytes)... : -// | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | | -// : ... : -// | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | | -// : Compressed frame M... : -// | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// : Opus Padding (Optional)... | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -// Figure 6: A CBR Code 3 Packet - -// In the VBR case, the (optional) padding length is followed by M-1 -// frame lengths (indicated by "N1" to "N[M-1]" in Figure 7), each -// encoded in a one- or two-byte sequence as described above. The -// packet MUST contain enough data for the M-1 lengths after removing -// the (optional) padding, and the sum of these lengths MUST be no -// larger than the number of bytes remaining in the packet after -// decoding them [R7]. The compressed data for all M frames follows, -// each frame consisting of the indicated number of bytes, with the -// final frame consuming any remaining bytes before the final padding, -// as illustrated in Figure 6. The number of header bytes (TOC byte, -// frame count byte, padding length bytes, and frame length bytes), plus -// the signaled length of the first M-1 frames themselves, plus the -// signaled length of the padding MUST be no larger than N, the total -// size of the packet. - -export class OpusRepacketizer { - depacketized: Buffer[] = [] - - constructor(public framesPerPacket: number) {} - - // repacketize a packet with a single frame into a packet with multiple frames. - repacketize(packet: RtpPacket): RtpPacket | undefined { - const code = packet.payload[0] & 0b00000011 - let offset: number - - // see Frame Length Coding in RFC - const decodeFrameLength = () => { - let frameLength = packet.payload.readUInt8(offset) - if (frameLength >= 252) { - offset++ - frameLength += packet.payload.readUInt8(offset) * 4 - } - return frameLength - } - // code 0: cbr, 1 packet - // code 1: cbr, 2 packets - // code 2: vbr, 2 packets - // code 3: cbr/vbr signaled, variable packets - - if (code === 0) { - if (this.framesPerPacket === 1 && !this.depacketized.length) return packet - // depacketize by stripping off the config byte - this.depacketized.push(packet.payload.subarray(1)) - } else if (code === 1) { - if (this.framesPerPacket === 2 && !this.depacketized.length) return packet - // depacketize by dividing the remaining payload into two equal sized frames - const remaining = packet.payload.length - 1 - if (remaining % 2) { - throw new Error('expected equal sized opus packets (code 1)') - } - const frameLength = remaining / 2 - this.depacketized.push(packet.payload.subarray(1, 1 + frameLength)) - this.depacketized.push(packet.payload.subarray(1 + frameLength)) - } else if (code === 2) { - if (this.framesPerPacket === 2 && !this.depacketized.length) return packet - offset = 1 - // depacketize by dividing the remaining payload into two inequal sized frames - const frameLength = decodeFrameLength() - this.depacketized.push( - packet.payload.subarray(offset, offset + frameLength), - ) - this.depacketized.push(packet.payload.subarray(offset + frameLength)) - } else if (code === 3) { - // code 3 packet will have a frame count and padding indicator, and whether the packets - // are equal size or not. - const frameCountByte = packet.payload[1], - packetFrameCount = frameCountByte & 0b00111111, - vbr = frameCountByte & 0b10000000 - if ( - this.framesPerPacket === packetFrameCount && - !this.depacketized.length - ) { - return packet - } - const paddingIndicator = frameCountByte & 0b01000000 - offset = 2 - let padding = 0 - if (paddingIndicator) { - padding = packet.payload.readUInt8(offset) - offset++ - if (padding === 255) { - padding = 254 + packet.payload.readUInt8(offset) - offset++ - } - } - - if (!vbr) { - const remaining = packet.payload.length - offset - padding - if (remaining % packetFrameCount) { - throw new Error('expected equal sized opus packets (code 3)') - } - const frameLength = remaining / packetFrameCount - for (let i = 0; i < packetFrameCount; i++) { - const start = offset + i * frameLength, - end = start + frameLength - this.depacketized.push(packet.payload.subarray(start, end)) - } - } else { - const frameLengths: number[] = [] - for (let i = 0; i < packetFrameCount; i++) { - const frameLength = decodeFrameLength() - frameLengths.push(frameLength) - } - for (let i = 0; i < packetFrameCount; i++) { - const frameLength = frameLengths[i], - start = offset - offset += frameLength - this.depacketized.push(packet.payload.subarray(start, offset)) - } - } - } - - if (this.depacketized.length < this.framesPerPacket) return - - const depacketized = this.depacketized.slice(0, this.framesPerPacket) - this.depacketized = this.depacketized.slice(this.framesPerPacket) - - // reuse the config and stereo indicator, but change the code to 3. - let toc = packet.payload[0] - toc |= 0b00000011 - // vbr | padding indicator | packet count - const frameCountByte = 0b10000000 | this.framesPerPacket, - newHeader: number[] = [toc, frameCountByte] - - // M-1 length bytes - newHeader.push(...depacketized.slice(0, -1).map((data) => data.length)) - - const headerBuffer = Buffer.from(newHeader), - payload = Buffer.concat([headerBuffer, ...depacketized]) - - packet.payload = payload - return packet - } -}