From da551ea9d5a5d0836483caee1bc4726fef85dbf8 Mon Sep 17 00:00:00 2001
From: Tom Sightler <tom.sightler@veeam.com>
Date: Mon, 23 Dec 2024 16:31:17 -0500
Subject: [PATCH] Use re-packetized Opus audio

This removes the ffmpeg dependency for outbound audio to Homekit.  The incoming Opus audio from Ring is simply repacketized into a format acceptable to Homekit with no additional transcoding.

Revert "Use re-packetized Opus audio"

This reverts commit 5c59b423e5fabad8c7f2ca16a121fdb949d77c27.

Improve Opus audio quality
---
 packages/homebridge-ring/camera-source.ts     |  99 +++-------
 packages/homebridge-ring/opus-repacketizer.ts | 183 ------------------
 2 files changed, 32 insertions(+), 250 deletions(-)
 delete mode 100644 packages/homebridge-ring/opus-repacketizer.ts

diff --git a/packages/homebridge-ring/camera-source.ts b/packages/homebridge-ring/camera-source.ts
index 66a0ec80..81c8b5d4 100644
--- a/packages/homebridge-ring/camera-source.ts
+++ b/packages/homebridge-ring/camera-source.ts
@@ -2,7 +2,6 @@ import type { RingCamera } from 'ring-client-api'
 import { hap } from './hap.ts'
 import type { SrtpOptions } from '@homebridge/camera-utils'
 import {
-  doesFfmpegSupportCodec,
   generateSrtpOptions,
   ReturnAudioTranscoder,
   RtpSplitter,
@@ -38,7 +37,6 @@ import {
   SrtcpSession,
 } from 'werift'
 import type { StreamingSession } from 'ring-client-api/streaming/streaming-session'
-import { OpusRepacketizer } from './opus-repacketizer.ts'
 import path from 'node:path'
 
 const __dirname = new URL('.', import.meta.url).pathname,
@@ -70,7 +68,7 @@ class StreamingSessionWrapper {
   videoSrtp = generateSrtpOptions()
   audioSplitter = new RtpSplitter()
   videoSplitter = new RtpSplitter()
-  repacketizeAudioSplitter = new RtpSplitter()
+  transcodedAudioSplitter = new RtpSplitter()
 
   constructor(
     public streamingSession: StreamingSession,
@@ -142,57 +140,26 @@ class StreamingSessionWrapper {
         targetAddress,
         audio: { port: audioPort },
       } = this.prepareStreamRequest,
-      {
-        audio: {
-          codec: audioCodec,
-          sample_rate: audioSampleRate,
-          packet_time: audioPacketTime,
-        },
-      } = startStreamRequest,
-      // Repacketize the audio stream after it's been transcoded
-      opusRepacketizer = new OpusRepacketizer(audioPacketTime / 20),
-      audioIntervalScale = ((audioSampleRate / 8) * audioPacketTime) / 20,
+      timestampIncrement =
+        startStreamRequest.audio.sample_rate *
+        startStreamRequest.audio.packet_time,
       audioSrtpSession = new SrtpSession(getSessionConfig(this.audioSrtp))
 
-    let firstTimestamp: number,
-      audioPacketCount = 0
-
-    this.repacketizeAudioSplitter.addMessageHandler(({ message }) => {
-      let rtp: RtpPacket | undefined = RtpPacket.deSerialize(message)
+    let runningTimestamp: number
 
-      if (audioCodec === AudioStreamingCodecType.OPUS) {
-        // borrowed from scrypted
-        // Original source: https://github.com/koush/scrypted/blob/c13ba09889c3e0d9d3724cb7d49253c9d787fb97/plugins/homekit/src/types/camera/camera-streaming-srtp-sender.ts#L124-L143
-        rtp = opusRepacketizer.repacketize(rtp)
+    this.transcodedAudioSplitter.addMessageHandler(({ message }) => {
+      const rtp: RtpPacket | undefined = RtpPacket.deSerialize(message)
 
-        if (!rtp) {
-          return null
-        }
-
-        if (!firstTimestamp) {
-          firstTimestamp = rtp.header.timestamp
-        }
-
-        // from HAP spec:
-        // RTP Payload Format for Opus Speech and Audio Codec RFC 7587 with an exception
-        // that Opus audio RTP Timestamp shall be based on RFC 3550.
-        // RFC 3550 indicates that PCM audio based with a sample rate of 8k and a packet
-        // time of 20ms would have a monotonic interval of 8k / (1000 / 20) = 160.
-        // So 24k audio would have a monotonic interval of (24k / 8k) * 160 = 480.
-        // HAP spec also states that it may request packet times of 20, 30, 40, or 60.
-        // In practice, HAP has been seen to request 20 on LAN and 60 over LTE.
-        // So the RTP timestamp must scale accordingly.
-        // Further investigation indicates that HAP doesn't care about the actual sample rate at all,
-        // that's merely a suggestion. When encoding Opus, it can seemingly be an arbitrary sample rate,
-        // audio will work so long as the rtp timestamps are created properly: which is a construct of the sample rate
-        // HAP requests, and the packet time is respected,
-        // opus 48khz will work just fine.
-        rtp.header.timestamp =
-          (firstTimestamp + audioPacketCount * 160 * audioIntervalScale) %
-          0xffffffff
-        audioPacketCount++
+      // For some reason HAP uses RFC 3550 timestamps instead of following RTP Paylod
+      // Format for Opus Speech and Audio Codec from RFC 7587 like everyone else.
+      // This calculates and replaces the timestamps before forwarding to Homekit.
+      if (!runningTimestamp) {
+        runningTimestamp = rtp.header.timestamp
       }
 
+      rtp.header.timestamp = runningTimestamp % 0xffffffff
+      runningTimestamp += timestampIncrement
+
       // encrypt the packet
       const encryptedPacket = audioSrtpSession.encrypt(rtp.payload, rtp.header)
 
@@ -246,36 +213,29 @@ class StreamingSessionWrapper {
     const transcodingPromise = this.streamingSession.startTranscoding({
       input: ['-vn'],
       audio: [
-        '-map',
-        '0:a',
-
-        // OPUS specific - it works, but audio is very choppy
         '-acodec',
         'libopus',
-        '-frame_duration',
-        request.audio.packet_time,
         '-application',
         'lowdelay',
-
-        // Shared options
+        '-frame_duration',
+        request.audio.packet_time.toString(),
         '-flags',
         '+global_header',
-        '-ac',
-        `${request.audio.channel}`,
         '-ar',
         `${request.audio.sample_rate}k`,
         '-b:a',
         `${request.audio.max_bit_rate}k`,
         '-bufsize',
         `${request.audio.max_bit_rate * 4}k`,
+        '-ac',
+        `${request.audio.channel}`,
         '-payload_type',
         request.audio.pt,
         '-ssrc',
         this.audioSsrc,
         '-f',
         'rtp',
-        `rtp://127.0.0.1:${await this.repacketizeAudioSplitter
-          .portPromise}?pkt_size=376`,
+        `rtp://127.0.0.1:${await this.transcodedAudioSplitter.portPromise}`,
       ],
       video: false,
       output: [],
@@ -309,16 +269,20 @@ class StreamingSessionWrapper {
         outputArgs: [
           '-acodec',
           'libopus',
-          '-ac',
-          '1',
-          '-ar',
-          '24k',
-          '-b:a',
-          '24k',
           '-application',
           'lowdelay',
+          '-frame_duration',
+          request.audio.packet_time.toString(),
           '-flags',
           '+global_header',
+          '-ar',
+          '48k',
+          '-b:a',
+          '48k',
+          '-bufsize',
+          '192k',
+          '-ac',
+          '2',
           '-f',
           'rtp',
           `rtp://127.0.0.1:${await returnAudioTranscodedSplitter.portPromise}`,
@@ -344,7 +308,7 @@ class StreamingSessionWrapper {
 
   stop() {
     this.audioSplitter.close()
-    this.repacketizeAudioSplitter.close()
+    this.transcodedAudioSplitter.close()
     this.videoSplitter.close()
     this.streamingSession.stop()
   }
@@ -363,6 +327,7 @@ export class CameraSource implements CameraStreamingDelegate {
         supportedCryptoSuites: [SRTPCryptoSuites.AES_CM_128_HMAC_SHA1_80],
         video: {
           resolutions: [
+            [1920, 1024, 30],
             [1280, 720, 30],
             [1024, 768, 30],
             [640, 480, 30],
diff --git a/packages/homebridge-ring/opus-repacketizer.ts b/packages/homebridge-ring/opus-repacketizer.ts
deleted file mode 100644
index d5cea034..00000000
--- a/packages/homebridge-ring/opus-repacketizer.ts
+++ /dev/null
@@ -1,183 +0,0 @@
-// OpusRepacketizer is borrowed from scrypted
-// Original source: https://github.com/koush/scrypted/blob/3150a3033515a3886af1e6b35a0ba7432b63e02b/plugins/homekit/src/types/camera/opus-repacketizer.ts
-
-import type { RtpPacket } from 'werift'
-
-// https://datatracker.ietf.org/doc/html/rfc6716
-
-// INPUT (for single frame sample, see RFC for other 4 code values)
-
-// 0                   1                   2                   3
-// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// | config  |s|0|0|                                               |
-// +-+-+-+-+-+-+-+-+                                               |
-// |                    Compressed frame 1 (N-1 bytes)...          :
-// :                                                               |
-// |                                                               |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-// OUTPUT
-
-// 0                   1                   2                   3
-// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// | config  |s|1|1|1|p|     M     | Padding length (Optional)     :
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// : N1 (1-2 bytes): N2 (1-2 bytes):     ...       :     N[M-1]    |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// |                                                               |
-// :               Compressed frame 1 (N1 bytes)...                :
-// |                                                               |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// |                                                               |
-// :               Compressed frame 2 (N2 bytes)...                :
-// |                                                               |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// |                                                               |
-// :                              ...                              :
-// |                                                               |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// |                                                               |
-// :                     Compressed frame M...                     :
-// |                                                               |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-// :                  Opus Padding (Optional)...                   |
-// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-//                  Figure 6: A CBR Code 3 Packet
-
-// In the VBR case, the (optional) padding length is followed by M-1
-// frame lengths (indicated by "N1" to "N[M-1]" in Figure 7), each
-// encoded in a one- or two-byte sequence as described above.  The
-// packet MUST contain enough data for the M-1 lengths after removing
-// the (optional) padding, and the sum of these lengths MUST be no
-// larger than the number of bytes remaining in the packet after
-// decoding them [R7].  The compressed data for all M frames follows,
-// each frame consisting of the indicated number of bytes, with the
-// final frame consuming any remaining bytes before the final padding,
-// as illustrated in Figure 6.  The number of header bytes (TOC byte,
-// frame count byte, padding length bytes, and frame length bytes), plus
-// the signaled length of the first M-1 frames themselves, plus the
-// signaled length of the padding MUST be no larger than N, the total
-// size of the packet.
-
-export class OpusRepacketizer {
-  depacketized: Buffer[] = []
-
-  constructor(public framesPerPacket: number) {}
-
-  // repacketize a packet with a single frame into a packet with multiple frames.
-  repacketize(packet: RtpPacket): RtpPacket | undefined {
-    const code = packet.payload[0] & 0b00000011
-    let offset: number
-
-    // see Frame Length Coding in RFC
-    const decodeFrameLength = () => {
-      let frameLength = packet.payload.readUInt8(offset)
-      if (frameLength >= 252) {
-        offset++
-        frameLength += packet.payload.readUInt8(offset) * 4
-      }
-      return frameLength
-    }
-    // code 0: cbr, 1 packet
-    // code 1: cbr, 2 packets
-    // code 2: vbr, 2 packets
-    // code 3: cbr/vbr signaled, variable packets
-
-    if (code === 0) {
-      if (this.framesPerPacket === 1 && !this.depacketized.length) return packet
-      // depacketize by stripping off the config byte
-      this.depacketized.push(packet.payload.subarray(1))
-    } else if (code === 1) {
-      if (this.framesPerPacket === 2 && !this.depacketized.length) return packet
-      // depacketize by dividing the remaining payload into two equal sized frames
-      const remaining = packet.payload.length - 1
-      if (remaining % 2) {
-        throw new Error('expected equal sized opus packets (code 1)')
-      }
-      const frameLength = remaining / 2
-      this.depacketized.push(packet.payload.subarray(1, 1 + frameLength))
-      this.depacketized.push(packet.payload.subarray(1 + frameLength))
-    } else if (code === 2) {
-      if (this.framesPerPacket === 2 && !this.depacketized.length) return packet
-      offset = 1
-      // depacketize by dividing the remaining payload into two inequal sized frames
-      const frameLength = decodeFrameLength()
-      this.depacketized.push(
-        packet.payload.subarray(offset, offset + frameLength),
-      )
-      this.depacketized.push(packet.payload.subarray(offset + frameLength))
-    } else if (code === 3) {
-      // code 3 packet will have a frame count and padding indicator, and whether the packets
-      // are equal size or not.
-      const frameCountByte = packet.payload[1],
-        packetFrameCount = frameCountByte & 0b00111111,
-        vbr = frameCountByte & 0b10000000
-      if (
-        this.framesPerPacket === packetFrameCount &&
-        !this.depacketized.length
-      ) {
-        return packet
-      }
-      const paddingIndicator = frameCountByte & 0b01000000
-      offset = 2
-      let padding = 0
-      if (paddingIndicator) {
-        padding = packet.payload.readUInt8(offset)
-        offset++
-        if (padding === 255) {
-          padding = 254 + packet.payload.readUInt8(offset)
-          offset++
-        }
-      }
-
-      if (!vbr) {
-        const remaining = packet.payload.length - offset - padding
-        if (remaining % packetFrameCount) {
-          throw new Error('expected equal sized opus packets (code 3)')
-        }
-        const frameLength = remaining / packetFrameCount
-        for (let i = 0; i < packetFrameCount; i++) {
-          const start = offset + i * frameLength,
-            end = start + frameLength
-          this.depacketized.push(packet.payload.subarray(start, end))
-        }
-      } else {
-        const frameLengths: number[] = []
-        for (let i = 0; i < packetFrameCount; i++) {
-          const frameLength = decodeFrameLength()
-          frameLengths.push(frameLength)
-        }
-        for (let i = 0; i < packetFrameCount; i++) {
-          const frameLength = frameLengths[i],
-            start = offset
-          offset += frameLength
-          this.depacketized.push(packet.payload.subarray(start, offset))
-        }
-      }
-    }
-
-    if (this.depacketized.length < this.framesPerPacket) return
-
-    const depacketized = this.depacketized.slice(0, this.framesPerPacket)
-    this.depacketized = this.depacketized.slice(this.framesPerPacket)
-
-    // reuse the config and stereo indicator, but change the code to 3.
-    let toc = packet.payload[0]
-    toc |= 0b00000011
-    // vbr | padding indicator | packet count
-    const frameCountByte = 0b10000000 | this.framesPerPacket,
-      newHeader: number[] = [toc, frameCountByte]
-
-    // M-1 length bytes
-    newHeader.push(...depacketized.slice(0, -1).map((data) => data.length))
-
-    const headerBuffer = Buffer.from(newHeader),
-      payload = Buffer.concat([headerBuffer, ...depacketized])
-
-    packet.payload = payload
-    return packet
-  }
-}