From 69bf3bd9a1036a76ea0d5f66c81d96176a1f9785 Mon Sep 17 00:00:00 2001
From: Robin <robin@robin.town>
Date: Wed, 20 Sep 2023 13:21:45 -0400
Subject: [PATCH] Fix double audio tracks

See comments. I'm not very happy with how this code bounces state in and out of different hooks and useEffect blocks, but as a quick fix this should work.
---
 src/livekit/useECConnectionState.ts | 62 +++++++++++++++++------------
 src/livekit/useLiveKit.ts           | 14 ++++++-
 2 files changed, 48 insertions(+), 28 deletions(-)
diff --git a/src/livekit/useECConnectionState.ts b/src/livekit/useECConnectionState.ts
index 1b881729..e9298eb7 100644
--- a/src/livekit/useECConnectionState.ts
+++ b/src/livekit/useECConnectionState.ts
@@ -17,10 +17,8 @@ limitations under the License.
 import {
   AudioCaptureOptions,
   ConnectionState,
-  LocalTrackPublication,
   Room,
   RoomEvent,
-  Track,
 } from "livekit-client";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { logger } from "matrix-js-sdk/src/logger";
@@ -56,24 +54,22 @@ async function doConnect(
   audioOptions: AudioCaptureOptions
 ): Promise<void> {
   await livekitRoom!.connect(sfuConfig!.url, sfuConfig!.jwt);
-  const hasMicrophoneTrack = Array.from(
-    livekitRoom?.localParticipant.audioTracks.values()
-  ).some((track: LocalTrackPublication) => {
-    return track.source == Track.Source.Microphone;
-  });
-  // We create a track in case there isn't any.
-  if (!hasMicrophoneTrack) {
-    const audioTracks = await livekitRoom!.localParticipant.createTracks({
-      audio: audioOptions,
-    });
-    if (audioTracks.length < 1) {
-      logger.info("Tried to pre-create local audio track but got no tracks");
-      return;
-    }
-    if (!audioEnabled) await audioTracks[0].mute();
 
-    await livekitRoom?.localParticipant.publishTrack(audioTracks[0]);
+  // Always create an audio track manually.
+  // livekit (by default) keeps the mic track open when you mute, but if you start muted,
+  // doesn't publish it until you unmute. We want to publish it from the start so we're
+  // always capturing audio: it helps keep bluetooth headsets in the right mode and
+  // mobile browsers to know we're doing a call.
+  const audioTracks = await livekitRoom!.localParticipant.createTracks({
+    audio: audioOptions,
+  });
+  if (audioTracks.length < 1) {
+    logger.info("Tried to pre-create local audio track but got no tracks");
+    return;
   }
+  if (!audioEnabled) await audioTracks[0].mute();
+
+  await livekitRoom?.localParticipant.publishTrack(audioTracks[0]);
 }
 
 export function useECConnectionState(
@@ -89,6 +85,7 @@ export function useECConnectionState(
   );
 
   const [isSwitchingFocus, setSwitchingFocus] = useState(false);
+  const [isInDoConnect, setIsInDoConnect] = useState(false);
 
   const onConnStateChanged = useCallback((state: ConnectionState) => {
     if (state == ConnectionState.Connected) setSwitchingFocus(false);
@@ -125,12 +122,17 @@ export function useECConnectionState(
       (async () => {
         setSwitchingFocus(true);
         await livekitRoom?.disconnect();
-        await doConnect(
-          livekitRoom!,
-          sfuConfig!,
-          initialAudioEnabled,
-          initialAudioOptions
-        );
+        setIsInDoConnect(true);
+        try {
+          await doConnect(
+            livekitRoom!,
+            sfuConfig!,
+            initialAudioEnabled,
+            initialAudioOptions
+          );
+        } finally {
+          setIsInDoConnect(false);
+        }
       })();
     } else if (
       !sfuConfigValid(currentSFUConfig.current) &&
@@ -142,16 +144,24 @@ export function useECConnectionState(
       // doesn't publish it until you unmute. We want to publish it from the start so we're
       // always capturing audio: it helps keep bluetooth headsets in the right mode and
       // mobile browsers to know we're doing a call.
+      setIsInDoConnect(true);
       doConnect(
         livekitRoom!,
         sfuConfig!,
         initialAudioEnabled,
         initialAudioOptions
-      );
+      ).finally(() => setIsInDoConnect(false));
     }
 
     currentSFUConfig.current = Object.assign({}, sfuConfig);
   }, [sfuConfig, livekitRoom, initialAudioOptions, initialAudioEnabled]);
 
-  return isSwitchingFocus ? ECAddonConnectionState.ECSwitchingFocus : connState;
+  // Because we create audio tracks by hand, there's more to connecting than
+  // just what LiveKit does in room.connect, and we should continue to return
+  // ConnectionState.Connecting for the entire duration of the doConnect promise
+  return isSwitchingFocus
+    ? ECAddonConnectionState.ECSwitchingFocus
+    : isInDoConnect
+    ? ConnectionState.Connecting
+    : connState;
 }
diff --git a/src/livekit/useLiveKit.ts b/src/livekit/useLiveKit.ts
index 21119c22..fe1bec20 100644
--- a/src/livekit/useLiveKit.ts
+++ b/src/livekit/useLiveKit.ts
@@ -23,7 +23,7 @@ import {
   setLogLevel,
 } from "livekit-client";
 import { useLiveKitRoom } from "@livekit/components-react";
-import { useEffect, useMemo, useRef } from "react";
+import { useEffect, useMemo, useRef, useState } from "react";
 import E2EEWorker from "livekit-client/e2ee-worker?worker";
 import { logger } from "matrix-js-sdk/src/logger";
 
@@ -98,6 +98,11 @@ export function useLiveKit(
     [e2eeOptions]
   );
 
+  // useECConnectionState creates and publishes an audio track by hand. To keep
+  // this from racing with LiveKit's automatic creation of the audio track, we
+  // block audio from being enabled until the connection is finished.
+  const [blockAudio, setBlockAudio] = useState(true);
+
   // We have to create the room manually here due to a bug inside
   // @livekit/components-react. JSON.stringify() is used in deps of a
   // useEffect() with an argument that references itself, if E2EE is enabled
@@ -105,7 +110,7 @@ export function useLiveKit(
   const { room } = useLiveKitRoom({
     token: sfuConfig?.jwt,
     serverUrl: sfuConfig?.url,
-    audio: initialMuteStates.current.audio.enabled,
+    audio: initialMuteStates.current.audio.enabled && !blockAudio,
     video: initialMuteStates.current.video.enabled,
     room: roomWithoutProps,
     connect: false,
@@ -120,6 +125,11 @@ export function useLiveKit(
     sfuConfig
   );
 
+  // Unblock audio once the connection is finished
+  useEffect(() => {
+    if (connectionState === ConnectionState.Connected) setBlockAudio(false);
+  }, [connectionState, setBlockAudio]);
+
   useEffect(() => {
     // Sync the requested mute states with LiveKit's mute states. We do it this
     // way around rather than using LiveKit as the source of truth, so that the