feat: voice input support in both search and chat modes (#732)

ayangweb · web-flow · commit a4f4a247309e · 2025-07-02T09:35:16.000+08:00
* feat: voice input support in both search and chat modes

* docs: update changelog

* update

* update

* update

* update
diff --git a/docs/content.en/docs/release-notes/_index.md b/docs/content.en/docs/release-notes/_index.md
@@ -10,9 +10,14 @@ Information about release notes of Coco Server is provided here.
 ## Latest (In development)
 
 ### ❌ Breaking changes
+
 ### 🚀 Features
+
 - feat: file search using spotlight #705
+- feat: voice input support in both search and chat modes #732
+
 ### 🐛 Bug fix
+
 ### ✈️ Improvements
 
 ## 0.6.0 (2025-06-29)
@@ -302,4 +307,4 @@ Information about release notes of Coco Server is provided here.
 
 ### Bug fix
 
-### Improvements
+### Improvements
diff --git a/src-tauri/src/server/transcription.rs b/src-tauri/src/server/transcription.rs
@@ -1,41 +1,96 @@
 use crate::common::http::get_response_body_text;
 use crate::server::http_client::HttpClient;
 use serde::{Deserialize, Serialize};
+use serde_json::{from_str, Value};
 use tauri::command;
 
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TranscriptionResponse {
-    pub text: String,
+    task_id: String,
+    results: Vec<Value>,
 }
 
 #[command]
 pub async fn transcription(
     server_id: String,
-    _audio_type: String,
-    _audio_content: String,
+    audio_content: String,
 ) -> Result<TranscriptionResponse, String> {
-    // let mut query_params = HashMap::new();
-    // query_params.insert("type".to_string(), JsonValue::String(audio_type));
-    // query_params.insert("content".to_string(), JsonValue::String(audio_content));
-
-    // Send the HTTP POST request
-    let response = HttpClient::post(
+    // Send request to initiate transcription task
+    let init_response = HttpClient::post(
         &server_id,
         "/services/audio/transcription",
         None,
-        None,
+        Some(audio_content.into()),
     )
+    .await
+    .map_err(|e| format!("Failed to initiate transcription: {}", e))?;
+
+    // Extract response body as text
+    let init_response_text = get_response_body_text(init_response)
         .await
-        .map_err(|e| format!("Error sending transcription request: {}", e))?;
+        .map_err(|e| format!("Failed to read initial response body: {}", e))?;
+
+    // Parse response JSON to extract task ID
+    let init_response_json: Value = from_str(&init_response_text).map_err(|e| {
+        format!(
+            "Failed to parse initial response JSON: {}. Raw response: {}",
+            e, init_response_text
+        )
+    })?;
+
+    let transcription_task_id = init_response_json["task_id"]
+        .as_str()
+        .ok_or_else(|| {
+            format!(
+                "Missing or invalid task_id in initial response: {}",
+                init_response_text
+            )
+        })?
+        .to_string();
+
+    // Set up polling with timeout
+    let polling_start = std::time::Instant::now();
+    const POLLING_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
+    const POLLING_INTERVAL: std::time::Duration = std::time::Duration::from_millis(200);
 
-    // Use get_response_body_text to extract the response body as text
-    let response_body = get_response_body_text(response)
+    let mut transcription_response: TranscriptionResponse;
+
+    loop {
+        // Poll for transcription results
+        let poll_response = HttpClient::get(
+            &server_id,
+            &format!("/services/audio/task/{}", transcription_task_id),
+            None,
+        )
         .await
-        .map_err(|e| format!("Failed to read response body: {}", e))?;
+        .map_err(|e| format!("Failed to poll transcription task: {}", e))?;
+
+        // Extract poll response body
+        let poll_response_text = get_response_body_text(poll_response)
+            .await
+            .map_err(|e| format!("Failed to read poll response body: {}", e))?;
+
+        // Parse poll response JSON
+        transcription_response = from_str(&poll_response_text).map_err(|e| {
+            format!(
+                "Failed to parse poll response JSON: {}. Raw response: {}",
+                e, poll_response_text
+            )
+        })?;
+
+        // Check if transcription results are available
+        if !transcription_response.results.is_empty() {
+            break;
+        }
+
+        // Check for timeout
+        if polling_start.elapsed() >= POLLING_TIMEOUT {
+            return Err("Transcription task timed out after 30 seconds".to_string());
+        }
 
-    // Deserialize the response body into TranscriptionResponse
-    let transcription_response: TranscriptionResponse = serde_json::from_str(&response_body)
-        .map_err(|e| format!("Failed to parse transcription response: {}", e))?;
+        // Wait before next poll
+        tokio::time::sleep(POLLING_INTERVAL).await;
+    }
 
     Ok(transcription_response)
 }
diff --git a/src/components/AudioRecording/index.tsx b/src/components/AudioRecording/index.tsx
@@ -40,9 +40,9 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
   const state = useReactive({ ...INITIAL_STATE });
   const containerRef = useRef<HTMLDivElement>(null);
   const recordRef = useRef<RecordPlugin>();
-  const withVisibility = useAppStore((state) => state.withVisibility);
-  const currentService = useConnectStore((state) => state.currentService);
-  const voiceInput = useShortcutsStore((state) => state.voiceInput);
+  const { withVisibility, addError } = useAppStore();
+  const { currentService } = useConnectStore();
+  const { voiceInput } = useShortcutsStore();
 
   const { wavesurfer } = useWavesurfer({
     container: containerRef,
@@ -75,23 +75,34 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
 
       const reader = new FileReader();
 
+      reader.readAsDataURL(blob);
+
       reader.onloadend = async () => {
         const base64Audio = (reader.result as string).split(",")[1];
 
-        const response: any = await platformAdapter.commands("transcription", {
-          serverId: currentService.id,
-          audioType: "mp3",
-          audioContent: base64Audio,
-        });
+        try {
+          const response: any = await platformAdapter.commands(
+            "transcription",
+            {
+              serverId: currentService.id,
+              audioContent: JSON.stringify({ content: base64Audio }),
+            }
+          );
 
-        if (!response) return;
+          console.log("response", response);
 
-        onChange?.(response.text);
+          const text = response?.results
+            .flatMap((item: any) => item?.transcription?.transcripts)
+            .map((item: any) => item?.text?.replace(/<\|[\/\w]+\|>/g, ""))
+            .join(" ");
 
-        resetState();
+          onChange?.(text);
+        } catch (error) {
+          addError(String(error));
+        } finally {
+          resetState();
+        }
       };
-
-      reader.readAsDataURL(blob);
     });
 
     recordRef.current = record;
@@ -157,20 +168,21 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
     <>
       <div
         className={clsx(
-          "p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer",
+          "size-6 flex items-center justify-center hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer",
           {
             hidden: state.audioDevices.length === 0,
           }
         )}
+        onClick={startRecording}
       >
         <VisibleKey shortcut={voiceInput} onKeyPress={startRecording}>
-          <Mic className="size-4 text-[#999]" onClick={startRecording} />
+          <Mic className="size-4 text-[#999]" />
         </VisibleKey>
       </div>
 
       <div
         className={clsx(
-          "absolute inset-0 flex items-center gap-1 px-1 rounded translate-x-full transition-all bg-[#ededed] dark:bg-[#202126]",
+          "absolute -inset-2 flex items-center gap-1 px-1 rounded translate-x-full transition-all bg-[#ededed] dark:bg-[#202126]",
           {
             "!translate-x-0": state.isRecording || state.converting,
           }
diff --git a/src/components/Search/InputBox.tsx b/src/components/Search/InputBox.tsx
@@ -17,6 +17,8 @@ import { useKeyboardHandlers } from "@/hooks/useKeyboardHandlers";
 import { useAssistantManager } from "./AssistantManager";
 import InputControls from "./InputControls";
 import { useExtensionsStore } from "@/stores/extensionsStore";
+import AudioRecording from "../AudioRecording";
+import { isDefaultServer } from "@/utils";
 
 interface ChatInputProps {
   onSend: (message: string) => void;
@@ -199,6 +201,13 @@ export default function ChatInput({
     return "Ask";
   }, [language, askAI]);
 
+  const { currentService } = useConnectStore();
+  const [visibleAudioInput, setVisibleAudioInput] = useState(false);
+
+  useEffect(() => {
+    setVisibleAudioInput(isDefaultServer());
+  }, [currentService]);
+
   const renderSearchIcon = () => (
     <SearchIcons
       lineCount={lineCount}
@@ -262,12 +271,17 @@ export default function ChatInput({
           </div>
         )}
 
-      {/* <AudioRecording
-      key={isChatMode ? "chat" : "search"}
-      onChange={(text) => {
-        changeInput(inputValue + text);
-      }}
-    /> */}
+      {visibleAudioInput && (
+        <AudioRecording
+          key={isChatMode ? "chat" : "search"}
+          onChange={(text) => {
+            const nextValue = inputValue + text;
+
+            changeInput(nextValue);
+            setSearchValue(nextValue);
+          }}
+        />
+      )}
 
       {isChatMode && curChatEnd && (
         <div
diff --git a/src/components/Settings/Advanced/components/Shortcuts/index.tsx b/src/components/Settings/Advanced/components/Shortcuts/index.tsx
@@ -8,7 +8,7 @@ import { isMac } from "@/utils/platform";
 import {
   INITIAL_MODE_SWITCH,
   INITIAL_RETURN_TO_INPUT,
-  // INITIAL_VOICE_INPUT,
+  INITIAL_VOICE_INPUT,
   // INITIAL_ADD_FILE,
   INITIAL_DEEP_THINKING,
   INITIAL_INTERNET_SEARCH,
@@ -37,61 +37,44 @@ export const modifierKeys: ModifierKey[] = isMac
 
 const Shortcuts = () => {
   const { t } = useTranslation();
-  const modifierKey = useShortcutsStore((state) => state.modifierKey);
-  const setModifierKey = useShortcutsStore((state) => state.setModifierKey);
-  const modeSwitch = useShortcutsStore((state) => state.modeSwitch);
-  const setModeSwitch = useShortcutsStore((state) => state.setModeSwitch);
-  const returnToInput = useShortcutsStore((state) => state.returnToInput);
-  const setReturnToInput = useShortcutsStore((state) => state.setReturnToInput);
-  // const voiceInput = useShortcutsStore((state) => state.voiceInput);
-  // const setVoiceInput = useShortcutsStore((state) => state.setVoiceInput);
-  // const addFile = useShortcutsStore((state) => state.addFile);
-  // const setAddFile = useShortcutsStore((state) => state.setAddFile);
-  const deepThinking = useShortcutsStore((state) => state.deepThinking);
-  const setDeepThinking = useShortcutsStore((state) => state.setDeepThinking);
-  const internetSearch = useShortcutsStore((state) => state.internetSearch);
-  const setInternetSearch = useShortcutsStore((state) => {
-    return state.setInternetSearch;
-  });
-  const internetSearchScope = useShortcutsStore((state) => {
-    return state.internetSearchScope;
-  });
-  const setInternetSearchScope = useShortcutsStore((state) => {
-    return state.setInternetSearchScope;
-  });
-  const mcpSearch = useShortcutsStore((state) => state.mcpSearch);
-  const setMcpSearch = useShortcutsStore((state) => {
-    return state.setMcpSearch;
-  });
-  const mcpSearchScope = useShortcutsStore((state) => {
-    return state.mcpSearchScope;
-  });
-  const setMcpSearchScope = useShortcutsStore((state) => {
-    return state.setMcpSearchScope;
-  });
-  const historicalRecords = useShortcutsStore((state) => {
-    return state.historicalRecords;
-  });
-  const setHistoricalRecords = useShortcutsStore((state) => {
-    return state.setHistoricalRecords;
-  });
-  const aiAssistant = useShortcutsStore((state) => {
-    return state.aiAssistant;
-  });
-  const setAiAssistant = useShortcutsStore((state) => {
-    return state.setAiAssistant;
-  });
-  const newSession = useShortcutsStore((state) => state.newSession);
-  const setNewSession = useShortcutsStore((state) => state.setNewSession);
-  const fixedWindow = useShortcutsStore((state) => state.fixedWindow);
-  const setFixedWindow = useShortcutsStore((state) => state.setFixedWindow);
-  const serviceList = useShortcutsStore((state) => state.serviceList);
-  const setServiceList = useShortcutsStore((state) => state.setServiceList);
-  const external = useShortcutsStore((state) => state.external);
-  const setExternal = useShortcutsStore((state) => state.setExternal);
-  const addError = useAppStore((state) => state.addError);
-  const aiOverview = useShortcutsStore((state) => state.aiOverview);
-  const setAiOverview = useShortcutsStore((state) => state.setAiOverview);
+  const {
+    modifierKey,
+    setModifierKey,
+    modeSwitch,
+    setModeSwitch,
+    returnToInput,
+    setReturnToInput,
+    voiceInput,
+    setVoiceInput,
+    // addFile,
+    // setAddFile,
+    deepThinking,
+    setDeepThinking,
+    internetSearch,
+    setInternetSearch,
+    internetSearchScope,
+    setInternetSearchScope,
+    mcpSearch,
+    setMcpSearch,
+    mcpSearchScope,
+    setMcpSearchScope,
+    historicalRecords,
+    setHistoricalRecords,
+    aiAssistant,
+    setAiAssistant,
+    newSession,
+    setNewSession,
+    fixedWindow,
+    setFixedWindow,
+    serviceList,
+    setServiceList,
+    external,
+    setExternal,
+    aiOverview,
+    setAiOverview,
+  } = useShortcutsStore();
+
+  const { addError } = useAppStore();
 
   useEffect(() => {
     const unlisten = useShortcutsStore.subscribe((state) => {
@@ -116,15 +99,13 @@ const Shortcuts = () => {
       value: returnToInput,
       setValue: setReturnToInput,
     },
-    // {
-    //   title: "settings.advanced.shortcuts.voiceInput.title",
-    //   description: "settings.advanced.shortcuts.voiceInput.description",
-    //   value: voiceInput,
-    //   setValue: setVoiceInput,
-    //   reset: () => {
-    //     handleChange(INITIAL_VOICE_INPUT, setVoiceInput);
-    //   },
-    // },
+    {
+      title: "settings.advanced.shortcuts.voiceInput.title",
+      description: "settings.advanced.shortcuts.voiceInput.description",
+      initialValue: INITIAL_VOICE_INPUT,
+      value: voiceInput,
+      setValue: setVoiceInput,
+    },
     // {
     //   title: "settings.advanced.shortcuts.addFile.title",
     //   description: "settings.advanced.shortcuts.addFile.description",
diff --git a/src/constants/index.ts b/src/constants/index.ts
diff --git a/src/utils/index.ts b/src/utils/index.ts