Skip to content

Commit a4f4a24

Browse files
authored
feat: voice input support in both search and chat modes (#732)
* feat: voice input support in both search and chat modes * docs: update changelog * update * update * update * update
1 parent 87bd3d0 commit a4f4a24

7 files changed

Lines changed: 194 additions & 106 deletions

File tree

docs/content.en/docs/release-notes/_index.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,14 @@ Information about release notes of Coco Server is provided here.
1010
## Latest (In development)
1111

1212
### ❌ Breaking changes
13+
1314
### 🚀 Features
15+
1416
- feat: file search using spotlight #705
17+
- feat: voice input support in both search and chat modes #732
18+
1519
### 🐛 Bug fix
20+
1621
### ✈️ Improvements
1722

1823
## 0.6.0 (2025-06-29)
@@ -302,4 +307,4 @@ Information about release notes of Coco Server is provided here.
302307

303308
### Bug fix
304309

305-
### Improvements
310+
### Improvements
Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,96 @@
11
use crate::common::http::get_response_body_text;
22
use crate::server::http_client::HttpClient;
33
use serde::{Deserialize, Serialize};
4+
use serde_json::{from_str, Value};
45
use tauri::command;
56

67
#[derive(Debug, Serialize, Deserialize)]
78
pub struct TranscriptionResponse {
8-
pub text: String,
9+
task_id: String,
10+
results: Vec<Value>,
911
}
1012

1113
#[command]
1214
pub async fn transcription(
1315
server_id: String,
14-
_audio_type: String,
15-
_audio_content: String,
16+
audio_content: String,
1617
) -> Result<TranscriptionResponse, String> {
17-
// let mut query_params = HashMap::new();
18-
// query_params.insert("type".to_string(), JsonValue::String(audio_type));
19-
// query_params.insert("content".to_string(), JsonValue::String(audio_content));
20-
21-
// Send the HTTP POST request
22-
let response = HttpClient::post(
18+
// Send request to initiate transcription task
19+
let init_response = HttpClient::post(
2320
&server_id,
2421
"/services/audio/transcription",
2522
None,
26-
None,
23+
Some(audio_content.into()),
2724
)
25+
.await
26+
.map_err(|e| format!("Failed to initiate transcription: {}", e))?;
27+
28+
// Extract response body as text
29+
let init_response_text = get_response_body_text(init_response)
2830
.await
29-
.map_err(|e| format!("Error sending transcription request: {}", e))?;
31+
.map_err(|e| format!("Failed to read initial response body: {}", e))?;
32+
33+
// Parse response JSON to extract task ID
34+
let init_response_json: Value = from_str(&init_response_text).map_err(|e| {
35+
format!(
36+
"Failed to parse initial response JSON: {}. Raw response: {}",
37+
e, init_response_text
38+
)
39+
})?;
40+
41+
let transcription_task_id = init_response_json["task_id"]
42+
.as_str()
43+
.ok_or_else(|| {
44+
format!(
45+
"Missing or invalid task_id in initial response: {}",
46+
init_response_text
47+
)
48+
})?
49+
.to_string();
50+
51+
// Set up polling with timeout
52+
let polling_start = std::time::Instant::now();
53+
const POLLING_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
54+
const POLLING_INTERVAL: std::time::Duration = std::time::Duration::from_millis(200);
3055

31-
// Use get_response_body_text to extract the response body as text
32-
let response_body = get_response_body_text(response)
56+
let mut transcription_response: TranscriptionResponse;
57+
58+
loop {
59+
// Poll for transcription results
60+
let poll_response = HttpClient::get(
61+
&server_id,
62+
&format!("/services/audio/task/{}", transcription_task_id),
63+
None,
64+
)
3365
.await
34-
.map_err(|e| format!("Failed to read response body: {}", e))?;
66+
.map_err(|e| format!("Failed to poll transcription task: {}", e))?;
67+
68+
// Extract poll response body
69+
let poll_response_text = get_response_body_text(poll_response)
70+
.await
71+
.map_err(|e| format!("Failed to read poll response body: {}", e))?;
72+
73+
// Parse poll response JSON
74+
transcription_response = from_str(&poll_response_text).map_err(|e| {
75+
format!(
76+
"Failed to parse poll response JSON: {}. Raw response: {}",
77+
e, poll_response_text
78+
)
79+
})?;
80+
81+
// Check if transcription results are available
82+
if !transcription_response.results.is_empty() {
83+
break;
84+
}
85+
86+
// Check for timeout
87+
if polling_start.elapsed() >= POLLING_TIMEOUT {
88+
return Err("Transcription task timed out after 30 seconds".to_string());
89+
}
3590

36-
// Deserialize the response body into TranscriptionResponse
37-
let transcription_response: TranscriptionResponse = serde_json::from_str(&response_body)
38-
.map_err(|e| format!("Failed to parse transcription response: {}", e))?;
91+
// Wait before next poll
92+
tokio::time::sleep(POLLING_INTERVAL).await;
93+
}
3994

4095
Ok(transcription_response)
4196
}

src/components/AudioRecording/index.tsx

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
4040
const state = useReactive({ ...INITIAL_STATE });
4141
const containerRef = useRef<HTMLDivElement>(null);
4242
const recordRef = useRef<RecordPlugin>();
43-
const withVisibility = useAppStore((state) => state.withVisibility);
44-
const currentService = useConnectStore((state) => state.currentService);
45-
const voiceInput = useShortcutsStore((state) => state.voiceInput);
43+
const { withVisibility, addError } = useAppStore();
44+
const { currentService } = useConnectStore();
45+
const { voiceInput } = useShortcutsStore();
4646

4747
const { wavesurfer } = useWavesurfer({
4848
container: containerRef,
@@ -75,23 +75,34 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
7575

7676
const reader = new FileReader();
7777

78+
reader.readAsDataURL(blob);
79+
7880
reader.onloadend = async () => {
7981
const base64Audio = (reader.result as string).split(",")[1];
8082

81-
const response: any = await platformAdapter.commands("transcription", {
82-
serverId: currentService.id,
83-
audioType: "mp3",
84-
audioContent: base64Audio,
85-
});
83+
try {
84+
const response: any = await platformAdapter.commands(
85+
"transcription",
86+
{
87+
serverId: currentService.id,
88+
audioContent: JSON.stringify({ content: base64Audio }),
89+
}
90+
);
8691

87-
if (!response) return;
92+
console.log("response", response);
8893

89-
onChange?.(response.text);
94+
const text = response?.results
95+
.flatMap((item: any) => item?.transcription?.transcripts)
96+
.map((item: any) => item?.text?.replace(/<\|[\/\w]+\|>/g, ""))
97+
.join(" ");
9098

91-
resetState();
99+
onChange?.(text);
100+
} catch (error) {
101+
addError(String(error));
102+
} finally {
103+
resetState();
104+
}
92105
};
93-
94-
reader.readAsDataURL(blob);
95106
});
96107

97108
recordRef.current = record;
@@ -157,20 +168,21 @@ const AudioRecording: FC<AudioRecordingProps> = (props) => {
157168
<>
158169
<div
159170
className={clsx(
160-
"p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer",
171+
"size-6 flex items-center justify-center hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer",
161172
{
162173
hidden: state.audioDevices.length === 0,
163174
}
164175
)}
176+
onClick={startRecording}
165177
>
166178
<VisibleKey shortcut={voiceInput} onKeyPress={startRecording}>
167-
<Mic className="size-4 text-[#999]" onClick={startRecording} />
179+
<Mic className="size-4 text-[#999]" />
168180
</VisibleKey>
169181
</div>
170182

171183
<div
172184
className={clsx(
173-
"absolute inset-0 flex items-center gap-1 px-1 rounded translate-x-full transition-all bg-[#ededed] dark:bg-[#202126]",
185+
"absolute -inset-2 flex items-center gap-1 px-1 rounded translate-x-full transition-all bg-[#ededed] dark:bg-[#202126]",
174186
{
175187
"!translate-x-0": state.isRecording || state.converting,
176188
}

src/components/Search/InputBox.tsx

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import { useKeyboardHandlers } from "@/hooks/useKeyboardHandlers";
1717
import { useAssistantManager } from "./AssistantManager";
1818
import InputControls from "./InputControls";
1919
import { useExtensionsStore } from "@/stores/extensionsStore";
20+
import AudioRecording from "../AudioRecording";
21+
import { isDefaultServer } from "@/utils";
2022

2123
interface ChatInputProps {
2224
onSend: (message: string) => void;
@@ -199,6 +201,13 @@ export default function ChatInput({
199201
return "Ask";
200202
}, [language, askAI]);
201203

204+
const { currentService } = useConnectStore();
205+
const [visibleAudioInput, setVisibleAudioInput] = useState(false);
206+
207+
useEffect(() => {
208+
setVisibleAudioInput(isDefaultServer());
209+
}, [currentService]);
210+
202211
const renderSearchIcon = () => (
203212
<SearchIcons
204213
lineCount={lineCount}
@@ -262,12 +271,17 @@ export default function ChatInput({
262271
</div>
263272
)}
264273

265-
{/* <AudioRecording
266-
key={isChatMode ? "chat" : "search"}
267-
onChange={(text) => {
268-
changeInput(inputValue + text);
269-
}}
270-
/> */}
274+
{visibleAudioInput && (
275+
<AudioRecording
276+
key={isChatMode ? "chat" : "search"}
277+
onChange={(text) => {
278+
const nextValue = inputValue + text;
279+
280+
changeInput(nextValue);
281+
setSearchValue(nextValue);
282+
}}
283+
/>
284+
)}
271285

272286
{isChatMode && curChatEnd && (
273287
<div

src/components/Settings/Advanced/components/Shortcuts/index.tsx

Lines changed: 46 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { isMac } from "@/utils/platform";
88
import {
99
INITIAL_MODE_SWITCH,
1010
INITIAL_RETURN_TO_INPUT,
11-
// INITIAL_VOICE_INPUT,
11+
INITIAL_VOICE_INPUT,
1212
// INITIAL_ADD_FILE,
1313
INITIAL_DEEP_THINKING,
1414
INITIAL_INTERNET_SEARCH,
@@ -37,61 +37,44 @@ export const modifierKeys: ModifierKey[] = isMac
3737

3838
const Shortcuts = () => {
3939
const { t } = useTranslation();
40-
const modifierKey = useShortcutsStore((state) => state.modifierKey);
41-
const setModifierKey = useShortcutsStore((state) => state.setModifierKey);
42-
const modeSwitch = useShortcutsStore((state) => state.modeSwitch);
43-
const setModeSwitch = useShortcutsStore((state) => state.setModeSwitch);
44-
const returnToInput = useShortcutsStore((state) => state.returnToInput);
45-
const setReturnToInput = useShortcutsStore((state) => state.setReturnToInput);
46-
// const voiceInput = useShortcutsStore((state) => state.voiceInput);
47-
// const setVoiceInput = useShortcutsStore((state) => state.setVoiceInput);
48-
// const addFile = useShortcutsStore((state) => state.addFile);
49-
// const setAddFile = useShortcutsStore((state) => state.setAddFile);
50-
const deepThinking = useShortcutsStore((state) => state.deepThinking);
51-
const setDeepThinking = useShortcutsStore((state) => state.setDeepThinking);
52-
const internetSearch = useShortcutsStore((state) => state.internetSearch);
53-
const setInternetSearch = useShortcutsStore((state) => {
54-
return state.setInternetSearch;
55-
});
56-
const internetSearchScope = useShortcutsStore((state) => {
57-
return state.internetSearchScope;
58-
});
59-
const setInternetSearchScope = useShortcutsStore((state) => {
60-
return state.setInternetSearchScope;
61-
});
62-
const mcpSearch = useShortcutsStore((state) => state.mcpSearch);
63-
const setMcpSearch = useShortcutsStore((state) => {
64-
return state.setMcpSearch;
65-
});
66-
const mcpSearchScope = useShortcutsStore((state) => {
67-
return state.mcpSearchScope;
68-
});
69-
const setMcpSearchScope = useShortcutsStore((state) => {
70-
return state.setMcpSearchScope;
71-
});
72-
const historicalRecords = useShortcutsStore((state) => {
73-
return state.historicalRecords;
74-
});
75-
const setHistoricalRecords = useShortcutsStore((state) => {
76-
return state.setHistoricalRecords;
77-
});
78-
const aiAssistant = useShortcutsStore((state) => {
79-
return state.aiAssistant;
80-
});
81-
const setAiAssistant = useShortcutsStore((state) => {
82-
return state.setAiAssistant;
83-
});
84-
const newSession = useShortcutsStore((state) => state.newSession);
85-
const setNewSession = useShortcutsStore((state) => state.setNewSession);
86-
const fixedWindow = useShortcutsStore((state) => state.fixedWindow);
87-
const setFixedWindow = useShortcutsStore((state) => state.setFixedWindow);
88-
const serviceList = useShortcutsStore((state) => state.serviceList);
89-
const setServiceList = useShortcutsStore((state) => state.setServiceList);
90-
const external = useShortcutsStore((state) => state.external);
91-
const setExternal = useShortcutsStore((state) => state.setExternal);
92-
const addError = useAppStore((state) => state.addError);
93-
const aiOverview = useShortcutsStore((state) => state.aiOverview);
94-
const setAiOverview = useShortcutsStore((state) => state.setAiOverview);
40+
const {
41+
modifierKey,
42+
setModifierKey,
43+
modeSwitch,
44+
setModeSwitch,
45+
returnToInput,
46+
setReturnToInput,
47+
voiceInput,
48+
setVoiceInput,
49+
// addFile,
50+
// setAddFile,
51+
deepThinking,
52+
setDeepThinking,
53+
internetSearch,
54+
setInternetSearch,
55+
internetSearchScope,
56+
setInternetSearchScope,
57+
mcpSearch,
58+
setMcpSearch,
59+
mcpSearchScope,
60+
setMcpSearchScope,
61+
historicalRecords,
62+
setHistoricalRecords,
63+
aiAssistant,
64+
setAiAssistant,
65+
newSession,
66+
setNewSession,
67+
fixedWindow,
68+
setFixedWindow,
69+
serviceList,
70+
setServiceList,
71+
external,
72+
setExternal,
73+
aiOverview,
74+
setAiOverview,
75+
} = useShortcutsStore();
76+
77+
const { addError } = useAppStore();
9578

9679
useEffect(() => {
9780
const unlisten = useShortcutsStore.subscribe((state) => {
@@ -116,15 +99,13 @@ const Shortcuts = () => {
11699
value: returnToInput,
117100
setValue: setReturnToInput,
118101
},
119-
// {
120-
// title: "settings.advanced.shortcuts.voiceInput.title",
121-
// description: "settings.advanced.shortcuts.voiceInput.description",
122-
// value: voiceInput,
123-
// setValue: setVoiceInput,
124-
// reset: () => {
125-
// handleChange(INITIAL_VOICE_INPUT, setVoiceInput);
126-
// },
127-
// },
102+
{
103+
title: "settings.advanced.shortcuts.voiceInput.title",
104+
description: "settings.advanced.shortcuts.voiceInput.description",
105+
initialValue: INITIAL_VOICE_INPUT,
106+
value: voiceInput,
107+
setValue: setVoiceInput,
108+
},
128109
// {
129110
// title: "settings.advanced.shortcuts.addFile.title",
130111
// description: "settings.advanced.shortcuts.addFile.description",

0 commit comments

Comments
 (0)