feat: add mixed-mode input (text + images + files) during voice calls

Enable users to send text messages, images, and files to the Agent
while an active voice call is in progress. This addresses the case
where spoken instructions are unclear or screenshots/documents need
to be shared for analysis.

## Architecture

Data flows through LiveKit data channel (not direct HTTP):
  Flutter → publishData(topic='text_inject') → voice-agent
  → llm.inject_text_message() → POST /api/v1/agent/tasks (same session)
  → collect streamed response → session.say() → TTS playback

This preserves the constraint that voice-agent owns the agent-service
sessionId — Flutter never contacts agent-service directly.

## Flutter UI (agent_call_page.dart)
- Add keyboard toggle button to active call controls (4-button row)
- Collapsible text input area with attachment picker (+) and send button
- Attachment support: gallery multi-select, camera, file picker
  (images max 1024x1024 quality 80%, PDF supported, max 5 attachments)
- Horizontal scrolling attachment preview with delete buttons
- 200KB payload size check before LiveKit data channel send
- Layout adapts: Spacer flex 1/3 toggle, reduced bottom padding

## voice-agent (agent.py)
- Register data_received event listener after session.start()
- Filter for topic='text_inject', parse JSON payload
- Call llm.inject_text_message(text, attachments) and TTS via session.say()
- Use asyncio.ensure_future() wrapper for async handler (matches
  existing disconnect handler pattern for sync EventEmitter)

## AgentServiceLLM (agent_llm.py)
- New inject_text_message(text, attachments) method on AgentServiceLLM
- Reuses same _agent_session_id for conversation context continuity
- WS+HTTP streaming: connect, pre-subscribe, POST /tasks with
  attachments field, collect full text response, return string
- _injecting flag prevents concurrent _do_stream from clearing
  session ID on abort errors while inject is in progress
- Same systemPrompt/voiceMode/engineType as voice pipeline

No agent-service changes required — attachments already supported
end-to-end (JSONB storage → multimodal content blocks → Claude).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-02 05:38:04 -08:00
parent 02aaf40bb2
commit ce63ece340
3 changed files with 597 additions and 6 deletions

View File

@ -1,12 +1,17 @@
import 'dart:async';
import 'dart:convert';
import 'dart:io';
import 'dart:math';
import 'package:flutter/material.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:livekit_client/livekit_client.dart';
import 'package:image_picker/image_picker.dart';
import 'package:file_picker/file_picker.dart';
import '../../../../core/config/api_endpoints.dart';
import '../../../../core/config/app_config.dart';
import '../../../../core/network/dio_client.dart';
import '../../../../core/theme/app_colors.dart';
import '../../../chat/domain/entities/chat_message.dart';
import '../../../settings/presentation/providers/settings_providers.dart';
/// Tracks the current state of the voice call.
@ -49,6 +54,12 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
// Prevent double-actions
bool _userEndedCall = false;
// Text input (mixed-mode: text + attachments during voice call)
final _textController = TextEditingController();
bool _isInputExpanded = false;
bool _isSending = false;
final List<ChatAttachment> _pendingAttachments = [];
@override
void initState() {
super.initState();
@ -247,6 +258,238 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
setState(() {});
}
// ---------------------------------------------------------------------------
// Text input + attachments (mixed-mode during voice call)
// ---------------------------------------------------------------------------
static const _maxAttachments = 5;
Future<void> _sendTextMessage() async {
final text = _textController.text.trim();
if (text.isEmpty && _pendingAttachments.isEmpty) return;
if (_room?.localParticipant == null) return;
setState(() => _isSending = true);
try {
final payload = {
'type': 'text_inject',
'text': text,
if (_pendingAttachments.isNotEmpty)
'attachments': _pendingAttachments.map((a) => a.toJson()).toList(),
};
final jsonStr = jsonEncode(payload);
final bytes = utf8.encode(jsonStr);
// Check size limit (200KB for LiveKit data channel)
if (bytes.length > 200 * 1024) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('消息太大,请减少附件数量或大小')),
);
}
return;
}
await _room!.localParticipant!.publishData(
bytes,
reliable: true,
topic: 'text_inject',
);
_textController.clear();
setState(() => _pendingAttachments.clear());
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(
content: Text('已发送'),
duration: Duration(seconds: 1),
),
);
}
} catch (e) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
SnackBar(content: Text('发送失败: $e')),
);
}
} finally {
if (mounted) setState(() => _isSending = false);
}
}
void _showAttachmentOptions() {
showModalBottomSheet(
context: context,
builder: (ctx) => SafeArea(
child: Column(
mainAxisSize: MainAxisSize.min,
children: [
ListTile(
leading: const Icon(Icons.photo_library),
title: const Text('从相册选择'),
subtitle: const Text('支持多选'),
onTap: () { Navigator.pop(ctx); _pickMultipleImages(); },
),
ListTile(
leading: const Icon(Icons.camera_alt),
title: const Text('拍照'),
onTap: () { Navigator.pop(ctx); _pickImage(ImageSource.camera); },
),
ListTile(
leading: const Icon(Icons.attach_file),
title: const Text('选择文件'),
subtitle: const Text('图片、PDF'),
onTap: () { Navigator.pop(ctx); _pickFile(); },
),
],
),
),
);
}
Future<void> _pickImage(ImageSource source) async {
if (_pendingAttachments.length >= _maxAttachments) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('最多添加 $_maxAttachments 个附件')),
);
}
return;
}
final picker = ImagePicker();
final picked = await picker.pickImage(
source: source,
maxWidth: 1024,
maxHeight: 1024,
imageQuality: 80,
);
if (picked == null) return;
final bytes = await picked.readAsBytes();
final ext = picked.path.split('.').last.toLowerCase();
final mediaType = switch (ext) {
'png' => 'image/png',
'webp' => 'image/webp',
'gif' => 'image/gif',
_ => 'image/jpeg',
};
setState(() {
_pendingAttachments.add(ChatAttachment(
base64Data: base64Encode(bytes),
mediaType: mediaType,
fileName: picked.name,
));
});
}
Future<void> _pickMultipleImages() async {
final remaining = _maxAttachments - _pendingAttachments.length;
if (remaining <= 0) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('最多添加 $_maxAttachments 个附件')),
);
}
return;
}
final picker = ImagePicker();
final pickedList = await picker.pickMultiImage(
maxWidth: 1024,
maxHeight: 1024,
imageQuality: 80,
);
if (pickedList.isEmpty) return;
final toAdd = pickedList.take(remaining);
for (final picked in toAdd) {
final bytes = await picked.readAsBytes();
final ext = picked.path.split('.').last.toLowerCase();
final mediaType = switch (ext) {
'png' => 'image/png',
'webp' => 'image/webp',
'gif' => 'image/gif',
_ => 'image/jpeg',
};
_pendingAttachments.add(ChatAttachment(
base64Data: base64Encode(bytes),
mediaType: mediaType,
fileName: picked.name,
));
}
setState(() {});
if (pickedList.length > remaining && mounted) {
ScaffoldMessenger.of(context).showSnackBar(
SnackBar(content: Text('已选择 $remaining 张,最多 $_maxAttachments')),
);
}
}
Future<void> _pickFile() async {
final remaining = _maxAttachments - _pendingAttachments.length;
if (remaining <= 0) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('最多添加 $_maxAttachments 个附件')),
);
}
return;
}
final result = await FilePicker.platform.pickFiles(
type: FileType.any,
allowMultiple: true,
);
if (result == null || result.files.isEmpty) return;
const allowedExts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'pdf'};
int skipped = 0;
final toAdd = result.files.take(remaining);
for (final file in toAdd) {
if (file.path == null) continue;
final ext = (file.extension ?? '').toLowerCase();
if (!allowedExts.contains(ext)) {
skipped++;
continue;
}
final bytes = await File(file.path!).readAsBytes();
final String mediaType;
if (ext == 'pdf') {
mediaType = 'application/pdf';
} else {
mediaType = switch (ext) {
'png' => 'image/png',
'webp' => 'image/webp',
'gif' => 'image/gif',
_ => 'image/jpeg',
};
}
_pendingAttachments.add(ChatAttachment(
base64Data: base64Encode(bytes),
mediaType: mediaType,
fileName: file.name,
));
}
setState(() {});
if (skipped > 0 && mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('仅支持图片(jpg/png/gif/webp)和PDF文件')),
);
} else if (result.files.length > remaining && mounted) {
ScaffoldMessenger.of(context).showSnackBar(
SnackBar(content: Text('已选择 $remaining 个,最多 $_maxAttachments')),
);
}
}
// ---------------------------------------------------------------------------
// Build
// ---------------------------------------------------------------------------
@ -345,9 +588,11 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
),
),
if (_phase == _CallPhase.active) _buildWaveform(),
const Spacer(flex: 3),
Spacer(flex: _isInputExpanded ? 1 : 3),
if (_isInputExpanded && _phase == _CallPhase.active)
_buildTextInputArea(),
_buildControls(),
const SizedBox(height: 48),
SizedBox(height: _isInputExpanded ? 12 : 48),
],
),
),
@ -464,6 +709,126 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
);
}
Widget _buildTextInputArea() {
return Padding(
padding: const EdgeInsets.symmetric(horizontal: 24),
child: Column(
mainAxisSize: MainAxisSize.min,
children: [
if (_pendingAttachments.isNotEmpty)
Padding(
padding: const EdgeInsets.only(bottom: 8),
child: _buildAttachmentPreview(),
),
Container(
decoration: BoxDecoration(
color: AppColors.surface.withOpacity(0.92),
borderRadius: BorderRadius.circular(24),
border: Border.all(color: AppColors.surfaceLight.withOpacity(0.6)),
),
child: Row(
children: [
Padding(
padding: const EdgeInsets.only(left: 4),
child: IconButton(
icon: const Icon(Icons.add_circle_outline, size: 22),
onPressed: _showAttachmentOptions,
),
),
Expanded(
child: TextField(
controller: _textController,
decoration: const InputDecoration(
hintText: '输入文字或添加附件...',
hintStyle: TextStyle(color: AppColors.textMuted),
border: InputBorder.none,
contentPadding: EdgeInsets.symmetric(vertical: 12),
),
textInputAction: TextInputAction.send,
onSubmitted: (_) => _sendTextMessage(),
),
),
Padding(
padding: const EdgeInsets.only(right: 4),
child: IconButton(
icon: _isSending
? const SizedBox(
width: 20, height: 20,
child: CircularProgressIndicator(strokeWidth: 2),
)
: const Icon(Icons.send, size: 20),
onPressed: _isSending ? null : _sendTextMessage,
),
),
],
),
),
],
),
);
}
Widget _buildAttachmentPreview() {
return SizedBox(
height: 80,
child: ListView.builder(
scrollDirection: Axis.horizontal,
itemCount: _pendingAttachments.length,
itemBuilder: (ctx, i) {
final att = _pendingAttachments[i];
final isImage = att.mediaType.startsWith('image/');
return Stack(
children: [
Padding(
padding: const EdgeInsets.all(4),
child: ClipRRect(
borderRadius: BorderRadius.circular(8),
child: isImage
? Image.memory(
base64Decode(att.base64Data),
width: 72, height: 72,
fit: BoxFit.cover,
cacheWidth: 144, cacheHeight: 144,
)
: Container(
width: 72, height: 72,
color: AppColors.surfaceLight,
child: Column(
mainAxisAlignment: MainAxisAlignment.center,
children: [
const Icon(Icons.description, size: 28, color: AppColors.textSecondary),
const SizedBox(height: 2),
Text(
att.fileName?.split('.').last.toUpperCase() ?? 'FILE',
style: const TextStyle(fontSize: 10, color: AppColors.textMuted),
overflow: TextOverflow.ellipsis,
),
],
),
),
),
),
Positioned(
top: 0,
right: 0,
child: GestureDetector(
onTap: () => setState(() => _pendingAttachments.removeAt(i)),
child: Container(
decoration: const BoxDecoration(
color: Colors.black54,
shape: BoxShape.circle,
),
child: const Icon(Icons.close, size: 16, color: Colors.white),
),
),
),
],
);
},
),
);
}
Widget _buildControls() {
switch (_phase) {
case _CallPhase.ringing:
@ -493,20 +858,27 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
return Row(
mainAxisAlignment: MainAxisAlignment.center,
children: [
_CircleButton(
icon: _isInputExpanded ? Icons.keyboard_hide : Icons.keyboard,
label: _isInputExpanded ? '收起' : '键盘',
isActive: _isInputExpanded,
onTap: () => setState(() => _isInputExpanded = !_isInputExpanded),
),
const SizedBox(width: 16),
_CircleButton(
icon: _isMuted ? Icons.mic_off : Icons.mic,
label: _isMuted ? '取消静音' : '静音',
isActive: _isMuted,
onTap: _toggleMute,
),
const SizedBox(width: 24),
const SizedBox(width: 16),
FloatingActionButton(
heroTag: 'end',
backgroundColor: AppColors.error,
onPressed: _endCall,
child: const Icon(Icons.call_end, color: Colors.white),
),
const SizedBox(width: 24),
const SizedBox(width: 16),
_CircleButton(
icon: _isSpeaker ? Icons.volume_up : Icons.hearing,
label: _isSpeaker ? '扬声器' : '听筒',
@ -525,6 +897,7 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
@override
void dispose() {
_userEndedCall = true;
_textController.dispose();
_durationTimer?.cancel();
_waveTimer?.cancel();
_waveController.dispose();

View File

@ -290,6 +290,49 @@ async def entrypoint(ctx: JobContext) -> None:
logger.info("Voice session started for room %s", ctx.room.name)
# ---------------------------------------------------------------------
# Data channel listener: receive text + attachments from Flutter client
# ---------------------------------------------------------------------
async def _on_data_received(data_packet) -> None:
try:
if data_packet.topic != "text_inject":
return
payload = json.loads(data_packet.data.decode("utf-8"))
text = payload.get("text", "")
attachments = payload.get("attachments")
logger.info(
"text_inject received: text=%s attachments=%d",
text[:80] if text else "(empty)",
len(attachments) if attachments else 0,
)
if not text and not attachments:
return
# Call agent-service with the same session (context preserved)
response = await llm.inject_text_message(
text=text,
attachments=attachments,
)
if response:
logger.info("inject response: %s", response[:100])
session.say(response)
else:
logger.warning("inject_text_message returned empty response")
except Exception as exc:
logger.error(
"text_inject handler error: %s: %s",
type(exc).__name__, exc, exc_info=True,
)
# Use ensure_future because ctx.room.on() uses a sync event emitter
# (same pattern as the "disconnected" handler above)
ctx.room.on("data_received", lambda dp: asyncio.ensure_future(_on_data_received(dp)))
except Exception as exc:
logger.error(
"Voice session failed for room %s: %s: %s",

View File

@ -48,6 +48,7 @@ class AgentServiceLLM(llm.LLM):
self._auth_header = auth_header
self._engine_type = engine_type
self._agent_session_id: str | None = None
self._injecting = False # guard: don't clear session during inject
@property
def model(self) -> str:
@ -74,6 +75,176 @@ class AgentServiceLLM(llm.LLM):
conn_options=conn_options,
)
async def inject_text_message(
self,
*,
text: str = "",
attachments: list[dict] | None = None,
) -> str:
"""Inject a text message (with optional attachments) into the agent session.
Returns the complete response text for TTS playback via session.say().
Uses the same session ID so conversation context is preserved.
"""
if not text and not attachments:
return ""
self._injecting = True
try:
return await self._do_inject(text, attachments)
except Exception as exc:
logger.error("inject_text_message error: %s: %s", type(exc).__name__, exc)
return ""
finally:
self._injecting = False
async def _do_inject(
self,
text: str,
attachments: list[dict] | None,
) -> str:
"""Execute inject: WS+HTTP stream, collect full response text."""
import time
agent_url = self._agent_service_url
auth_header = self._auth_header
headers: dict[str, str] = {"Content-Type": "application/json"}
if auth_header:
headers["Authorization"] = auth_header
ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
ws_url = f"{ws_url}/ws/agent"
timeout_secs = 120
engine_type = self._engine_type
voice_mode = engine_type == "claude_agent_sdk"
body: dict[str, Any] = {
"prompt": text if text else "(see attachments)",
"engineType": engine_type,
"voiceMode": voice_mode,
}
if voice_mode:
body["systemPrompt"] = (
"你正在通过语音与用户实时对话。请严格遵守以下规则:\n"
"1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n"
"2. 用简洁自然的口语中文回答,像面对面对话一样\n"
"3. 回复要简短精炼适合语音播报通常1-3句话即可\n"
"4. 不要使用markdown格式、代码块、列表符号等文本格式"
)
if self._agent_session_id:
body["sessionId"] = self._agent_session_id
if attachments:
body["attachments"] = attachments
logger.info(
"inject POST /tasks engine=%s text=%s attachments=%d",
engine_type,
text[:80] if text else "(empty)",
len(attachments) if attachments else 0,
)
collected_text = ""
async with websockets.connect(
ws_url,
open_timeout=10,
close_timeout=5,
ping_interval=20,
ping_timeout=10,
) as ws:
# Pre-subscribe
if self._agent_session_id:
await ws.send(json.dumps({
"event": "subscribe_session",
"data": {"sessionId": self._agent_session_id},
}))
# Create task
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
) as client:
resp = await client.post(
f"{agent_url}/api/v1/agent/tasks",
json=body,
headers=headers,
)
if resp.status_code not in (200, 201):
logger.error(
"inject task creation failed: %d %s",
resp.status_code, resp.text[:200],
)
return ""
data = resp.json()
session_id = data.get("sessionId", "")
task_id = data.get("taskId", "")
self._agent_session_id = session_id
logger.info(
"inject task created: session=%s, task=%s",
session_id, task_id,
)
# Subscribe with actual IDs
await ws.send(json.dumps({
"event": "subscribe_session",
"data": {"sessionId": session_id, "taskId": task_id},
}))
# Stream events → collect text
deadline = time.time() + timeout_secs
while time.time() < deadline:
remaining = deadline - time.time()
try:
raw = await asyncio.wait_for(
ws.recv(), timeout=min(30.0, remaining)
)
except asyncio.TimeoutError:
if time.time() >= deadline:
logger.warning("inject stream timeout after %ds", timeout_secs)
continue
except websockets.exceptions.ConnectionClosed:
logger.warning("inject WS connection closed")
break
try:
msg = json.loads(raw)
except (json.JSONDecodeError, TypeError):
continue
event_type = msg.get("event", "")
if event_type == "stream_event":
evt_data = msg.get("data", {})
evt_type = evt_data.get("type", "")
if evt_type == "text":
content = evt_data.get("content", "")
if content:
collected_text += content
elif evt_type == "completed":
logger.info(
"inject stream completed, text length=%d",
len(collected_text),
)
return collected_text
elif evt_type == "error":
err_msg = evt_data.get("message", "Unknown error")
logger.error("inject error: %s", err_msg)
if "aborted" in err_msg.lower() or "exited" in err_msg.lower():
self._agent_session_id = None
return collected_text if collected_text else ""
return collected_text
class AgentServiceLLMStream(llm.LLMStream):
"""Streams text from agent-service via WebSocket."""
@ -336,9 +507,13 @@ class AgentServiceLLMStream(llm.LLMStream):
logger.error("Agent error: %s", err_msg)
# Clear session so next task starts fresh
# (don't try to resume a dead/aborted session)
# But skip if inject is in progress — it owns the session
if "aborted" in err_msg.lower() or "exited" in err_msg.lower():
logger.info("Clearing agent session after abort/exit")
self._llm_instance._agent_session_id = None
if not self._llm_instance._injecting:
logger.info("Clearing agent session after abort/exit")
self._llm_instance._agent_session_id = None
else:
logger.info("Skipping session clear — inject in progress")
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,