fix(dingtalk): 55s bridge timeout + batchSend fallback for expired webhooks
Root cause of "Bridge call failed" errors: bridge /task endpoint defaults to 25s agent reply timeout, but LLM calls through the iConsulting gateway can take 30-60s. Fix: pass timeoutSeconds=55 explicitly in POST body. Also add batchSend fallback in routeToAgent: if the sessionWebhook has expired by the time the LLM replies (user sent a message, LLM took >30s, webhook window closed), the reply is now sent via proactive batchSend using senderStaffId instead of being silently dropped. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5874907300
commit
440819add8
|
|
@ -29,6 +29,8 @@
|
|||
* - DingTalk API response capped at 256 KB (prevents memory spike on bad response)
|
||||
* - Bridge (OpenClaw) response also capped at 256 KB
|
||||
* - Dual routing: senderStaffId (OAuth binding) + senderId (code binding) both handled
|
||||
* - Bridge task timeout explicitly set to 55s (bridge default 25s is too short for LLM)
|
||||
* - sessionWebhook expiry fallback: if webhook expires before LLM replies, uses batchSend
|
||||
* - Periodic cleanup for all in-memory maps (5 min interval)
|
||||
*/
|
||||
|
||||
|
|
@ -78,7 +80,7 @@ const OAUTH_STATE_TTL_MS = 10 * 60 * 1000; // 10 min
|
|||
const TOKEN_REFRESH_BUFFER = 300; // seconds before expiry to proactively refresh
|
||||
const WS_RECONNECT_BASE_MS = 2_000;
|
||||
const WS_RECONNECT_MAX_MS = 60_000;
|
||||
const TASK_TIMEOUT_S = 30;
|
||||
const TASK_TIMEOUT_S = 55; // seconds — bridge default is 25s; must pass explicitly
|
||||
const DEDUP_TTL_MS = 10 * 60 * 1000;
|
||||
const RATE_LIMIT_PER_MIN = 10;
|
||||
const QUEUE_MAX_DEPTH = 5;
|
||||
|
|
@ -622,8 +624,10 @@ export class DingTalkRouterService implements OnModuleInit, OnModuleDestroy {
|
|||
prompt: text,
|
||||
sessionKey: `agent:main:dt-${userId}`,
|
||||
idempotencyKey: msg.msgId,
|
||||
// Pass explicit timeout to bridge — default is 25s which is too short for LLM calls.
|
||||
timeoutSeconds: TASK_TIMEOUT_S,
|
||||
},
|
||||
(TASK_TIMEOUT_S + 5) * 1000,
|
||||
(TASK_TIMEOUT_S + 10) * 1000,
|
||||
);
|
||||
|
||||
if (result.ok && result.result !== undefined) {
|
||||
|
|
@ -638,7 +642,41 @@ export class DingTalkRouterService implements OnModuleInit, OnModuleDestroy {
|
|||
reply = '与小龙虾通信时出现错误,请稍后重试。';
|
||||
}
|
||||
|
||||
this.reply(msg, reply);
|
||||
// Try sessionWebhook first; if it has expired by the time we have a reply (LLM took
|
||||
// longer than ~30s), fall back to proactive batchSend so the reply still reaches the user.
|
||||
const webhookExpiry = msg.sessionWebhookExpiredTime > 1e11
|
||||
? msg.sessionWebhookExpiredTime
|
||||
: msg.sessionWebhookExpiredTime * 1000;
|
||||
|
||||
if (Date.now() <= webhookExpiry) {
|
||||
this.reply(msg, reply);
|
||||
} else {
|
||||
this.logger.warn(
|
||||
`sessionWebhook expired for msgId=${msg.msgId} — falling back to batchSend for userId=${userId}`,
|
||||
);
|
||||
const staffId = msg.senderStaffId?.trim();
|
||||
if (staffId) {
|
||||
this.getToken()
|
||||
.then((token) =>
|
||||
this.httpsPost<unknown>(
|
||||
'api.dingtalk.com',
|
||||
'/v1.0/robot/oToMessages/batchSend',
|
||||
{
|
||||
robotCode: this.clientId,
|
||||
userIds: [staffId],
|
||||
msgKey: 'sampleText',
|
||||
msgParam: JSON.stringify({ content: reply }),
|
||||
},
|
||||
{ 'x-acs-dingtalk-access-token': token },
|
||||
),
|
||||
)
|
||||
.catch((e: Error) =>
|
||||
this.logger.error(`batchSend fallback failed for msgId=${msg.msgId}:`, e.message),
|
||||
);
|
||||
} else {
|
||||
this.logger.warn(`No staffId for batchSend fallback, reply lost for msgId=${msg.msgId}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Reply (chunked) ────────────────────────────────────────────────────────
|
||||
|
|
|
|||
Loading…
Reference in New Issue