fix(dingtalk): 55s bridge timeout + batchSend fallback for expired webhooks

Root cause of "Bridge call failed" errors: bridge /task endpoint defaults to 25s agent reply timeout, but LLM calls through the iConsulting gateway can take 30-60s. Fix: pass timeoutSeconds=55 explicitly in POST body. Also add batchSend fallback in routeToAgent: if the sessionWebhook has expired by the time the LLM replies (user sent a message, LLM took >30s, webhook window closed), the reply is now sent via proactive batchSend using senderStaffId instead of being silently dropped. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-08 23:33:56 -07:00 · 2026-03-08 23:33:56 -07:00 · 440819add8
parent 5874907300
commit 440819add8
1 changed files with 41 additions and 3 deletions
--- a/packages/services/agent-service/src/infrastructure/dingtalk/dingtalk-router.service.ts
+++ b/packages/services/agent-service/src/infrastructure/dingtalk/dingtalk-router.service.ts
@ -29,6 +29,8 @@
 *   - DingTalk API response capped at 256 KB (prevents memory spike on bad response)
 *   - Bridge (OpenClaw) response also capped at 256 KB
 *   - Dual routing: senderStaffId (OAuth binding) + senderId (code binding) both handled
 *   - Bridge task timeout explicitly set to 55s (bridge default 25s is too short for LLM)
 *   - sessionWebhook expiry fallback: if webhook expires before LLM replies, uses batchSend
 *   - Periodic cleanup for all in-memory maps (5 min interval)
 */
@ -78,7 +80,7 @@ const OAUTH_STATE_TTL_MS    = 10 * 60 * 1000;  // 10 min
 const TOKEN_REFRESH_BUFFER  = 300;               // seconds before expiry to proactively refresh
 const WS_RECONNECT_BASE_MS  = 2_000;
 const WS_RECONNECT_MAX_MS   = 60_000;
-const TASK_TIMEOUT_S        = 30;
+const TASK_TIMEOUT_S        = 55; // seconds — bridge default is 25s; must pass explicitly
 const DEDUP_TTL_MS          = 10 * 60 * 1000;
 const RATE_LIMIT_PER_MIN    = 10;
 const QUEUE_MAX_DEPTH       = 5;
@ -622,8 +624,10 @@ export class DingTalkRouterService implements OnModuleInit, OnModuleDestroy {
          prompt: text,
          sessionKey: `agent:main:dt-${userId}`,
          idempotencyKey: msg.msgId,
          // Pass explicit timeout to bridge — default is 25s which is too short for LLM calls.
          timeoutSeconds: TASK_TIMEOUT_S,
        },
-        (TASK_TIMEOUT_S + 5) * 1000,
+        (TASK_TIMEOUT_S + 10) * 1000,
      );
      if (result.ok && result.result !== undefined) {
@ -638,7 +642,41 @@ export class DingTalkRouterService implements OnModuleInit, OnModuleDestroy {
      reply = '与小龙虾通信时出现错误，请稍后重试。';
    }
    // Try sessionWebhook first; if it has expired by the time we have a reply (LLM took
    // longer than ~30s), fall back to proactive batchSend so the reply still reaches the user.
    const webhookExpiry = msg.sessionWebhookExpiredTime > 1e11
      ? msg.sessionWebhookExpiredTime
      : msg.sessionWebhookExpiredTime * 1000;
    if (Date.now() <= webhookExpiry) {
      this.reply(msg, reply);
    } else {
      this.logger.warn(
        `sessionWebhook expired for msgId=${msg.msgId} — falling back to batchSend for userId=${userId}`,
      );
      const staffId = msg.senderStaffId?.trim();
      if (staffId) {
        this.getToken()
          .then((token) =>
            this.httpsPost<unknown>(
              'api.dingtalk.com',
              '/v1.0/robot/oToMessages/batchSend',
              {
                robotCode: this.clientId,
                userIds:   [staffId],
                msgKey:    'sampleText',
                msgParam:  JSON.stringify({ content: reply }),
              },
              { 'x-acs-dingtalk-access-token': token },
            ),
          )
          .catch((e: Error) =>
            this.logger.error(`batchSend fallback failed for msgId=${msg.msgId}:`, e.message),
          );
      } else {
        this.logger.warn(`No staffId for batchSend fallback, reply lost for msgId=${msg.msgId}`);
      }
    }
  }
  // ── Reply (chunked) ────────────────────────────────────────────────────────