fix(server-party): heartbeat during waitForAllParticipants

Problem:
- co_managed_keygen server-party waits for external party after joining
- No heartbeat sent during wait period (up to 5 minutes)
- session-coordinator has 120 second inactivity timeout
- Server-party marked as timed_out/failed while waiting

Fix:
- Send heartbeat in waitForAllParticipants polling loop
- Add Heartbeat method to MessageRouterClient interface
- Heartbeat every 2 seconds with poll interval
- Heartbeat failure only logs warning, does not block

Generated with Claude Code

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
hailin 2025-12-29 13:04:40 -08:00
parent c0e292535d
commit 576679ae30
1 changed files with 14 additions and 1 deletions

View File

@ -56,6 +56,7 @@ type SessionStatusInfo struct {
type MessageRouterClient interface {
RouteMessage(ctx context.Context, sessionID uuid.UUID, fromParty string, toParties []string, roundNumber int, payload []byte) error
SubscribeMessages(ctx context.Context, sessionID uuid.UUID, partyID string) (<-chan *MPCMessage, error)
Heartbeat(ctx context.Context, partyID string) (int32, error)
}
// SessionInfo contains session information from coordinator
@ -127,7 +128,7 @@ func (uc *ParticipateKeygenUseCase) Execute(
// For co_managed_keygen: wait for all N participants to join before proceeding
// This is necessary because server parties join immediately but external party joins later
if sessionInfo.SessionType == "co_managed_keygen" {
sessionInfo, err = uc.waitForAllParticipants(ctx, input.SessionID, sessionInfo)
sessionInfo, err = uc.waitForAllParticipants(ctx, input.SessionID, sessionInfo, input.PartyID)
if err != nil {
return nil, err
}
@ -395,6 +396,7 @@ func (uc *ParticipateKeygenUseCase) waitForAllParticipants(
ctx context.Context,
sessionID uuid.UUID,
initialSessionInfo *SessionInfo,
partyID string,
) (*SessionInfo, error) {
logger.Info("Waiting for all participants to join co_managed_keygen session",
zap.String("session_id", sessionID.String()),
@ -418,6 +420,17 @@ func (uc *ParticipateKeygenUseCase) waitForAllParticipants(
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(pollInterval):
// Send heartbeat to keep the party alive during wait
// This prevents the session-coordinator from timing out this party
_, heartbeatErr := uc.messageRouter.Heartbeat(ctx, partyID)
if heartbeatErr != nil {
logger.Warn("Failed to send heartbeat during wait",
zap.String("session_id", sessionID.String()),
zap.String("party_id", partyID),
zap.Error(heartbeatErr))
// Continue anyway - heartbeat failure is not fatal
}
// Get full session status including participants
statusInfo, err := uc.sessionClient.GetSessionStatusFull(ctx, sessionID)
if err != nil {