fix(server-party): heartbeat during waitForAllParticipants
Problem: - co_managed_keygen server-party waits for external party after joining - No heartbeat sent during wait period (up to 5 minutes) - session-coordinator has 120 second inactivity timeout - Server-party marked as timed_out/failed while waiting Fix: - Send heartbeat in waitForAllParticipants polling loop - Add Heartbeat method to MessageRouterClient interface - Heartbeat every 2 seconds with poll interval - Heartbeat failure only logs warning, does not block Generated with Claude Code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c0e292535d
commit
576679ae30
|
|
@ -56,6 +56,7 @@ type SessionStatusInfo struct {
|
|||
type MessageRouterClient interface {
|
||||
RouteMessage(ctx context.Context, sessionID uuid.UUID, fromParty string, toParties []string, roundNumber int, payload []byte) error
|
||||
SubscribeMessages(ctx context.Context, sessionID uuid.UUID, partyID string) (<-chan *MPCMessage, error)
|
||||
Heartbeat(ctx context.Context, partyID string) (int32, error)
|
||||
}
|
||||
|
||||
// SessionInfo contains session information from coordinator
|
||||
|
|
@ -127,7 +128,7 @@ func (uc *ParticipateKeygenUseCase) Execute(
|
|||
// For co_managed_keygen: wait for all N participants to join before proceeding
|
||||
// This is necessary because server parties join immediately but external party joins later
|
||||
if sessionInfo.SessionType == "co_managed_keygen" {
|
||||
sessionInfo, err = uc.waitForAllParticipants(ctx, input.SessionID, sessionInfo)
|
||||
sessionInfo, err = uc.waitForAllParticipants(ctx, input.SessionID, sessionInfo, input.PartyID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -395,6 +396,7 @@ func (uc *ParticipateKeygenUseCase) waitForAllParticipants(
|
|||
ctx context.Context,
|
||||
sessionID uuid.UUID,
|
||||
initialSessionInfo *SessionInfo,
|
||||
partyID string,
|
||||
) (*SessionInfo, error) {
|
||||
logger.Info("Waiting for all participants to join co_managed_keygen session",
|
||||
zap.String("session_id", sessionID.String()),
|
||||
|
|
@ -418,6 +420,17 @@ func (uc *ParticipateKeygenUseCase) waitForAllParticipants(
|
|||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-time.After(pollInterval):
|
||||
// Send heartbeat to keep the party alive during wait
|
||||
// This prevents the session-coordinator from timing out this party
|
||||
_, heartbeatErr := uc.messageRouter.Heartbeat(ctx, partyID)
|
||||
if heartbeatErr != nil {
|
||||
logger.Warn("Failed to send heartbeat during wait",
|
||||
zap.String("session_id", sessionID.String()),
|
||||
zap.String("party_id", partyID),
|
||||
zap.Error(heartbeatErr))
|
||||
// Continue anyway - heartbeat failure is not fatal
|
||||
}
|
||||
|
||||
// Get full session status including participants
|
||||
statusInfo, err := uc.sessionClient.GetSessionStatusFull(ctx, sessionID)
|
||||
if err != nil {
|
||||
|
|
|
|||
Loading…
Reference in New Issue