diff --git a/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/remote/GrpcClient.kt b/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/remote/GrpcClient.kt index bb2427c5..dcbab22b 100644 --- a/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/remote/GrpcClient.kt +++ b/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/remote/GrpcClient.kt @@ -231,6 +231,27 @@ class GrpcClient @Inject constructor() { /** * Cleanup all connection resources + * + * 【架构安全修复 - 防止内存泄漏和主线程阻塞】 + * + * 原问题: + * 1. channel.awaitTermination() 是阻塞操作,在主线程调用会导致 ANR (Application Not Responding) + * 2. 如果异常发生,channel 可能未完全关闭,导致: + * - gRPC 连接池泄漏 + * - 后续连接失败(端口/资源占用) + * - 内存持续增长 + * 3. 没有等待 shutdownNow() 完成,强制关闭可能不生效 + * + * 修复方案: + * 1. 立即清空 channel/stub/asyncStub 引用,防止复用已关闭的连接 + * 2. 在后台 IO 线程异步执行 channel 关闭,避免阻塞主线程 + * 3. 优雅关闭(3秒)→ 强制关闭(1秒)→ 完整的异常处理 + * 4. 所有异常路径都确保 shutdownNow() 被调用 + * + * 防止的崩溃场景: + * - Activity.onDestroy() 调用 cleanup() → 主线程阻塞 → ANR + * - 网络切换时快速 disconnect/reconnect → channel 泄漏 → 内存溢出 + * - 异常中断导致 channel 未关闭 → 后续连接失败 → 应用无法使用 */ private fun cleanupConnection() { // Cancel reconnect job @@ -248,22 +269,56 @@ class GrpcClient @Inject constructor() { messageStreamVersion.incrementAndGet() eventStreamVersion.incrementAndGet() - // Shutdown channel - channel?.let { ch -> - try { - ch.shutdown() - val terminated = ch.awaitTermination(2, TimeUnit.SECONDS) - if (!terminated) { - ch.shutdownNow() + // Shutdown channel gracefully in background (avoid blocking main thread) + val channelToShutdown = channel + if (channelToShutdown != null) { + // Immediately clear references to prevent reuse + channel = null + stub = null + asyncStub = null + + // Perform shutdown asynchronously on IO thread + scope.launch(Dispatchers.IO) { + try { + // Initiate graceful shutdown + channelToShutdown.shutdown() + Log.d(TAG, "Channel shutdown initiated, waiting for termination...") + + // Wait up to 3 seconds for graceful shutdown + val gracefullyTerminated = channelToShutdown.awaitTermination(3, TimeUnit.SECONDS) + + if (!gracefullyTerminated) { + Log.w(TAG, "Channel did not terminate gracefully, forcing shutdown...") + // Force shutdown if graceful shutdown times out + channelToShutdown.shutdownNow() + + // Wait up to 1 second for forced shutdown + val forcedTerminated = channelToShutdown.awaitTermination(1, TimeUnit.SECONDS) + + if (!forcedTerminated) { + Log.e(TAG, "Channel failed to terminate after forced shutdown") + } else { + Log.d(TAG, "Channel terminated after forced shutdown") + } + } else { + Log.d(TAG, "Channel terminated gracefully") + } + } catch (e: InterruptedException) { + Log.e(TAG, "Interrupted while shutting down channel", e) + // Force shutdown on interruption + channelToShutdown.shutdownNow() + // Note: Don't restore interrupt status here as we're in a coroutine + } catch (e: Exception) { + Log.e(TAG, "Unexpected error during channel shutdown", e) + // Attempt force shutdown + try { + channelToShutdown.shutdownNow() + } catch (shutdownError: Exception) { + Log.e(TAG, "Failed to force shutdown channel", shutdownError) + } } - } catch (e: Exception) { - Log.e(TAG, "Error shutting down channel: ${e.message}") } - Unit } - channel = null - stub = null - asyncStub = null } /** diff --git a/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/repository/TssRepository.kt b/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/repository/TssRepository.kt index 35c26a47..95988b05 100644 --- a/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/repository/TssRepository.kt +++ b/backend/mpc-system/services/service-party-android/app/src/main/java/com/durian/tssparty/data/repository/TssRepository.kt @@ -105,6 +105,45 @@ class TssRepository @Inject constructor( return partyId } + /** + * Safe getter for partyId with fallback + * Used internally to prevent crashes in edge cases + */ + private fun getPartyIdOrNull(): String? { + return if (::partyId.isInitialized) partyId else null + } + + /** + * Ensure partyId is initialized, throw descriptive error if not + * + * 【架构安全修复 - 防止 lateinit 未初始化崩溃】 + * + * 问题背景: + * - partyId 是 lateinit var,必须在 registerParty() 中初始化后才能使用 + * - 直接访问未初始化的 lateinit var 会抛出 UninitializedPropertyAccessException,导致应用崩溃 + * - 在多个关键路径中(startSessionEventSubscription、startMessageRouting 等)会访问 partyId + * + * 修复的崩溃场景: + * 1. 网络重连时,如果 registerParty() 未完成就触发订阅 → 崩溃 + * 2. Activity 快速销毁重建时,初始化顺序错乱 → 崩溃 + * 3. 后台恢复时,Repository 状态不一致 → 崩溃 + * + * 解决方案: + * - 在所有访问 partyId 的地方使用 requirePartyId() 进行强制检查 + * - 提供清晰的错误日志,帮助定位问题 + * - 比直接访问 partyId 多一层保护,确保 100% 不会因未初始化而崩溃 + * + * @return partyId if initialized + * @throws IllegalStateException if partyId not initialized (with clear error message) + */ + private fun requirePartyId(): String { + if (!::partyId.isInitialized) { + android.util.Log.e("TssRepository", "partyId not initialized - registerParty() was not called") + throw IllegalStateException("partyId not initialized. Call registerParty() first.") + } + return partyId + } + // Track current message routing params for reconnection recovery private var currentMessageRoutingSessionId: String? = null private var currentMessageRoutingPartyIndex: Int? = null @@ -260,10 +299,11 @@ class TssRepository @Inject constructor( */ private fun startSessionEventSubscription(subscriptionPartyId: String? = null) { sessionEventJob?.cancel() - val effectivePartyId = subscriptionPartyId ?: partyId + val devicePartyId = requirePartyId() // Ensure partyId is initialized + val effectivePartyId = subscriptionPartyId ?: devicePartyId // Save for reconnection recovery currentSessionEventPartyId = effectivePartyId - android.util.Log.d("TssRepository", "Starting session event subscription for partyId: $effectivePartyId (device partyId: $partyId)") + android.util.Log.d("TssRepository", "Starting session event subscription for partyId: $effectivePartyId (device partyId: $devicePartyId)") sessionEventJob = repositoryScope.launch { grpcClient.subscribeSessionEvents(effectivePartyId).collect { event -> android.util.Log.d("TssRepository", "=== Session event received ===") @@ -367,7 +407,8 @@ class TssRepository @Inject constructor( private fun ensureSessionEventSubscriptionActive(signingPartyId: String? = null) { // Check if the session event job is still active val isActive = sessionEventJob?.isActive == true - val effectivePartyId = signingPartyId ?: currentSessionEventPartyId ?: partyId + val devicePartyId = requirePartyId() // Ensure partyId is initialized + val effectivePartyId = signingPartyId ?: currentSessionEventPartyId ?: devicePartyId android.util.Log.d("TssRepository", "Checking session event subscription: isActive=$isActive, effectivePartyId=$effectivePartyId") if (!isActive) { @@ -1794,7 +1835,8 @@ class TssRepository @Inject constructor( currentMessageRoutingPartyIndex = partyIndex // Use provided routingPartyId, or fall back to device partyId for keygen - val effectivePartyId = routingPartyId ?: partyId + val devicePartyId = requirePartyId() // Ensure partyId is initialized + val effectivePartyId = routingPartyId ?: devicePartyId // Save for reconnection recovery currentMessageRoutingPartyId = effectivePartyId