diff --git a/src/Managing.Api/Controllers/BotController.cs b/src/Managing.Api/Controllers/BotController.cs index 78e89a3f..fc0b3911 100644 --- a/src/Managing.Api/Controllers/BotController.cs +++ b/src/Managing.Api/Controllers/BotController.cs @@ -7,6 +7,7 @@ using Managing.Application.ManageBot.Commands; using Managing.Application.Shared; using Managing.Common; using Managing.Core; +using Managing.Core.Exceptions; using Managing.Domain.Accounts; using Managing.Domain.Backtests; using Managing.Domain.Bots; @@ -245,9 +246,26 @@ public class BotController : BaseController return Ok(result); } + catch (ServiceUnavailableException ex) + { + // ServiceUnavailableException is already user-friendly (e.g., from Orleans exception conversion) + _logger.LogWarning(ex, "Service unavailable error stopping bot {Identifier}", identifier); + return StatusCode(503, ex.Message); + } catch (Exception ex) { - _logger.LogError(ex, "Error stopping bot"); + _logger.LogError(ex, "Error stopping bot {Identifier}", identifier); + + // Check if this is an Orleans exception that wasn't caught earlier + if (OrleansExceptionHelper.IsOrleansException(ex)) + { + var userMessage = OrleansExceptionHelper.GetUserFriendlyMessage(ex, "bot stop"); + _logger.LogWarning( + "Orleans exception detected in controller for bot {Identifier}: {ExceptionType}", + identifier, ex.GetType().Name); + return StatusCode(503, userMessage); + } + return StatusCode(500, $"Error stopping bot: {ex.Message}"); } } diff --git a/src/Managing.Application/Bots/Grains/LiveTradingBotGrain.cs b/src/Managing.Application/Bots/Grains/LiveTradingBotGrain.cs index 9de6b1dc..5c9c0060 100644 --- a/src/Managing.Application/Bots/Grains/LiveTradingBotGrain.cs +++ b/src/Managing.Application/Bots/Grains/LiveTradingBotGrain.cs @@ -320,6 +320,36 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable if (!isRestarting) { await CloseAllOpenPositionsAsync(); + + // Verify positions are actually closed (even if CloseAllOpenPositionsAsync had timeouts/exceptions) + // This ensures we don't report failure if positions were successfully closed despite timeouts + try + { + var botId = this.GetPrimaryKey(); + var positions = await ServiceScopeHelpers.WithScopedService>( + _scopeFactory, + async tradingService => await tradingService.GetPositionsByInitiatorIdentifierAsync(botId)); + + var stillOpenPositions = positions?.Where(p => p.IsOpen() || p.Status.Equals(PositionStatus.New)).ToList() ?? + new List(); + + if (stillOpenPositions.Any()) + { + _logger.LogWarning( + "Bot {GrainId} still has {Count} open positions after closure attempt: {Positions}", + botId, stillOpenPositions.Count, string.Join(", ", stillOpenPositions.Select(p => p.Identifier))); + } + else + { + _logger.LogInformation("Bot {GrainId} - all positions verified as closed", botId); + } + } + catch (Exception ex) + { + // Don't fail the stop operation if we can't verify positions + _logger.LogWarning(ex, "Could not verify position closure status for bot {GrainId}, continuing with stop", + this.GetPrimaryKey()); + } } // The check is now against the registry status @@ -332,6 +362,9 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable return; } + // Track if critical operations succeeded + var criticalOperationsSucceeded = false; + try { StopAndDisposeTimer(); @@ -355,17 +388,69 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable // Sync state from the volatile TradingBotBase before destroying it SyncStateFromBase(); - await _state.WriteStateAsync(); - await SaveBotAsync(BotStatus.Stopped); + + // Critical: Stop the bot and clear the trading bot instance await _tradingBot?.StopBot(reason)!; _tradingBot = null; + + // Mark critical operations as succeeded + criticalOperationsSucceeded = true; + + // Non-critical cleanup operations - continue even if these fail + try + { + await _state.WriteStateAsync(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to write state for bot {GrainId}, but bot is stopped", this.GetPrimaryKey()); + } + + try + { + await SaveBotAsync(BotStatus.Stopped); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to save bot status for bot {GrainId}, but bot is stopped", this.GetPrimaryKey()); + } + + try + { + await UpdateBotRegistryStatus(BotStatus.Stopped); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to update bot registry for bot {GrainId}, but bot is stopped", this.GetPrimaryKey()); + } - await UpdateBotRegistryStatus(BotStatus.Stopped); _logger.LogInformation("LiveTradingBotGrain {GrainId} stopped successfully", this.GetPrimaryKey()); } catch (Exception ex) { + // If critical operations succeeded (bot stopped, positions closed), don't throw + // Only throw if critical operations failed + if (criticalOperationsSucceeded) + { + _logger.LogWarning(ex, + "Cleanup operations failed for bot {GrainId}, but bot was successfully stopped. " + + "This is non-critical and the stop operation is considered successful.", + this.GetPrimaryKey()); + // Don't throw - the bot is stopped, which is what matters + return; + } + _logger.LogError(ex, "Failed to stop LiveTradingBotGrain {GrainId}", this.GetPrimaryKey()); + + // Log Orleans-specific exceptions with additional context for debugging + if (Managing.Core.Exceptions.OrleansExceptionHelper.IsOrleansException(ex)) + { + _logger.LogWarning( + "Orleans exception detected during bot stop: {ExceptionType} - {Message}. " + + "This may indicate a timeout or deadlock when closing positions.", + ex.GetType().Name, ex.Message); + } + throw; } } @@ -1151,13 +1236,75 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable "Bot {GrainId} has {Count} open positions that will be closed before stopping: {Positions}", botId, openPositions.Count, string.Join(", ", openPositions.Select(p => p.Identifier))); + // Add timeout for closing positions (30 seconds per position, max 2 minutes total) + var maxTotalTime = TimeSpan.FromMinutes(2); + var perPositionTimeout = TimeSpan.FromSeconds(30); + var startTime = DateTime.UtcNow; + foreach (var position in openPositions) { + // Check if we've exceeded total timeout + if (DateTime.UtcNow - startTime > maxTotalTime) + { + _logger.LogWarning( + "Closing positions for bot {GrainId} exceeded total timeout of {Timeout} seconds. Stopping position closure.", + botId, maxTotalTime.TotalSeconds); + break; + } + try { _logger.LogInformation("Closing position {PositionId} for bot {GrainId}", position.Identifier, botId); - await ClosePositionAsync(position.Identifier); + + // Use timeout for individual position closure + using var cts = new CancellationTokenSource(perPositionTimeout); + var closeTask = ClosePositionAsync(position.Identifier); + var timeoutTask = Task.Delay(perPositionTimeout, cts.Token); + + var completedTask = await Task.WhenAny(closeTask, timeoutTask); + if (completedTask == timeoutTask) + { + _logger.LogWarning( + "Closing position {PositionId} for bot {GrainId} timed out after {Timeout} seconds. " + + "Verifying if position was actually closed...", + position.Identifier, botId, perPositionTimeout.TotalSeconds); + + // Verify if position was actually closed despite timeout + try + { + var updatedPositions = await ServiceScopeHelpers.WithScopedService>( + _scopeFactory, + async tradingService => await tradingService.GetPositionsByInitiatorIdentifierAsync(botId)); + + var updatedPosition = updatedPositions?.FirstOrDefault(p => p.Identifier == position.Identifier); + if (updatedPosition != null && !updatedPosition.IsOpen() && updatedPosition.Status != PositionStatus.New) + { + _logger.LogInformation( + "Position {PositionId} was actually closed despite timeout. Status: {Status}", + position.Identifier, updatedPosition.Status); + // Position is closed, continue + } + else + { + _logger.LogWarning( + "Position {PositionId} may not be closed. Status: {Status}", + position.Identifier, updatedPosition != null ? updatedPosition.Status.ToString() : "null"); + // Continue with other positions - we'll verify all positions at the end + } + } + catch (Exception verifyEx) + { + _logger.LogWarning(verifyEx, + "Could not verify position {PositionId} closure status after timeout", + position.Identifier); + } + + // Continue with other positions + continue; + } + + await closeTask; _logger.LogInformation("Successfully closed position {PositionId} for bot {GrainId}", position.Identifier, botId); } diff --git a/src/Managing.Application/ManageBot/BotService.cs b/src/Managing.Application/ManageBot/BotService.cs index 4ee77463..d4810354 100644 --- a/src/Managing.Application/ManageBot/BotService.cs +++ b/src/Managing.Application/ManageBot/BotService.cs @@ -6,6 +6,7 @@ using Managing.Application.Bots; using Managing.Application.Bots.Models; using Managing.Common; using Managing.Core; +using Managing.Core.Exceptions; using Managing.Domain.Accounts; using Managing.Domain.Bots; using Managing.Domain.Indicators; @@ -60,6 +61,17 @@ namespace Managing.Application.ManageBot catch (Exception e) { _tradingBotLogger.LogError(e, "Error stopping bot {Identifier}", identifier); + + // Check if this is an Orleans exception and convert to user-friendly message + if (OrleansExceptionHelper.IsOrleansException(e)) + { + var userMessage = OrleansExceptionHelper.GetUserFriendlyMessage(e, "bot stop"); + _tradingBotLogger.LogWarning( + "Orleans exception during bot stop {Identifier}: {ExceptionType}. Converting to user-friendly message.", + identifier, e.GetType().Name); + throw new ServiceUnavailableException(userMessage); + } + throw; } }