Enhance error handling and logging in BotController, LiveTradingBotGrain, and BotService
- Added specific handling for ServiceUnavailableException in BotController to return user-friendly messages. - Improved logging for Orleans exceptions in both BotController and BotService to provide clearer context during errors. - Implemented verification of position closure status in LiveTradingBotGrain, including timeout handling for closing positions. - Enhanced logging for critical and non-critical operations during bot stop processes to ensure better traceability.
This commit is contained in:
@@ -320,6 +320,36 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable
|
||||
if (!isRestarting)
|
||||
{
|
||||
await CloseAllOpenPositionsAsync();
|
||||
|
||||
// Verify positions are actually closed (even if CloseAllOpenPositionsAsync had timeouts/exceptions)
|
||||
// This ensures we don't report failure if positions were successfully closed despite timeouts
|
||||
try
|
||||
{
|
||||
var botId = this.GetPrimaryKey();
|
||||
var positions = await ServiceScopeHelpers.WithScopedService<ITradingService, IEnumerable<Position>>(
|
||||
_scopeFactory,
|
||||
async tradingService => await tradingService.GetPositionsByInitiatorIdentifierAsync(botId));
|
||||
|
||||
var stillOpenPositions = positions?.Where(p => p.IsOpen() || p.Status.Equals(PositionStatus.New)).ToList() ??
|
||||
new List<Position>();
|
||||
|
||||
if (stillOpenPositions.Any())
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Bot {GrainId} still has {Count} open positions after closure attempt: {Positions}",
|
||||
botId, stillOpenPositions.Count, string.Join(", ", stillOpenPositions.Select(p => p.Identifier)));
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogInformation("Bot {GrainId} - all positions verified as closed", botId);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Don't fail the stop operation if we can't verify positions
|
||||
_logger.LogWarning(ex, "Could not verify position closure status for bot {GrainId}, continuing with stop",
|
||||
this.GetPrimaryKey());
|
||||
}
|
||||
}
|
||||
|
||||
// The check is now against the registry status
|
||||
@@ -332,6 +362,9 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable
|
||||
return;
|
||||
}
|
||||
|
||||
// Track if critical operations succeeded
|
||||
var criticalOperationsSucceeded = false;
|
||||
|
||||
try
|
||||
{
|
||||
StopAndDisposeTimer();
|
||||
@@ -355,17 +388,69 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable
|
||||
|
||||
// Sync state from the volatile TradingBotBase before destroying it
|
||||
SyncStateFromBase();
|
||||
await _state.WriteStateAsync();
|
||||
await SaveBotAsync(BotStatus.Stopped);
|
||||
|
||||
// Critical: Stop the bot and clear the trading bot instance
|
||||
await _tradingBot?.StopBot(reason)!;
|
||||
_tradingBot = null;
|
||||
|
||||
// Mark critical operations as succeeded
|
||||
criticalOperationsSucceeded = true;
|
||||
|
||||
// Non-critical cleanup operations - continue even if these fail
|
||||
try
|
||||
{
|
||||
await _state.WriteStateAsync();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to write state for bot {GrainId}, but bot is stopped", this.GetPrimaryKey());
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await SaveBotAsync(BotStatus.Stopped);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to save bot status for bot {GrainId}, but bot is stopped", this.GetPrimaryKey());
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await UpdateBotRegistryStatus(BotStatus.Stopped);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to update bot registry for bot {GrainId}, but bot is stopped", this.GetPrimaryKey());
|
||||
}
|
||||
|
||||
await UpdateBotRegistryStatus(BotStatus.Stopped);
|
||||
_logger.LogInformation("LiveTradingBotGrain {GrainId} stopped successfully", this.GetPrimaryKey());
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// If critical operations succeeded (bot stopped, positions closed), don't throw
|
||||
// Only throw if critical operations failed
|
||||
if (criticalOperationsSucceeded)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Cleanup operations failed for bot {GrainId}, but bot was successfully stopped. " +
|
||||
"This is non-critical and the stop operation is considered successful.",
|
||||
this.GetPrimaryKey());
|
||||
// Don't throw - the bot is stopped, which is what matters
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogError(ex, "Failed to stop LiveTradingBotGrain {GrainId}", this.GetPrimaryKey());
|
||||
|
||||
// Log Orleans-specific exceptions with additional context for debugging
|
||||
if (Managing.Core.Exceptions.OrleansExceptionHelper.IsOrleansException(ex))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Orleans exception detected during bot stop: {ExceptionType} - {Message}. " +
|
||||
"This may indicate a timeout or deadlock when closing positions.",
|
||||
ex.GetType().Name, ex.Message);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@@ -1151,13 +1236,75 @@ public class LiveTradingBotGrain : Grain, ILiveTradingBotGrain, IRemindable
|
||||
"Bot {GrainId} has {Count} open positions that will be closed before stopping: {Positions}",
|
||||
botId, openPositions.Count, string.Join(", ", openPositions.Select(p => p.Identifier)));
|
||||
|
||||
// Add timeout for closing positions (30 seconds per position, max 2 minutes total)
|
||||
var maxTotalTime = TimeSpan.FromMinutes(2);
|
||||
var perPositionTimeout = TimeSpan.FromSeconds(30);
|
||||
var startTime = DateTime.UtcNow;
|
||||
|
||||
foreach (var position in openPositions)
|
||||
{
|
||||
// Check if we've exceeded total timeout
|
||||
if (DateTime.UtcNow - startTime > maxTotalTime)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Closing positions for bot {GrainId} exceeded total timeout of {Timeout} seconds. Stopping position closure.",
|
||||
botId, maxTotalTime.TotalSeconds);
|
||||
break;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogInformation("Closing position {PositionId} for bot {GrainId}", position.Identifier,
|
||||
botId);
|
||||
await ClosePositionAsync(position.Identifier);
|
||||
|
||||
// Use timeout for individual position closure
|
||||
using var cts = new CancellationTokenSource(perPositionTimeout);
|
||||
var closeTask = ClosePositionAsync(position.Identifier);
|
||||
var timeoutTask = Task.Delay(perPositionTimeout, cts.Token);
|
||||
|
||||
var completedTask = await Task.WhenAny(closeTask, timeoutTask);
|
||||
if (completedTask == timeoutTask)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Closing position {PositionId} for bot {GrainId} timed out after {Timeout} seconds. " +
|
||||
"Verifying if position was actually closed...",
|
||||
position.Identifier, botId, perPositionTimeout.TotalSeconds);
|
||||
|
||||
// Verify if position was actually closed despite timeout
|
||||
try
|
||||
{
|
||||
var updatedPositions = await ServiceScopeHelpers.WithScopedService<ITradingService, IEnumerable<Position>>(
|
||||
_scopeFactory,
|
||||
async tradingService => await tradingService.GetPositionsByInitiatorIdentifierAsync(botId));
|
||||
|
||||
var updatedPosition = updatedPositions?.FirstOrDefault(p => p.Identifier == position.Identifier);
|
||||
if (updatedPosition != null && !updatedPosition.IsOpen() && updatedPosition.Status != PositionStatus.New)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Position {PositionId} was actually closed despite timeout. Status: {Status}",
|
||||
position.Identifier, updatedPosition.Status);
|
||||
// Position is closed, continue
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Position {PositionId} may not be closed. Status: {Status}",
|
||||
position.Identifier, updatedPosition != null ? updatedPosition.Status.ToString() : "null");
|
||||
// Continue with other positions - we'll verify all positions at the end
|
||||
}
|
||||
}
|
||||
catch (Exception verifyEx)
|
||||
{
|
||||
_logger.LogWarning(verifyEx,
|
||||
"Could not verify position {PositionId} closure status after timeout",
|
||||
position.Identifier);
|
||||
}
|
||||
|
||||
// Continue with other positions
|
||||
continue;
|
||||
}
|
||||
|
||||
await closeTask;
|
||||
_logger.LogInformation("Successfully closed position {PositionId} for bot {GrainId}",
|
||||
position.Identifier, botId);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ using Managing.Application.Bots;
|
||||
using Managing.Application.Bots.Models;
|
||||
using Managing.Common;
|
||||
using Managing.Core;
|
||||
using Managing.Core.Exceptions;
|
||||
using Managing.Domain.Accounts;
|
||||
using Managing.Domain.Bots;
|
||||
using Managing.Domain.Indicators;
|
||||
@@ -60,6 +61,17 @@ namespace Managing.Application.ManageBot
|
||||
catch (Exception e)
|
||||
{
|
||||
_tradingBotLogger.LogError(e, "Error stopping bot {Identifier}", identifier);
|
||||
|
||||
// Check if this is an Orleans exception and convert to user-friendly message
|
||||
if (OrleansExceptionHelper.IsOrleansException(e))
|
||||
{
|
||||
var userMessage = OrleansExceptionHelper.GetUserFriendlyMessage(e, "bot stop");
|
||||
_tradingBotLogger.LogWarning(
|
||||
"Orleans exception during bot stop {Identifier}: {ExceptionType}. Converting to user-friendly message.",
|
||||
identifier, e.GetType().Name);
|
||||
throw new ServiceUnavailableException(userMessage);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user