Add admin endpoint to delete bundle backtest requests and implement related UI functionality + Add job resilient
This commit is contained in:
@@ -228,5 +228,40 @@ public class AdminController : BaseController
|
|||||||
|
|
||||||
return Ok(response);
|
return Ok(response);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Deletes a bundle backtest request by ID for admin users.
|
||||||
|
/// Also deletes all related backtests associated with this bundle request.
|
||||||
|
/// This endpoint does not require user ownership - admins can delete any bundle.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="id">The ID of the bundle backtest request to delete.</param>
|
||||||
|
/// <returns>An ActionResult indicating the outcome of the operation.</returns>
|
||||||
|
[HttpDelete]
|
||||||
|
[Route("BundleBacktestRequests/{id}")]
|
||||||
|
public async Task<ActionResult> DeleteBundleBacktestRequest(string id)
|
||||||
|
{
|
||||||
|
if (!await IsUserAdmin())
|
||||||
|
{
|
||||||
|
_logger.LogWarning("Non-admin user attempted to delete bundle backtest request");
|
||||||
|
return StatusCode(403, new { error = "Only admin users can delete bundle backtest requests" });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Guid.TryParse(id, out var requestId))
|
||||||
|
{
|
||||||
|
return BadRequest("Invalid bundle request ID format. Must be a valid GUID.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// First, delete the bundle request
|
||||||
|
await _backtester.DeleteBundleBacktestRequestByIdAsync(requestId);
|
||||||
|
|
||||||
|
// Then, delete all related backtests
|
||||||
|
var backtestsDeleted = await _backtester.DeleteBacktestsByRequestIdAsync(requestId);
|
||||||
|
|
||||||
|
return Ok(new
|
||||||
|
{
|
||||||
|
BundleRequestDeleted = true,
|
||||||
|
RelatedBacktestsDeleted = backtestsDeleted
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -75,4 +75,7 @@ public interface IBacktestRepository
|
|||||||
|
|
||||||
// Admin summary methods
|
// Admin summary methods
|
||||||
Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync();
|
Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync();
|
||||||
|
|
||||||
|
// Admin delete methods - no user filter
|
||||||
|
Task DeleteBundleBacktestRequestByIdAsync(Guid id);
|
||||||
}
|
}
|
||||||
@@ -115,5 +115,8 @@ namespace Managing.Application.Abstractions.Services
|
|||||||
BundleBacktestRequestsFilter? filter = null);
|
BundleBacktestRequestsFilter? filter = null);
|
||||||
|
|
||||||
Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync();
|
Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync();
|
||||||
|
|
||||||
|
// Admin delete methods - no user filter
|
||||||
|
Task DeleteBundleBacktestRequestByIdAsync(Guid id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -197,4 +197,7 @@ public class BacktestExecutorAdapter : IBacktester
|
|||||||
|
|
||||||
public Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync() =>
|
public Task<BundleBacktestRequestSummary> GetBundleBacktestRequestsSummaryAsync() =>
|
||||||
throw new NotImplementedException("Not available in compute worker");
|
throw new NotImplementedException("Not available in compute worker");
|
||||||
|
|
||||||
|
public Task DeleteBundleBacktestRequestByIdAsync(Guid id) =>
|
||||||
|
throw new NotImplementedException("Not available in compute worker");
|
||||||
}
|
}
|
||||||
@@ -590,6 +590,11 @@ namespace Managing.Application.Backtests
|
|||||||
return await _backtestRepository.GetBundleBacktestRequestsSummaryAsync();
|
return await _backtestRepository.GetBundleBacktestRequestsSummaryAsync();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task DeleteBundleBacktestRequestByIdAsync(Guid id)
|
||||||
|
{
|
||||||
|
await _backtestRepository.DeleteBundleBacktestRequestByIdAsync(id);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Sends a LightBacktestResponse to all SignalR subscribers of a bundle request.
|
/// Sends a LightBacktestResponse to all SignalR subscribers of a bundle request.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
using System.Collections.Concurrent;
|
||||||
using System.Text.Json;
|
using System.Text.Json;
|
||||||
using Managing.Application.Abstractions.Repositories;
|
using Managing.Application.Abstractions.Repositories;
|
||||||
using Managing.Application.Abstractions.Services;
|
using Managing.Application.Abstractions.Services;
|
||||||
@@ -22,6 +23,8 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
private readonly ILogger<BacktestComputeWorker> _logger;
|
private readonly ILogger<BacktestComputeWorker> _logger;
|
||||||
private readonly BacktestComputeWorkerOptions _options;
|
private readonly BacktestComputeWorkerOptions _options;
|
||||||
private readonly SemaphoreSlim _instanceSemaphore;
|
private readonly SemaphoreSlim _instanceSemaphore;
|
||||||
|
private readonly ConcurrentDictionary<Guid, Task> _runningJobTasks = new();
|
||||||
|
private readonly CancellationTokenSource _shutdownCts = new();
|
||||||
|
|
||||||
public BacktestComputeWorker(
|
public BacktestComputeWorker(
|
||||||
IServiceScopeFactory scopeFactory,
|
IServiceScopeFactory scopeFactory,
|
||||||
@@ -37,32 +40,78 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||||
{
|
{
|
||||||
_logger.LogInformation(
|
_logger.LogInformation(
|
||||||
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s",
|
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s, JobTimeout: {JobTimeoutMinutes}min",
|
||||||
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance, _options.JobPollIntervalSeconds);
|
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance,
|
||||||
|
_options.JobPollIntervalSeconds, _options.JobTimeoutMinutes);
|
||||||
|
|
||||||
|
// Link cancellation tokens
|
||||||
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken, _shutdownCts.Token);
|
||||||
|
var cancellationToken = linkedCts.Token;
|
||||||
|
|
||||||
// Background task for stale job recovery
|
// Background task for stale job recovery
|
||||||
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(stoppingToken), stoppingToken);
|
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(cancellationToken), cancellationToken);
|
||||||
|
|
||||||
// Background task for heartbeat updates
|
// Background task for heartbeat updates
|
||||||
var heartbeatTask = Task.Run(() => HeartbeatLoop(stoppingToken), stoppingToken);
|
var heartbeatTask = Task.Run(() => HeartbeatLoop(cancellationToken), cancellationToken);
|
||||||
|
|
||||||
// Main job processing loop
|
// Main job processing loop
|
||||||
while (!stoppingToken.IsCancellationRequested)
|
try
|
||||||
{
|
{
|
||||||
try
|
while (!cancellationToken.IsCancellationRequested)
|
||||||
{
|
{
|
||||||
await ProcessJobsAsync(stoppingToken);
|
try
|
||||||
}
|
{
|
||||||
catch (Exception ex)
|
await ProcessJobsAsync(cancellationToken);
|
||||||
{
|
}
|
||||||
_logger.LogError(ex, "Error in BacktestComputeWorker main loop");
|
catch (OperationCanceledException)
|
||||||
SentrySdk.CaptureException(ex);
|
{
|
||||||
}
|
// Expected during shutdown
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Error in BacktestComputeWorker main loop");
|
||||||
|
SentrySdk.CaptureException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
await Task.Delay(TimeSpan.FromSeconds(_options.JobPollIntervalSeconds), stoppingToken);
|
try
|
||||||
|
{
|
||||||
|
await Task.Delay(TimeSpan.FromSeconds(_options.JobPollIntervalSeconds), cancellationToken);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
_logger.LogInformation("BacktestComputeWorker stopping, waiting for {Count} running jobs to complete or timeout",
|
||||||
|
_runningJobTasks.Count);
|
||||||
|
|
||||||
_logger.LogInformation("BacktestComputeWorker stopping");
|
// Signal shutdown
|
||||||
|
_shutdownCts.Cancel();
|
||||||
|
|
||||||
|
// Wait for running jobs with timeout
|
||||||
|
var waitTasks = _runningJobTasks.Values.ToArray();
|
||||||
|
if (waitTasks.Length > 0)
|
||||||
|
{
|
||||||
|
var timeoutTask = Task.Delay(TimeSpan.FromMinutes(_options.GracefulShutdownTimeoutMinutes), CancellationToken.None);
|
||||||
|
var completedTask = await Task.WhenAny(Task.WhenAll(waitTasks), timeoutTask);
|
||||||
|
|
||||||
|
if (completedTask == timeoutTask)
|
||||||
|
{
|
||||||
|
_logger.LogWarning("Graceful shutdown timeout reached, {Count} jobs may still be running",
|
||||||
|
_runningJobTasks.Count);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_logger.LogInformation("All running jobs completed during graceful shutdown");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.LogInformation("BacktestComputeWorker stopped");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async Task ProcessJobsAsync(CancellationToken cancellationToken)
|
private async Task ProcessJobsAsync(CancellationToken cancellationToken)
|
||||||
@@ -99,22 +148,32 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
|
|
||||||
// Process the job asynchronously (don't await, let it run in background)
|
// Process the job asynchronously (don't await, let it run in background)
|
||||||
// Create a new scope for the job processing to ensure proper lifetime management
|
// Create a new scope for the job processing to ensure proper lifetime management
|
||||||
_ = Task.Run(async () =>
|
var jobTask = Task.Run(async () =>
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
await ProcessJobAsync(job, cancellationToken);
|
await ProcessJobAsync(job, cancellationToken);
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
// Handle cancellation gracefully
|
||||||
|
_logger.LogInformation("Job {JobId} was cancelled during processing", job.Id);
|
||||||
|
await HandleJobCancellation(job);
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error processing job {JobId}", job.Id);
|
_logger.LogError(ex, "Error processing job {JobId}", job.Id);
|
||||||
throw;
|
// Error handling is done in ProcessJobAsync
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
|
_runningJobTasks.TryRemove(job.Id, out _);
|
||||||
_instanceSemaphore.Release();
|
_instanceSemaphore.Release();
|
||||||
}
|
}
|
||||||
}, cancellationToken);
|
}, cancellationToken);
|
||||||
|
|
||||||
|
// Track the running job task
|
||||||
|
_runningJobTasks.TryAdd(job.Id, jobTask);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
@@ -135,6 +194,8 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
|
var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
|
||||||
var agentSummaryRepository = scope.ServiceProvider.GetRequiredService<IAgentSummaryRepository>();
|
var agentSummaryRepository = scope.ServiceProvider.GetRequiredService<IAgentSummaryRepository>();
|
||||||
|
|
||||||
|
var jobStartTime = DateTime.UtcNow;
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_logger.LogInformation(
|
_logger.LogInformation(
|
||||||
@@ -174,6 +235,16 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
// Check if job has been running too long
|
||||||
|
var elapsed = DateTime.UtcNow - jobStartTime;
|
||||||
|
if (elapsed.TotalMinutes > _options.JobTimeoutMinutes)
|
||||||
|
{
|
||||||
|
_logger.LogWarning(
|
||||||
|
"Job {JobId} has been running for {ElapsedMinutes} minutes, exceeding timeout of {TimeoutMinutes} minutes",
|
||||||
|
job.Id, elapsed.TotalMinutes, _options.JobTimeoutMinutes);
|
||||||
|
throw new TimeoutException($"Job exceeded timeout of {_options.JobTimeoutMinutes} minutes");
|
||||||
|
}
|
||||||
|
|
||||||
job.ProgressPercentage = percentage;
|
job.ProgressPercentage = percentage;
|
||||||
job.LastHeartbeat = DateTime.UtcNow;
|
job.LastHeartbeat = DateTime.UtcNow;
|
||||||
await jobRepository.UpdateAsync(job);
|
await jobRepository.UpdateAsync(job);
|
||||||
@@ -181,19 +252,37 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogWarning(ex, "Error updating job progress for job {JobId}", job.Id);
|
_logger.LogWarning(ex, "Error updating job progress for job {JobId}", job.Id);
|
||||||
|
throw; // Re-throw timeout exceptions
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Execute the backtest
|
// Execute the backtest with timeout
|
||||||
var result = await executor.ExecuteAsync(
|
var timeoutCts = new CancellationTokenSource(TimeSpan.FromMinutes(_options.JobTimeoutMinutes));
|
||||||
config,
|
var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
||||||
candles,
|
|
||||||
user,
|
LightBacktest result;
|
||||||
save: true,
|
try
|
||||||
withCandles: false,
|
{
|
||||||
requestId: job.RequestId,
|
result = await executor.ExecuteAsync(
|
||||||
metadata: null,
|
config,
|
||||||
progressCallback: progressCallback);
|
candles,
|
||||||
|
user,
|
||||||
|
save: true,
|
||||||
|
withCandles: false,
|
||||||
|
requestId: job.RequestId,
|
||||||
|
metadata: null,
|
||||||
|
progressCallback: progressCallback);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) when (timeoutCts.Token.IsCancellationRequested && !cancellationToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
var elapsed = DateTime.UtcNow - jobStartTime;
|
||||||
|
throw new TimeoutException($"Job {job.Id} exceeded timeout of {_options.JobTimeoutMinutes} minutes (ran for {elapsed.TotalMinutes:F1} minutes)");
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
timeoutCts.Dispose();
|
||||||
|
linkedCts.Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
// Update job with result
|
// Update job with result
|
||||||
job.Status = JobStatus.Completed;
|
job.Status = JobStatus.Completed;
|
||||||
@@ -216,9 +305,10 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
// Don't fail the job if this update fails
|
// Don't fail the job if this update fails
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var elapsedTime = DateTime.UtcNow - jobStartTime;
|
||||||
_logger.LogInformation(
|
_logger.LogInformation(
|
||||||
"Completed backtest job {JobId}. Score: {Score}, PnL: {PnL}",
|
"Completed backtest job {JobId}. Score: {Score}, PnL: {PnL}, Duration: {DurationMinutes:F1} minutes",
|
||||||
job.Id, result.Score, result.FinalPnl);
|
job.Id, result.Score, result.FinalPnl, elapsedTime.TotalMinutes);
|
||||||
|
|
||||||
// Update bundle request if this is part of a bundle
|
// Update bundle request if this is part of a bundle
|
||||||
if (job.BundleRequestId.HasValue)
|
if (job.BundleRequestId.HasValue)
|
||||||
@@ -226,6 +316,11 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
|
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Job {JobId} was cancelled", job.Id);
|
||||||
|
throw; // Re-throw to be handled by the caller
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
|
_logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
|
||||||
@@ -235,6 +330,62 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async Task HandleJobCancellation(Job job)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var scope = _scopeFactory.CreateScope();
|
||||||
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||||
|
|
||||||
|
// Reload job to get latest state
|
||||||
|
var currentJob = await jobRepository.GetByIdAsync(job.Id);
|
||||||
|
if (currentJob == null)
|
||||||
|
{
|
||||||
|
_logger.LogWarning("Job {JobId} not found when handling cancellation", job.Id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If job is still running, mark it for retry
|
||||||
|
if (currentJob.Status == JobStatus.Running)
|
||||||
|
{
|
||||||
|
if (currentJob.RetryCount < currentJob.MaxRetries)
|
||||||
|
{
|
||||||
|
currentJob.Status = JobStatus.Pending;
|
||||||
|
currentJob.RetryCount++;
|
||||||
|
var backoffMinutes = Math.Min(Math.Pow(2, currentJob.RetryCount), _options.MaxRetryDelayMinutes);
|
||||||
|
currentJob.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
||||||
|
currentJob.ErrorMessage = $"Worker shutdown - retry {currentJob.RetryCount}/{currentJob.MaxRetries}";
|
||||||
|
currentJob.FailureCategory = FailureCategory.Transient;
|
||||||
|
currentJob.AssignedWorkerId = null;
|
||||||
|
currentJob.LastHeartbeat = null;
|
||||||
|
|
||||||
|
_logger.LogWarning(
|
||||||
|
"Job {JobId} cancelled during shutdown, will be retried (attempt {RetryCount}/{MaxRetries})",
|
||||||
|
currentJob.Id, currentJob.RetryCount, currentJob.MaxRetries);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
currentJob.Status = JobStatus.Failed;
|
||||||
|
currentJob.ErrorMessage = "Worker shutdown - exceeded max retries";
|
||||||
|
currentJob.FailureCategory = FailureCategory.SystemError;
|
||||||
|
currentJob.IsRetryable = false;
|
||||||
|
currentJob.CompletedAt = DateTime.UtcNow;
|
||||||
|
currentJob.AssignedWorkerId = null;
|
||||||
|
|
||||||
|
_logger.LogError("Job {JobId} cancelled during shutdown and exceeded max retries", currentJob.Id);
|
||||||
|
|
||||||
|
await NotifyPermanentFailure(currentJob, new OperationCanceledException("Worker shutdown"), scope.ServiceProvider);
|
||||||
|
}
|
||||||
|
|
||||||
|
await jobRepository.UpdateAsync(currentJob);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Error handling job cancellation for job {JobId}", job.Id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private async Task UpdateBundleRequestProgress(Guid bundleRequestId, IServiceProvider serviceProvider)
|
private async Task UpdateBundleRequestProgress(Guid bundleRequestId, IServiceProvider serviceProvider)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
@@ -373,14 +524,35 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
|
|
||||||
// Get stale jobs for this worker
|
// Get stale jobs for this worker
|
||||||
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
||||||
|
var now = DateTime.UtcNow;
|
||||||
var staleJobs = runningJobs
|
var staleJobs = runningJobs
|
||||||
.Where(j => j.JobType == JobType.Backtest &&
|
.Where(j => j.JobType == JobType.Backtest &&
|
||||||
(j.LastHeartbeat == null ||
|
(
|
||||||
j.LastHeartbeat < DateTime.UtcNow.AddMinutes(-_options.StaleJobTimeoutMinutes)))
|
// Stale heartbeat (no heartbeat in timeout period)
|
||||||
|
j.LastHeartbeat == null ||
|
||||||
|
j.LastHeartbeat < now.AddMinutes(-_options.StaleJobTimeoutMinutes) ||
|
||||||
|
// Job running too long (even with recent heartbeat)
|
||||||
|
(j.StartedAt.HasValue &&
|
||||||
|
j.StartedAt.Value < now.AddMinutes(-_options.JobTimeoutMinutes))
|
||||||
|
))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
foreach (var job in staleJobs)
|
foreach (var job in staleJobs)
|
||||||
{
|
{
|
||||||
|
var elapsed = job.StartedAt.HasValue
|
||||||
|
? (DateTime.UtcNow - job.StartedAt.Value).TotalMinutes
|
||||||
|
: (double?)null;
|
||||||
|
var lastHeartbeatAge = job.LastHeartbeat.HasValue
|
||||||
|
? (DateTime.UtcNow - job.LastHeartbeat.Value).TotalMinutes
|
||||||
|
: (double?)null;
|
||||||
|
|
||||||
|
_logger.LogWarning(
|
||||||
|
"Detected stale job {JobId}: Started {StartedAt}, LastHeartbeat: {LastHeartbeat} ({HeartbeatAge} min ago), Elapsed: {Elapsed} min",
|
||||||
|
job.Id, job.StartedAt, job.LastHeartbeat, lastHeartbeatAge, elapsed);
|
||||||
|
|
||||||
|
// Remove from running tasks if still tracked
|
||||||
|
_runningJobTasks.TryRemove(job.Id, out _);
|
||||||
|
|
||||||
// If it's stale but retryable, reset to pending with retry count
|
// If it's stale but retryable, reset to pending with retry count
|
||||||
if (job.RetryCount < job.MaxRetries)
|
if (job.RetryCount < job.MaxRetries)
|
||||||
{
|
{
|
||||||
@@ -388,7 +560,7 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
job.RetryCount++;
|
job.RetryCount++;
|
||||||
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
||||||
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
||||||
job.ErrorMessage = $"Worker timeout - retry {job.RetryCount}/{job.MaxRetries}";
|
job.ErrorMessage = $"Job timeout/stale - retry {job.RetryCount}/{job.MaxRetries} (ran for {elapsed:F1} min)";
|
||||||
job.FailureCategory = FailureCategory.SystemError;
|
job.FailureCategory = FailureCategory.SystemError;
|
||||||
_logger.LogWarning(
|
_logger.LogWarning(
|
||||||
"Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
|
"Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
|
||||||
@@ -398,13 +570,17 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
{
|
{
|
||||||
// Exceeded retries - mark as failed
|
// Exceeded retries - mark as failed
|
||||||
job.Status = JobStatus.Failed;
|
job.Status = JobStatus.Failed;
|
||||||
job.ErrorMessage = "Worker timeout - exceeded max retries";
|
job.ErrorMessage = $"Job timeout/stale - exceeded max retries (ran for {elapsed:F1} min)";
|
||||||
job.FailureCategory = FailureCategory.SystemError;
|
job.FailureCategory = FailureCategory.SystemError;
|
||||||
job.IsRetryable = false;
|
job.IsRetryable = false;
|
||||||
job.CompletedAt = DateTime.UtcNow;
|
job.CompletedAt = DateTime.UtcNow;
|
||||||
|
|
||||||
|
_logger.LogError(
|
||||||
|
"Stale job {JobId} exceeded max retries after running for {Elapsed} minutes",
|
||||||
|
job.Id, elapsed);
|
||||||
|
|
||||||
// Notify permanent failure
|
// Notify permanent failure
|
||||||
await NotifyPermanentFailure(job, new TimeoutException("Worker timeout"), scope.ServiceProvider);
|
await NotifyPermanentFailure(job, new TimeoutException($"Job timeout after {elapsed:F1} minutes"), scope.ServiceProvider);
|
||||||
|
|
||||||
// Update bundle request if this is part of a bundle
|
// Update bundle request if this is part of a bundle
|
||||||
if (job.BundleRequestId.HasValue)
|
if (job.BundleRequestId.HasValue)
|
||||||
@@ -423,6 +599,11 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
_logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
|
_logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
// Expected during shutdown, don't log as error
|
||||||
|
break;
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error in stale job recovery loop");
|
_logger.LogError(ex, "Error in stale job recovery loop");
|
||||||
@@ -450,6 +631,11 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
await jobRepository.UpdateAsync(job);
|
await jobRepository.UpdateAsync(job);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
// Expected during shutdown, don't log as error
|
||||||
|
break;
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error in heartbeat loop");
|
_logger.LogError(ex, "Error in heartbeat loop");
|
||||||
@@ -571,6 +757,8 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
|
|
||||||
public override void Dispose()
|
public override void Dispose()
|
||||||
{
|
{
|
||||||
|
_shutdownCts?.Cancel();
|
||||||
|
_shutdownCts?.Dispose();
|
||||||
_instanceSemaphore?.Dispose();
|
_instanceSemaphore?.Dispose();
|
||||||
base.Dispose();
|
base.Dispose();
|
||||||
}
|
}
|
||||||
@@ -622,5 +810,15 @@ public class BacktestComputeWorkerOptions
|
|||||||
/// Maximum retry delay in minutes (cap for exponential backoff)
|
/// Maximum retry delay in minutes (cap for exponential backoff)
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public int MaxRetryDelayMinutes { get; set; } = 60;
|
public int MaxRetryDelayMinutes { get; set; } = 60;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Maximum time in minutes a job can run before being considered timed out
|
||||||
|
/// </summary>
|
||||||
|
public int JobTimeoutMinutes { get; set; } = 60;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Timeout in minutes to wait for running jobs during graceful shutdown
|
||||||
|
/// </summary>
|
||||||
|
public int GracefulShutdownTimeoutMinutes { get; set; } = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -236,6 +236,11 @@ public class GeneticComputeWorker : BackgroundService
|
|||||||
_logger.LogInformation("Processed {Count} stale genetic jobs", staleJobs.Count);
|
_logger.LogInformation("Processed {Count} stale genetic jobs", staleJobs.Count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
// Expected during shutdown, don't log as error
|
||||||
|
break;
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error in stale job recovery loop");
|
_logger.LogError(ex, "Error in stale job recovery loop");
|
||||||
@@ -264,6 +269,11 @@ public class GeneticComputeWorker : BackgroundService
|
|||||||
await jobRepository.UpdateAsync(job);
|
await jobRepository.UpdateAsync(job);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (OperationCanceledException)
|
||||||
|
{
|
||||||
|
// Expected during shutdown, don't log as error
|
||||||
|
break;
|
||||||
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Error in heartbeat loop");
|
_logger.LogError(ex, "Error in heartbeat loop");
|
||||||
|
|||||||
@@ -926,6 +926,20 @@ public class PostgreSqlBacktestRepository : IBacktestRepository
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task DeleteBundleBacktestRequestByIdAsync(Guid id)
|
||||||
|
{
|
||||||
|
var entity = await _context.BundleBacktestRequests
|
||||||
|
.AsTracking()
|
||||||
|
.FirstOrDefaultAsync(b => b.RequestId == id)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
|
if (entity != null)
|
||||||
|
{
|
||||||
|
_context.BundleBacktestRequests.Remove(entity);
|
||||||
|
await _context.SaveChangesAsync().ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public IEnumerable<BundleBacktestRequest> GetBundleBacktestRequestsByStatus(BundleBacktestRequestStatus status)
|
public IEnumerable<BundleBacktestRequest> GetBundleBacktestRequestsByStatus(BundleBacktestRequestStatus status)
|
||||||
{
|
{
|
||||||
var entities = _context.BundleBacktestRequests
|
var entities = _context.BundleBacktestRequests
|
||||||
|
|||||||
@@ -486,6 +486,49 @@ export class AdminClient extends AuthorizedApiBase {
|
|||||||
}
|
}
|
||||||
return Promise.resolve<BundleBacktestRequestSummaryResponse>(null as any);
|
return Promise.resolve<BundleBacktestRequestSummaryResponse>(null as any);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
admin_DeleteBundleBacktestRequest(id: string): Promise<FileResponse> {
|
||||||
|
let url_ = this.baseUrl + "/Admin/BundleBacktestRequests/{id}";
|
||||||
|
if (id === undefined || id === null)
|
||||||
|
throw new Error("The parameter 'id' must be defined.");
|
||||||
|
url_ = url_.replace("{id}", encodeURIComponent("" + id));
|
||||||
|
url_ = url_.replace(/[?&]$/, "");
|
||||||
|
|
||||||
|
let options_: RequestInit = {
|
||||||
|
method: "DELETE",
|
||||||
|
headers: {
|
||||||
|
"Accept": "application/octet-stream"
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return this.transformOptions(options_).then(transformedOptions_ => {
|
||||||
|
return this.http.fetch(url_, transformedOptions_);
|
||||||
|
}).then((_response: Response) => {
|
||||||
|
return this.processAdmin_DeleteBundleBacktestRequest(_response);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
protected processAdmin_DeleteBundleBacktestRequest(response: Response): Promise<FileResponse> {
|
||||||
|
const status = response.status;
|
||||||
|
let _headers: any = {}; if (response.headers && response.headers.forEach) { response.headers.forEach((v: any, k: any) => _headers[k] = v); };
|
||||||
|
if (status === 200 || status === 206) {
|
||||||
|
const contentDisposition = response.headers ? response.headers.get("content-disposition") : undefined;
|
||||||
|
let fileNameMatch = contentDisposition ? /filename\*=(?:(\\?['"])(.*?)\1|(?:[^\s]+'.*?')?([^;\n]*))/g.exec(contentDisposition) : undefined;
|
||||||
|
let fileName = fileNameMatch && fileNameMatch.length > 1 ? fileNameMatch[3] || fileNameMatch[2] : undefined;
|
||||||
|
if (fileName) {
|
||||||
|
fileName = decodeURIComponent(fileName);
|
||||||
|
} else {
|
||||||
|
fileNameMatch = contentDisposition ? /filename="?([^"]*?)"?(;|$)/g.exec(contentDisposition) : undefined;
|
||||||
|
fileName = fileNameMatch && fileNameMatch.length > 1 ? fileNameMatch[1] : undefined;
|
||||||
|
}
|
||||||
|
return response.blob().then(blob => { return { fileName: fileName, data: blob, status: status, headers: _headers }; });
|
||||||
|
} else if (status !== 200 && status !== 204) {
|
||||||
|
return response.text().then((_responseText) => {
|
||||||
|
return throwException("An unexpected server error occurred.", status, _responseText, _headers);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return Promise.resolve<FileResponse>(null as any);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export class BacktestClient extends AuthorizedApiBase {
|
export class BacktestClient extends AuthorizedApiBase {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import {useState} from 'react'
|
import {useState} from 'react'
|
||||||
import {useQuery} from '@tanstack/react-query'
|
import {useMutation, useQuery, useQueryClient} from '@tanstack/react-query'
|
||||||
|
|
||||||
import useApiUrlStore from '../../../app/store/apiStore'
|
import useApiUrlStore from '../../../app/store/apiStore'
|
||||||
import {
|
import {
|
||||||
@@ -28,8 +28,10 @@ const BundleBacktestRequestsSettings: React.FC = () => {
|
|||||||
const [progressPercentageMax, setProgressPercentageMax] = useState<string>('')
|
const [progressPercentageMax, setProgressPercentageMax] = useState<string>('')
|
||||||
const [filtersOpen, setFiltersOpen] = useState<boolean>(false)
|
const [filtersOpen, setFiltersOpen] = useState<boolean>(false)
|
||||||
const [showTable, setShowTable] = useState<boolean>(true)
|
const [showTable, setShowTable] = useState<boolean>(true)
|
||||||
|
const [deleteConfirmRequestId, setDeleteConfirmRequestId] = useState<string | null>(null)
|
||||||
|
|
||||||
const adminClient = new AdminClient({}, apiUrl)
|
const adminClient = new AdminClient({}, apiUrl)
|
||||||
|
const queryClient = useQueryClient()
|
||||||
|
|
||||||
// Fetch bundle backtest requests summary statistics
|
// Fetch bundle backtest requests summary statistics
|
||||||
const {
|
const {
|
||||||
@@ -83,6 +85,33 @@ const BundleBacktestRequestsSettings: React.FC = () => {
|
|||||||
const totalPages = bundleRequestsData?.totalPages || 0
|
const totalPages = bundleRequestsData?.totalPages || 0
|
||||||
const currentPage = bundleRequestsData?.currentPage || 1
|
const currentPage = bundleRequestsData?.currentPage || 1
|
||||||
|
|
||||||
|
// Delete mutation
|
||||||
|
const deleteMutation = useMutation({
|
||||||
|
mutationFn: async (requestId: string) => {
|
||||||
|
return await adminClient.admin_DeleteBundleBacktestRequest(requestId)
|
||||||
|
},
|
||||||
|
onSuccess: () => {
|
||||||
|
// Invalidate and refetch queries
|
||||||
|
queryClient.invalidateQueries({ queryKey: ['bundleBacktestRequests'] })
|
||||||
|
queryClient.invalidateQueries({ queryKey: ['bundleBacktestRequestsSummary'] })
|
||||||
|
setDeleteConfirmRequestId(null)
|
||||||
|
},
|
||||||
|
onError: (error: any) => {
|
||||||
|
console.error('Failed to delete bundle request:', error)
|
||||||
|
alert(error.message || 'Failed to delete bundle request')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
const handleDelete = (requestId: string) => {
|
||||||
|
setDeleteConfirmRequestId(requestId)
|
||||||
|
}
|
||||||
|
|
||||||
|
const confirmDelete = () => {
|
||||||
|
if (deleteConfirmRequestId) {
|
||||||
|
deleteMutation.mutate(deleteConfirmRequestId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const handlePageChange = (newPage: number) => {
|
const handlePageChange = (newPage: number) => {
|
||||||
setPage(newPage)
|
setPage(newPage)
|
||||||
}
|
}
|
||||||
@@ -519,6 +548,7 @@ const BundleBacktestRequestsSettings: React.FC = () => {
|
|||||||
sortOrder={sortOrder}
|
sortOrder={sortOrder}
|
||||||
onPageChange={handlePageChange}
|
onPageChange={handlePageChange}
|
||||||
onSortChange={handleSortChange}
|
onSortChange={handleSortChange}
|
||||||
|
onDelete={handleDelete}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{error && (
|
{error && (
|
||||||
@@ -526,6 +556,34 @@ const BundleBacktestRequestsSettings: React.FC = () => {
|
|||||||
<span>Failed to load bundle backtest requests. Please try again.</span>
|
<span>Failed to load bundle backtest requests. Please try again.</span>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* Delete Confirmation Modal */}
|
||||||
|
{deleteConfirmRequestId && (
|
||||||
|
<div className="fixed inset-0 bg-black/50 flex items-center justify-center z-50">
|
||||||
|
<div className="bg-base-100 p-6 rounded-lg shadow-xl max-w-md w-full mx-4">
|
||||||
|
<h3 className="text-lg font-semibold mb-4">Confirm Delete</h3>
|
||||||
|
<p className="text-base-content/70 mb-6">
|
||||||
|
Are you sure you want to delete this bundle backtest request? This action cannot be undone and will also delete all related backtests.
|
||||||
|
</p>
|
||||||
|
<div className="flex gap-3 justify-end">
|
||||||
|
<button
|
||||||
|
className="btn btn-ghost"
|
||||||
|
onClick={() => setDeleteConfirmRequestId(null)}
|
||||||
|
disabled={deleteMutation.isPending}
|
||||||
|
>
|
||||||
|
Cancel
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
className="btn btn-error"
|
||||||
|
onClick={confirmDelete}
|
||||||
|
disabled={deleteMutation.isPending}
|
||||||
|
>
|
||||||
|
{deleteMutation.isPending ? 'Deleting...' : 'Delete'}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ interface IBundleBacktestRequestsTable {
|
|||||||
sortOrder: string
|
sortOrder: string
|
||||||
onPageChange: (page: number) => void
|
onPageChange: (page: number) => void
|
||||||
onSortChange: (sortBy: BundleBacktestRequestSortableColumn) => void
|
onSortChange: (sortBy: BundleBacktestRequestSortableColumn) => void
|
||||||
|
onDelete?: (requestId: string) => void
|
||||||
}
|
}
|
||||||
|
|
||||||
const BundleBacktestRequestsTable: React.FC<IBundleBacktestRequestsTable> = ({
|
const BundleBacktestRequestsTable: React.FC<IBundleBacktestRequestsTable> = ({
|
||||||
@@ -28,7 +29,8 @@ const BundleBacktestRequestsTable: React.FC<IBundleBacktestRequestsTable> = ({
|
|||||||
sortBy,
|
sortBy,
|
||||||
sortOrder,
|
sortOrder,
|
||||||
onPageChange,
|
onPageChange,
|
||||||
onSortChange
|
onSortChange,
|
||||||
|
onDelete
|
||||||
}) => {
|
}) => {
|
||||||
const getStatusBadge = (status: string | null | undefined) => {
|
const getStatusBadge = (status: string | null | undefined) => {
|
||||||
if (!status) return <span className="badge badge-sm">-</span>
|
if (!status) return <span className="badge badge-sm">-</span>
|
||||||
@@ -180,8 +182,24 @@ const BundleBacktestRequestsTable: React.FC<IBundleBacktestRequestsTable> = ({
|
|||||||
accessor: (row: BundleBacktestRequestListItemResponse) => (
|
accessor: (row: BundleBacktestRequestListItemResponse) => (
|
||||||
<span className="font-mono text-xs">{row.requestId?.substring(0, 8)}...</span>
|
<span className="font-mono text-xs">{row.requestId?.substring(0, 8)}...</span>
|
||||||
)
|
)
|
||||||
}
|
},
|
||||||
], [sortBy, sortOrder, onSortChange])
|
...(onDelete ? [{
|
||||||
|
id: 'actions',
|
||||||
|
Header: 'Actions',
|
||||||
|
accessor: (row: BundleBacktestRequestListItemResponse) => (
|
||||||
|
<button
|
||||||
|
className="btn btn-sm btn-error btn-outline"
|
||||||
|
onClick={() => onDelete(row.requestId || '')}
|
||||||
|
disabled={!row.requestId}
|
||||||
|
>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" strokeWidth="1.5" stroke="currentColor" className="w-4 h-4">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" d="M14.74 9l-.346 9m-4.788 0L9.26 9m-4.788 0L3.74 9m4.788-4.788L9.26 4.51m4.788 0L14.74 4.51M9.26 4.51l.346-1.5M14.74 4.51l-.346-1.5M3.74 9l4.788 0M14.74 9l4.788 0" />
|
||||||
|
</svg>
|
||||||
|
Delete
|
||||||
|
</button>
|
||||||
|
)
|
||||||
|
}] : [])
|
||||||
|
], [sortBy, sortOrder, onSortChange, onDelete])
|
||||||
|
|
||||||
const tableData = useMemo(() => {
|
const tableData = useMemo(() => {
|
||||||
return bundleRequests.map((request) => ({
|
return bundleRequests.map((request) => ({
|
||||||
|
|||||||
Reference in New Issue
Block a user