Enhance BacktestExecutor and BacktestComputeWorker with timeout and memory monitoring features. Implement auto-completion for stuck jobs and handle long-running jobs more effectively. Add overall runtime checks for bundles in BundleBacktestHealthCheckWorker to improve job management and recovery processes.
This commit is contained in:
@@ -637,7 +637,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
var stuckCompletedJobs = runningJobs
|
||||
.Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
|
||||
.ToList();
|
||||
|
||||
|
||||
if (stuckCompletedJobs.Any())
|
||||
{
|
||||
_logger.LogWarning(
|
||||
@@ -646,42 +646,27 @@ public class BacktestComputeWorker : BackgroundService
|
||||
|
||||
foreach (var stuckJob in stuckCompletedJobs)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
|
||||
stuckJob.Id, stuckJob.StartedAt);
|
||||
await AutoCompleteStuckJobAsync(stuckJob, jobRepository, scope.ServiceProvider);
|
||||
}
|
||||
}
|
||||
|
||||
stuckJob.Status = JobStatus.Completed;
|
||||
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
|
||||
stuckJob.LastHeartbeat = DateTime.UtcNow;
|
||||
|
||||
// Add note to error message if not already set
|
||||
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
|
||||
{
|
||||
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
|
||||
}
|
||||
|
||||
await jobRepository.UpdateAsync(stuckJob);
|
||||
|
||||
// Clean up progress tracker if still present
|
||||
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
|
||||
_runningJobTasks.TryRemove(stuckJob.Id, out _);
|
||||
// Also check for jobs that have been running for too long but haven't reached 100%
|
||||
var longRunningJobs = runningJobs
|
||||
.Where(j => j.JobType == JobType.Backtest &&
|
||||
j.ProgressPercentage < 100 &&
|
||||
j.StartedAt.HasValue &&
|
||||
(DateTime.UtcNow - j.StartedAt.Value) > TimeSpan.FromMinutes(_options.JobTimeoutMinutes + 10)) // Extra 10 min grace
|
||||
.ToList();
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (stuckJob.BundleRequestId.HasValue)
|
||||
{
|
||||
try
|
||||
{
|
||||
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, scope.ServiceProvider, stuckJob);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
|
||||
stuckJob.Id);
|
||||
if (longRunningJobs.Any())
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"🔧 Found {Count} jobs running longer than timeout for worker {WorkerId}. Marking as failed.",
|
||||
longRunningJobs.Count, _options.WorkerId);
|
||||
|
||||
foreach (var longJob in longRunningJobs)
|
||||
{
|
||||
await HandleLongRunningJobAsync(longJob, jobRepository, scope.ServiceProvider);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -947,6 +932,102 @@ public class BacktestComputeWorker : BackgroundService
|
||||
return category == FailureCategory.Transient || category == FailureCategory.SystemError;
|
||||
}
|
||||
|
||||
private async Task AutoCompleteStuckJobAsync(Job stuckJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
|
||||
stuckJob.Id, stuckJob.StartedAt);
|
||||
|
||||
stuckJob.Status = JobStatus.Completed;
|
||||
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
|
||||
stuckJob.LastHeartbeat = DateTime.UtcNow;
|
||||
|
||||
// Add note to error message if not already set
|
||||
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
|
||||
{
|
||||
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
|
||||
}
|
||||
|
||||
await jobRepository.UpdateAsync(stuckJob);
|
||||
|
||||
// Clean up progress tracker if still present
|
||||
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
|
||||
_runningJobTasks.TryRemove(stuckJob.Id, out _);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (stuckJob.BundleRequestId.HasValue)
|
||||
{
|
||||
try
|
||||
{
|
||||
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, serviceProvider, stuckJob);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
|
||||
stuckJob.Id);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error auto-completing stuck job {JobId}", stuckJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleLongRunningJobAsync(Job longJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
var elapsed = longJob.StartedAt.HasValue
|
||||
? (DateTime.UtcNow - longJob.StartedAt.Value).TotalMinutes
|
||||
: 0;
|
||||
|
||||
_logger.LogWarning(
|
||||
"🔧 Job {JobId} has been running for {Elapsed:F1} minutes (timeout: {_options.JobTimeoutMinutes}). Failing job.",
|
||||
longJob.Id, elapsed, _options.JobTimeoutMinutes);
|
||||
|
||||
// Mark as failed
|
||||
longJob.Status = JobStatus.Failed;
|
||||
longJob.ErrorMessage = $"Job exceeded maximum runtime of {_options.JobTimeoutMinutes} minutes";
|
||||
longJob.FailureCategory = FailureCategory.SystemError;
|
||||
longJob.IsRetryable = false;
|
||||
longJob.CompletedAt = DateTime.UtcNow;
|
||||
longJob.AssignedWorkerId = null;
|
||||
longJob.LastHeartbeat = DateTime.UtcNow;
|
||||
|
||||
await jobRepository.UpdateAsync(longJob);
|
||||
|
||||
// Clean up
|
||||
_jobProgressTrackers.TryRemove(longJob.Id, out _);
|
||||
_runningJobTasks.TryRemove(longJob.Id, out _);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (longJob.BundleRequestId.HasValue)
|
||||
{
|
||||
try
|
||||
{
|
||||
await UpdateBundleRequestProgress(longJob.BundleRequestId.Value, serviceProvider, longJob);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error updating bundle request progress for long-running job {JobId}", longJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
// Notify about permanent failure
|
||||
await NotifyPermanentFailure(longJob, new TimeoutException($"Job exceeded {elapsed:F1} minutes"), serviceProvider);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error handling long-running job {JobId}", longJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task NotifyPermanentFailure(
|
||||
Job job,
|
||||
Exception ex,
|
||||
@@ -956,7 +1037,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
{
|
||||
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
|
||||
const string alertsChannel = "2676086723";
|
||||
|
||||
|
||||
var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
|
||||
var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
|
||||
$"Job ID: `{job.Id}`\n" +
|
||||
@@ -965,7 +1046,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
$"Failure Category: {job.FailureCategory}\n" +
|
||||
$"Error: {ex.Message}\n" +
|
||||
$"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";
|
||||
|
||||
|
||||
await webhookService.SendMessage(message, alertsChannel);
|
||||
}
|
||||
catch (Exception notifyEx)
|
||||
|
||||
Reference in New Issue
Block a user