Enhance BacktestExecutor and BacktestComputeWorker with timeout and memory monitoring features. Implement auto-completion for stuck jobs and handle long-running jobs more effectively. Add overall runtime checks for bundles in BundleBacktestHealthCheckWorker to improve job management and recovery processes.

This commit is contained in:
2025-12-28 18:56:33 +07:00
parent f84524f93a
commit d1924d9030
3 changed files with 272 additions and 87 deletions

View File

@@ -637,7 +637,7 @@ public class BacktestComputeWorker : BackgroundService
var stuckCompletedJobs = runningJobs
.Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
.ToList();
if (stuckCompletedJobs.Any())
{
_logger.LogWarning(
@@ -646,42 +646,27 @@ public class BacktestComputeWorker : BackgroundService
foreach (var stuckJob in stuckCompletedJobs)
{
_logger.LogWarning(
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
stuckJob.Id, stuckJob.StartedAt);
await AutoCompleteStuckJobAsync(stuckJob, jobRepository, scope.ServiceProvider);
}
}
stuckJob.Status = JobStatus.Completed;
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
stuckJob.LastHeartbeat = DateTime.UtcNow;
// Add note to error message if not already set
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
{
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
}
await jobRepository.UpdateAsync(stuckJob);
// Clean up progress tracker if still present
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
_runningJobTasks.TryRemove(stuckJob.Id, out _);
// Also check for jobs that have been running for too long but haven't reached 100%
var longRunningJobs = runningJobs
.Where(j => j.JobType == JobType.Backtest &&
j.ProgressPercentage < 100 &&
j.StartedAt.HasValue &&
(DateTime.UtcNow - j.StartedAt.Value) > TimeSpan.FromMinutes(_options.JobTimeoutMinutes + 10)) // Extra 10 min grace
.ToList();
// Update bundle request if this is part of a bundle
if (stuckJob.BundleRequestId.HasValue)
{
try
{
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, scope.ServiceProvider, stuckJob);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
}
}
_logger.LogInformation(
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
stuckJob.Id);
if (longRunningJobs.Any())
{
_logger.LogWarning(
"🔧 Found {Count} jobs running longer than timeout for worker {WorkerId}. Marking as failed.",
longRunningJobs.Count, _options.WorkerId);
foreach (var longJob in longRunningJobs)
{
await HandleLongRunningJobAsync(longJob, jobRepository, scope.ServiceProvider);
}
}
@@ -947,6 +932,102 @@ public class BacktestComputeWorker : BackgroundService
return category == FailureCategory.Transient || category == FailureCategory.SystemError;
}
private async Task AutoCompleteStuckJobAsync(Job stuckJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
{
try
{
_logger.LogWarning(
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
stuckJob.Id, stuckJob.StartedAt);
stuckJob.Status = JobStatus.Completed;
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
stuckJob.LastHeartbeat = DateTime.UtcNow;
// Add note to error message if not already set
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
{
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
}
await jobRepository.UpdateAsync(stuckJob);
// Clean up progress tracker if still present
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
_runningJobTasks.TryRemove(stuckJob.Id, out _);
// Update bundle request if this is part of a bundle
if (stuckJob.BundleRequestId.HasValue)
{
try
{
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, serviceProvider, stuckJob);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
}
}
_logger.LogInformation(
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
stuckJob.Id);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error auto-completing stuck job {JobId}", stuckJob.Id);
}
}
private async Task HandleLongRunningJobAsync(Job longJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
{
try
{
var elapsed = longJob.StartedAt.HasValue
? (DateTime.UtcNow - longJob.StartedAt.Value).TotalMinutes
: 0;
_logger.LogWarning(
"🔧 Job {JobId} has been running for {Elapsed:F1} minutes (timeout: {_options.JobTimeoutMinutes}). Failing job.",
longJob.Id, elapsed, _options.JobTimeoutMinutes);
// Mark as failed
longJob.Status = JobStatus.Failed;
longJob.ErrorMessage = $"Job exceeded maximum runtime of {_options.JobTimeoutMinutes} minutes";
longJob.FailureCategory = FailureCategory.SystemError;
longJob.IsRetryable = false;
longJob.CompletedAt = DateTime.UtcNow;
longJob.AssignedWorkerId = null;
longJob.LastHeartbeat = DateTime.UtcNow;
await jobRepository.UpdateAsync(longJob);
// Clean up
_jobProgressTrackers.TryRemove(longJob.Id, out _);
_runningJobTasks.TryRemove(longJob.Id, out _);
// Update bundle request if this is part of a bundle
if (longJob.BundleRequestId.HasValue)
{
try
{
await UpdateBundleRequestProgress(longJob.BundleRequestId.Value, serviceProvider, longJob);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error updating bundle request progress for long-running job {JobId}", longJob.Id);
}
}
// Notify about permanent failure
await NotifyPermanentFailure(longJob, new TimeoutException($"Job exceeded {elapsed:F1} minutes"), serviceProvider);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error handling long-running job {JobId}", longJob.Id);
}
}
private async Task NotifyPermanentFailure(
Job job,
Exception ex,
@@ -956,7 +1037,7 @@ public class BacktestComputeWorker : BackgroundService
{
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
const string alertsChannel = "2676086723";
var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
$"Job ID: `{job.Id}`\n" +
@@ -965,7 +1046,7 @@ public class BacktestComputeWorker : BackgroundService
$"Failure Category: {job.FailureCategory}\n" +
$"Error: {ex.Message}\n" +
$"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";
await webhookService.SendMessage(message, alertsChannel);
}
catch (Exception notifyEx)