Improve a bit workers. bug : Bundle reset after all backtest finish
This commit is contained in:
@@ -457,6 +457,30 @@ public class BacktestComputeWorker : BackgroundService
|
||||
}
|
||||
|
||||
var previousStatus = bundleRequest.Status;
|
||||
|
||||
// CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set),
|
||||
// don't overwrite it unless we're detecting a legitimate change
|
||||
if (bundleRequest.CompletedAt.HasValue &&
|
||||
(bundleRequest.Status == BundleBacktestRequestStatus.Completed ||
|
||||
bundleRequest.Status == BundleBacktestRequestStatus.Failed))
|
||||
{
|
||||
// Bundle already finalized, only update if job counts indicate it should be re-opened
|
||||
// (This shouldn't happen in normal flow, but guards against race conditions)
|
||||
if (completedJobs + failedJobs == totalJobs)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Bundle {BundleRequestId} already completed/failed. Skipping status update.",
|
||||
bundleRequestId);
|
||||
return; // Don't modify a completed bundle
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.",
|
||||
bundleRequestId, completedJobs, failedJobs, totalJobs);
|
||||
// Allow the update to proceed to fix inconsistent state
|
||||
}
|
||||
}
|
||||
|
||||
// Update bundle request progress
|
||||
bundleRequest.CompletedBacktests = completedJobs;
|
||||
@@ -483,11 +507,14 @@ public class BacktestComputeWorker : BackgroundService
|
||||
bundleRequest.CompletedAt = DateTime.UtcNow;
|
||||
bundleRequest.CurrentBacktest = null;
|
||||
}
|
||||
else if (runningJobs > 0)
|
||||
else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0)
|
||||
{
|
||||
// Some jobs still running
|
||||
// Some jobs are running, or some have completed/failed (meaning work has started)
|
||||
// Once a bundle has started processing, it should stay "Running" until all jobs are done
|
||||
bundleRequest.Status = BundleBacktestRequestStatus.Running;
|
||||
}
|
||||
// If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0),
|
||||
// keep the current status (likely Pending)
|
||||
|
||||
// Update results list from completed jobs
|
||||
var completedJobResults = jobs
|
||||
@@ -554,11 +581,68 @@ public class BacktestComputeWorker : BackgroundService
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
|
||||
// Get stale jobs for this worker
|
||||
// Get running jobs for this worker
|
||||
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
||||
|
||||
// CRITICAL FIX: Check for jobs stuck at 100% progress
|
||||
// These jobs completed execution but their status wasn't updated to Completed
|
||||
// This causes the worker to think it's at max capacity
|
||||
var stuckCompletedJobs = runningJobs
|
||||
.Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
|
||||
.ToList();
|
||||
|
||||
if (stuckCompletedJobs.Any())
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"🔧 Found {Count} jobs stuck at 100% progress for worker {WorkerId}. Auto-completing them.",
|
||||
stuckCompletedJobs.Count, _options.WorkerId);
|
||||
|
||||
foreach (var stuckJob in stuckCompletedJobs)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
|
||||
stuckJob.Id, stuckJob.StartedAt);
|
||||
|
||||
stuckJob.Status = JobStatus.Completed;
|
||||
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
|
||||
stuckJob.LastHeartbeat = DateTime.UtcNow;
|
||||
|
||||
// Add note to error message if not already set
|
||||
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
|
||||
{
|
||||
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
|
||||
}
|
||||
|
||||
await jobRepository.UpdateAsync(stuckJob);
|
||||
|
||||
// Clean up progress tracker if still present
|
||||
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
|
||||
_runningJobTasks.TryRemove(stuckJob.Id, out _);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (stuckJob.BundleRequestId.HasValue)
|
||||
{
|
||||
try
|
||||
{
|
||||
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, scope.ServiceProvider);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
|
||||
stuckJob.Id);
|
||||
}
|
||||
}
|
||||
|
||||
// Get stale jobs for this worker
|
||||
var now = DateTime.UtcNow;
|
||||
var staleJobs = runningJobs
|
||||
.Where(j => j.JobType == JobType.Backtest &&
|
||||
j.ProgressPercentage < 100 && // Don't mark stuck-at-100% jobs as stale
|
||||
(
|
||||
// Stale heartbeat (no heartbeat in timeout period)
|
||||
j.LastHeartbeat == null ||
|
||||
|
||||
Reference in New Issue
Block a user