Improve a bit workers. bug : Bundle reset after all backtest finish

This commit is contained in:
2025-11-11 05:30:40 +07:00
parent c6becb032b
commit 8a27155418
11 changed files with 198 additions and 18 deletions

View File

@@ -457,6 +457,30 @@ public class BacktestComputeWorker : BackgroundService
}
var previousStatus = bundleRequest.Status;
// CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set),
// don't overwrite it unless we're detecting a legitimate change
if (bundleRequest.CompletedAt.HasValue &&
(bundleRequest.Status == BundleBacktestRequestStatus.Completed ||
bundleRequest.Status == BundleBacktestRequestStatus.Failed))
{
// Bundle already finalized, only update if job counts indicate it should be re-opened
// (This shouldn't happen in normal flow, but guards against race conditions)
if (completedJobs + failedJobs == totalJobs)
{
_logger.LogDebug(
"Bundle {BundleRequestId} already completed/failed. Skipping status update.",
bundleRequestId);
return; // Don't modify a completed bundle
}
else
{
_logger.LogWarning(
"Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.",
bundleRequestId, completedJobs, failedJobs, totalJobs);
// Allow the update to proceed to fix inconsistent state
}
}
// Update bundle request progress
bundleRequest.CompletedBacktests = completedJobs;
@@ -483,11 +507,14 @@ public class BacktestComputeWorker : BackgroundService
bundleRequest.CompletedAt = DateTime.UtcNow;
bundleRequest.CurrentBacktest = null;
}
else if (runningJobs > 0)
else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0)
{
// Some jobs still running
// Some jobs are running, or some have completed/failed (meaning work has started)
// Once a bundle has started processing, it should stay "Running" until all jobs are done
bundleRequest.Status = BundleBacktestRequestStatus.Running;
}
// If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0),
// keep the current status (likely Pending)
// Update results list from completed jobs
var completedJobResults = jobs
@@ -554,11 +581,68 @@ public class BacktestComputeWorker : BackgroundService
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
// Get stale jobs for this worker
// Get running jobs for this worker
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
// CRITICAL FIX: Check for jobs stuck at 100% progress
// These jobs completed execution but their status wasn't updated to Completed
// This causes the worker to think it's at max capacity
var stuckCompletedJobs = runningJobs
.Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
.ToList();
if (stuckCompletedJobs.Any())
{
_logger.LogWarning(
"🔧 Found {Count} jobs stuck at 100% progress for worker {WorkerId}. Auto-completing them.",
stuckCompletedJobs.Count, _options.WorkerId);
foreach (var stuckJob in stuckCompletedJobs)
{
_logger.LogWarning(
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
stuckJob.Id, stuckJob.StartedAt);
stuckJob.Status = JobStatus.Completed;
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
stuckJob.LastHeartbeat = DateTime.UtcNow;
// Add note to error message if not already set
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
{
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
}
await jobRepository.UpdateAsync(stuckJob);
// Clean up progress tracker if still present
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
_runningJobTasks.TryRemove(stuckJob.Id, out _);
// Update bundle request if this is part of a bundle
if (stuckJob.BundleRequestId.HasValue)
{
try
{
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, scope.ServiceProvider);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
}
}
_logger.LogInformation(
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
stuckJob.Id);
}
}
// Get stale jobs for this worker
var now = DateTime.UtcNow;
var staleJobs = runningJobs
.Where(j => j.JobType == JobType.Backtest &&
j.ProgressPercentage < 100 && // Don't mark stuck-at-100% jobs as stale
(
// Stale heartbeat (no heartbeat in timeout period)
j.LastHeartbeat == null ||