Update and fix worker
This commit is contained in:
@@ -52,6 +52,12 @@ public interface IJobRepository
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
Task<int> ResetStaleJobsAsync(int timeoutMinutes = 5);
|
Task<int> ResetStaleJobsAsync(int timeoutMinutes = 5);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Resets all jobs (Running or Pending) assigned to a specific worker back to Pending status.
|
||||||
|
/// Used when a worker restarts to clear jobs from the previous instance.
|
||||||
|
/// </summary>
|
||||||
|
Task<int> ResetJobsByWorkerIdAsync(string workerId);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets all running jobs assigned to a specific worker
|
/// Gets all running jobs assigned to a specific worker
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -45,6 +45,34 @@ public class BacktestComputeWorker : BackgroundService
|
|||||||
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance,
|
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance,
|
||||||
_options.JobPollIntervalSeconds, _options.JobTimeoutMinutes);
|
_options.JobPollIntervalSeconds, _options.JobTimeoutMinutes);
|
||||||
|
|
||||||
|
// Reset any jobs assigned to this WorkerId from previous worker instances at startup
|
||||||
|
// This is critical when restarting with the same WorkerId (e.g., Environment.MachineName)
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var scope = _scopeFactory.CreateScope();
|
||||||
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||||
|
|
||||||
|
// First, reset all jobs assigned to this WorkerId (from previous instance)
|
||||||
|
var workerResetCount = await jobRepository.ResetJobsByWorkerIdAsync(_options.WorkerId);
|
||||||
|
if (workerResetCount > 0)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Reset {Count} jobs assigned to worker {WorkerId} from previous instance",
|
||||||
|
workerResetCount, _options.WorkerId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then, reset any other stale jobs (from other workers or orphaned jobs)
|
||||||
|
var staleResetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
|
||||||
|
if (staleResetCount > 0)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Reset {Count} stale jobs to Pending status at startup", staleResetCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Error resetting jobs at startup");
|
||||||
|
// Don't fail startup if this fails, but log it
|
||||||
|
}
|
||||||
|
|
||||||
// Link cancellation tokens
|
// Link cancellation tokens
|
||||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken, _shutdownCts.Token);
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken, _shutdownCts.Token);
|
||||||
var cancellationToken = linkedCts.Token;
|
var cancellationToken = linkedCts.Token;
|
||||||
|
|||||||
@@ -39,6 +39,34 @@ public class GeneticComputeWorker : BackgroundService
|
|||||||
"GeneticComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrent: {MaxConcurrent}, PollInterval: {PollInterval}s",
|
"GeneticComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrent: {MaxConcurrent}, PollInterval: {PollInterval}s",
|
||||||
_options.WorkerId, _options.MaxConcurrentGenetics, _options.JobPollIntervalSeconds);
|
_options.WorkerId, _options.MaxConcurrentGenetics, _options.JobPollIntervalSeconds);
|
||||||
|
|
||||||
|
// Reset any jobs assigned to this WorkerId from previous worker instances at startup
|
||||||
|
// This is critical when restarting with the same WorkerId (e.g., Environment.MachineName)
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var scope = _scopeFactory.CreateScope();
|
||||||
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||||
|
|
||||||
|
// First, reset all jobs assigned to this WorkerId (from previous instance)
|
||||||
|
var workerResetCount = await jobRepository.ResetJobsByWorkerIdAsync(_options.WorkerId);
|
||||||
|
if (workerResetCount > 0)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Reset {Count} jobs assigned to worker {WorkerId} from previous instance",
|
||||||
|
workerResetCount, _options.WorkerId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then, reset any other stale jobs (from other workers or orphaned jobs)
|
||||||
|
var staleResetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
|
||||||
|
if (staleResetCount > 0)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Reset {Count} stale jobs to Pending status at startup", staleResetCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Error resetting jobs at startup");
|
||||||
|
// Don't fail startup if this fails, but log it
|
||||||
|
}
|
||||||
|
|
||||||
// Background task for stale job recovery
|
// Background task for stale job recovery
|
||||||
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(stoppingToken), stoppingToken);
|
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(stoppingToken), stoppingToken);
|
||||||
|
|
||||||
|
|||||||
@@ -52,11 +52,13 @@ public class PostgreSqlJobRepository : IJobRepository
|
|||||||
""GeneticRequestId"", ""RetryCount"", ""MaxRetries"", ""RetryAfter"",
|
""GeneticRequestId"", ""RetryCount"", ""MaxRetries"", ""RetryAfter"",
|
||||||
""IsRetryable"", ""FailureCategory""
|
""IsRetryable"", ""FailureCategory""
|
||||||
FROM ""Jobs""
|
FROM ""Jobs""
|
||||||
WHERE ""Status"" = @status";
|
WHERE ""Status"" = @status
|
||||||
|
AND (""RetryAfter"" IS NULL OR ""RetryAfter"" <= @now)";
|
||||||
|
|
||||||
var parameters = new List<NpgsqlParameter>
|
var parameters = new List<NpgsqlParameter>
|
||||||
{
|
{
|
||||||
new NpgsqlParameter("status", NpgsqlDbType.Integer) { Value = (int)JobStatus.Pending }
|
new NpgsqlParameter("status", NpgsqlDbType.Integer) { Value = (int)JobStatus.Pending },
|
||||||
|
new NpgsqlParameter("now", NpgsqlDbType.TimestampTz) { Value = DateTime.UtcNow }
|
||||||
};
|
};
|
||||||
|
|
||||||
if (jobType.HasValue)
|
if (jobType.HasValue)
|
||||||
@@ -255,6 +257,7 @@ public class PostgreSqlJobRepository : IJobRepository
|
|||||||
FROM ""Jobs"" j
|
FROM ""Jobs"" j
|
||||||
WHERE j.""Status"" = @status
|
WHERE j.""Status"" = @status
|
||||||
AND j.""JobType"" = @jobType
|
AND j.""JobType"" = @jobType
|
||||||
|
AND (j.""RetryAfter"" IS NULL OR j.""RetryAfter"" <= @now)
|
||||||
AND (
|
AND (
|
||||||
SELECT COUNT(*)
|
SELECT COUNT(*)
|
||||||
FROM ""Jobs"" running
|
FROM ""Jobs"" running
|
||||||
@@ -271,7 +274,8 @@ public class PostgreSqlJobRepository : IJobRepository
|
|||||||
new NpgsqlParameter("status", NpgsqlDbType.Integer) { Value = (int)JobStatus.Pending },
|
new NpgsqlParameter("status", NpgsqlDbType.Integer) { Value = (int)JobStatus.Pending },
|
||||||
new NpgsqlParameter("jobType", NpgsqlDbType.Integer) { Value = (int)jobType },
|
new NpgsqlParameter("jobType", NpgsqlDbType.Integer) { Value = (int)jobType },
|
||||||
new NpgsqlParameter("runningStatus", NpgsqlDbType.Integer) { Value = (int)JobStatus.Running },
|
new NpgsqlParameter("runningStatus", NpgsqlDbType.Integer) { Value = (int)JobStatus.Running },
|
||||||
new NpgsqlParameter("maxConcurrentPerUser", NpgsqlDbType.Integer) { Value = maxConcurrentPerUser }
|
new NpgsqlParameter("maxConcurrentPerUser", NpgsqlDbType.Integer) { Value = maxConcurrentPerUser },
|
||||||
|
new NpgsqlParameter("now", NpgsqlDbType.TimestampTz) { Value = DateTime.UtcNow }
|
||||||
};
|
};
|
||||||
|
|
||||||
_logger.LogDebug("Claiming random job atomically (maxConcurrentPerUser: {MaxConcurrent})", maxConcurrentPerUser);
|
_logger.LogDebug("Claiming random job atomically (maxConcurrentPerUser: {MaxConcurrent})", maxConcurrentPerUser);
|
||||||
@@ -453,8 +457,14 @@ public class PostgreSqlJobRepository : IJobRepository
|
|||||||
// Use AsTracking() to enable change tracking since DbContext uses NoTracking by default
|
// Use AsTracking() to enable change tracking since DbContext uses NoTracking by default
|
||||||
var staleJobs = await _context.Jobs
|
var staleJobs = await _context.Jobs
|
||||||
.AsTracking()
|
.AsTracking()
|
||||||
.Where(j => j.Status == (int)JobStatus.Running &&
|
.Where(j =>
|
||||||
(j.LastHeartbeat == null || j.LastHeartbeat < timeoutThreshold))
|
// Running jobs with stale or missing heartbeats
|
||||||
|
(j.Status == (int)JobStatus.Running &&
|
||||||
|
(j.LastHeartbeat == null || j.LastHeartbeat < timeoutThreshold)) ||
|
||||||
|
// Pending jobs that were assigned to a worker but never started (stale assignment)
|
||||||
|
(j.Status == (int)JobStatus.Pending &&
|
||||||
|
j.AssignedWorkerId != null &&
|
||||||
|
(j.LastHeartbeat == null || j.LastHeartbeat < timeoutThreshold)))
|
||||||
.ToListAsync();
|
.ToListAsync();
|
||||||
|
|
||||||
foreach (var job in staleJobs)
|
foreach (var job in staleJobs)
|
||||||
@@ -474,6 +484,33 @@ public class PostgreSqlJobRepository : IJobRepository
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task<int> ResetJobsByWorkerIdAsync(string workerId)
|
||||||
|
{
|
||||||
|
// Use AsTracking() to enable change tracking since DbContext uses NoTracking by default
|
||||||
|
// Reset ALL jobs (Running or Pending) assigned to this worker - they belong to a previous instance
|
||||||
|
var jobsToReset = await _context.Jobs
|
||||||
|
.AsTracking()
|
||||||
|
.Where(j => j.AssignedWorkerId == workerId &&
|
||||||
|
(j.Status == (int)JobStatus.Running || j.Status == (int)JobStatus.Pending))
|
||||||
|
.ToListAsync();
|
||||||
|
|
||||||
|
foreach (var job in jobsToReset)
|
||||||
|
{
|
||||||
|
job.Status = (int)JobStatus.Pending;
|
||||||
|
job.AssignedWorkerId = null;
|
||||||
|
job.LastHeartbeat = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var count = jobsToReset.Count;
|
||||||
|
if (count > 0)
|
||||||
|
{
|
||||||
|
await _context.SaveChangesAsync();
|
||||||
|
_logger.LogInformation("Reset {Count} jobs assigned to worker {WorkerId} back to Pending status", count, workerId);
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
public async Task<JobSummary> GetSummaryAsync()
|
public async Task<JobSummary> GetSummaryAsync()
|
||||||
{
|
{
|
||||||
// Use ADO.NET directly for aggregation queries to avoid EF Core mapping issues
|
// Use ADO.NET directly for aggregation queries to avoid EF Core mapping issues
|
||||||
|
|||||||
Reference in New Issue
Block a user