1210 lines
51 KiB
C#
1210 lines
51 KiB
C#
using System.Collections.Concurrent;
|
|
using System.Text.Json;
|
|
using Managing.Application.Abstractions.Repositories;
|
|
using Managing.Application.Abstractions.Services;
|
|
using Managing.Application.Backtests;
|
|
using Managing.Domain.Backtests;
|
|
using Managing.Domain.Bots;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using static Managing.Common.Enums;
|
|
|
|
namespace Managing.Application.Workers;
|
|
|
|
/// <summary>
|
|
/// Background worker that processes backtest jobs from the queue.
|
|
/// Polls for pending jobs, claims them using advisory locks, and processes them.
|
|
/// </summary>
|
|
public class BacktestComputeWorker : BackgroundService
|
|
{
|
|
private readonly IServiceScopeFactory _scopeFactory;
|
|
private readonly ILogger<BacktestComputeWorker> _logger;
|
|
private readonly BacktestComputeWorkerOptions _options;
|
|
private readonly SemaphoreSlim _instanceSemaphore;
|
|
private readonly ConcurrentDictionary<Guid, Task> _runningJobTasks = new();
|
|
private readonly ConcurrentDictionary<Guid, JobProgressTracker> _jobProgressTrackers = new();
|
|
private readonly CancellationTokenSource _shutdownCts = new();
|
|
|
|
public BacktestComputeWorker(
|
|
IServiceScopeFactory scopeFactory,
|
|
ILogger<BacktestComputeWorker> logger,
|
|
IOptions<BacktestComputeWorkerOptions> options)
|
|
{
|
|
_scopeFactory = scopeFactory;
|
|
_logger = logger;
|
|
_options = options.Value;
|
|
_instanceSemaphore = new SemaphoreSlim(_options.MaxConcurrentPerInstance, _options.MaxConcurrentPerInstance);
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s, JobTimeout: {JobTimeoutMinutes}min",
|
|
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance,
|
|
_options.JobPollIntervalSeconds, _options.JobTimeoutMinutes);
|
|
|
|
// Reset any jobs assigned to this WorkerId from previous worker instances at startup
|
|
// This is critical when restarting with the same WorkerId (e.g., Environment.MachineName)
|
|
try
|
|
{
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// First, reset all jobs assigned to this WorkerId (from previous instance)
|
|
var workerResetCount = await jobRepository.ResetJobsByWorkerIdAsync(_options.WorkerId);
|
|
if (workerResetCount > 0)
|
|
{
|
|
_logger.LogInformation("Reset {Count} jobs assigned to worker {WorkerId} from previous instance",
|
|
workerResetCount, _options.WorkerId);
|
|
}
|
|
|
|
// Then, reset any other stale jobs (from other workers or orphaned jobs)
|
|
var staleResetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
|
|
if (staleResetCount > 0)
|
|
{
|
|
_logger.LogInformation("Reset {Count} stale jobs to Pending status at startup", staleResetCount);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error resetting jobs at startup");
|
|
// Don't fail startup if this fails, but log it
|
|
}
|
|
|
|
// Link cancellation tokens
|
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken, _shutdownCts.Token);
|
|
var cancellationToken = linkedCts.Token;
|
|
|
|
// Background task for stale job recovery
|
|
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(cancellationToken), cancellationToken);
|
|
|
|
// Background task for heartbeat updates
|
|
var heartbeatTask = Task.Run(() => HeartbeatLoop(cancellationToken), cancellationToken);
|
|
|
|
// Background task for progress persistence
|
|
var progressPersistenceTask = Task.Run(() => ProgressPersistenceLoop(cancellationToken), cancellationToken);
|
|
|
|
// Main job processing loop
|
|
try
|
|
{
|
|
while (!cancellationToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await ProcessJobsAsync(cancellationToken);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Expected during shutdown
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error in BacktestComputeWorker main loop");
|
|
SentrySdk.CaptureException(ex);
|
|
}
|
|
|
|
try
|
|
{
|
|
await Task.Delay(TimeSpan.FromSeconds(_options.JobPollIntervalSeconds), cancellationToken);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
_logger.LogInformation("BacktestComputeWorker stopping, waiting for {Count} running jobs to complete or timeout",
|
|
_runningJobTasks.Count);
|
|
|
|
// Signal shutdown
|
|
_shutdownCts.Cancel();
|
|
|
|
// Wait for running jobs with timeout
|
|
var waitTasks = _runningJobTasks.Values.ToArray();
|
|
if (waitTasks.Length > 0)
|
|
{
|
|
var timeoutTask = Task.Delay(TimeSpan.FromMinutes(_options.GracefulShutdownTimeoutMinutes), CancellationToken.None);
|
|
var completedTask = await Task.WhenAny(Task.WhenAll(waitTasks), timeoutTask);
|
|
|
|
if (completedTask == timeoutTask)
|
|
{
|
|
_logger.LogWarning("Graceful shutdown timeout reached, {Count} jobs may still be running",
|
|
_runningJobTasks.Count);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogInformation("All running jobs completed during graceful shutdown");
|
|
}
|
|
}
|
|
|
|
_logger.LogInformation("BacktestComputeWorker stopped");
|
|
}
|
|
}
|
|
|
|
private async Task ProcessJobsAsync(CancellationToken cancellationToken)
|
|
{
|
|
// Check if this instance has capacity
|
|
if (!await _instanceSemaphore.WaitAsync(0, cancellationToken))
|
|
{
|
|
// Instance at capacity, skip this iteration
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// Claim a random backtest job atomically, excluding users at capacity
|
|
// The SQL query checks running job counts within the transaction, ensuring thread-safety
|
|
var job = await jobRepository.ClaimRandomJobAsync(
|
|
_options.WorkerId,
|
|
JobType.Backtest,
|
|
_options.MaxConcurrentPerUser);
|
|
|
|
if (job == null)
|
|
{
|
|
// No jobs available for users not at capacity, release semaphore
|
|
_instanceSemaphore.Release();
|
|
return;
|
|
}
|
|
|
|
_logger.LogInformation(
|
|
"Claimed random backtest job {JobId} (UserId: {UserId}) for worker {WorkerId}",
|
|
job.Id, job.UserId, _options.WorkerId);
|
|
|
|
// Process the job asynchronously (don't await, let it run in background)
|
|
// Create a new scope for the job processing to ensure proper lifetime management
|
|
var jobTask = Task.Run(async () =>
|
|
{
|
|
try
|
|
{
|
|
await ProcessJobAsync(job, cancellationToken);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Handle cancellation gracefully
|
|
_logger.LogInformation("Job {JobId} was cancelled during processing", job.Id);
|
|
await HandleJobCancellation(job);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error processing job {JobId}", job.Id);
|
|
// Error handling is done in ProcessJobAsync
|
|
}
|
|
finally
|
|
{
|
|
_runningJobTasks.TryRemove(job.Id, out _);
|
|
_instanceSemaphore.Release();
|
|
}
|
|
}, cancellationToken);
|
|
|
|
// Track the running job task
|
|
_runningJobTasks.TryAdd(job.Id, jobTask);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error claiming or processing job");
|
|
_instanceSemaphore.Release();
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private async Task ProcessJobAsync(
|
|
Job job,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
var executor = scope.ServiceProvider.GetRequiredService<BacktestExecutor>();
|
|
var userService = scope.ServiceProvider.GetRequiredService<IUserService>();
|
|
var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
|
|
var agentSummaryRepository = scope.ServiceProvider.GetRequiredService<IAgentSummaryRepository>();
|
|
|
|
var jobStartTime = DateTime.UtcNow;
|
|
|
|
try
|
|
{
|
|
_logger.LogInformation(
|
|
"Processing backtest job {JobId} (BundleRequestId: {BundleRequestId}, UserId: {UserId})",
|
|
job.Id, job.BundleRequestId, job.UserId);
|
|
|
|
// Deserialize config
|
|
var config = JsonSerializer.Deserialize<TradingBotConfig>(job.ConfigJson);
|
|
if (config == null)
|
|
{
|
|
throw new InvalidOperationException("Failed to deserialize TradingBotConfig from job");
|
|
}
|
|
|
|
// Load user
|
|
var user = await userService.GetUserByIdAsync(job.UserId);
|
|
if (user == null)
|
|
{
|
|
throw new InvalidOperationException($"User {job.UserId} not found");
|
|
}
|
|
|
|
// Load candles
|
|
var candles = await exchangeService.GetCandlesInflux(
|
|
TradingExchanges.Evm,
|
|
config.Ticker,
|
|
job.StartDate,
|
|
config.Timeframe,
|
|
job.EndDate);
|
|
|
|
if (candles == null || candles.Count == 0)
|
|
{
|
|
throw new InvalidOperationException(
|
|
$"No candles found for {config.Ticker} on {config.Timeframe} from {job.StartDate} to {job.EndDate}");
|
|
}
|
|
|
|
// Create progress tracker for this job
|
|
var progressTracker = new JobProgressTracker(job.Id, _logger);
|
|
_jobProgressTrackers.TryAdd(job.Id, progressTracker);
|
|
|
|
// Progress callback that only updates in-memory progress (non-blocking)
|
|
// Timeout is now enforced via CancellationToken, not by throwing in callback
|
|
Func<int, Task> progressCallback = (percentage) =>
|
|
{
|
|
// Update progress in memory only - persistence happens in background
|
|
progressTracker.UpdateProgress(percentage);
|
|
|
|
return Task.CompletedTask; // Non-blocking
|
|
};
|
|
|
|
// Execute the backtest with timeout
|
|
var timeoutCts = new CancellationTokenSource(TimeSpan.FromMinutes(_options.JobTimeoutMinutes));
|
|
var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
|
|
|
// Convert HashSet to List - candles are already ordered from repository
|
|
var candlesList = candles.ToList();
|
|
|
|
LightBacktest result;
|
|
try
|
|
{
|
|
result = await executor.ExecuteAsync(
|
|
config,
|
|
candlesList,
|
|
user,
|
|
save: true,
|
|
withCandles: false,
|
|
requestId: job.RequestId,
|
|
bundleRequestId: job.BundleRequestId,
|
|
metadata: null,
|
|
progressCallback: progressCallback,
|
|
cancellationToken: linkedCts.Token);
|
|
}
|
|
catch (OperationCanceledException) when (timeoutCts.Token.IsCancellationRequested && !cancellationToken.IsCancellationRequested)
|
|
{
|
|
var elapsed = DateTime.UtcNow - jobStartTime;
|
|
throw new TimeoutException($"Job {job.Id} exceeded timeout of {_options.JobTimeoutMinutes} minutes (ran for {elapsed.TotalMinutes:F1} minutes)");
|
|
}
|
|
finally
|
|
{
|
|
timeoutCts.Dispose();
|
|
linkedCts.Dispose();
|
|
}
|
|
|
|
// Update job with result
|
|
job.Status = JobStatus.Completed;
|
|
job.ProgressPercentage = 100;
|
|
job.ResultJson = JsonSerializer.Serialize(result);
|
|
job.CompletedAt = DateTime.UtcNow;
|
|
job.LastHeartbeat = DateTime.UtcNow;
|
|
|
|
await jobRepository.UpdateAsync(job);
|
|
|
|
// Clean up progress tracker
|
|
_jobProgressTrackers.TryRemove(job.Id, out _);
|
|
|
|
// Increment backtest count for the user's agent summary
|
|
try
|
|
{
|
|
await agentSummaryRepository.IncrementBacktestCountAsync(job.UserId);
|
|
_logger.LogDebug("Incremented backtest count for user {UserId}", job.UserId);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to increment backtest count for user {UserId}", job.UserId);
|
|
// Don't fail the job if this update fails
|
|
}
|
|
|
|
var elapsedTime = DateTime.UtcNow - jobStartTime;
|
|
_logger.LogInformation(
|
|
"Completed backtest job {JobId}. Score: {Score}, PnL: {PnL}, Duration: {DurationMinutes:F1} minutes",
|
|
job.Id, result.Score, result.FinalPnl, elapsedTime.TotalMinutes);
|
|
|
|
// Update bundle request progress if this job is part of a bundle
|
|
if (job.BundleRequestId.HasValue)
|
|
{
|
|
try
|
|
{
|
|
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider, job);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error updating bundle request progress for job {JobId}", job.Id);
|
|
}
|
|
}
|
|
}
|
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
|
{
|
|
_logger.LogInformation("Job {JobId} was cancelled", job.Id);
|
|
throw; // Re-throw to be handled by the caller
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
|
|
SentrySdk.CaptureException(ex);
|
|
|
|
// Clean up progress tracker on failure
|
|
_jobProgressTrackers.TryRemove(job.Id, out _);
|
|
|
|
await HandleJobFailure(job, ex, jobRepository, scope.ServiceProvider);
|
|
}
|
|
}
|
|
|
|
private async Task HandleJobCancellation(Job job)
|
|
{
|
|
try
|
|
{
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// Reload job to get latest state
|
|
var currentJob = await jobRepository.GetByIdAsync(job.Id);
|
|
if (currentJob == null)
|
|
{
|
|
_logger.LogWarning("Job {JobId} not found when handling cancellation", job.Id);
|
|
return;
|
|
}
|
|
|
|
// If job is still running, mark it for retry
|
|
if (currentJob.Status == JobStatus.Running)
|
|
{
|
|
if (currentJob.RetryCount < currentJob.MaxRetries)
|
|
{
|
|
currentJob.Status = JobStatus.Pending;
|
|
currentJob.RetryCount++;
|
|
var backoffMinutes = Math.Min(Math.Pow(2, currentJob.RetryCount), _options.MaxRetryDelayMinutes);
|
|
currentJob.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
|
currentJob.ErrorMessage = $"Worker shutdown - retry {currentJob.RetryCount}/{currentJob.MaxRetries}";
|
|
currentJob.FailureCategory = FailureCategory.Transient;
|
|
currentJob.AssignedWorkerId = null;
|
|
currentJob.LastHeartbeat = null;
|
|
|
|
_logger.LogWarning(
|
|
"Job {JobId} cancelled during shutdown, will be retried (attempt {RetryCount}/{MaxRetries})",
|
|
currentJob.Id, currentJob.RetryCount, currentJob.MaxRetries);
|
|
}
|
|
else
|
|
{
|
|
currentJob.Status = JobStatus.Failed;
|
|
currentJob.ErrorMessage = "Worker shutdown - exceeded max retries";
|
|
currentJob.FailureCategory = FailureCategory.SystemError;
|
|
currentJob.IsRetryable = false;
|
|
currentJob.CompletedAt = DateTime.UtcNow;
|
|
currentJob.AssignedWorkerId = null;
|
|
|
|
_logger.LogError("Job {JobId} cancelled during shutdown and exceeded max retries", currentJob.Id);
|
|
|
|
await NotifyPermanentFailure(currentJob, new OperationCanceledException("Worker shutdown"), scope.ServiceProvider);
|
|
}
|
|
|
|
await jobRepository.UpdateAsync(currentJob);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error handling job cancellation for job {JobId}", job.Id);
|
|
}
|
|
}
|
|
|
|
private async Task UpdateBundleRequestProgress(Guid bundleRequestId, IServiceProvider serviceProvider, Job? updatedJob = null)
|
|
{
|
|
try
|
|
{
|
|
var backtestRepository = serviceProvider.GetRequiredService<IBacktestRepository>();
|
|
var jobRepository = serviceProvider.GetRequiredService<IJobRepository>();
|
|
var userService = serviceProvider.GetRequiredService<IUserService>();
|
|
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
|
|
|
|
// Get all jobs for this bundle
|
|
var jobs = await jobRepository.GetByBundleRequestIdAsync(bundleRequestId);
|
|
var totalJobs = jobs.Count();
|
|
|
|
// If we have an updated job, use its current status instead of querying again
|
|
// This avoids race conditions where the job was just updated but not yet reflected in the query
|
|
var completedJobs = jobs.Count(j => j.Status == JobStatus.Completed);
|
|
var failedJobs = jobs.Count(j => j.Status == JobStatus.Failed);
|
|
var runningJobs = jobs.Count(j => j.Status == JobStatus.Running);
|
|
|
|
// Adjust counts based on the updated job if provided
|
|
if (updatedJob != null)
|
|
{
|
|
var existingJob = jobs.FirstOrDefault(j => j.Id == updatedJob.Id);
|
|
if (existingJob != null)
|
|
{
|
|
// Remove the old status count
|
|
switch (existingJob.Status)
|
|
{
|
|
case JobStatus.Completed:
|
|
completedJobs--;
|
|
break;
|
|
case JobStatus.Failed:
|
|
failedJobs--;
|
|
break;
|
|
case JobStatus.Running:
|
|
runningJobs--;
|
|
break;
|
|
}
|
|
|
|
// Add the new status count
|
|
switch (updatedJob.Status)
|
|
{
|
|
case JobStatus.Completed:
|
|
completedJobs++;
|
|
break;
|
|
case JobStatus.Failed:
|
|
failedJobs++;
|
|
break;
|
|
case JobStatus.Running:
|
|
runningJobs++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (totalJobs == 0)
|
|
{
|
|
return; // No jobs yet
|
|
}
|
|
|
|
// Get user from first job
|
|
var firstJob = jobs.First();
|
|
var user = await userService.GetUserByIdAsync(firstJob.UserId);
|
|
if (user == null)
|
|
{
|
|
_logger.LogWarning("User {UserId} not found for bundle request {BundleRequestId}", firstJob.UserId, bundleRequestId);
|
|
return;
|
|
}
|
|
|
|
// Get bundle request
|
|
var bundleRequest = backtestRepository.GetBundleBacktestRequestByIdForUser(user, bundleRequestId);
|
|
if (bundleRequest == null)
|
|
{
|
|
_logger.LogWarning("Bundle request {BundleRequestId} not found for user {UserId}", bundleRequestId, user.Id);
|
|
return;
|
|
}
|
|
|
|
var previousStatus = bundleRequest.Status;
|
|
|
|
// Update bundle request progress (always update counters regardless of status)
|
|
bundleRequest.CompletedBacktests = completedJobs;
|
|
bundleRequest.FailedBacktests = failedJobs;
|
|
bundleRequest.UpdatedAt = DateTime.UtcNow;
|
|
|
|
// CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set),
|
|
// don't overwrite it unless we're detecting a legitimate inconsistency that needs fixing
|
|
if (bundleRequest.CompletedAt.HasValue &&
|
|
(bundleRequest.Status == BundleBacktestRequestStatus.Completed ||
|
|
bundleRequest.Status == BundleBacktestRequestStatus.Failed))
|
|
{
|
|
// Bundle already finalized, only update if job counts indicate it should be re-opened
|
|
// (This shouldn't happen in normal flow, but guards against race conditions)
|
|
if (completedJobs + failedJobs == totalJobs)
|
|
{
|
|
_logger.LogDebug(
|
|
"Bundle {BundleRequestId} already completed/failed. Skipping status update.",
|
|
bundleRequestId);
|
|
// Progress counters already updated above, just return
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
_logger.LogWarning(
|
|
"Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.",
|
|
bundleRequestId, completedJobs, failedJobs, totalJobs);
|
|
// Allow the update to proceed to fix inconsistent state
|
|
}
|
|
}
|
|
|
|
// Update status based on job states
|
|
if (completedJobs + failedJobs == totalJobs)
|
|
{
|
|
// All jobs completed or failed
|
|
if (failedJobs == 0)
|
|
{
|
|
bundleRequest.Status = BundleBacktestRequestStatus.Completed;
|
|
}
|
|
else if (completedJobs == 0)
|
|
{
|
|
bundleRequest.Status = BundleBacktestRequestStatus.Failed;
|
|
bundleRequest.ErrorMessage = "All backtests failed";
|
|
}
|
|
else
|
|
{
|
|
bundleRequest.Status = BundleBacktestRequestStatus.Completed;
|
|
bundleRequest.ErrorMessage = $"{failedJobs} backtests failed";
|
|
}
|
|
bundleRequest.CompletedAt = DateTime.UtcNow;
|
|
bundleRequest.CurrentBacktest = null;
|
|
}
|
|
else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0)
|
|
{
|
|
// Some jobs are running, or some have completed/failed (meaning work has started)
|
|
// Once a bundle has started processing, it should stay "Running" until all jobs are done
|
|
bundleRequest.Status = BundleBacktestRequestStatus.Running;
|
|
}
|
|
// If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0),
|
|
// keep the current status (likely Pending)
|
|
|
|
// Update results list from completed jobs
|
|
var completedJobResults = jobs
|
|
.Where(j => j.Status == JobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
|
|
.Select(j =>
|
|
{
|
|
try
|
|
{
|
|
var result = JsonSerializer.Deserialize<LightBacktest>(j.ResultJson);
|
|
return result?.Id;
|
|
}
|
|
catch
|
|
{
|
|
return null;
|
|
}
|
|
})
|
|
.Where(id => !string.IsNullOrEmpty(id))
|
|
.ToList();
|
|
|
|
bundleRequest.Results = completedJobResults!;
|
|
|
|
await backtestRepository.UpdateBundleBacktestRequestAsync(bundleRequest);
|
|
|
|
// Send webhook notification if bundle request just completed
|
|
if (previousStatus != BundleBacktestRequestStatus.Completed &&
|
|
bundleRequest.Status == BundleBacktestRequestStatus.Completed &&
|
|
!string.IsNullOrEmpty(user.TelegramChannel))
|
|
{
|
|
var completedAt = bundleRequest.CompletedAt ?? DateTime.UtcNow;
|
|
var duration = completedAt - bundleRequest.CreatedAt;
|
|
var durationText = duration.TotalHours >= 1
|
|
? $"{duration.Hours}h {duration.Minutes}m {duration.Seconds}s"
|
|
: duration.TotalMinutes >= 1
|
|
? $"{duration.Minutes}m {duration.Seconds}s"
|
|
: $"{duration.Seconds}s";
|
|
|
|
var message = $"✅ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) completed successfully. " +
|
|
$"Completed: {completedJobs}/{totalJobs} backtests" +
|
|
(failedJobs > 0 ? $", Failed: {failedJobs}" : "") +
|
|
$". Results: {completedJobResults.Count} backtest(s) generated. " +
|
|
$"Duration: {durationText}";
|
|
|
|
await webhookService.SendMessage(message, user.TelegramChannel);
|
|
}
|
|
else if (previousStatus != BundleBacktestRequestStatus.Failed &&
|
|
bundleRequest.Status == BundleBacktestRequestStatus.Failed &&
|
|
!string.IsNullOrEmpty(user.TelegramChannel))
|
|
{
|
|
var completedAt = bundleRequest.CompletedAt ?? DateTime.UtcNow;
|
|
var duration = completedAt - bundleRequest.CreatedAt;
|
|
var durationText = duration.TotalHours >= 1
|
|
? $"{duration.Hours}h {duration.Minutes}m {duration.Seconds}s"
|
|
: duration.TotalMinutes >= 1
|
|
? $"{duration.Minutes}m {duration.Seconds}s"
|
|
: $"{duration.Seconds}s";
|
|
|
|
var message = $"❌ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) failed. " +
|
|
$"All {totalJobs} backtests failed. Error: {bundleRequest.ErrorMessage}. " +
|
|
$"Duration: {durationText}";
|
|
|
|
await webhookService.SendMessage(message, user.TelegramChannel);
|
|
}
|
|
|
|
_logger.LogInformation(
|
|
"Updated bundle request {BundleRequestId} progress: {Completed}/{Total} completed, {Failed} failed, {Running} running",
|
|
bundleRequestId, completedJobs, totalJobs, failedJobs, runningJobs);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Error updating bundle request {BundleRequestId} progress", bundleRequestId);
|
|
}
|
|
}
|
|
|
|
private async Task StaleJobRecoveryLoop(CancellationToken cancellationToken)
|
|
{
|
|
while (!cancellationToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(TimeSpan.FromMinutes(1), cancellationToken); // Check every minute
|
|
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// Get running jobs for this worker
|
|
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
|
|
|
// CRITICAL FIX: Check for jobs stuck at 100% progress
|
|
// These jobs completed execution but their status wasn't updated to Completed
|
|
// This causes the worker to think it's at max capacity
|
|
var stuckCompletedJobs = runningJobs
|
|
.Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
|
|
.ToList();
|
|
|
|
if (stuckCompletedJobs.Any())
|
|
{
|
|
_logger.LogWarning(
|
|
"🔧 Found {Count} jobs stuck at 100% progress for worker {WorkerId}. Auto-completing them.",
|
|
stuckCompletedJobs.Count, _options.WorkerId);
|
|
|
|
foreach (var stuckJob in stuckCompletedJobs)
|
|
{
|
|
await AutoCompleteStuckJobAsync(stuckJob, jobRepository, scope.ServiceProvider);
|
|
}
|
|
}
|
|
|
|
// Also check for jobs that have been running for too long but haven't reached 100%
|
|
var longRunningJobs = runningJobs
|
|
.Where(j => j.JobType == JobType.Backtest &&
|
|
j.ProgressPercentage < 100 &&
|
|
j.StartedAt.HasValue &&
|
|
(DateTime.UtcNow - j.StartedAt.Value) > TimeSpan.FromMinutes(_options.JobTimeoutMinutes + 10)) // Extra 10 min grace
|
|
.ToList();
|
|
|
|
if (longRunningJobs.Any())
|
|
{
|
|
_logger.LogWarning(
|
|
"🔧 Found {Count} jobs running longer than timeout for worker {WorkerId}. Marking as failed.",
|
|
longRunningJobs.Count, _options.WorkerId);
|
|
|
|
foreach (var longJob in longRunningJobs)
|
|
{
|
|
await HandleLongRunningJobAsync(longJob, jobRepository, scope.ServiceProvider);
|
|
}
|
|
}
|
|
|
|
// Get stale jobs for this worker
|
|
var now = DateTime.UtcNow;
|
|
var staleJobs = runningJobs
|
|
.Where(j => j.JobType == JobType.Backtest &&
|
|
j.ProgressPercentage < 100 && // Don't mark stuck-at-100% jobs as stale
|
|
(
|
|
// Stale heartbeat (no heartbeat in timeout period)
|
|
j.LastHeartbeat == null ||
|
|
j.LastHeartbeat < now.AddMinutes(-_options.StaleJobTimeoutMinutes) ||
|
|
// Job running too long (even with recent heartbeat)
|
|
(j.StartedAt.HasValue &&
|
|
j.StartedAt.Value < now.AddMinutes(-_options.JobTimeoutMinutes))
|
|
))
|
|
.ToList();
|
|
|
|
foreach (var job in staleJobs)
|
|
{
|
|
var elapsed = job.StartedAt.HasValue
|
|
? (DateTime.UtcNow - job.StartedAt.Value).TotalMinutes
|
|
: (double?)null;
|
|
var lastHeartbeatAge = job.LastHeartbeat.HasValue
|
|
? (DateTime.UtcNow - job.LastHeartbeat.Value).TotalMinutes
|
|
: (double?)null;
|
|
|
|
_logger.LogWarning(
|
|
"Detected stale job {JobId}: Started {StartedAt}, LastHeartbeat: {LastHeartbeat} ({HeartbeatAge} min ago), Elapsed: {Elapsed} min",
|
|
job.Id, job.StartedAt, job.LastHeartbeat, lastHeartbeatAge, elapsed);
|
|
|
|
// Remove from running tasks if still tracked
|
|
_runningJobTasks.TryRemove(job.Id, out _);
|
|
|
|
// If it's stale but retryable, reset to pending with retry count
|
|
if (job.RetryCount < job.MaxRetries)
|
|
{
|
|
job.Status = JobStatus.Pending;
|
|
job.RetryCount++;
|
|
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
|
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
|
job.ErrorMessage = $"Job timeout/stale - retry {job.RetryCount}/{job.MaxRetries} (ran for {elapsed:F1} min)";
|
|
job.FailureCategory = FailureCategory.SystemError;
|
|
_logger.LogWarning(
|
|
"Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
|
|
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter);
|
|
}
|
|
else
|
|
{
|
|
// Exceeded retries - mark as failed
|
|
job.Status = JobStatus.Failed;
|
|
job.ErrorMessage = $"Job timeout/stale - exceeded max retries (ran for {elapsed:F1} min)";
|
|
job.FailureCategory = FailureCategory.SystemError;
|
|
job.IsRetryable = false;
|
|
job.CompletedAt = DateTime.UtcNow;
|
|
|
|
_logger.LogError(
|
|
"Stale job {JobId} exceeded max retries after running for {Elapsed} minutes",
|
|
job.Id, elapsed);
|
|
|
|
// Notify permanent failure
|
|
await NotifyPermanentFailure(job, new TimeoutException($"Job timeout after {elapsed:F1} minutes"), scope.ServiceProvider);
|
|
|
|
// Update bundle request if this is part of a bundle
|
|
if (job.BundleRequestId.HasValue)
|
|
{
|
|
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider, job);
|
|
}
|
|
}
|
|
|
|
job.AssignedWorkerId = null;
|
|
job.LastHeartbeat = null;
|
|
await jobRepository.UpdateAsync(job);
|
|
}
|
|
|
|
if (staleJobs.Count > 0)
|
|
{
|
|
_logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Expected during shutdown, don't log as error
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error in stale job recovery loop");
|
|
}
|
|
}
|
|
}
|
|
|
|
private async Task HeartbeatLoop(CancellationToken cancellationToken)
|
|
{
|
|
while (!cancellationToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(TimeSpan.FromSeconds(_options.HeartbeatIntervalSeconds), cancellationToken);
|
|
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// Update heartbeat for all jobs assigned to this worker
|
|
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
|
|
|
foreach (var job in runningJobs)
|
|
{
|
|
job.LastHeartbeat = DateTime.UtcNow;
|
|
await jobRepository.UpdateAsync(job);
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Expected during shutdown, don't log as error
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error in heartbeat loop");
|
|
}
|
|
}
|
|
}
|
|
|
|
private async Task ProgressPersistenceLoop(CancellationToken cancellationToken)
|
|
{
|
|
while (!cancellationToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(TimeSpan.FromSeconds(2), cancellationToken); // Check every 2 seconds
|
|
|
|
using var scope = _scopeFactory.CreateScope();
|
|
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
|
|
|
// Process all progress trackers that need persistence
|
|
var trackersToPersist = _jobProgressTrackers
|
|
.Where(kvp => kvp.Value.ShouldPersist())
|
|
.ToList();
|
|
|
|
if (trackersToPersist.Count > 0)
|
|
{
|
|
_logger.LogDebug("Persisting progress for {Count} jobs", trackersToPersist.Count);
|
|
|
|
foreach (var (jobId, tracker) in trackersToPersist)
|
|
{
|
|
try
|
|
{
|
|
var (percentage, lastUpdate) = tracker.GetProgressForPersistence();
|
|
|
|
// Get and update the job
|
|
var job = await jobRepository.GetByIdAsync(jobId);
|
|
if (job != null && job.Status == JobStatus.Running)
|
|
{
|
|
job.ProgressPercentage = percentage;
|
|
job.LastHeartbeat = lastUpdate;
|
|
await jobRepository.UpdateAsync(job);
|
|
|
|
_logger.LogDebug("Persisted progress {Percentage}% for job {JobId}", percentage, jobId);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Error persisting progress for job {JobId}", jobId);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Expected during shutdown, don't log as error
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error in progress persistence loop");
|
|
}
|
|
}
|
|
}
|
|
|
|
private async Task HandleJobFailure(
|
|
Job job,
|
|
Exception ex,
|
|
IJobRepository jobRepository,
|
|
IServiceProvider serviceProvider)
|
|
{
|
|
try
|
|
{
|
|
// Categorize the failure
|
|
var failureCategory = CategorizeFailure(ex);
|
|
var isRetryable = IsRetryableFailure(ex, failureCategory);
|
|
|
|
// Check if we should retry
|
|
if (isRetryable && job.RetryCount < job.MaxRetries)
|
|
{
|
|
// Calculate exponential backoff: 2^retryCount minutes, capped at MaxRetryDelayMinutes
|
|
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
|
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
|
job.RetryCount++;
|
|
job.Status = JobStatus.Pending; // Reset to pending for retry
|
|
job.AssignedWorkerId = null; // Allow any worker to pick it up
|
|
job.ErrorMessage = $"Retry {job.RetryCount}/{job.MaxRetries}: {ex.Message}";
|
|
job.FailureCategory = failureCategory;
|
|
job.IsRetryable = true;
|
|
|
|
_logger.LogWarning(
|
|
"Job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}. Error: {Error}",
|
|
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter, ex.Message);
|
|
}
|
|
else
|
|
{
|
|
// Permanent failure - mark as failed
|
|
job.Status = JobStatus.Failed;
|
|
job.ErrorMessage = ex.Message;
|
|
job.FailureCategory = failureCategory;
|
|
job.IsRetryable = false;
|
|
job.CompletedAt = DateTime.UtcNow;
|
|
|
|
_logger.LogError(
|
|
"Job {JobId} failed permanently after {RetryCount} retries. Error: {Error}",
|
|
job.Id, job.RetryCount, ex.Message);
|
|
|
|
// Send notification for permanent failure
|
|
await NotifyPermanentFailure(job, ex, serviceProvider);
|
|
|
|
// Update bundle request if this is part of a bundle
|
|
if (job.BundleRequestId.HasValue)
|
|
{
|
|
await UpdateBundleRequestProgress(job.BundleRequestId.Value, serviceProvider, job);
|
|
}
|
|
}
|
|
|
|
job.LastHeartbeat = DateTime.UtcNow;
|
|
await jobRepository.UpdateAsync(job);
|
|
}
|
|
catch (Exception updateEx)
|
|
{
|
|
_logger.LogError(updateEx, "Failed to update job {JobId} status after failure", job.Id);
|
|
}
|
|
}
|
|
|
|
private FailureCategory CategorizeFailure(Exception ex)
|
|
{
|
|
return ex switch
|
|
{
|
|
TimeoutException => FailureCategory.Transient,
|
|
TaskCanceledException => FailureCategory.Transient,
|
|
HttpRequestException => FailureCategory.Transient,
|
|
InvalidOperationException when ex.Message.Contains("candles") || ex.Message.Contains("No candles") => FailureCategory.DataError,
|
|
InvalidOperationException when ex.Message.Contains("User") || ex.Message.Contains("not found") => FailureCategory.UserError,
|
|
OutOfMemoryException => FailureCategory.SystemError,
|
|
_ => FailureCategory.Unknown
|
|
};
|
|
}
|
|
|
|
private bool IsRetryableFailure(Exception ex, FailureCategory category)
|
|
{
|
|
// Don't retry user errors or data errors (missing candles, invalid config)
|
|
if (category == FailureCategory.UserError || category == FailureCategory.DataError)
|
|
return false;
|
|
|
|
// Retry transient and system errors
|
|
return category == FailureCategory.Transient || category == FailureCategory.SystemError;
|
|
}
|
|
|
|
private async Task AutoCompleteStuckJobAsync(Job stuckJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
|
|
{
|
|
try
|
|
{
|
|
_logger.LogWarning(
|
|
"🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
|
|
stuckJob.Id, stuckJob.StartedAt);
|
|
|
|
stuckJob.Status = JobStatus.Completed;
|
|
stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
|
|
stuckJob.LastHeartbeat = DateTime.UtcNow;
|
|
|
|
// Add note to error message if not already set
|
|
if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
|
|
{
|
|
stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
|
|
}
|
|
|
|
await jobRepository.UpdateAsync(stuckJob);
|
|
|
|
// Clean up progress tracker if still present
|
|
_jobProgressTrackers.TryRemove(stuckJob.Id, out _);
|
|
_runningJobTasks.TryRemove(stuckJob.Id, out _);
|
|
|
|
// Update bundle request if this is part of a bundle
|
|
if (stuckJob.BundleRequestId.HasValue)
|
|
{
|
|
try
|
|
{
|
|
await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, serviceProvider, stuckJob);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
|
|
}
|
|
}
|
|
|
|
_logger.LogInformation(
|
|
"✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
|
|
stuckJob.Id);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error auto-completing stuck job {JobId}", stuckJob.Id);
|
|
}
|
|
}
|
|
|
|
private async Task HandleLongRunningJobAsync(Job longJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
|
|
{
|
|
try
|
|
{
|
|
var elapsed = longJob.StartedAt.HasValue
|
|
? (DateTime.UtcNow - longJob.StartedAt.Value).TotalMinutes
|
|
: 0;
|
|
|
|
_logger.LogWarning(
|
|
"🔧 Job {JobId} has been running for {Elapsed:F1} minutes (timeout: {_options.JobTimeoutMinutes}). Failing job.",
|
|
longJob.Id, elapsed, _options.JobTimeoutMinutes);
|
|
|
|
// Mark as failed
|
|
longJob.Status = JobStatus.Failed;
|
|
longJob.ErrorMessage = $"Job exceeded maximum runtime of {_options.JobTimeoutMinutes} minutes";
|
|
longJob.FailureCategory = FailureCategory.SystemError;
|
|
longJob.IsRetryable = false;
|
|
longJob.CompletedAt = DateTime.UtcNow;
|
|
longJob.AssignedWorkerId = null;
|
|
longJob.LastHeartbeat = DateTime.UtcNow;
|
|
|
|
await jobRepository.UpdateAsync(longJob);
|
|
|
|
// Clean up
|
|
_jobProgressTrackers.TryRemove(longJob.Id, out _);
|
|
_runningJobTasks.TryRemove(longJob.Id, out _);
|
|
|
|
// Update bundle request if this is part of a bundle
|
|
if (longJob.BundleRequestId.HasValue)
|
|
{
|
|
try
|
|
{
|
|
await UpdateBundleRequestProgress(longJob.BundleRequestId.Value, serviceProvider, longJob);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error updating bundle request progress for long-running job {JobId}", longJob.Id);
|
|
}
|
|
}
|
|
|
|
// Notify about permanent failure
|
|
await NotifyPermanentFailure(longJob, new TimeoutException($"Job exceeded {elapsed:F1} minutes"), serviceProvider);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error handling long-running job {JobId}", longJob.Id);
|
|
}
|
|
}
|
|
|
|
private async Task NotifyPermanentFailure(
|
|
Job job,
|
|
Exception ex,
|
|
IServiceProvider serviceProvider)
|
|
{
|
|
try
|
|
{
|
|
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
|
|
const string alertsChannel = "2676086723";
|
|
|
|
var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
|
|
var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
|
|
$"Job ID: `{job.Id}`\n" +
|
|
$"User ID: {job.UserId}\n" +
|
|
$"Retry Attempts: {job.RetryCount}/{job.MaxRetries}\n" +
|
|
$"Failure Category: {job.FailureCategory}\n" +
|
|
$"Error: {ex.Message}\n" +
|
|
$"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";
|
|
|
|
await webhookService.SendMessage(message, alertsChannel);
|
|
}
|
|
catch (Exception notifyEx)
|
|
{
|
|
_logger.LogError(notifyEx, "Failed to send permanent failure notification for job {JobId}", job.Id);
|
|
}
|
|
}
|
|
|
|
public override void Dispose()
|
|
{
|
|
_shutdownCts?.Cancel();
|
|
_shutdownCts?.Dispose();
|
|
_instanceSemaphore?.Dispose();
|
|
base.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Tracks job progress with batched database updates for performance optimization
|
|
/// </summary>
|
|
public class JobProgressTracker
|
|
{
|
|
private readonly object _lock = new();
|
|
private int _lastPersistedPercentage;
|
|
private DateTime _lastPersistedTime;
|
|
private readonly ILogger _logger;
|
|
|
|
public Guid JobId { get; }
|
|
public int CurrentPercentage { get; private set; }
|
|
public DateTime LastUpdateTime { get; private set; }
|
|
|
|
public JobProgressTracker(Guid jobId, ILogger logger)
|
|
{
|
|
JobId = jobId;
|
|
_logger = logger;
|
|
_lastPersistedTime = DateTime.UtcNow;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Updates progress in memory only - thread safe
|
|
/// </summary>
|
|
public void UpdateProgress(int percentage)
|
|
{
|
|
lock (_lock)
|
|
{
|
|
CurrentPercentage = percentage;
|
|
LastUpdateTime = DateTime.UtcNow;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if progress should be persisted to database based on time/percentage thresholds
|
|
/// </summary>
|
|
public bool ShouldPersist(int progressUpdateIntervalMs = 5000, int percentageThreshold = 5)
|
|
{
|
|
lock (_lock)
|
|
{
|
|
var timeSinceLastPersist = (DateTime.UtcNow - _lastPersistedTime).TotalMilliseconds;
|
|
var percentageSinceLastPersist = CurrentPercentage - _lastPersistedPercentage;
|
|
|
|
return timeSinceLastPersist >= progressUpdateIntervalMs ||
|
|
percentageSinceLastPersist >= percentageThreshold ||
|
|
CurrentPercentage >= 100; // Always persist completion
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets current progress and marks as persisted
|
|
/// </summary>
|
|
public (int percentage, DateTime lastUpdate) GetProgressForPersistence()
|
|
{
|
|
lock (_lock)
|
|
{
|
|
var percentage = CurrentPercentage;
|
|
var lastUpdate = LastUpdateTime;
|
|
|
|
_lastPersistedPercentage = percentage;
|
|
_lastPersistedTime = DateTime.UtcNow;
|
|
|
|
return (percentage, lastUpdate);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Configuration options for BacktestComputeWorker
|
|
/// </summary>
|
|
public class BacktestComputeWorkerOptions
|
|
{
|
|
public const string SectionName = "BacktestComputeWorker";
|
|
|
|
/// <summary>
|
|
/// Unique identifier for this worker instance
|
|
/// </summary>
|
|
public string WorkerId { get; set; } = Environment.MachineName;
|
|
|
|
/// <summary>
|
|
/// Maximum number of concurrent backtests per user (global limit across all workers)
|
|
/// </summary>
|
|
public int MaxConcurrentPerUser { get; set; } = 6;
|
|
|
|
/// <summary>
|
|
/// Maximum number of concurrent backtests per worker instance (local limit for this worker)
|
|
/// </summary>
|
|
public int MaxConcurrentPerInstance { get; set; } = 6;
|
|
|
|
/// <summary>
|
|
/// Interval in seconds between job polling attempts
|
|
/// </summary>
|
|
public int JobPollIntervalSeconds { get; set; } = 5;
|
|
|
|
/// <summary>
|
|
/// Interval in seconds between heartbeat updates
|
|
/// </summary>
|
|
public int HeartbeatIntervalSeconds { get; set; } = 30;
|
|
|
|
/// <summary>
|
|
/// Timeout in minutes for considering a job stale
|
|
/// </summary>
|
|
public int StaleJobTimeoutMinutes { get; set; } = 5;
|
|
|
|
/// <summary>
|
|
/// Default maximum retry attempts for failed jobs
|
|
/// </summary>
|
|
public int DefaultMaxRetries { get; set; } = 3;
|
|
|
|
/// <summary>
|
|
/// Maximum retry delay in minutes (cap for exponential backoff)
|
|
/// </summary>
|
|
public int MaxRetryDelayMinutes { get; set; } = 60;
|
|
|
|
/// <summary>
|
|
/// Maximum time in minutes a job can run before being considered timed out
|
|
/// </summary>
|
|
public int JobTimeoutMinutes { get; set; } = 60;
|
|
|
|
/// <summary>
|
|
/// Timeout in minutes to wait for running jobs during graceful shutdown
|
|
/// </summary>
|
|
public int GracefulShutdownTimeoutMinutes { get; set; } = 5;
|
|
}
|
|
|