managing-apps/src/Managing.Application/Workers/BacktestComputeWorker.cs

using System.Collections.Concurrent;
using System.Text.Json;
using Managing.Application.Abstractions.Repositories;
using Managing.Application.Abstractions.Services;
using Managing.Application.Backtests;
using Managing.Domain.Backtests;
using Managing.Domain.Bots;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using static Managing.Common.Enums;

namespace Managing.Application.Workers;

/// <summary>
/// Background worker that processes backtest jobs from the queue.
/// Polls for pending jobs, claims them using advisory locks, and processes them.
/// </summary>
public class BacktestComputeWorker : BackgroundService
{
    private readonly IServiceScopeFactory _scopeFactory;
    private readonly ILogger<BacktestComputeWorker> _logger;
    private readonly BacktestComputeWorkerOptions _options;
    private readonly SemaphoreSlim _instanceSemaphore;
    private readonly ConcurrentDictionary<Guid, Task> _runningJobTasks = new();
    private readonly ConcurrentDictionary<Guid, JobProgressTracker> _jobProgressTrackers = new();
    private readonly CancellationTokenSource _shutdownCts = new();

    public BacktestComputeWorker(
        IServiceScopeFactory scopeFactory,
        ILogger<BacktestComputeWorker> logger,
        IOptions<BacktestComputeWorkerOptions> options)
    {
        _scopeFactory = scopeFactory;
        _logger = logger;
        _options = options.Value;
        _instanceSemaphore = new SemaphoreSlim(_options.MaxConcurrentPerInstance, _options.MaxConcurrentPerInstance);
    }

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        _logger.LogInformation(
            "BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s, JobTimeout: {JobTimeoutMinutes}min",
            _options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance,
            _options.JobPollIntervalSeconds, _options.JobTimeoutMinutes);

        // Reset any jobs assigned to this WorkerId from previous worker instances at startup
        // This is critical when restarting with the same WorkerId (e.g., Environment.MachineName)
        try
        {
            using var scope = _scopeFactory.CreateScope();
            var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

            // First, reset all jobs assigned to this WorkerId (from previous instance)
            var workerResetCount = await jobRepository.ResetJobsByWorkerIdAsync(_options.WorkerId);
            if (workerResetCount > 0)
            {
                _logger.LogInformation("Reset {Count} jobs assigned to worker {WorkerId} from previous instance",
                    workerResetCount, _options.WorkerId);
            }

            // Then, reset any other stale jobs (from other workers or orphaned jobs)
            var staleResetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
            if (staleResetCount > 0)
            {
                _logger.LogInformation("Reset {Count} stale jobs to Pending status at startup", staleResetCount);
            }
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error resetting jobs at startup");
            // Don't fail startup if this fails, but log it
        }

        // Link cancellation tokens
        using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken, _shutdownCts.Token);
        var cancellationToken = linkedCts.Token;

        // Background task for stale job recovery
        var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(cancellationToken), cancellationToken);

        // Background task for heartbeat updates
        var heartbeatTask = Task.Run(() => HeartbeatLoop(cancellationToken), cancellationToken);

        // Background task for progress persistence
        var progressPersistenceTask = Task.Run(() => ProgressPersistenceLoop(cancellationToken), cancellationToken);

        // Main job processing loop
        try
        {
            while (!cancellationToken.IsCancellationRequested)
            {
                try
                {
                    await ProcessJobsAsync(cancellationToken);
                }
                catch (OperationCanceledException)
                {
                    // Expected during shutdown
                    break;
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Error in BacktestComputeWorker main loop");
                    SentrySdk.CaptureException(ex);
                }

                try
                {
                    await Task.Delay(TimeSpan.FromSeconds(_options.JobPollIntervalSeconds), cancellationToken);
                }
                catch (OperationCanceledException)
                {
                    break;
                }
            }
        }
        finally
        {
            _logger.LogInformation("BacktestComputeWorker stopping, waiting for {Count} running jobs to complete or timeout",
                _runningJobTasks.Count);

            // Signal shutdown
            _shutdownCts.Cancel();

            // Wait for running jobs with timeout
            var waitTasks = _runningJobTasks.Values.ToArray();
            if (waitTasks.Length > 0)
            {
                var timeoutTask = Task.Delay(TimeSpan.FromMinutes(_options.GracefulShutdownTimeoutMinutes), CancellationToken.None);
                var completedTask = await Task.WhenAny(Task.WhenAll(waitTasks), timeoutTask);

                if (completedTask == timeoutTask)
                {
                    _logger.LogWarning("Graceful shutdown timeout reached, {Count} jobs may still be running",
                        _runningJobTasks.Count);
                }
                else
                {
                    _logger.LogInformation("All running jobs completed during graceful shutdown");
                }
            }

            _logger.LogInformation("BacktestComputeWorker stopped");
        }
    }

    private async Task ProcessJobsAsync(CancellationToken cancellationToken)
    {
        // Check if this instance has capacity
        if (!await _instanceSemaphore.WaitAsync(0, cancellationToken))
        {
            // Instance at capacity, skip this iteration
            return;
        }

        try
        {
            using var scope = _scopeFactory.CreateScope();
            var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

            // Claim a random backtest job atomically, excluding users at capacity
            // The SQL query checks running job counts within the transaction, ensuring thread-safety
            var job = await jobRepository.ClaimRandomJobAsync(
                _options.WorkerId,
                JobType.Backtest,
                _options.MaxConcurrentPerUser);

            if (job == null)
            {
                // No jobs available for users not at capacity, release semaphore
                _instanceSemaphore.Release();
                return;
            }

            _logger.LogInformation(
                "Claimed random backtest job {JobId} (UserId: {UserId}) for worker {WorkerId}",
                job.Id, job.UserId, _options.WorkerId);

            // Process the job asynchronously (don't await, let it run in background)
            // Create a new scope for the job processing to ensure proper lifetime management
            var jobTask = Task.Run(async () =>
            {
                try
                {
                    await ProcessJobAsync(job, cancellationToken);
                }
                catch (OperationCanceledException)
                {
                    // Handle cancellation gracefully
                    _logger.LogInformation("Job {JobId} was cancelled during processing", job.Id);
                    await HandleJobCancellation(job);
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Error processing job {JobId}", job.Id);
                    // Error handling is done in ProcessJobAsync
                }
                finally
                {
                    _runningJobTasks.TryRemove(job.Id, out _);
                    _instanceSemaphore.Release();
                }
            }, cancellationToken);

            // Track the running job task
            _runningJobTasks.TryAdd(job.Id, jobTask);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error claiming or processing job");
            _instanceSemaphore.Release();
            throw;
        }
    }

    private async Task ProcessJobAsync(
        Job job,
        CancellationToken cancellationToken)
    {
        using var scope = _scopeFactory.CreateScope();
        var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
        var executor = scope.ServiceProvider.GetRequiredService<BacktestExecutor>();
        var userService = scope.ServiceProvider.GetRequiredService<IUserService>();
        var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
        var agentSummaryRepository = scope.ServiceProvider.GetRequiredService<IAgentSummaryRepository>();

        var jobStartTime = DateTime.UtcNow;

        try
        {
            _logger.LogInformation(
                "Processing backtest job {JobId} (BundleRequestId: {BundleRequestId}, UserId: {UserId})",
                job.Id, job.BundleRequestId, job.UserId);

            // Deserialize config
            var config = JsonSerializer.Deserialize<TradingBotConfig>(job.ConfigJson);
            if (config == null)
            {
                throw new InvalidOperationException("Failed to deserialize TradingBotConfig from job");
            }

            // Load user
            var user = await userService.GetUserByIdAsync(job.UserId);
            if (user == null)
            {
                throw new InvalidOperationException($"User {job.UserId} not found");
            }

            // Load candles
            var candles = await exchangeService.GetCandlesInflux(
                TradingExchanges.Evm,
                config.Ticker,
                job.StartDate,
                config.Timeframe,
                job.EndDate);

            if (candles == null || candles.Count == 0)
            {
                throw new InvalidOperationException(
                    $"No candles found for {config.Ticker} on {config.Timeframe} from {job.StartDate} to {job.EndDate}");
            }

            // Create progress tracker for this job
            var progressTracker = new JobProgressTracker(job.Id, _logger);
            _jobProgressTrackers.TryAdd(job.Id, progressTracker);

            // Progress callback that only updates in-memory progress (non-blocking)
            // Timeout is now enforced via CancellationToken, not by throwing in callback
            Func<int, Task> progressCallback = (percentage) =>
            {
                // Update progress in memory only - persistence happens in background
                progressTracker.UpdateProgress(percentage);

                return Task.CompletedTask; // Non-blocking
            };

            // Execute the backtest with timeout
            var timeoutCts = new CancellationTokenSource(TimeSpan.FromMinutes(_options.JobTimeoutMinutes));
            var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);

            // Convert HashSet to List - candles are already ordered from repository
            var candlesList = candles.ToList();

            LightBacktest result;
            try
            {
                result = await executor.ExecuteAsync(
                    config,
                    candlesList,
                    user,
                    save: true,
                    withCandles: false,
                    requestId: job.RequestId,
                    bundleRequestId: job.BundleRequestId,
                    metadata: null,
                    progressCallback: progressCallback,
                    cancellationToken: linkedCts.Token);
            }
            catch (OperationCanceledException) when (timeoutCts.Token.IsCancellationRequested && !cancellationToken.IsCancellationRequested)
            {
                var elapsed = DateTime.UtcNow - jobStartTime;
                throw new TimeoutException($"Job {job.Id} exceeded timeout of {_options.JobTimeoutMinutes} minutes (ran for {elapsed.TotalMinutes:F1} minutes)");
            }
            finally
            {
                timeoutCts.Dispose();
                linkedCts.Dispose();
            }

            // Update job with result
            job.Status = JobStatus.Completed;
            job.ProgressPercentage = 100;
            job.ResultJson = JsonSerializer.Serialize(result);
            job.CompletedAt = DateTime.UtcNow;
            job.LastHeartbeat = DateTime.UtcNow;

            await jobRepository.UpdateAsync(job);

            // Clean up progress tracker
            _jobProgressTrackers.TryRemove(job.Id, out _);

            // Increment backtest count for the user's agent summary
            try
            {
                await agentSummaryRepository.IncrementBacktestCountAsync(job.UserId);
                _logger.LogDebug("Incremented backtest count for user {UserId}", job.UserId);
            }
            catch (Exception ex)
            {
                _logger.LogWarning(ex, "Failed to increment backtest count for user {UserId}", job.UserId);
                // Don't fail the job if this update fails
            }

            var elapsedTime = DateTime.UtcNow - jobStartTime;
            _logger.LogInformation(
                "Completed backtest job {JobId}. Score: {Score}, PnL: {PnL}, Duration: {DurationMinutes:F1} minutes",
                job.Id, result.Score, result.FinalPnl, elapsedTime.TotalMinutes);

            // Update bundle request progress if this job is part of a bundle
            if (job.BundleRequestId.HasValue)
            {
                try
                {
                    await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider, job);
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Error updating bundle request progress for job {JobId}", job.Id);
                }
            }
        }
        catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
        {
            _logger.LogInformation("Job {JobId} was cancelled", job.Id);
            throw; // Re-throw to be handled by the caller
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
            SentrySdk.CaptureException(ex);

            // Clean up progress tracker on failure
            _jobProgressTrackers.TryRemove(job.Id, out _);

            await HandleJobFailure(job, ex, jobRepository, scope.ServiceProvider);
        }
    }

    private async Task HandleJobCancellation(Job job)
    {
        try
        {
            using var scope = _scopeFactory.CreateScope();
            var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

            // Reload job to get latest state
            var currentJob = await jobRepository.GetByIdAsync(job.Id);
            if (currentJob == null)
            {
                _logger.LogWarning("Job {JobId} not found when handling cancellation", job.Id);
                return;
            }

            // If job is still running, mark it for retry
            if (currentJob.Status == JobStatus.Running)
            {
                if (currentJob.RetryCount < currentJob.MaxRetries)
                {
                    currentJob.Status = JobStatus.Pending;
                    currentJob.RetryCount++;
                    var backoffMinutes = Math.Min(Math.Pow(2, currentJob.RetryCount), _options.MaxRetryDelayMinutes);
                    currentJob.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
                    currentJob.ErrorMessage = $"Worker shutdown - retry {currentJob.RetryCount}/{currentJob.MaxRetries}";
                    currentJob.FailureCategory = FailureCategory.Transient;
                    currentJob.AssignedWorkerId = null;
                    currentJob.LastHeartbeat = null;

                    _logger.LogWarning(
                        "Job {JobId} cancelled during shutdown, will be retried (attempt {RetryCount}/{MaxRetries})",
                        currentJob.Id, currentJob.RetryCount, currentJob.MaxRetries);
                }
                else
                {
                    currentJob.Status = JobStatus.Failed;
                    currentJob.ErrorMessage = "Worker shutdown - exceeded max retries";
                    currentJob.FailureCategory = FailureCategory.SystemError;
                    currentJob.IsRetryable = false;
                    currentJob.CompletedAt = DateTime.UtcNow;
                    currentJob.AssignedWorkerId = null;

                    _logger.LogError("Job {JobId} cancelled during shutdown and exceeded max retries", currentJob.Id);

                    await NotifyPermanentFailure(currentJob, new OperationCanceledException("Worker shutdown"), scope.ServiceProvider);
                }

                await jobRepository.UpdateAsync(currentJob);
            }
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error handling job cancellation for job {JobId}", job.Id);
        }
    }

    private async Task UpdateBundleRequestProgress(Guid bundleRequestId, IServiceProvider serviceProvider, Job? updatedJob = null)
    {
        try
        {
            var backtestRepository = serviceProvider.GetRequiredService<IBacktestRepository>();
            var jobRepository = serviceProvider.GetRequiredService<IJobRepository>();
            var userService = serviceProvider.GetRequiredService<IUserService>();
            var webhookService = serviceProvider.GetRequiredService<IWebhookService>();

            // Get all jobs for this bundle
            var jobs = await jobRepository.GetByBundleRequestIdAsync(bundleRequestId);
            var totalJobs = jobs.Count();

            // If we have an updated job, use its current status instead of querying again
            // This avoids race conditions where the job was just updated but not yet reflected in the query
            var completedJobs = jobs.Count(j => j.Status == JobStatus.Completed);
            var failedJobs = jobs.Count(j => j.Status == JobStatus.Failed);
            var runningJobs = jobs.Count(j => j.Status == JobStatus.Running);

            // Adjust counts based on the updated job if provided
            if (updatedJob != null)
            {
                var existingJob = jobs.FirstOrDefault(j => j.Id == updatedJob.Id);
                if (existingJob != null)
                {
                    // Remove the old status count
                    switch (existingJob.Status)
                    {
                        case JobStatus.Completed:
                            completedJobs--;
                            break;
                        case JobStatus.Failed:
                            failedJobs--;
                            break;
                        case JobStatus.Running:
                            runningJobs--;
                            break;
                    }

                    // Add the new status count
                    switch (updatedJob.Status)
                    {
                        case JobStatus.Completed:
                            completedJobs++;
                            break;
                        case JobStatus.Failed:
                            failedJobs++;
                            break;
                        case JobStatus.Running:
                            runningJobs++;
                            break;
                    }
                }
            }

            if (totalJobs == 0)
            {
                return; // No jobs yet
            }

            // Get user from first job
            var firstJob = jobs.First();
            var user = await userService.GetUserByIdAsync(firstJob.UserId);
            if (user == null)
            {
                _logger.LogWarning("User {UserId} not found for bundle request {BundleRequestId}", firstJob.UserId, bundleRequestId);
                return;
            }

            // Get bundle request
            var bundleRequest = backtestRepository.GetBundleBacktestRequestByIdForUser(user, bundleRequestId);
            if (bundleRequest == null)
            {
                _logger.LogWarning("Bundle request {BundleRequestId} not found for user {UserId}", bundleRequestId, user.Id);
                return;
            }

            var previousStatus = bundleRequest.Status;

            // Update bundle request progress (always update counters regardless of status)
            bundleRequest.CompletedBacktests = completedJobs;
            bundleRequest.FailedBacktests = failedJobs;
            bundleRequest.UpdatedAt = DateTime.UtcNow;

            // CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set),
            // don't overwrite it unless we're detecting a legitimate inconsistency that needs fixing
            if (bundleRequest.CompletedAt.HasValue &&
                (bundleRequest.Status == BundleBacktestRequestStatus.Completed ||
                 bundleRequest.Status == BundleBacktestRequestStatus.Failed))
            {
                // Bundle already finalized, only update if job counts indicate it should be re-opened
                // (This shouldn't happen in normal flow, but guards against race conditions)
                if (completedJobs + failedJobs == totalJobs)
                {
                    _logger.LogDebug(
                        "Bundle {BundleRequestId} already completed/failed. Skipping status update.",
                        bundleRequestId);
                    // Progress counters already updated above, just return
                    return;
                }
                else
                {
                    _logger.LogWarning(
                        "Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.",
                        bundleRequestId, completedJobs, failedJobs, totalJobs);
                    // Allow the update to proceed to fix inconsistent state
                }
            }

            // Update status based on job states
            if (completedJobs + failedJobs == totalJobs)
            {
                // All jobs completed or failed
                if (failedJobs == 0)
                {
                    bundleRequest.Status = BundleBacktestRequestStatus.Completed;
                }
                else if (completedJobs == 0)
                {
                    bundleRequest.Status = BundleBacktestRequestStatus.Failed;
                    bundleRequest.ErrorMessage = "All backtests failed";
                }
                else
                {
                    bundleRequest.Status = BundleBacktestRequestStatus.Completed;
                    bundleRequest.ErrorMessage = $"{failedJobs} backtests failed";
                }
                bundleRequest.CompletedAt = DateTime.UtcNow;
                bundleRequest.CurrentBacktest = null;
            }
            else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0)
            {
                // Some jobs are running, or some have completed/failed (meaning work has started)
                // Once a bundle has started processing, it should stay "Running" until all jobs are done
                bundleRequest.Status = BundleBacktestRequestStatus.Running;
            }
            // If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0),
            // keep the current status (likely Pending)

            // Update results list from completed jobs
            var completedJobResults = jobs
                .Where(j => j.Status == JobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
                .Select(j =>
                {
                    try
                    {
                        var result = JsonSerializer.Deserialize<LightBacktest>(j.ResultJson);
                        return result?.Id;
                    }
                    catch
                    {
                        return null;
                    }
                })
                .Where(id => !string.IsNullOrEmpty(id))
                .ToList();

            bundleRequest.Results = completedJobResults!;

            await backtestRepository.UpdateBundleBacktestRequestAsync(bundleRequest);

            // Send webhook notification if bundle request just completed
            if (previousStatus != BundleBacktestRequestStatus.Completed &&
                bundleRequest.Status == BundleBacktestRequestStatus.Completed &&
                !string.IsNullOrEmpty(user.TelegramChannel))
            {
                var completedAt = bundleRequest.CompletedAt ?? DateTime.UtcNow;
                var duration = completedAt - bundleRequest.CreatedAt;
                var durationText = duration.TotalHours >= 1
                    ? $"{duration.Hours}h {duration.Minutes}m {duration.Seconds}s"
                    : duration.TotalMinutes >= 1
                        ? $"{duration.Minutes}m {duration.Seconds}s"
                        : $"{duration.Seconds}s";

                var message = $"✅ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) completed successfully. " +
                             $"Completed: {completedJobs}/{totalJobs} backtests" +
                             (failedJobs > 0 ? $", Failed: {failedJobs}" : "") +
                             $". Results: {completedJobResults.Count} backtest(s) generated. " +
                             $"Duration: {durationText}";

                await webhookService.SendMessage(message, user.TelegramChannel);
            }
            else if (previousStatus != BundleBacktestRequestStatus.Failed &&
                     bundleRequest.Status == BundleBacktestRequestStatus.Failed &&
                     !string.IsNullOrEmpty(user.TelegramChannel))
            {
                var completedAt = bundleRequest.CompletedAt ?? DateTime.UtcNow;
                var duration = completedAt - bundleRequest.CreatedAt;
                var durationText = duration.TotalHours >= 1
                    ? $"{duration.Hours}h {duration.Minutes}m {duration.Seconds}s"
                    : duration.TotalMinutes >= 1
                        ? $"{duration.Minutes}m {duration.Seconds}s"
                        : $"{duration.Seconds}s";

                var message = $"❌ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) failed. " +
                             $"All {totalJobs} backtests failed. Error: {bundleRequest.ErrorMessage}. " +
                             $"Duration: {durationText}";

                await webhookService.SendMessage(message, user.TelegramChannel);
            }

            _logger.LogInformation(
                "Updated bundle request {BundleRequestId} progress: {Completed}/{Total} completed, {Failed} failed, {Running} running",
                bundleRequestId, completedJobs, totalJobs, failedJobs, runningJobs);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Error updating bundle request {BundleRequestId} progress", bundleRequestId);
        }
    }

    private async Task StaleJobRecoveryLoop(CancellationToken cancellationToken)
    {
        while (!cancellationToken.IsCancellationRequested)
        {
            try
            {
                await Task.Delay(TimeSpan.FromMinutes(1), cancellationToken); // Check every minute

                using var scope = _scopeFactory.CreateScope();
                var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

                // Get running jobs for this worker
                var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);

                // CRITICAL FIX: Check for jobs stuck at 100% progress
                // These jobs completed execution but their status wasn't updated to Completed
                // This causes the worker to think it's at max capacity
                var stuckCompletedJobs = runningJobs
                    .Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100)
                    .ToList();

                if (stuckCompletedJobs.Any())
                {
                    _logger.LogWarning(
                        "🔧 Found {Count} jobs stuck at 100% progress for worker {WorkerId}. Auto-completing them.",
                        stuckCompletedJobs.Count, _options.WorkerId);

                    foreach (var stuckJob in stuckCompletedJobs)
                    {
                        await AutoCompleteStuckJobAsync(stuckJob, jobRepository, scope.ServiceProvider);
                    }
                }

                // Also check for jobs that have been running for too long but haven't reached 100%
                var longRunningJobs = runningJobs
                    .Where(j => j.JobType == JobType.Backtest &&
                               j.ProgressPercentage < 100 &&
                               j.StartedAt.HasValue &&
                               (DateTime.UtcNow - j.StartedAt.Value) > TimeSpan.FromMinutes(_options.JobTimeoutMinutes + 10)) // Extra 10 min grace
                    .ToList();

                if (longRunningJobs.Any())
                {
                    _logger.LogWarning(
                        "🔧 Found {Count} jobs running longer than timeout for worker {WorkerId}. Marking as failed.",
                        longRunningJobs.Count, _options.WorkerId);

                    foreach (var longJob in longRunningJobs)
                    {
                        await HandleLongRunningJobAsync(longJob, jobRepository, scope.ServiceProvider);
                    }
                }

                // Get stale jobs for this worker
                var now = DateTime.UtcNow;
                var staleJobs = runningJobs
                    .Where(j => j.JobType == JobType.Backtest &&
                               j.ProgressPercentage < 100 && // Don't mark stuck-at-100% jobs as stale
                               (
                                   // Stale heartbeat (no heartbeat in timeout period)
                                   j.LastHeartbeat == null ||
                                   j.LastHeartbeat < now.AddMinutes(-_options.StaleJobTimeoutMinutes) ||
                                   // Job running too long (even with recent heartbeat)
                                   (j.StartedAt.HasValue &&
                                    j.StartedAt.Value < now.AddMinutes(-_options.JobTimeoutMinutes))
                               ))
                    .ToList();

                foreach (var job in staleJobs)
                {
                    var elapsed = job.StartedAt.HasValue
                        ? (DateTime.UtcNow - job.StartedAt.Value).TotalMinutes
                        : (double?)null;
                    var lastHeartbeatAge = job.LastHeartbeat.HasValue
                        ? (DateTime.UtcNow - job.LastHeartbeat.Value).TotalMinutes
                        : (double?)null;

                    _logger.LogWarning(
                        "Detected stale job {JobId}: Started {StartedAt}, LastHeartbeat: {LastHeartbeat} ({HeartbeatAge} min ago), Elapsed: {Elapsed} min",
                        job.Id, job.StartedAt, job.LastHeartbeat, lastHeartbeatAge, elapsed);

                    // Remove from running tasks if still tracked
                    _runningJobTasks.TryRemove(job.Id, out _);

                    // If it's stale but retryable, reset to pending with retry count
                    if (job.RetryCount < job.MaxRetries)
                    {
                        job.Status = JobStatus.Pending;
                        job.RetryCount++;
                        var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
                        job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
                        job.ErrorMessage = $"Job timeout/stale - retry {job.RetryCount}/{job.MaxRetries} (ran for {elapsed:F1} min)";
                        job.FailureCategory = FailureCategory.SystemError;
                        _logger.LogWarning(
                            "Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
                            job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter);
                    }
                    else
                    {
                        // Exceeded retries - mark as failed
                        job.Status = JobStatus.Failed;
                        job.ErrorMessage = $"Job timeout/stale - exceeded max retries (ran for {elapsed:F1} min)";
                        job.FailureCategory = FailureCategory.SystemError;
                        job.IsRetryable = false;
                        job.CompletedAt = DateTime.UtcNow;

                        _logger.LogError(
                            "Stale job {JobId} exceeded max retries after running for {Elapsed} minutes",
                            job.Id, elapsed);

                        // Notify permanent failure
                        await NotifyPermanentFailure(job, new TimeoutException($"Job timeout after {elapsed:F1} minutes"), scope.ServiceProvider);

                        // Update bundle request if this is part of a bundle
                        if (job.BundleRequestId.HasValue)
                        {
                            await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider, job);
                        }
                    }

                    job.AssignedWorkerId = null;
                    job.LastHeartbeat = null;
                    await jobRepository.UpdateAsync(job);
                }

                if (staleJobs.Count > 0)
                {
                    _logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
                }
            }
            catch (OperationCanceledException)
            {
                // Expected during shutdown, don't log as error
                break;
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error in stale job recovery loop");
            }
        }
    }

    private async Task HeartbeatLoop(CancellationToken cancellationToken)
    {
        while (!cancellationToken.IsCancellationRequested)
        {
            try
            {
                await Task.Delay(TimeSpan.FromSeconds(_options.HeartbeatIntervalSeconds), cancellationToken);

                using var scope = _scopeFactory.CreateScope();
                var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

                // Update heartbeat for all jobs assigned to this worker
                var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);

                foreach (var job in runningJobs)
                {
                    job.LastHeartbeat = DateTime.UtcNow;
                    await jobRepository.UpdateAsync(job);
                }
            }
            catch (OperationCanceledException)
            {
                // Expected during shutdown, don't log as error
                break;
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error in heartbeat loop");
            }
        }
    }

    private async Task ProgressPersistenceLoop(CancellationToken cancellationToken)
    {
        while (!cancellationToken.IsCancellationRequested)
        {
            try
            {
                await Task.Delay(TimeSpan.FromSeconds(2), cancellationToken); // Check every 2 seconds

                using var scope = _scopeFactory.CreateScope();
                var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();

                // Process all progress trackers that need persistence
                var trackersToPersist = _jobProgressTrackers
                    .Where(kvp => kvp.Value.ShouldPersist())
                    .ToList();

                if (trackersToPersist.Count > 0)
                {
                    _logger.LogDebug("Persisting progress for {Count} jobs", trackersToPersist.Count);

                    foreach (var (jobId, tracker) in trackersToPersist)
                    {
                        try
                        {
                            var (percentage, lastUpdate) = tracker.GetProgressForPersistence();

                            // Get and update the job
                            var job = await jobRepository.GetByIdAsync(jobId);
                            if (job != null && job.Status == JobStatus.Running)
                            {
                                job.ProgressPercentage = percentage;
                                job.LastHeartbeat = lastUpdate;
                                await jobRepository.UpdateAsync(job);

                                _logger.LogDebug("Persisted progress {Percentage}% for job {JobId}", percentage, jobId);
                            }
                        }
                        catch (Exception ex)
                        {
                            _logger.LogWarning(ex, "Error persisting progress for job {JobId}", jobId);
                        }
                    }
                }
            }
            catch (OperationCanceledException)
            {
                // Expected during shutdown, don't log as error
                break;
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error in progress persistence loop");
            }
        }
    }

    private async Task HandleJobFailure(
        Job job,
        Exception ex,
        IJobRepository jobRepository,
        IServiceProvider serviceProvider)
    {
        try
        {
            // Categorize the failure
            var failureCategory = CategorizeFailure(ex);
            var isRetryable = IsRetryableFailure(ex, failureCategory);

            // Check if we should retry
            if (isRetryable && job.RetryCount < job.MaxRetries)
            {
                // Calculate exponential backoff: 2^retryCount minutes, capped at MaxRetryDelayMinutes
                var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
                job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
                job.RetryCount++;
                job.Status = JobStatus.Pending; // Reset to pending for retry
                job.AssignedWorkerId = null; // Allow any worker to pick it up
                job.ErrorMessage = $"Retry {job.RetryCount}/{job.MaxRetries}: {ex.Message}";
                job.FailureCategory = failureCategory;
                job.IsRetryable = true;

                _logger.LogWarning(
                    "Job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}. Error: {Error}",
                    job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter, ex.Message);
            }
            else
            {
                // Permanent failure - mark as failed
                job.Status = JobStatus.Failed;
                job.ErrorMessage = ex.Message;
                job.FailureCategory = failureCategory;
                job.IsRetryable = false;
                job.CompletedAt = DateTime.UtcNow;

                _logger.LogError(
                    "Job {JobId} failed permanently after {RetryCount} retries. Error: {Error}",
                    job.Id, job.RetryCount, ex.Message);

                // Send notification for permanent failure
                await NotifyPermanentFailure(job, ex, serviceProvider);

                // Update bundle request if this is part of a bundle
                if (job.BundleRequestId.HasValue)
                {
                    await UpdateBundleRequestProgress(job.BundleRequestId.Value, serviceProvider, job);
                }
            }

            job.LastHeartbeat = DateTime.UtcNow;
            await jobRepository.UpdateAsync(job);
        }
        catch (Exception updateEx)
        {
            _logger.LogError(updateEx, "Failed to update job {JobId} status after failure", job.Id);
        }
    }

    private FailureCategory CategorizeFailure(Exception ex)
    {
        return ex switch
        {
            TimeoutException => FailureCategory.Transient,
            TaskCanceledException => FailureCategory.Transient,
            HttpRequestException => FailureCategory.Transient,
            InvalidOperationException when ex.Message.Contains("candles") || ex.Message.Contains("No candles") => FailureCategory.DataError,
            InvalidOperationException when ex.Message.Contains("User") || ex.Message.Contains("not found") => FailureCategory.UserError,
            OutOfMemoryException => FailureCategory.SystemError,
            _ => FailureCategory.Unknown
        };
    }

    private bool IsRetryableFailure(Exception ex, FailureCategory category)
    {
        // Don't retry user errors or data errors (missing candles, invalid config)
        if (category == FailureCategory.UserError || category == FailureCategory.DataError)
            return false;

        // Retry transient and system errors
        return category == FailureCategory.Transient || category == FailureCategory.SystemError;
    }

    private async Task AutoCompleteStuckJobAsync(Job stuckJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
    {
        try
        {
            _logger.LogWarning(
                "🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.",
                stuckJob.Id, stuckJob.StartedAt);

            stuckJob.Status = JobStatus.Completed;
            stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow;
            stuckJob.LastHeartbeat = DateTime.UtcNow;

            // Add note to error message if not already set
            if (string.IsNullOrEmpty(stuckJob.ErrorMessage))
            {
                stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)";
            }

            await jobRepository.UpdateAsync(stuckJob);

            // Clean up progress tracker if still present
            _jobProgressTrackers.TryRemove(stuckJob.Id, out _);
            _runningJobTasks.TryRemove(stuckJob.Id, out _);

            // Update bundle request if this is part of a bundle
            if (stuckJob.BundleRequestId.HasValue)
            {
                try
                {
                    await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, serviceProvider, stuckJob);
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id);
                }
            }

            _logger.LogInformation(
                "✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.",
                stuckJob.Id);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error auto-completing stuck job {JobId}", stuckJob.Id);
        }
    }

    private async Task HandleLongRunningJobAsync(Job longJob, IJobRepository jobRepository, IServiceProvider serviceProvider)
    {
        try
        {
            var elapsed = longJob.StartedAt.HasValue
                ? (DateTime.UtcNow - longJob.StartedAt.Value).TotalMinutes
                : 0;

            _logger.LogWarning(
                "🔧 Job {JobId} has been running for {Elapsed:F1} minutes (timeout: {_options.JobTimeoutMinutes}). Failing job.",
                longJob.Id, elapsed, _options.JobTimeoutMinutes);

            // Mark as failed
            longJob.Status = JobStatus.Failed;
            longJob.ErrorMessage = $"Job exceeded maximum runtime of {_options.JobTimeoutMinutes} minutes";
            longJob.FailureCategory = FailureCategory.SystemError;
            longJob.IsRetryable = false;
            longJob.CompletedAt = DateTime.UtcNow;
            longJob.AssignedWorkerId = null;
            longJob.LastHeartbeat = DateTime.UtcNow;

            await jobRepository.UpdateAsync(longJob);

            // Clean up
            _jobProgressTrackers.TryRemove(longJob.Id, out _);
            _runningJobTasks.TryRemove(longJob.Id, out _);

            // Update bundle request if this is part of a bundle
            if (longJob.BundleRequestId.HasValue)
            {
                try
                {
                    await UpdateBundleRequestProgress(longJob.BundleRequestId.Value, serviceProvider, longJob);
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "Error updating bundle request progress for long-running job {JobId}", longJob.Id);
                }
            }

            // Notify about permanent failure
            await NotifyPermanentFailure(longJob, new TimeoutException($"Job exceeded {elapsed:F1} minutes"), serviceProvider);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error handling long-running job {JobId}", longJob.Id);
        }
    }

    private async Task NotifyPermanentFailure(
        Job job,
        Exception ex,
        IServiceProvider serviceProvider)
    {
        try
        {
            var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
            const string alertsChannel = "2676086723";

            var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
            var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
                         $"Job ID: `{job.Id}`\n" +
                         $"User ID: {job.UserId}\n" +
                         $"Retry Attempts: {job.RetryCount}/{job.MaxRetries}\n" +
                         $"Failure Category: {job.FailureCategory}\n" +
                         $"Error: {ex.Message}\n" +
                         $"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";

            await webhookService.SendMessage(message, alertsChannel);
        }
        catch (Exception notifyEx)
        {
            _logger.LogError(notifyEx, "Failed to send permanent failure notification for job {JobId}", job.Id);
        }
    }

    public override void Dispose()
    {
        _shutdownCts?.Cancel();
        _shutdownCts?.Dispose();
        _instanceSemaphore?.Dispose();
        base.Dispose();
    }
}

/// <summary>
/// Tracks job progress with batched database updates for performance optimization
/// </summary>
public class JobProgressTracker
{
    private readonly object _lock = new();
    private int _lastPersistedPercentage;
    private DateTime _lastPersistedTime;
    private readonly ILogger _logger;

    public Guid JobId { get; }
    public int CurrentPercentage { get; private set; }
    public DateTime LastUpdateTime { get; private set; }

    public JobProgressTracker(Guid jobId, ILogger logger)
    {
        JobId = jobId;
        _logger = logger;
        _lastPersistedTime = DateTime.UtcNow;
    }

    /// <summary>
    /// Updates progress in memory only - thread safe
    /// </summary>
    public void UpdateProgress(int percentage)
    {
        lock (_lock)
        {
            CurrentPercentage = percentage;
            LastUpdateTime = DateTime.UtcNow;
        }
    }

    /// <summary>
    /// Checks if progress should be persisted to database based on time/percentage thresholds
    /// </summary>
    public bool ShouldPersist(int progressUpdateIntervalMs = 5000, int percentageThreshold = 5)
    {
        lock (_lock)
        {
            var timeSinceLastPersist = (DateTime.UtcNow - _lastPersistedTime).TotalMilliseconds;
            var percentageSinceLastPersist = CurrentPercentage - _lastPersistedPercentage;

            return timeSinceLastPersist >= progressUpdateIntervalMs ||
                   percentageSinceLastPersist >= percentageThreshold ||
                   CurrentPercentage >= 100; // Always persist completion
        }
    }

    /// <summary>
    /// Gets current progress and marks as persisted
    /// </summary>
    public (int percentage, DateTime lastUpdate) GetProgressForPersistence()
    {
        lock (_lock)
        {
            var percentage = CurrentPercentage;
            var lastUpdate = LastUpdateTime;

            _lastPersistedPercentage = percentage;
            _lastPersistedTime = DateTime.UtcNow;

            return (percentage, lastUpdate);
        }
    }
}

/// <summary>
/// Configuration options for BacktestComputeWorker
/// </summary>
public class BacktestComputeWorkerOptions
{
    public const string SectionName = "BacktestComputeWorker";

    /// <summary>
    /// Unique identifier for this worker instance
    /// </summary>
    public string WorkerId { get; set; } = Environment.MachineName;

    /// <summary>
    /// Maximum number of concurrent backtests per user (global limit across all workers)
    /// </summary>
    public int MaxConcurrentPerUser { get; set; } = 6;

    /// <summary>
    /// Maximum number of concurrent backtests per worker instance (local limit for this worker)
    /// </summary>
    public int MaxConcurrentPerInstance { get; set; } = 6;

    /// <summary>
    /// Interval in seconds between job polling attempts
    /// </summary>
    public int JobPollIntervalSeconds { get; set; } = 5;

    /// <summary>
    /// Interval in seconds between heartbeat updates
    /// </summary>
    public int HeartbeatIntervalSeconds { get; set; } = 30;

    /// <summary>
    /// Timeout in minutes for considering a job stale
    /// </summary>
    public int StaleJobTimeoutMinutes { get; set; } = 5;

    /// <summary>
    /// Default maximum retry attempts for failed jobs
    /// </summary>
    public int DefaultMaxRetries { get; set; } = 3;

    /// <summary>
    /// Maximum retry delay in minutes (cap for exponential backoff)
    /// </summary>
    public int MaxRetryDelayMinutes { get; set; } = 60;

    /// <summary>
    /// Maximum time in minutes a job can run before being considered timed out
    /// </summary>
    public int JobTimeoutMinutes { get; set; } = 60;

    /// <summary>
    /// Timeout in minutes to wait for running jobs during graceful shutdown
    /// </summary>
    public int GracefulShutdownTimeoutMinutes { get; set; } = 5;
}