Improve workers for backtests

This commit is contained in:
2025-11-10 01:44:33 +07:00
parent 97f2b8229b
commit 7e52b7a734
18 changed files with 740 additions and 144 deletions

View File

@@ -250,5 +250,58 @@ public class JobService
throw;
}
}
/// <summary>
/// Retries a failed or cancelled job by resetting it to Pending status.
/// </summary>
/// <param name="jobId">The job ID to retry</param>
/// <returns>The updated job</returns>
/// <exception cref="InvalidOperationException">Thrown if job cannot be retried</exception>
public async Task<Job> RetryJobAsync(Guid jobId)
{
var job = await _jobRepository.GetByIdAsync(jobId);
if (job == null)
{
throw new InvalidOperationException($"Job with ID {jobId} not found.");
}
// Only allow retrying Failed or Cancelled jobs
// Running jobs should be handled by stale job recovery, not manual retry
if (job.Status != JobStatus.Failed && job.Status != JobStatus.Cancelled)
{
throw new InvalidOperationException($"Cannot retry job with status {job.Status}. Only Failed or Cancelled jobs can be retried.");
}
// Reset job to pending state
job.Status = JobStatus.Pending;
job.AssignedWorkerId = null;
job.LastHeartbeat = null;
job.StartedAt = null;
job.CompletedAt = null;
job.ProgressPercentage = 0;
job.RetryAfter = null;
// Keep ErrorMessage for reference, but clear it on next run
// Keep RetryCount to track total retries
// Reset IsRetryable to true
job.IsRetryable = true;
await _jobRepository.UpdateAsync(job);
_logger.LogInformation("Job {JobId} reset to Pending status for retry", jobId);
return job;
}
/// <summary>
/// Deletes a job from the database.
/// </summary>
/// <param name="jobId">The job ID to delete</param>
/// <exception cref="InvalidOperationException">Thrown if job cannot be found</exception>
public async Task DeleteJobAsync(Guid jobId)
{
await _jobRepository.DeleteAsync(jobId);
_logger.LogInformation("Deleted job {JobId}", jobId);
}
}

View File

@@ -1,6 +1,5 @@
using System.Text.Json;
using GeneticSharp;
using Managing.Application.Abstractions.Grains;
using Managing.Application.Abstractions.Repositories;
using Managing.Application.Abstractions.Services;
using Managing.Core;
@@ -28,6 +27,7 @@ public class GeneticService : IGeneticService
private readonly IMessengerService _messengerService;
private readonly IServiceScopeFactory _serviceScopeFactory;
private readonly IGrainFactory _grainFactory;
private readonly IJobRepository _jobRepository;
// Predefined parameter ranges for each indicator (matching backtestGenetic.tsx)
public static readonly Dictionary<string, (double min, double max)> ParameterRanges = new()
@@ -196,7 +196,8 @@ public class GeneticService : IGeneticService
ILogger<GeneticService> logger,
IMessengerService messengerService,
IServiceScopeFactory serviceScopeFactory,
IGrainFactory grainFactory)
IGrainFactory grainFactory,
IJobRepository jobRepository)
{
_geneticRepository = geneticRepository;
_backtester = backtester;
@@ -204,9 +205,10 @@ public class GeneticService : IGeneticService
_messengerService = messengerService;
_serviceScopeFactory = serviceScopeFactory;
_grainFactory = grainFactory;
_jobRepository = jobRepository;
}
public GeneticRequest CreateGeneticRequest(
public async Task<GeneticRequest> CreateGeneticRequestAsync(
User user,
Ticker ticker,
Timeframe timeframe,
@@ -245,15 +247,31 @@ public class GeneticService : IGeneticService
_geneticRepository.InsertGeneticRequestForUser(user, geneticRequest);
// Trigger Orleans grain to process this request asynchronously
// Create a single job for this genetic request that will run until completion
try
{
var grain = _grainFactory.GetGrain<IGeneticBacktestGrain>(id);
_ = grain.ProcessGeneticRequestAsync();
var job = new Job
{
UserId = user.Id,
Status = JobStatus.Pending,
JobType = JobType.Genetic,
Priority = 0,
ConfigJson = "{}", // Not needed for genetic jobs, GeneticRequestId is used
StartDate = startDate,
EndDate = endDate,
GeneticRequestId = id,
RetryCount = 0,
MaxRetries = 3,
IsRetryable = true
};
await _jobRepository.CreateAsync(job);
_logger.LogInformation("Created genetic job {JobId} for genetic request {RequestId}", job.Id, id);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to trigger GeneticBacktestGrain for request {RequestId}", id);
_logger.LogError(ex, "Failed to create job for genetic request {RequestId}", id);
throw;
}
return geneticRequest;
@@ -365,31 +383,83 @@ public class GeneticService : IGeneticService
var generationCount = 0;
ga.GenerationRan += async (sender, e) =>
{
generationCount = ga.GenerationsNumber;
// Update progress every generation
var bestFitness = ga.BestChromosome?.Fitness ?? 0;
request.CurrentGeneration = generationCount;
request.BestFitnessSoFar = bestFitness;
if (ga.BestChromosome is TradingBotChromosome bestChromosome)
try
{
var genes = bestChromosome.GetGenes();
var geneValues = genes.Select(g =>
generationCount = ga.GenerationsNumber;
// Update progress every generation
var bestFitness = ga.BestChromosome?.Fitness ?? 0;
var bestChromosomeJson = (string?)null;
var bestIndividual = (string?)null;
if (ga.BestChromosome is TradingBotChromosome bestChromosome)
{
if (g.Value is double doubleValue) return doubleValue;
if (g.Value is int intValue) return (double)intValue;
return Convert.ToDouble(g.Value.ToString());
}).ToArray();
request.BestChromosome = JsonSerializer.Serialize(geneValues);
var genes = bestChromosome.GetGenes();
var geneValues = genes.Select(g =>
{
if (g.Value is double doubleValue) return doubleValue;
if (g.Value is int intValue) return (double)intValue;
return Convert.ToDouble(g.Value.ToString());
}).ToArray();
bestChromosomeJson = JsonSerializer.Serialize(geneValues);
bestIndividual = bestChromosome.ToString();
}
// Update ProgressInfo with current generation information
var progressInfo = JsonSerializer.Serialize(new
{
generation = generationCount,
best_fitness = bestFitness,
population_size = request.PopulationSize,
generations = request.Generations,
updated_at = DateTime.UtcNow
});
// Update the domain object for local use
request.CurrentGeneration = generationCount;
request.BestFitnessSoFar = bestFitness;
request.BestChromosome = bestChromosomeJson;
request.BestIndividual = bestIndividual;
request.ProgressInfo = progressInfo;
// Update the database with current generation progress using a new scope
// This prevents DbContext concurrency issues when running in parallel
await ServiceScopeHelpers.WithScopedService<IGeneticService>(
_serviceScopeFactory,
async geneticService =>
{
// Reload the request from the database in the new scope
// Use the user from the original request to get the request by ID
var dbRequest = geneticService.GetGeneticRequestByIdForUser(request.User, request.RequestId);
if (dbRequest != null)
{
// Update the loaded request with current generation data
dbRequest.CurrentGeneration = generationCount;
dbRequest.BestFitnessSoFar = bestFitness;
dbRequest.BestChromosome = bestChromosomeJson;
dbRequest.BestIndividual = bestIndividual;
dbRequest.ProgressInfo = progressInfo;
// Save the update
await geneticService.UpdateGeneticRequestAsync(dbRequest);
}
});
_logger.LogDebug("Updated genetic request {RequestId} at generation {Generation} with fitness {Fitness}",
request.RequestId, generationCount, bestFitness);
// Check for cancellation
if (cancellationToken.IsCancellationRequested)
{
ga.Stop();
}
}
await UpdateGeneticRequestAsync(request);
// Check for cancellation
if (cancellationToken.IsCancellationRequested)
catch (Exception ex)
{
ga.Stop();
_logger.LogError(ex, "Error updating genetic request {RequestId} at generation {Generation}",
request.RequestId, generationCount);
// Don't throw - continue with next generation
}
};
@@ -421,11 +491,27 @@ public class GeneticService : IGeneticService
_logger.LogInformation("Genetic algorithm completed for request {RequestId}. Best fitness: {Fitness}",
request.RequestId, bestFitness);
// Update request with results
// Update request with final results
request.Status = GeneticRequestStatus.Completed;
request.CompletedAt = DateTime.UtcNow;
request.BestFitness = bestFitness;
request.BestIndividual = bestChromosome?.ToString() ?? "unknown";
request.CurrentGeneration = ga.GenerationsNumber;
request.BestFitnessSoFar = bestFitness;
// Update BestChromosome if not already set
if (bestChromosome != null && string.IsNullOrEmpty(request.BestChromosome))
{
var genes = bestChromosome.GetGenes();
var geneValues = genes.Select(g =>
{
if (g.Value is double doubleValue) return doubleValue;
if (g.Value is int intValue) return (double)intValue;
return Convert.ToDouble(g.Value.ToString());
}).ToArray();
request.BestChromosome = JsonSerializer.Serialize(geneValues);
}
request.ProgressInfo = JsonSerializer.Serialize(new
{
generation = ga.GenerationsNumber,
@@ -436,6 +522,9 @@ public class GeneticService : IGeneticService
});
await UpdateGeneticRequestAsync(request);
_logger.LogInformation("Final update completed for genetic request {RequestId}. Generation: {Generation}, Best Fitness: {Fitness}",
request.RequestId, ga.GenerationsNumber, bestFitness);
// Send notification about the completed genetic algorithm
try

View File

@@ -21,7 +21,7 @@ public class BacktestComputeWorker : BackgroundService
private readonly IServiceScopeFactory _scopeFactory;
private readonly ILogger<BacktestComputeWorker> _logger;
private readonly BacktestComputeWorkerOptions _options;
private readonly SemaphoreSlim _semaphore;
private readonly SemaphoreSlim _instanceSemaphore;
public BacktestComputeWorker(
IServiceScopeFactory scopeFactory,
@@ -31,14 +31,14 @@ public class BacktestComputeWorker : BackgroundService
_scopeFactory = scopeFactory;
_logger = logger;
_options = options.Value;
_semaphore = new SemaphoreSlim(_options.MaxConcurrentBacktests, _options.MaxConcurrentBacktests);
_instanceSemaphore = new SemaphoreSlim(_options.MaxConcurrentPerInstance, _options.MaxConcurrentPerInstance);
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation(
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrent: {MaxConcurrent}, PollInterval: {PollInterval}s",
_options.WorkerId, _options.MaxConcurrentBacktests, _options.JobPollIntervalSeconds);
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s",
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance, _options.JobPollIntervalSeconds);
// Background task for stale job recovery
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(stoppingToken), stoppingToken);
@@ -67,10 +67,10 @@ public class BacktestComputeWorker : BackgroundService
private async Task ProcessJobsAsync(CancellationToken cancellationToken)
{
// Check if we have capacity
if (!await _semaphore.WaitAsync(0, cancellationToken))
// Check if this instance has capacity
if (!await _instanceSemaphore.WaitAsync(0, cancellationToken))
{
// At capacity, skip this iteration
// Instance at capacity, skip this iteration
return;
}
@@ -79,17 +79,23 @@ public class BacktestComputeWorker : BackgroundService
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
// Try to claim a backtest job (exclude genetic jobs)
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId, JobType.Backtest);
// Claim a random backtest job atomically, excluding users at capacity
// The SQL query checks running job counts within the transaction, ensuring thread-safety
var job = await jobRepository.ClaimRandomJobAsync(
_options.WorkerId,
JobType.Backtest,
_options.MaxConcurrentPerUser);
if (job == null)
{
// No jobs available, release semaphore
_semaphore.Release();
// No jobs available for users not at capacity, release semaphore
_instanceSemaphore.Release();
return;
}
_logger.LogInformation("Claimed backtest job {JobId} for worker {WorkerId}", job.Id, _options.WorkerId);
_logger.LogInformation(
"Claimed random backtest job {JobId} (UserId: {UserId}) for worker {WorkerId}",
job.Id, job.UserId, _options.WorkerId);
// Process the job asynchronously (don't await, let it run in background)
// Create a new scope for the job processing to ensure proper lifetime management
@@ -99,16 +105,21 @@ public class BacktestComputeWorker : BackgroundService
{
await ProcessJobAsync(job, cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing job {JobId}", job.Id);
throw;
}
finally
{
_semaphore.Release();
_instanceSemaphore.Release();
}
}, cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error claiming or processing job");
_semaphore.Release();
_instanceSemaphore.Release();
throw;
}
}
@@ -560,7 +571,7 @@ public class BacktestComputeWorker : BackgroundService
public override void Dispose()
{
_semaphore?.Dispose();
_instanceSemaphore?.Dispose();
base.Dispose();
}
}
@@ -578,9 +589,14 @@ public class BacktestComputeWorkerOptions
public string WorkerId { get; set; } = Environment.MachineName;
/// <summary>
/// Maximum number of concurrent backtests to process
/// Maximum number of concurrent backtests per user (global limit across all workers)
/// </summary>
public int MaxConcurrentBacktests { get; set; } = 6;
public int MaxConcurrentPerUser { get; set; } = 6;
/// <summary>
/// Maximum number of concurrent backtests per worker instance (local limit for this worker)
/// </summary>
public int MaxConcurrentPerInstance { get; set; } = 6;
/// <summary>
/// Interval in seconds between job polling attempts