Improve workers for backtests
This commit is contained in:
@@ -250,5 +250,58 @@ public class JobService
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Retries a failed or cancelled job by resetting it to Pending status.
|
||||
/// </summary>
|
||||
/// <param name="jobId">The job ID to retry</param>
|
||||
/// <returns>The updated job</returns>
|
||||
/// <exception cref="InvalidOperationException">Thrown if job cannot be retried</exception>
|
||||
public async Task<Job> RetryJobAsync(Guid jobId)
|
||||
{
|
||||
var job = await _jobRepository.GetByIdAsync(jobId);
|
||||
|
||||
if (job == null)
|
||||
{
|
||||
throw new InvalidOperationException($"Job with ID {jobId} not found.");
|
||||
}
|
||||
|
||||
// Only allow retrying Failed or Cancelled jobs
|
||||
// Running jobs should be handled by stale job recovery, not manual retry
|
||||
if (job.Status != JobStatus.Failed && job.Status != JobStatus.Cancelled)
|
||||
{
|
||||
throw new InvalidOperationException($"Cannot retry job with status {job.Status}. Only Failed or Cancelled jobs can be retried.");
|
||||
}
|
||||
|
||||
// Reset job to pending state
|
||||
job.Status = JobStatus.Pending;
|
||||
job.AssignedWorkerId = null;
|
||||
job.LastHeartbeat = null;
|
||||
job.StartedAt = null;
|
||||
job.CompletedAt = null;
|
||||
job.ProgressPercentage = 0;
|
||||
job.RetryAfter = null;
|
||||
// Keep ErrorMessage for reference, but clear it on next run
|
||||
// Keep RetryCount to track total retries
|
||||
// Reset IsRetryable to true
|
||||
job.IsRetryable = true;
|
||||
|
||||
await _jobRepository.UpdateAsync(job);
|
||||
|
||||
_logger.LogInformation("Job {JobId} reset to Pending status for retry", jobId);
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a job from the database.
|
||||
/// </summary>
|
||||
/// <param name="jobId">The job ID to delete</param>
|
||||
/// <exception cref="InvalidOperationException">Thrown if job cannot be found</exception>
|
||||
public async Task DeleteJobAsync(Guid jobId)
|
||||
{
|
||||
await _jobRepository.DeleteAsync(jobId);
|
||||
_logger.LogInformation("Deleted job {JobId}", jobId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
using System.Text.Json;
|
||||
using GeneticSharp;
|
||||
using Managing.Application.Abstractions.Grains;
|
||||
using Managing.Application.Abstractions.Repositories;
|
||||
using Managing.Application.Abstractions.Services;
|
||||
using Managing.Core;
|
||||
@@ -28,6 +27,7 @@ public class GeneticService : IGeneticService
|
||||
private readonly IMessengerService _messengerService;
|
||||
private readonly IServiceScopeFactory _serviceScopeFactory;
|
||||
private readonly IGrainFactory _grainFactory;
|
||||
private readonly IJobRepository _jobRepository;
|
||||
|
||||
// Predefined parameter ranges for each indicator (matching backtestGenetic.tsx)
|
||||
public static readonly Dictionary<string, (double min, double max)> ParameterRanges = new()
|
||||
@@ -196,7 +196,8 @@ public class GeneticService : IGeneticService
|
||||
ILogger<GeneticService> logger,
|
||||
IMessengerService messengerService,
|
||||
IServiceScopeFactory serviceScopeFactory,
|
||||
IGrainFactory grainFactory)
|
||||
IGrainFactory grainFactory,
|
||||
IJobRepository jobRepository)
|
||||
{
|
||||
_geneticRepository = geneticRepository;
|
||||
_backtester = backtester;
|
||||
@@ -204,9 +205,10 @@ public class GeneticService : IGeneticService
|
||||
_messengerService = messengerService;
|
||||
_serviceScopeFactory = serviceScopeFactory;
|
||||
_grainFactory = grainFactory;
|
||||
_jobRepository = jobRepository;
|
||||
}
|
||||
|
||||
public GeneticRequest CreateGeneticRequest(
|
||||
public async Task<GeneticRequest> CreateGeneticRequestAsync(
|
||||
User user,
|
||||
Ticker ticker,
|
||||
Timeframe timeframe,
|
||||
@@ -245,15 +247,31 @@ public class GeneticService : IGeneticService
|
||||
|
||||
_geneticRepository.InsertGeneticRequestForUser(user, geneticRequest);
|
||||
|
||||
// Trigger Orleans grain to process this request asynchronously
|
||||
// Create a single job for this genetic request that will run until completion
|
||||
try
|
||||
{
|
||||
var grain = _grainFactory.GetGrain<IGeneticBacktestGrain>(id);
|
||||
_ = grain.ProcessGeneticRequestAsync();
|
||||
var job = new Job
|
||||
{
|
||||
UserId = user.Id,
|
||||
Status = JobStatus.Pending,
|
||||
JobType = JobType.Genetic,
|
||||
Priority = 0,
|
||||
ConfigJson = "{}", // Not needed for genetic jobs, GeneticRequestId is used
|
||||
StartDate = startDate,
|
||||
EndDate = endDate,
|
||||
GeneticRequestId = id,
|
||||
RetryCount = 0,
|
||||
MaxRetries = 3,
|
||||
IsRetryable = true
|
||||
};
|
||||
|
||||
await _jobRepository.CreateAsync(job);
|
||||
_logger.LogInformation("Created genetic job {JobId} for genetic request {RequestId}", job.Id, id);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to trigger GeneticBacktestGrain for request {RequestId}", id);
|
||||
_logger.LogError(ex, "Failed to create job for genetic request {RequestId}", id);
|
||||
throw;
|
||||
}
|
||||
|
||||
return geneticRequest;
|
||||
@@ -365,31 +383,83 @@ public class GeneticService : IGeneticService
|
||||
var generationCount = 0;
|
||||
ga.GenerationRan += async (sender, e) =>
|
||||
{
|
||||
generationCount = ga.GenerationsNumber;
|
||||
|
||||
// Update progress every generation
|
||||
var bestFitness = ga.BestChromosome?.Fitness ?? 0;
|
||||
request.CurrentGeneration = generationCount;
|
||||
request.BestFitnessSoFar = bestFitness;
|
||||
|
||||
if (ga.BestChromosome is TradingBotChromosome bestChromosome)
|
||||
try
|
||||
{
|
||||
var genes = bestChromosome.GetGenes();
|
||||
var geneValues = genes.Select(g =>
|
||||
generationCount = ga.GenerationsNumber;
|
||||
|
||||
// Update progress every generation
|
||||
var bestFitness = ga.BestChromosome?.Fitness ?? 0;
|
||||
var bestChromosomeJson = (string?)null;
|
||||
var bestIndividual = (string?)null;
|
||||
|
||||
if (ga.BestChromosome is TradingBotChromosome bestChromosome)
|
||||
{
|
||||
if (g.Value is double doubleValue) return doubleValue;
|
||||
if (g.Value is int intValue) return (double)intValue;
|
||||
return Convert.ToDouble(g.Value.ToString());
|
||||
}).ToArray();
|
||||
request.BestChromosome = JsonSerializer.Serialize(geneValues);
|
||||
var genes = bestChromosome.GetGenes();
|
||||
var geneValues = genes.Select(g =>
|
||||
{
|
||||
if (g.Value is double doubleValue) return doubleValue;
|
||||
if (g.Value is int intValue) return (double)intValue;
|
||||
return Convert.ToDouble(g.Value.ToString());
|
||||
}).ToArray();
|
||||
bestChromosomeJson = JsonSerializer.Serialize(geneValues);
|
||||
bestIndividual = bestChromosome.ToString();
|
||||
}
|
||||
|
||||
// Update ProgressInfo with current generation information
|
||||
var progressInfo = JsonSerializer.Serialize(new
|
||||
{
|
||||
generation = generationCount,
|
||||
best_fitness = bestFitness,
|
||||
population_size = request.PopulationSize,
|
||||
generations = request.Generations,
|
||||
updated_at = DateTime.UtcNow
|
||||
});
|
||||
|
||||
// Update the domain object for local use
|
||||
request.CurrentGeneration = generationCount;
|
||||
request.BestFitnessSoFar = bestFitness;
|
||||
request.BestChromosome = bestChromosomeJson;
|
||||
request.BestIndividual = bestIndividual;
|
||||
request.ProgressInfo = progressInfo;
|
||||
|
||||
// Update the database with current generation progress using a new scope
|
||||
// This prevents DbContext concurrency issues when running in parallel
|
||||
await ServiceScopeHelpers.WithScopedService<IGeneticService>(
|
||||
_serviceScopeFactory,
|
||||
async geneticService =>
|
||||
{
|
||||
// Reload the request from the database in the new scope
|
||||
// Use the user from the original request to get the request by ID
|
||||
var dbRequest = geneticService.GetGeneticRequestByIdForUser(request.User, request.RequestId);
|
||||
|
||||
if (dbRequest != null)
|
||||
{
|
||||
// Update the loaded request with current generation data
|
||||
dbRequest.CurrentGeneration = generationCount;
|
||||
dbRequest.BestFitnessSoFar = bestFitness;
|
||||
dbRequest.BestChromosome = bestChromosomeJson;
|
||||
dbRequest.BestIndividual = bestIndividual;
|
||||
dbRequest.ProgressInfo = progressInfo;
|
||||
|
||||
// Save the update
|
||||
await geneticService.UpdateGeneticRequestAsync(dbRequest);
|
||||
}
|
||||
});
|
||||
|
||||
_logger.LogDebug("Updated genetic request {RequestId} at generation {Generation} with fitness {Fitness}",
|
||||
request.RequestId, generationCount, bestFitness);
|
||||
|
||||
// Check for cancellation
|
||||
if (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
ga.Stop();
|
||||
}
|
||||
}
|
||||
|
||||
await UpdateGeneticRequestAsync(request);
|
||||
|
||||
// Check for cancellation
|
||||
if (cancellationToken.IsCancellationRequested)
|
||||
catch (Exception ex)
|
||||
{
|
||||
ga.Stop();
|
||||
_logger.LogError(ex, "Error updating genetic request {RequestId} at generation {Generation}",
|
||||
request.RequestId, generationCount);
|
||||
// Don't throw - continue with next generation
|
||||
}
|
||||
};
|
||||
|
||||
@@ -421,11 +491,27 @@ public class GeneticService : IGeneticService
|
||||
_logger.LogInformation("Genetic algorithm completed for request {RequestId}. Best fitness: {Fitness}",
|
||||
request.RequestId, bestFitness);
|
||||
|
||||
// Update request with results
|
||||
// Update request with final results
|
||||
request.Status = GeneticRequestStatus.Completed;
|
||||
request.CompletedAt = DateTime.UtcNow;
|
||||
request.BestFitness = bestFitness;
|
||||
request.BestIndividual = bestChromosome?.ToString() ?? "unknown";
|
||||
request.CurrentGeneration = ga.GenerationsNumber;
|
||||
request.BestFitnessSoFar = bestFitness;
|
||||
|
||||
// Update BestChromosome if not already set
|
||||
if (bestChromosome != null && string.IsNullOrEmpty(request.BestChromosome))
|
||||
{
|
||||
var genes = bestChromosome.GetGenes();
|
||||
var geneValues = genes.Select(g =>
|
||||
{
|
||||
if (g.Value is double doubleValue) return doubleValue;
|
||||
if (g.Value is int intValue) return (double)intValue;
|
||||
return Convert.ToDouble(g.Value.ToString());
|
||||
}).ToArray();
|
||||
request.BestChromosome = JsonSerializer.Serialize(geneValues);
|
||||
}
|
||||
|
||||
request.ProgressInfo = JsonSerializer.Serialize(new
|
||||
{
|
||||
generation = ga.GenerationsNumber,
|
||||
@@ -436,6 +522,9 @@ public class GeneticService : IGeneticService
|
||||
});
|
||||
|
||||
await UpdateGeneticRequestAsync(request);
|
||||
|
||||
_logger.LogInformation("Final update completed for genetic request {RequestId}. Generation: {Generation}, Best Fitness: {Fitness}",
|
||||
request.RequestId, ga.GenerationsNumber, bestFitness);
|
||||
|
||||
// Send notification about the completed genetic algorithm
|
||||
try
|
||||
|
||||
@@ -21,7 +21,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
private readonly IServiceScopeFactory _scopeFactory;
|
||||
private readonly ILogger<BacktestComputeWorker> _logger;
|
||||
private readonly BacktestComputeWorkerOptions _options;
|
||||
private readonly SemaphoreSlim _semaphore;
|
||||
private readonly SemaphoreSlim _instanceSemaphore;
|
||||
|
||||
public BacktestComputeWorker(
|
||||
IServiceScopeFactory scopeFactory,
|
||||
@@ -31,14 +31,14 @@ public class BacktestComputeWorker : BackgroundService
|
||||
_scopeFactory = scopeFactory;
|
||||
_logger = logger;
|
||||
_options = options.Value;
|
||||
_semaphore = new SemaphoreSlim(_options.MaxConcurrentBacktests, _options.MaxConcurrentBacktests);
|
||||
_instanceSemaphore = new SemaphoreSlim(_options.MaxConcurrentPerInstance, _options.MaxConcurrentPerInstance);
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrent: {MaxConcurrent}, PollInterval: {PollInterval}s",
|
||||
_options.WorkerId, _options.MaxConcurrentBacktests, _options.JobPollIntervalSeconds);
|
||||
"BacktestComputeWorker starting. WorkerId: {WorkerId}, MaxConcurrentPerUser: {MaxConcurrentPerUser}, MaxConcurrentPerInstance: {MaxConcurrentPerInstance}, PollInterval: {PollInterval}s",
|
||||
_options.WorkerId, _options.MaxConcurrentPerUser, _options.MaxConcurrentPerInstance, _options.JobPollIntervalSeconds);
|
||||
|
||||
// Background task for stale job recovery
|
||||
var staleJobRecoveryTask = Task.Run(() => StaleJobRecoveryLoop(stoppingToken), stoppingToken);
|
||||
@@ -67,10 +67,10 @@ public class BacktestComputeWorker : BackgroundService
|
||||
|
||||
private async Task ProcessJobsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// Check if we have capacity
|
||||
if (!await _semaphore.WaitAsync(0, cancellationToken))
|
||||
// Check if this instance has capacity
|
||||
if (!await _instanceSemaphore.WaitAsync(0, cancellationToken))
|
||||
{
|
||||
// At capacity, skip this iteration
|
||||
// Instance at capacity, skip this iteration
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -79,17 +79,23 @@ public class BacktestComputeWorker : BackgroundService
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
|
||||
// Try to claim a backtest job (exclude genetic jobs)
|
||||
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId, JobType.Backtest);
|
||||
// Claim a random backtest job atomically, excluding users at capacity
|
||||
// The SQL query checks running job counts within the transaction, ensuring thread-safety
|
||||
var job = await jobRepository.ClaimRandomJobAsync(
|
||||
_options.WorkerId,
|
||||
JobType.Backtest,
|
||||
_options.MaxConcurrentPerUser);
|
||||
|
||||
if (job == null)
|
||||
{
|
||||
// No jobs available, release semaphore
|
||||
_semaphore.Release();
|
||||
// No jobs available for users not at capacity, release semaphore
|
||||
_instanceSemaphore.Release();
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Claimed backtest job {JobId} for worker {WorkerId}", job.Id, _options.WorkerId);
|
||||
_logger.LogInformation(
|
||||
"Claimed random backtest job {JobId} (UserId: {UserId}) for worker {WorkerId}",
|
||||
job.Id, job.UserId, _options.WorkerId);
|
||||
|
||||
// Process the job asynchronously (don't await, let it run in background)
|
||||
// Create a new scope for the job processing to ensure proper lifetime management
|
||||
@@ -99,16 +105,21 @@ public class BacktestComputeWorker : BackgroundService
|
||||
{
|
||||
await ProcessJobAsync(job, cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error processing job {JobId}", job.Id);
|
||||
throw;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_semaphore.Release();
|
||||
_instanceSemaphore.Release();
|
||||
}
|
||||
}, cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error claiming or processing job");
|
||||
_semaphore.Release();
|
||||
_instanceSemaphore.Release();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@@ -560,7 +571,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
|
||||
public override void Dispose()
|
||||
{
|
||||
_semaphore?.Dispose();
|
||||
_instanceSemaphore?.Dispose();
|
||||
base.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -578,9 +589,14 @@ public class BacktestComputeWorkerOptions
|
||||
public string WorkerId { get; set; } = Environment.MachineName;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent backtests to process
|
||||
/// Maximum number of concurrent backtests per user (global limit across all workers)
|
||||
/// </summary>
|
||||
public int MaxConcurrentBacktests { get; set; } = 6;
|
||||
public int MaxConcurrentPerUser { get; set; } = 6;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent backtests per worker instance (local limit for this worker)
|
||||
/// </summary>
|
||||
public int MaxConcurrentPerInstance { get; set; } = 6;
|
||||
|
||||
/// <summary>
|
||||
/// Interval in seconds between job polling attempts
|
||||
|
||||
Reference in New Issue
Block a user