Add genetic backtest to worker

This commit is contained in:
2025-11-09 03:32:08 +07:00
parent 7dba29c66f
commit 7e08e63dd1
30 changed files with 5056 additions and 232 deletions

View File

@@ -77,10 +77,10 @@ public class BacktestComputeWorker : BackgroundService
try
{
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
// Try to claim a job
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId);
// Try to claim a backtest job (exclude genetic jobs)
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId, JobType.Backtest);
if (job == null)
{
@@ -114,11 +114,11 @@ public class BacktestComputeWorker : BackgroundService
}
private async Task ProcessJobAsync(
BacktestJob job,
Job job,
CancellationToken cancellationToken)
{
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
var executor = scope.ServiceProvider.GetRequiredService<BacktestExecutor>();
var userService = scope.ServiceProvider.GetRequiredService<IUserService>();
var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
@@ -184,7 +184,7 @@ public class BacktestComputeWorker : BackgroundService
progressCallback: progressCallback);
// Update job with result
job.Status = BacktestJobStatus.Completed;
job.Status = JobStatus.Completed;
job.ProgressPercentage = 100;
job.ResultJson = JsonSerializer.Serialize(result);
job.CompletedAt = DateTime.UtcNow;
@@ -207,24 +207,7 @@ public class BacktestComputeWorker : BackgroundService
_logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
SentrySdk.CaptureException(ex);
// Update job status to failed
try
{
job.Status = BacktestJobStatus.Failed;
job.ErrorMessage = ex.Message;
job.CompletedAt = DateTime.UtcNow;
await jobRepository.UpdateAsync(job);
// Update bundle request if this is part of a bundle
if (job.BundleRequestId.HasValue)
{
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
}
}
catch (Exception updateEx)
{
_logger.LogError(updateEx, "Error updating job {JobId} status to failed", job.Id);
}
await HandleJobFailure(job, ex, jobRepository, scope.ServiceProvider);
}
}
@@ -233,14 +216,15 @@ public class BacktestComputeWorker : BackgroundService
try
{
var backtestRepository = serviceProvider.GetRequiredService<IBacktestRepository>();
var jobRepository = serviceProvider.GetRequiredService<IBacktestJobRepository>();
var jobRepository = serviceProvider.GetRequiredService<IJobRepository>();
var userService = serviceProvider.GetRequiredService<IUserService>();
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
// Get all jobs for this bundle
var jobs = await jobRepository.GetByBundleRequestIdAsync(bundleRequestId);
var completedJobs = jobs.Count(j => j.Status == BacktestJobStatus.Completed);
var failedJobs = jobs.Count(j => j.Status == BacktestJobStatus.Failed);
var runningJobs = jobs.Count(j => j.Status == BacktestJobStatus.Running);
var completedJobs = jobs.Count(j => j.Status == JobStatus.Completed);
var failedJobs = jobs.Count(j => j.Status == JobStatus.Failed);
var runningJobs = jobs.Count(j => j.Status == JobStatus.Running);
var totalJobs = jobs.Count();
if (totalJobs == 0)
@@ -265,6 +249,8 @@ public class BacktestComputeWorker : BackgroundService
return;
}
var previousStatus = bundleRequest.Status;
// Update bundle request progress
bundleRequest.CompletedBacktests = completedJobs;
bundleRequest.FailedBacktests = failedJobs;
@@ -298,7 +284,7 @@ public class BacktestComputeWorker : BackgroundService
// Update results list from completed jobs
var completedJobResults = jobs
.Where(j => j.Status == BacktestJobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
.Where(j => j.Status == JobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
.Select(j =>
{
try
@@ -318,6 +304,28 @@ public class BacktestComputeWorker : BackgroundService
await backtestRepository.UpdateBundleBacktestRequestAsync(bundleRequest);
// Send webhook notification if bundle request just completed
if (previousStatus != BundleBacktestRequestStatus.Completed &&
bundleRequest.Status == BundleBacktestRequestStatus.Completed &&
!string.IsNullOrEmpty(user.TelegramChannel))
{
var message = $"✅ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) completed successfully. " +
$"Completed: {completedJobs}/{totalJobs} backtests" +
(failedJobs > 0 ? $", Failed: {failedJobs}" : "") +
$". Results: {completedJobResults.Count} backtest(s) generated.";
await webhookService.SendMessage(message, user.TelegramChannel);
}
else if (previousStatus != BundleBacktestRequestStatus.Failed &&
bundleRequest.Status == BundleBacktestRequestStatus.Failed &&
!string.IsNullOrEmpty(user.TelegramChannel))
{
var message = $"❌ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) failed. " +
$"All {totalJobs} backtests failed. Error: {bundleRequest.ErrorMessage}";
await webhookService.SendMessage(message, user.TelegramChannel);
}
_logger.LogInformation(
"Updated bundle request {BundleRequestId} progress: {Completed}/{Total} completed, {Failed} failed, {Running} running",
bundleRequestId, completedJobs, totalJobs, failedJobs, runningJobs);
@@ -337,13 +345,58 @@ public class BacktestComputeWorker : BackgroundService
await Task.Delay(TimeSpan.FromMinutes(1), cancellationToken); // Check every minute
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
var resetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
// Get stale jobs for this worker
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
var staleJobs = runningJobs
.Where(j => j.JobType == JobType.Backtest &&
(j.LastHeartbeat == null ||
j.LastHeartbeat < DateTime.UtcNow.AddMinutes(-_options.StaleJobTimeoutMinutes)))
.ToList();
if (resetCount > 0)
foreach (var job in staleJobs)
{
_logger.LogInformation("Reset {Count} stale backtest jobs back to Pending status", resetCount);
// If it's stale but retryable, reset to pending with retry count
if (job.RetryCount < job.MaxRetries)
{
job.Status = JobStatus.Pending;
job.RetryCount++;
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
job.ErrorMessage = $"Worker timeout - retry {job.RetryCount}/{job.MaxRetries}";
job.FailureCategory = FailureCategory.SystemError;
_logger.LogWarning(
"Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter);
}
else
{
// Exceeded retries - mark as failed
job.Status = JobStatus.Failed;
job.ErrorMessage = "Worker timeout - exceeded max retries";
job.FailureCategory = FailureCategory.SystemError;
job.IsRetryable = false;
job.CompletedAt = DateTime.UtcNow;
// Notify permanent failure
await NotifyPermanentFailure(job, new TimeoutException("Worker timeout"), scope.ServiceProvider);
// Update bundle request if this is part of a bundle
if (job.BundleRequestId.HasValue)
{
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
}
}
job.AssignedWorkerId = null;
job.LastHeartbeat = null;
await jobRepository.UpdateAsync(job);
}
if (staleJobs.Count > 0)
{
_logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
}
}
catch (Exception ex)
@@ -362,7 +415,7 @@ public class BacktestComputeWorker : BackgroundService
await Task.Delay(TimeSpan.FromSeconds(_options.HeartbeatIntervalSeconds), cancellationToken);
using var scope = _scopeFactory.CreateScope();
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
// Update heartbeat for all jobs assigned to this worker
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
@@ -380,6 +433,118 @@ public class BacktestComputeWorker : BackgroundService
}
}
private async Task HandleJobFailure(
Job job,
Exception ex,
IJobRepository jobRepository,
IServiceProvider serviceProvider)
{
try
{
// Categorize the failure
var failureCategory = CategorizeFailure(ex);
var isRetryable = IsRetryableFailure(ex, failureCategory);
// Check if we should retry
if (isRetryable && job.RetryCount < job.MaxRetries)
{
// Calculate exponential backoff: 2^retryCount minutes, capped at MaxRetryDelayMinutes
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
job.RetryCount++;
job.Status = JobStatus.Pending; // Reset to pending for retry
job.AssignedWorkerId = null; // Allow any worker to pick it up
job.ErrorMessage = $"Retry {job.RetryCount}/{job.MaxRetries}: {ex.Message}";
job.FailureCategory = failureCategory;
job.IsRetryable = true;
_logger.LogWarning(
"Job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}. Error: {Error}",
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter, ex.Message);
}
else
{
// Permanent failure - mark as failed
job.Status = JobStatus.Failed;
job.ErrorMessage = ex.Message;
job.FailureCategory = failureCategory;
job.IsRetryable = false;
job.CompletedAt = DateTime.UtcNow;
_logger.LogError(
"Job {JobId} failed permanently after {RetryCount} retries. Error: {Error}",
job.Id, job.RetryCount, ex.Message);
// Send notification for permanent failure
await NotifyPermanentFailure(job, ex, serviceProvider);
// Update bundle request if this is part of a bundle
if (job.BundleRequestId.HasValue)
{
await UpdateBundleRequestProgress(job.BundleRequestId.Value, serviceProvider);
}
}
job.LastHeartbeat = DateTime.UtcNow;
await jobRepository.UpdateAsync(job);
}
catch (Exception updateEx)
{
_logger.LogError(updateEx, "Failed to update job {JobId} status after failure", job.Id);
}
}
private FailureCategory CategorizeFailure(Exception ex)
{
return ex switch
{
TimeoutException => FailureCategory.Transient,
TaskCanceledException => FailureCategory.Transient,
HttpRequestException => FailureCategory.Transient,
InvalidOperationException when ex.Message.Contains("candles") || ex.Message.Contains("No candles") => FailureCategory.DataError,
InvalidOperationException when ex.Message.Contains("User") || ex.Message.Contains("not found") => FailureCategory.UserError,
OutOfMemoryException => FailureCategory.SystemError,
_ => FailureCategory.Unknown
};
}
private bool IsRetryableFailure(Exception ex, FailureCategory category)
{
// Don't retry user errors or data errors (missing candles, invalid config)
if (category == FailureCategory.UserError || category == FailureCategory.DataError)
return false;
// Retry transient and system errors
return category == FailureCategory.Transient || category == FailureCategory.SystemError;
}
private async Task NotifyPermanentFailure(
Job job,
Exception ex,
IServiceProvider serviceProvider)
{
try
{
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
const string alertsChannel = "2676086723";
var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
$"Job ID: `{job.Id}`\n" +
$"User ID: {job.UserId}\n" +
$"Retry Attempts: {job.RetryCount}/{job.MaxRetries}\n" +
$"Failure Category: {job.FailureCategory}\n" +
$"Error: {ex.Message}\n" +
$"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";
await webhookService.SendMessage(message, alertsChannel);
}
catch (Exception notifyEx)
{
_logger.LogError(notifyEx, "Failed to send permanent failure notification for job {JobId}", job.Id);
}
}
public override void Dispose()
{
_semaphore?.Dispose();
@@ -418,5 +583,15 @@ public class BacktestComputeWorkerOptions
/// Timeout in minutes for considering a job stale
/// </summary>
public int StaleJobTimeoutMinutes { get; set; } = 5;
/// <summary>
/// Default maximum retry attempts for failed jobs
/// </summary>
public int DefaultMaxRetries { get; set; } = 3;
/// <summary>
/// Maximum retry delay in minutes (cap for exponential backoff)
/// </summary>
public int MaxRetryDelayMinutes { get; set; } = 60;
}