Add genetic backtest to worker
This commit is contained in:
@@ -77,10 +77,10 @@ public class BacktestComputeWorker : BackgroundService
|
||||
try
|
||||
{
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
|
||||
// Try to claim a job
|
||||
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId);
|
||||
// Try to claim a backtest job (exclude genetic jobs)
|
||||
var job = await jobRepository.ClaimNextJobAsync(_options.WorkerId, JobType.Backtest);
|
||||
|
||||
if (job == null)
|
||||
{
|
||||
@@ -114,11 +114,11 @@ public class BacktestComputeWorker : BackgroundService
|
||||
}
|
||||
|
||||
private async Task ProcessJobAsync(
|
||||
BacktestJob job,
|
||||
Job job,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
var executor = scope.ServiceProvider.GetRequiredService<BacktestExecutor>();
|
||||
var userService = scope.ServiceProvider.GetRequiredService<IUserService>();
|
||||
var exchangeService = scope.ServiceProvider.GetRequiredService<IExchangeService>();
|
||||
@@ -184,7 +184,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
progressCallback: progressCallback);
|
||||
|
||||
// Update job with result
|
||||
job.Status = BacktestJobStatus.Completed;
|
||||
job.Status = JobStatus.Completed;
|
||||
job.ProgressPercentage = 100;
|
||||
job.ResultJson = JsonSerializer.Serialize(result);
|
||||
job.CompletedAt = DateTime.UtcNow;
|
||||
@@ -207,24 +207,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
_logger.LogError(ex, "Error processing backtest job {JobId}", job.Id);
|
||||
SentrySdk.CaptureException(ex);
|
||||
|
||||
// Update job status to failed
|
||||
try
|
||||
{
|
||||
job.Status = BacktestJobStatus.Failed;
|
||||
job.ErrorMessage = ex.Message;
|
||||
job.CompletedAt = DateTime.UtcNow;
|
||||
await jobRepository.UpdateAsync(job);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (job.BundleRequestId.HasValue)
|
||||
{
|
||||
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
|
||||
}
|
||||
}
|
||||
catch (Exception updateEx)
|
||||
{
|
||||
_logger.LogError(updateEx, "Error updating job {JobId} status to failed", job.Id);
|
||||
}
|
||||
await HandleJobFailure(job, ex, jobRepository, scope.ServiceProvider);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,14 +216,15 @@ public class BacktestComputeWorker : BackgroundService
|
||||
try
|
||||
{
|
||||
var backtestRepository = serviceProvider.GetRequiredService<IBacktestRepository>();
|
||||
var jobRepository = serviceProvider.GetRequiredService<IBacktestJobRepository>();
|
||||
var jobRepository = serviceProvider.GetRequiredService<IJobRepository>();
|
||||
var userService = serviceProvider.GetRequiredService<IUserService>();
|
||||
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
|
||||
|
||||
// Get all jobs for this bundle
|
||||
var jobs = await jobRepository.GetByBundleRequestIdAsync(bundleRequestId);
|
||||
var completedJobs = jobs.Count(j => j.Status == BacktestJobStatus.Completed);
|
||||
var failedJobs = jobs.Count(j => j.Status == BacktestJobStatus.Failed);
|
||||
var runningJobs = jobs.Count(j => j.Status == BacktestJobStatus.Running);
|
||||
var completedJobs = jobs.Count(j => j.Status == JobStatus.Completed);
|
||||
var failedJobs = jobs.Count(j => j.Status == JobStatus.Failed);
|
||||
var runningJobs = jobs.Count(j => j.Status == JobStatus.Running);
|
||||
var totalJobs = jobs.Count();
|
||||
|
||||
if (totalJobs == 0)
|
||||
@@ -265,6 +249,8 @@ public class BacktestComputeWorker : BackgroundService
|
||||
return;
|
||||
}
|
||||
|
||||
var previousStatus = bundleRequest.Status;
|
||||
|
||||
// Update bundle request progress
|
||||
bundleRequest.CompletedBacktests = completedJobs;
|
||||
bundleRequest.FailedBacktests = failedJobs;
|
||||
@@ -298,7 +284,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
|
||||
// Update results list from completed jobs
|
||||
var completedJobResults = jobs
|
||||
.Where(j => j.Status == BacktestJobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
|
||||
.Where(j => j.Status == JobStatus.Completed && !string.IsNullOrEmpty(j.ResultJson))
|
||||
.Select(j =>
|
||||
{
|
||||
try
|
||||
@@ -318,6 +304,28 @@ public class BacktestComputeWorker : BackgroundService
|
||||
|
||||
await backtestRepository.UpdateBundleBacktestRequestAsync(bundleRequest);
|
||||
|
||||
// Send webhook notification if bundle request just completed
|
||||
if (previousStatus != BundleBacktestRequestStatus.Completed &&
|
||||
bundleRequest.Status == BundleBacktestRequestStatus.Completed &&
|
||||
!string.IsNullOrEmpty(user.TelegramChannel))
|
||||
{
|
||||
var message = $"✅ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) completed successfully. " +
|
||||
$"Completed: {completedJobs}/{totalJobs} backtests" +
|
||||
(failedJobs > 0 ? $", Failed: {failedJobs}" : "") +
|
||||
$". Results: {completedJobResults.Count} backtest(s) generated.";
|
||||
|
||||
await webhookService.SendMessage(message, user.TelegramChannel);
|
||||
}
|
||||
else if (previousStatus != BundleBacktestRequestStatus.Failed &&
|
||||
bundleRequest.Status == BundleBacktestRequestStatus.Failed &&
|
||||
!string.IsNullOrEmpty(user.TelegramChannel))
|
||||
{
|
||||
var message = $"❌ Bundle backtest '{bundleRequest.Name}' (ID: {bundleRequest.RequestId}) failed. " +
|
||||
$"All {totalJobs} backtests failed. Error: {bundleRequest.ErrorMessage}";
|
||||
|
||||
await webhookService.SendMessage(message, user.TelegramChannel);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Updated bundle request {BundleRequestId} progress: {Completed}/{Total} completed, {Failed} failed, {Running} running",
|
||||
bundleRequestId, completedJobs, totalJobs, failedJobs, runningJobs);
|
||||
@@ -337,13 +345,58 @@ public class BacktestComputeWorker : BackgroundService
|
||||
await Task.Delay(TimeSpan.FromMinutes(1), cancellationToken); // Check every minute
|
||||
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
|
||||
var resetCount = await jobRepository.ResetStaleJobsAsync(_options.StaleJobTimeoutMinutes);
|
||||
// Get stale jobs for this worker
|
||||
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
||||
var staleJobs = runningJobs
|
||||
.Where(j => j.JobType == JobType.Backtest &&
|
||||
(j.LastHeartbeat == null ||
|
||||
j.LastHeartbeat < DateTime.UtcNow.AddMinutes(-_options.StaleJobTimeoutMinutes)))
|
||||
.ToList();
|
||||
|
||||
if (resetCount > 0)
|
||||
foreach (var job in staleJobs)
|
||||
{
|
||||
_logger.LogInformation("Reset {Count} stale backtest jobs back to Pending status", resetCount);
|
||||
// If it's stale but retryable, reset to pending with retry count
|
||||
if (job.RetryCount < job.MaxRetries)
|
||||
{
|
||||
job.Status = JobStatus.Pending;
|
||||
job.RetryCount++;
|
||||
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
||||
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
||||
job.ErrorMessage = $"Worker timeout - retry {job.RetryCount}/{job.MaxRetries}";
|
||||
job.FailureCategory = FailureCategory.SystemError;
|
||||
_logger.LogWarning(
|
||||
"Stale job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}",
|
||||
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Exceeded retries - mark as failed
|
||||
job.Status = JobStatus.Failed;
|
||||
job.ErrorMessage = "Worker timeout - exceeded max retries";
|
||||
job.FailureCategory = FailureCategory.SystemError;
|
||||
job.IsRetryable = false;
|
||||
job.CompletedAt = DateTime.UtcNow;
|
||||
|
||||
// Notify permanent failure
|
||||
await NotifyPermanentFailure(job, new TimeoutException("Worker timeout"), scope.ServiceProvider);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (job.BundleRequestId.HasValue)
|
||||
{
|
||||
await UpdateBundleRequestProgress(job.BundleRequestId.Value, scope.ServiceProvider);
|
||||
}
|
||||
}
|
||||
|
||||
job.AssignedWorkerId = null;
|
||||
job.LastHeartbeat = null;
|
||||
await jobRepository.UpdateAsync(job);
|
||||
}
|
||||
|
||||
if (staleJobs.Count > 0)
|
||||
{
|
||||
_logger.LogInformation("Processed {Count} stale backtest jobs", staleJobs.Count);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
@@ -362,7 +415,7 @@ public class BacktestComputeWorker : BackgroundService
|
||||
await Task.Delay(TimeSpan.FromSeconds(_options.HeartbeatIntervalSeconds), cancellationToken);
|
||||
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IBacktestJobRepository>();
|
||||
var jobRepository = scope.ServiceProvider.GetRequiredService<IJobRepository>();
|
||||
|
||||
// Update heartbeat for all jobs assigned to this worker
|
||||
var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId);
|
||||
@@ -380,6 +433,118 @@ public class BacktestComputeWorker : BackgroundService
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleJobFailure(
|
||||
Job job,
|
||||
Exception ex,
|
||||
IJobRepository jobRepository,
|
||||
IServiceProvider serviceProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Categorize the failure
|
||||
var failureCategory = CategorizeFailure(ex);
|
||||
var isRetryable = IsRetryableFailure(ex, failureCategory);
|
||||
|
||||
// Check if we should retry
|
||||
if (isRetryable && job.RetryCount < job.MaxRetries)
|
||||
{
|
||||
// Calculate exponential backoff: 2^retryCount minutes, capped at MaxRetryDelayMinutes
|
||||
var backoffMinutes = Math.Min(Math.Pow(2, job.RetryCount), _options.MaxRetryDelayMinutes);
|
||||
job.RetryAfter = DateTime.UtcNow.AddMinutes(backoffMinutes);
|
||||
job.RetryCount++;
|
||||
job.Status = JobStatus.Pending; // Reset to pending for retry
|
||||
job.AssignedWorkerId = null; // Allow any worker to pick it up
|
||||
job.ErrorMessage = $"Retry {job.RetryCount}/{job.MaxRetries}: {ex.Message}";
|
||||
job.FailureCategory = failureCategory;
|
||||
job.IsRetryable = true;
|
||||
|
||||
_logger.LogWarning(
|
||||
"Job {JobId} will be retried (attempt {RetryCount}/{MaxRetries}) after {RetryAfter}. Error: {Error}",
|
||||
job.Id, job.RetryCount, job.MaxRetries, job.RetryAfter, ex.Message);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Permanent failure - mark as failed
|
||||
job.Status = JobStatus.Failed;
|
||||
job.ErrorMessage = ex.Message;
|
||||
job.FailureCategory = failureCategory;
|
||||
job.IsRetryable = false;
|
||||
job.CompletedAt = DateTime.UtcNow;
|
||||
|
||||
_logger.LogError(
|
||||
"Job {JobId} failed permanently after {RetryCount} retries. Error: {Error}",
|
||||
job.Id, job.RetryCount, ex.Message);
|
||||
|
||||
// Send notification for permanent failure
|
||||
await NotifyPermanentFailure(job, ex, serviceProvider);
|
||||
|
||||
// Update bundle request if this is part of a bundle
|
||||
if (job.BundleRequestId.HasValue)
|
||||
{
|
||||
await UpdateBundleRequestProgress(job.BundleRequestId.Value, serviceProvider);
|
||||
}
|
||||
}
|
||||
|
||||
job.LastHeartbeat = DateTime.UtcNow;
|
||||
await jobRepository.UpdateAsync(job);
|
||||
}
|
||||
catch (Exception updateEx)
|
||||
{
|
||||
_logger.LogError(updateEx, "Failed to update job {JobId} status after failure", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private FailureCategory CategorizeFailure(Exception ex)
|
||||
{
|
||||
return ex switch
|
||||
{
|
||||
TimeoutException => FailureCategory.Transient,
|
||||
TaskCanceledException => FailureCategory.Transient,
|
||||
HttpRequestException => FailureCategory.Transient,
|
||||
InvalidOperationException when ex.Message.Contains("candles") || ex.Message.Contains("No candles") => FailureCategory.DataError,
|
||||
InvalidOperationException when ex.Message.Contains("User") || ex.Message.Contains("not found") => FailureCategory.UserError,
|
||||
OutOfMemoryException => FailureCategory.SystemError,
|
||||
_ => FailureCategory.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private bool IsRetryableFailure(Exception ex, FailureCategory category)
|
||||
{
|
||||
// Don't retry user errors or data errors (missing candles, invalid config)
|
||||
if (category == FailureCategory.UserError || category == FailureCategory.DataError)
|
||||
return false;
|
||||
|
||||
// Retry transient and system errors
|
||||
return category == FailureCategory.Transient || category == FailureCategory.SystemError;
|
||||
}
|
||||
|
||||
private async Task NotifyPermanentFailure(
|
||||
Job job,
|
||||
Exception ex,
|
||||
IServiceProvider serviceProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
var webhookService = serviceProvider.GetRequiredService<IWebhookService>();
|
||||
const string alertsChannel = "2676086723";
|
||||
|
||||
var jobTypeName = job.JobType == JobType.Genetic ? "Genetic" : "Backtest";
|
||||
var message = $"🚨 **{jobTypeName} Job Failed Permanently**\n" +
|
||||
$"Job ID: `{job.Id}`\n" +
|
||||
$"User ID: {job.UserId}\n" +
|
||||
$"Retry Attempts: {job.RetryCount}/{job.MaxRetries}\n" +
|
||||
$"Failure Category: {job.FailureCategory}\n" +
|
||||
$"Error: {ex.Message}\n" +
|
||||
$"Time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss} UTC";
|
||||
|
||||
await webhookService.SendMessage(message, alertsChannel);
|
||||
}
|
||||
catch (Exception notifyEx)
|
||||
{
|
||||
_logger.LogError(notifyEx, "Failed to send permanent failure notification for job {JobId}", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
public override void Dispose()
|
||||
{
|
||||
_semaphore?.Dispose();
|
||||
@@ -418,5 +583,15 @@ public class BacktestComputeWorkerOptions
|
||||
/// Timeout in minutes for considering a job stale
|
||||
/// </summary>
|
||||
public int StaleJobTimeoutMinutes { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Default maximum retry attempts for failed jobs
|
||||
/// </summary>
|
||||
public int DefaultMaxRetries { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum retry delay in minutes (cap for exponential backoff)
|
||||
/// </summary>
|
||||
public int MaxRetryDelayMinutes { get; set; } = 60;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user