From 8a271554181c44ff03471974a5c11c71355d0b62 Mon Sep 17 00:00:00 2001 From: cryptooda Date: Tue, 11 Nov 2025 05:30:40 +0700 Subject: [PATCH] Improve a bit workers. bug : Bundle reset after all backtest finish --- src/Managing.Api/appsettings.Oda.json | 3 +- src/Managing.Api/appsettings.json | 1 + .../Backtests/BacktestExecutor.cs | 31 ++++++- .../Workers/BacktestComputeWorker.cs | 90 ++++++++++++++++++- src/Managing.Bootstrap/ComputeBootstrap.cs | 3 + .../mollecules/PieChart/PieChart.tsx | 8 +- .../bundleBacktestRequestsSettings.tsx | 4 +- .../bundleBacktestRequestsTable.tsx | 30 ++++++- .../src/pages/adminPage/jobs/jobsSettings.tsx | 38 ++++++-- src/Managing.Workers/Program.cs | 7 ++ src/Managing.Workers/appsettings.json | 1 + 11 files changed, 198 insertions(+), 18 deletions(-) diff --git a/src/Managing.Api/appsettings.Oda.json b/src/Managing.Api/appsettings.Oda.json index c076fac7..828fe3cc 100644 --- a/src/Managing.Api/appsettings.Oda.json +++ b/src/Managing.Api/appsettings.Oda.json @@ -55,5 +55,6 @@ "WorkerLeaderboard": false, "WorkerFundingRatesWatcher": false, "WorkerGeneticAlgorithm": false, - "WorkerBundleBacktest": false + "WorkerBundleBacktest": false, + "WorkerBundleBacktestHealthCheck": false } \ No newline at end of file diff --git a/src/Managing.Api/appsettings.json b/src/Managing.Api/appsettings.json index abec46e0..319b1408 100644 --- a/src/Managing.Api/appsettings.json +++ b/src/Managing.Api/appsettings.json @@ -86,6 +86,7 @@ "WorkerFundingRatesWatcher": false, "WorkerGeneticAlgorithm": false, "WorkerBundleBacktest": false, + "WorkerBundleBacktestHealthCheck": false, "WorkerBalancesTracking": false, "WorkerNotifyBundleBacktest": false, "SqlMonitoring": { diff --git a/src/Managing.Application/Backtests/BacktestExecutor.cs b/src/Managing.Application/Backtests/BacktestExecutor.cs index 60c59586..0502fddf 100644 --- a/src/Managing.Application/Backtests/BacktestExecutor.cs +++ b/src/Managing.Application/Backtests/BacktestExecutor.cs @@ -641,6 +641,30 @@ public class BacktestExecutor var failedJobs = jobs.Count(j => j.Status == JobStatus.Failed); var runningJobs = jobs.Count(j => j.Status == JobStatus.Running); var totalJobs = jobs.Count(); + + // CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set), + // don't overwrite it unless we're detecting a legitimate change + if (bundleRequest.CompletedAt.HasValue && + (bundleRequest.Status == BundleBacktestRequestStatus.Completed || + bundleRequest.Status == BundleBacktestRequestStatus.Failed)) + { + // Bundle already finalized, only update if job counts indicate it should be re-opened + // (This shouldn't happen in normal flow, but guards against race conditions) + if (completedJobs + failedJobs == totalJobs) + { + _logger.LogDebug( + "Bundle {BundleRequestId} already completed/failed. Skipping status update.", + bundleRequestId); + return; // Don't modify a completed bundle + } + else + { + _logger.LogWarning( + "Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.", + bundleRequestId, completedJobs, failedJobs, totalJobs); + // Allow the update to proceed to fix inconsistent state + } + } // Update bundle request progress bundleRequest.CompletedBacktests = completedJobs; @@ -668,11 +692,14 @@ public class BacktestExecutor bundleRequest.CompletedAt = DateTime.UtcNow; bundleRequest.CurrentBacktest = null; } - else if (runningJobs > 0) + else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0) { - // Some jobs still running + // Some jobs are running, or some have completed/failed (meaning work has started) + // Once a bundle has started processing, it should stay "Running" until all jobs are done bundleRequest.Status = BundleBacktestRequestStatus.Running; } + // If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0), + // keep the current status (likely Pending) // Update results list with the new backtest ID var resultsList = bundleRequest.Results?.ToList() ?? new List(); diff --git a/src/Managing.Application/Workers/BacktestComputeWorker.cs b/src/Managing.Application/Workers/BacktestComputeWorker.cs index e1b8eba1..57e61b22 100644 --- a/src/Managing.Application/Workers/BacktestComputeWorker.cs +++ b/src/Managing.Application/Workers/BacktestComputeWorker.cs @@ -457,6 +457,30 @@ public class BacktestComputeWorker : BackgroundService } var previousStatus = bundleRequest.Status; + + // CRITICAL: If bundle is already in a final state (Completed/Failed with CompletedAt set), + // don't overwrite it unless we're detecting a legitimate change + if (bundleRequest.CompletedAt.HasValue && + (bundleRequest.Status == BundleBacktestRequestStatus.Completed || + bundleRequest.Status == BundleBacktestRequestStatus.Failed)) + { + // Bundle already finalized, only update if job counts indicate it should be re-opened + // (This shouldn't happen in normal flow, but guards against race conditions) + if (completedJobs + failedJobs == totalJobs) + { + _logger.LogDebug( + "Bundle {BundleRequestId} already completed/failed. Skipping status update.", + bundleRequestId); + return; // Don't modify a completed bundle + } + else + { + _logger.LogWarning( + "Bundle {BundleRequestId} was marked as completed/failed but has incomplete jobs ({Completed}+{Failed}/{Total}). Reopening.", + bundleRequestId, completedJobs, failedJobs, totalJobs); + // Allow the update to proceed to fix inconsistent state + } + } // Update bundle request progress bundleRequest.CompletedBacktests = completedJobs; @@ -483,11 +507,14 @@ public class BacktestComputeWorker : BackgroundService bundleRequest.CompletedAt = DateTime.UtcNow; bundleRequest.CurrentBacktest = null; } - else if (runningJobs > 0) + else if (runningJobs > 0 || completedJobs > 0 || failedJobs > 0) { - // Some jobs still running + // Some jobs are running, or some have completed/failed (meaning work has started) + // Once a bundle has started processing, it should stay "Running" until all jobs are done bundleRequest.Status = BundleBacktestRequestStatus.Running; } + // If all jobs are still pending (completedJobs = 0, failedJobs = 0, runningJobs = 0), + // keep the current status (likely Pending) // Update results list from completed jobs var completedJobResults = jobs @@ -554,11 +581,68 @@ public class BacktestComputeWorker : BackgroundService using var scope = _scopeFactory.CreateScope(); var jobRepository = scope.ServiceProvider.GetRequiredService(); - // Get stale jobs for this worker + // Get running jobs for this worker var runningJobs = await jobRepository.GetRunningJobsByWorkerIdAsync(_options.WorkerId); + + // CRITICAL FIX: Check for jobs stuck at 100% progress + // These jobs completed execution but their status wasn't updated to Completed + // This causes the worker to think it's at max capacity + var stuckCompletedJobs = runningJobs + .Where(j => j.JobType == JobType.Backtest && j.ProgressPercentage >= 100) + .ToList(); + + if (stuckCompletedJobs.Any()) + { + _logger.LogWarning( + "🔧 Found {Count} jobs stuck at 100% progress for worker {WorkerId}. Auto-completing them.", + stuckCompletedJobs.Count, _options.WorkerId); + + foreach (var stuckJob in stuckCompletedJobs) + { + _logger.LogWarning( + "🔧 Job {JobId} stuck at 100% progress in Running status since {StartedAt}. Marking as completed.", + stuckJob.Id, stuckJob.StartedAt); + + stuckJob.Status = JobStatus.Completed; + stuckJob.CompletedAt = stuckJob.CompletedAt ?? DateTime.UtcNow; + stuckJob.LastHeartbeat = DateTime.UtcNow; + + // Add note to error message if not already set + if (string.IsNullOrEmpty(stuckJob.ErrorMessage)) + { + stuckJob.ErrorMessage = "Job completed but status was not updated (auto-recovered)"; + } + + await jobRepository.UpdateAsync(stuckJob); + + // Clean up progress tracker if still present + _jobProgressTrackers.TryRemove(stuckJob.Id, out _); + _runningJobTasks.TryRemove(stuckJob.Id, out _); + + // Update bundle request if this is part of a bundle + if (stuckJob.BundleRequestId.HasValue) + { + try + { + await UpdateBundleRequestProgress(stuckJob.BundleRequestId.Value, scope.ServiceProvider); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error updating bundle request progress for stuck job {JobId}", stuckJob.Id); + } + } + + _logger.LogInformation( + "✅ Successfully auto-completed stuck job {JobId}. Worker can now claim new jobs.", + stuckJob.Id); + } + } + + // Get stale jobs for this worker var now = DateTime.UtcNow; var staleJobs = runningJobs .Where(j => j.JobType == JobType.Backtest && + j.ProgressPercentage < 100 && // Don't mark stuck-at-100% jobs as stale ( // Stale heartbeat (no heartbeat in timeout period) j.LastHeartbeat == null || diff --git a/src/Managing.Bootstrap/ComputeBootstrap.cs b/src/Managing.Bootstrap/ComputeBootstrap.cs index 93eedbed..e446bae6 100644 --- a/src/Managing.Bootstrap/ComputeBootstrap.cs +++ b/src/Managing.Bootstrap/ComputeBootstrap.cs @@ -74,6 +74,9 @@ public static class ComputeBootstrap // Genetic service (needed for GeneticExecutor) services.AddScoped(); services.AddTransient(); + + // Job service (needed for BundleBacktestHealthCheckWorker to recreate missing jobs) + services.AddTransient(); services.AddTransient(); services.AddTransient(); diff --git a/src/Managing.WebApp/src/components/mollecules/PieChart/PieChart.tsx b/src/Managing.WebApp/src/components/mollecules/PieChart/PieChart.tsx index 51c6c3b0..ebfe77f3 100644 --- a/src/Managing.WebApp/src/components/mollecules/PieChart/PieChart.tsx +++ b/src/Managing.WebApp/src/components/mollecules/PieChart/PieChart.tsx @@ -5,9 +5,11 @@ type IPieChart = { data: number[] labels: string[] colors: string[] + width?: number + height?: number } -const PieChart: React.FC = ({ data, labels, colors }) => { +const PieChart: React.FC = ({ data, labels, colors, width = 150, height = 150 }) => { return ( <> = ({ data, labels, colors }) => { }, ]} layout={{ - height: 150, + height: height, margin: { b: 20, l: 0, @@ -33,7 +35,7 @@ const PieChart: React.FC = ({ data, labels, colors }) => { paper_bgcolor: 'rgba(0,0,0,0)', plot_bgcolor: 'rgba(0,0,0,0)', showlegend: false, - width: 150, + width: width, }} config={{ displayModeBar: false, diff --git a/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsSettings.tsx b/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsSettings.tsx index eeb1ea66..ba0410d8 100644 --- a/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsSettings.tsx +++ b/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsSettings.tsx @@ -17,7 +17,7 @@ const BundleBacktestRequestsSettings: React.FC = () => { const [sortBy, setSortBy] = useState(BundleBacktestRequestSortableColumn.CreatedAt) const [sortOrder, setSortOrder] = useState('desc') const [nameContains, setNameContains] = useState('') - const [statusFilter, setStatusFilter] = useState(BundleBacktestRequestStatus.Failed) + const [statusFilter, setStatusFilter] = useState(null) const [userIdFilter, setUserIdFilter] = useState('') const [userNameContains, setUserNameContains] = useState('') const [totalBacktestsMin, setTotalBacktestsMin] = useState('') @@ -258,7 +258,7 @@ const BundleBacktestRequestsSettings: React.FC = () => { )} -
+
{isLoadingSummary ? ( // Show skeleton with all statuses set to 0 <> diff --git a/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsTable.tsx b/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsTable.tsx index 470d31b0..2fff7d02 100644 --- a/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsTable.tsx +++ b/src/Managing.WebApp/src/pages/adminPage/bundleBacktestRequests/bundleBacktestRequestsTable.tsx @@ -3,7 +3,7 @@ import { type BundleBacktestRequestListItemResponse, BundleBacktestRequestSortableColumn } from '../../../generated/ManagingApi' -import {Table} from '../../../components/mollecules' +import {Table, Toast} from '../../../components/mollecules' interface IBundleBacktestRequestsTable { bundleRequests: BundleBacktestRequestListItemResponse[] @@ -68,6 +68,16 @@ const BundleBacktestRequestsTable: React.FC = ({ return `${progress.toFixed(1)}%` } + const copyToClipboard = async (text: string) => { + const toast = new Toast('Copying to clipboard...') + try { + await navigator.clipboard.writeText(text) + toast.update('success', 'Request ID copied to clipboard!') + } catch (err) { + toast.update('error', 'Failed to copy to clipboard') + } + } + const SortableHeader = ({ column, label }: { column: BundleBacktestRequestSortableColumn; label: string }) => { const isActive = sortBy === column return ( @@ -180,7 +190,23 @@ const BundleBacktestRequestsTable: React.FC = ({ id: 'requestId', Header: () => , accessor: (row: BundleBacktestRequestListItemResponse) => ( - {row.requestId?.substring(0, 8)}... +
+ {row.requestId?.substring(0, 8)}... + {row.requestId && ( + + )} +
) }, ...(onDelete ? [{ diff --git a/src/Managing.WebApp/src/pages/adminPage/jobs/jobsSettings.tsx b/src/Managing.WebApp/src/pages/adminPage/jobs/jobsSettings.tsx index f33689d5..b102fdae 100644 --- a/src/Managing.WebApp/src/pages/adminPage/jobs/jobsSettings.tsx +++ b/src/Managing.WebApp/src/pages/adminPage/jobs/jobsSettings.tsx @@ -3,7 +3,7 @@ import {useMutation, useQuery, useQueryClient} from '@tanstack/react-query' import useApiUrlStore from '../../../app/store/apiStore' import {JobClient} from '../../../generated/ManagingApi' -import {BottomMenuBar, Toast} from '../../../components/mollecules' +import {BottomMenuBar, PieChart, Toast} from '../../../components/mollecules' import JobsTable from './jobsTable' @@ -13,7 +13,7 @@ const JobsSettings: React.FC = () => { const [pageSize, setPageSize] = useState(50) const [sortBy, setSortBy] = useState('CreatedAt') const [sortOrder, setSortOrder] = useState('desc') - const [statusFilter, setStatusFilter] = useState('Failed') + const [statusFilter, setStatusFilter] = useState('') const [jobTypeFilter, setJobTypeFilter] = useState('') const [userIdFilter, setUserIdFilter] = useState('') const [workerIdFilter, setWorkerIdFilter] = useState('') @@ -146,7 +146,7 @@ const JobsSettings: React.FC = () => { } const clearFilters = () => { - setStatusFilter('Failed') // Reset to Failed instead of All + setStatusFilter('') // Reset to All setJobTypeFilter('') setUserIdFilter('') setWorkerIdFilter('') @@ -195,7 +195,35 @@ const JobsSettings: React.FC = () => { Status Overview -
+
+ {/* Pie Chart */} +
+ item.count || 0)} + labels={jobSummary.statusSummary.map(item => item.status || 'Unknown')} + colors={jobSummary.statusSummary.map(item => { + const statusLower = (item.status || '').toLowerCase() + switch (statusLower) { + case 'pending': + return '#fbbf24' // warning color + case 'running': + return '#3b82f6' // info color + case 'completed': + return '#10b981' // success color + case 'failed': + return '#ef4444' // error color + case 'cancelled': + return '#6b7280' // neutral color + default: + return '#9ca3af' // default gray + } + })} + width={300} + height={300} + /> +
+ {/* Status Tiles */} +
{jobSummary.statusSummary.map((statusItem) => { const statusLower = (statusItem.status || '').toLowerCase() let statusIcon, statusDesc, statusColor @@ -271,6 +299,7 @@ const JobsSettings: React.FC = () => {
) })} +
@@ -576,7 +605,6 @@ const JobsSettings: React.FC = () => { - )} ) } diff --git a/src/Managing.Workers/Program.cs b/src/Managing.Workers/Program.cs index 0f504d4a..e750bd38 100644 --- a/src/Managing.Workers/Program.cs +++ b/src/Managing.Workers/Program.cs @@ -207,6 +207,13 @@ var host = hostBuilder { services.AddHostedService(); } + + // Register the bundle backtest health check worker if enabled + var isBundleHealthCheckEnabled = configuration.GetValue("WorkerBundleBacktestHealthCheck", false); + if (isBundleHealthCheckEnabled) + { + services.AddHostedService(); + } }) .ConfigureLogging((hostingContext, logging) => { diff --git a/src/Managing.Workers/appsettings.json b/src/Managing.Workers/appsettings.json index 959a896c..ab23771f 100644 --- a/src/Managing.Workers/appsettings.json +++ b/src/Managing.Workers/appsettings.json @@ -20,6 +20,7 @@ "HeartbeatIntervalSeconds": 30, "StaleJobTimeoutMinutes": 10 }, + "WorkerBundleBacktestHealthCheck": true, "Sentry": { "Dsn": "https://ba7ab16fc3aa445480c115861b4ec8b9@glitch.kai.managing.live/4" },