From b9700904927dea15025d38780a93e40eef451442 Mon Sep 17 00:00:00 2001 From: cryptooda Date: Fri, 3 Oct 2025 12:46:44 +0700 Subject: [PATCH] Add orleans healthchecks --- .../HealthChecks/OrleansHealthCheck.cs | 438 ++++++++++++++++++ src/Managing.Api/Program.cs | 3 +- 2 files changed, 440 insertions(+), 1 deletion(-) create mode 100644 src/Managing.Api/HealthChecks/OrleansHealthCheck.cs diff --git a/src/Managing.Api/HealthChecks/OrleansHealthCheck.cs b/src/Managing.Api/HealthChecks/OrleansHealthCheck.cs new file mode 100644 index 00000000..c2ec9da0 --- /dev/null +++ b/src/Managing.Api/HealthChecks/OrleansHealthCheck.cs @@ -0,0 +1,438 @@ +using System.Diagnostics; +using Microsoft.Extensions.Diagnostics.HealthChecks; + +namespace Managing.Api.HealthChecks +{ + public class OrleansHealthCheck : IHealthCheck + { + private readonly IGrainFactory _grainFactory; + private readonly ILogger _logger; + + public OrleansHealthCheck(IGrainFactory grainFactory, ILogger logger) + { + _grainFactory = grainFactory; + _logger = logger; + } + + public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) + { + try + { + var healthData = new Dictionary(); + var isHealthy = true; + var issues = new List(); + + // Check cluster connectivity + var clusterHealth = await CheckClusterConnectivity(healthData, cancellationToken); + if (!clusterHealth.IsHealthy) + { + isHealthy = false; + issues.AddRange(clusterHealth.Issues); + } + + // Check silo status and roles + var siloHealth = await CheckSiloStatus(healthData, cancellationToken); + if (!siloHealth.IsHealthy) + { + isHealthy = false; + issues.AddRange(siloHealth.Issues); + } + + // Check time synchronization + var timeHealth = CheckTimeSynchronization(healthData); + if (!timeHealth.IsHealthy) + { + isHealthy = false; + issues.AddRange(timeHealth.Issues); + } + + // Check Orleans metrics + var metricsHealth = await CheckOrleansMetrics(healthData, cancellationToken); + if (!metricsHealth.IsHealthy) + { + isHealthy = false; + issues.AddRange(metricsHealth.Issues); + } + + // Check grain activation + var grainHealth = await CheckGrainActivation(healthData, cancellationToken); + if (!grainHealth.IsHealthy) + { + isHealthy = false; + issues.AddRange(grainHealth.Issues); + } + + // Determine overall health status + if (isHealthy) + { + return HealthCheckResult.Healthy("Orleans cluster is healthy", data: healthData); + } + else if (issues.Count <= 2) + { + return HealthCheckResult.Degraded($"Orleans cluster has minor issues: {string.Join(", ", issues)}", data: healthData); + } + else + { + return HealthCheckResult.Unhealthy($"Orleans cluster has critical issues: {string.Join(", ", issues)}", data: healthData); + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Error checking Orleans health"); + return HealthCheckResult.Unhealthy( + "Failed to check Orleans cluster health", + ex, + data: new Dictionary + { + ["ErrorMessage"] = ex.Message, + ["ErrorType"] = ex.GetType().Name, + ["StackTrace"] = ex.StackTrace + }); + } + } + + private async Task<(bool IsHealthy, List Issues)> CheckClusterConnectivity(Dictionary healthData, CancellationToken cancellationToken) + { + var issues = new List(); + var clusterInfo = new Dictionary(); + + try + { + // Check cluster membership + try + { + var managementGrain = _grainFactory.GetGrain(0); + var membershipTable = await managementGrain.GetDetailedHosts(); + clusterInfo["ActiveSilos"] = membershipTable.Count(); + + var membershipList = new List(); + foreach (var silo in membershipTable) + { + var siloData = new Dictionary + { + ["SiloName"] = silo.SiloName, + ["Status"] = silo.Status.ToString(), + ["SiloAddress"] = silo.SiloAddress.ToString(), + ["HostName"] = silo.HostName, + ["ProxyPort"] = silo.ProxyPort + }; + + if (silo.StartTime != default) + { + siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); + } + else + { + siloData["StartTime"] = "Unknown"; + } + + membershipList.Add(siloData); + } + clusterInfo["MembershipTable"] = membershipList; + + // Count active silos + var activeSilos = 0; + var localhostEnvironment = false; + foreach (var silo in membershipTable) + { + if (silo.Status == SiloStatus.Active) + { + activeSilos++; + } + if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local")) + { + localhostEnvironment = true; + } + } + + clusterInfo["ActiveSilosCount"] = activeSilos; + clusterInfo["IsLocalhostEnvironment"] = localhostEnvironment; + + if (membershipTable.Count() == 0) + { + issues.Add("No silos found in cluster"); + } + else if (activeSilos == 0) + { + issues.Add("No active silos found in cluster"); + } + } + catch (Exception ex) + { + issues.Add($"Failed to get cluster membership: {ex.Message}"); + clusterInfo["MembershipError"] = ex.Message; + } + + healthData["ClusterConnectivity"] = clusterInfo; + return (issues.Count == 0, issues); + } + catch (Exception ex) + { + issues.Add($"Cluster connectivity check failed: {ex.Message}"); + healthData["ClusterConnectivity"] = new Dictionary + { + ["Error"] = ex.Message, + ["ErrorType"] = ex.GetType().Name + }; + return (false, issues); + } + } + + private async Task<(bool IsHealthy, List Issues)> CheckSiloStatus(Dictionary healthData, CancellationToken cancellationToken) + { + var issues = new List(); + var siloInfo = new Dictionary(); + + try + { + // Check silo roles and configuration + var siloRole = Environment.GetEnvironmentVariable("SILO_ROLE") ?? "Unknown"; + var taskSlot = Environment.GetEnvironmentVariable("TASK_SLOT") ?? "1"; + siloInfo["SiloRole"] = siloRole; + siloInfo["TaskSlot"] = taskSlot; + + // Check if clustering is disabled + var disableClustering = Environment.GetEnvironmentVariable("DISABLE_ORLEANS_CLUSTERING"); + var clusteringDisabled = !string.IsNullOrEmpty(disableClustering) && bool.Parse(disableClustering); + siloInfo["ClusteringDisabled"] = clusteringDisabled; + + if (clusteringDisabled) + { + issues.Add("Orleans clustering is disabled - running in localhost mode"); + } + + // Check Orleans grains configuration + var runGrains = Environment.GetEnvironmentVariable("RUN_ORLEANS_GRAINS"); + var grainsEnabled = string.IsNullOrEmpty(runGrains) || bool.Parse(runGrains); + siloInfo["GrainsEnabled"] = grainsEnabled; + + if (!grainsEnabled) + { + issues.Add("Orleans grains are disabled"); + } + + healthData["SiloStatus"] = siloInfo; + return (issues.Count == 0, issues); + } + catch (Exception ex) + { + issues.Add($"Silo status check failed: {ex.Message}"); + healthData["SiloStatus"] = new Dictionary + { + ["Error"] = ex.Message, + ["ErrorType"] = ex.GetType().Name + }; + return (false, issues); + } + } + + private (bool IsHealthy, List Issues) CheckTimeSynchronization(Dictionary healthData) + { + var issues = new List(); + var timeInfo = new Dictionary(); + + try + { + var utcNow = DateTime.UtcNow; + var localNow = DateTime.Now; + var timeZone = TimeZoneInfo.Local; + + timeInfo["UtcTime"] = utcNow.ToString("yyyy-MM-dd HH:mm:ss UTC"); + timeInfo["LocalTime"] = localNow.ToString("yyyy-MM-dd HH:mm:ss"); + timeInfo["TimeZone"] = timeZone.DisplayName; + timeInfo["TimeZoneId"] = timeZone.Id; + timeInfo["UtcOffset"] = timeZone.GetUtcOffset(utcNow).ToString(); + + // Check if timezone is properly configured + if (timeZone.Id == "UTC" || timeZone.Id == "GMT") + { + timeInfo["TimezoneWarning"] = "Using UTC/GMT timezone - ensure this is intentional"; + } + + // Check for potential time drift (basic check) + var processStartTime = Process.GetCurrentProcess().StartTime; + var uptime = utcNow - processStartTime; + timeInfo["ProcessUptime"] = uptime.ToString(@"dd\.hh\:mm\:ss"); + timeInfo["ProcessStartTime"] = processStartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); + + healthData["TimeSynchronization"] = timeInfo; + return (true, issues); + } + catch (Exception ex) + { + issues.Add($"Time synchronization check failed: {ex.Message}"); + healthData["TimeSynchronization"] = new Dictionary + { + ["Error"] = ex.Message, + ["ErrorType"] = ex.GetType().Name + }; + return (false, issues); + } + } + + private async Task<(bool IsHealthy, List Issues)> CheckOrleansMetrics(Dictionary healthData, CancellationToken cancellationToken) + { + var issues = new List(); + var metricsInfo = new Dictionary(); + + try + { + // Get Orleans statistics + var managementGrain = _grainFactory.GetGrain(0); + + try + { + var siloStatistics = await managementGrain.GetDetailedHosts(); + var siloCount = siloStatistics.Count(); + metricsInfo["SiloCount"] = siloCount; + + // Get basic silo information + var siloMetricsList = new List(); + var activeSilos = 0; + var deadSilos = 0; + var localhostEnvironment = false; + + foreach (var silo in siloStatistics) + { + var siloData = new Dictionary + { + ["SiloName"] = silo.SiloName, + ["Status"] = silo.Status.ToString(), + ["SiloAddress"] = silo.SiloAddress.ToString(), + ["HostName"] = silo.HostName, + ["ProxyPort"] = silo.ProxyPort + }; + + if (silo.StartTime != default) + { + siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); + } + else + { + siloData["StartTime"] = "Unknown"; + } + + siloMetricsList.Add(siloData); + + // Count silo statuses + if (silo.Status == SiloStatus.Active) + { + activeSilos++; + } + else if (silo.Status == SiloStatus.Dead) + { + deadSilos++; + } + + // Detect localhost environment (same hostname for all silos) + if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local")) + { + localhostEnvironment = true; + } + } + + metricsInfo["SiloMetrics"] = siloMetricsList; + metricsInfo["ActiveSilos"] = activeSilos; + metricsInfo["DeadSilos"] = deadSilos; + metricsInfo["IsLocalhostEnvironment"] = localhostEnvironment; + + // Determine expected silo count based on environment + var expectedActiveSilos = localhostEnvironment ? 1 : 2; // Localhost: 1, Sandbox: 2 + metricsInfo["ExpectedActiveSilos"] = expectedActiveSilos; + + // Check for health issues based on environment + if (activeSilos == 0) + { + issues.Add("No active silos found"); + } + else if (localhostEnvironment) + { + // In localhost, we expect exactly 1 active silo + if (activeSilos != 1) + { + issues.Add($"Localhost environment should have exactly 1 active silo, found {activeSilos}"); + } + // Dead silos in localhost are normal (from previous runs) + if (deadSilos > 0) + { + metricsInfo["DeadSilosNote"] = "Dead silos in localhost are normal (from previous runs)"; + } + } + else + { + // In sandbox/production, we expect at least 2 active silos for redundancy + if (activeSilos < 2) + { + issues.Add($"Production environment should have at least 2 active silos for redundancy, found {activeSilos}"); + } + // Dead silos in production are concerning + if (deadSilos > 0) + { + issues.Add($"Found {deadSilos} dead silos in production environment"); + } + } + } + catch (Exception ex) + { + issues.Add($"Failed to get Orleans statistics: {ex.Message}"); + metricsInfo["StatisticsError"] = ex.Message; + } + + healthData["OrleansMetrics"] = metricsInfo; + return (issues.Count == 0, issues); + } + catch (Exception ex) + { + issues.Add($"Orleans metrics check failed: {ex.Message}"); + healthData["OrleansMetrics"] = new Dictionary + { + ["Error"] = ex.Message, + ["ErrorType"] = ex.GetType().Name + }; + return (false, issues); + } + } + + private async Task<(bool IsHealthy, List Issues)> CheckGrainActivation(Dictionary healthData, CancellationToken cancellationToken) + { + var issues = new List(); + var grainInfo = new Dictionary(); + + try + { + // Test basic grain activation + var testGrain = _grainFactory.GetGrain(0); + + try + { + // Try to call a simple method to test grain activation + var hosts = await testGrain.GetDetailedHosts(); + grainInfo["GrainActivationTest"] = "Success"; + grainInfo["ActivationTestResult"] = $"Successfully activated ManagementGrain and retrieved {hosts.Count()} hosts"; + } + catch (Exception ex) + { + issues.Add($"Grain activation test failed: {ex.Message}"); + grainInfo["GrainActivationTest"] = "Failed"; + grainInfo["ActivationTestError"] = ex.Message; + } + + // Check grain factory status + grainInfo["GrainFactoryAvailable"] = _grainFactory != null; + + healthData["GrainActivation"] = grainInfo; + return (issues.Count == 0, issues); + } + catch (Exception ex) + { + issues.Add($"Grain activation check failed: {ex.Message}"); + healthData["GrainActivation"] = new Dictionary + { + ["Error"] = ex.Message, + ["ErrorType"] = ex.GetType().Name + }; + return (false, issues); + } + } + } +} \ No newline at end of file diff --git a/src/Managing.Api/Program.cs b/src/Managing.Api/Program.cs index f511b51d..93e2eb0d 100644 --- a/src/Managing.Api/Program.cs +++ b/src/Managing.Api/Program.cs @@ -124,7 +124,8 @@ builder.Services.AddHealthChecks() .AddUrlGroup(new Uri($"{influxUrl}/health"), name: "influxdb", tags: ["database"]) .AddCheck("web3proxy", tags: ["api", "external"]) .AddCheck("candle-data", tags: ["database", "candles"]) - .AddCheck("gmx-connectivity", tags: ["api", "external"]); + .AddCheck("gmx-connectivity", tags: ["api", "external"]) + .AddCheck("orleans-cluster", tags: ["orleans", "cluster"]); builder.Host.UseSerilog((hostBuilder, loggerConfiguration) => {