using System.Diagnostics; using Microsoft.Extensions.Diagnostics.HealthChecks; namespace Managing.Api.HealthChecks { public class OrleansHealthCheck : IHealthCheck { private readonly IGrainFactory _grainFactory; private readonly ILogger _logger; public OrleansHealthCheck(IGrainFactory grainFactory, ILogger logger) { _grainFactory = grainFactory; _logger = logger; } public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) { try { var healthData = new Dictionary(); var isHealthy = true; var issues = new List(); // Check cluster connectivity var clusterHealth = await CheckClusterConnectivity(healthData, cancellationToken); if (!clusterHealth.IsHealthy) { isHealthy = false; issues.AddRange(clusterHealth.Issues); } // Check silo status and roles var siloHealth = await CheckSiloStatus(healthData, cancellationToken); if (!siloHealth.IsHealthy) { isHealthy = false; issues.AddRange(siloHealth.Issues); } // Check time synchronization var timeHealth = CheckTimeSynchronization(healthData); if (!timeHealth.IsHealthy) { isHealthy = false; issues.AddRange(timeHealth.Issues); } // Check Orleans metrics var metricsHealth = await CheckOrleansMetrics(healthData, cancellationToken); if (!metricsHealth.IsHealthy) { isHealthy = false; issues.AddRange(metricsHealth.Issues); } // Check grain activation var grainHealth = await CheckGrainActivation(healthData, cancellationToken); if (!grainHealth.IsHealthy) { isHealthy = false; issues.AddRange(grainHealth.Issues); } // Determine overall health status if (isHealthy) { return HealthCheckResult.Healthy("Orleans cluster is healthy", data: healthData); } else if (issues.Count <= 2) { return HealthCheckResult.Degraded($"Orleans cluster has minor issues: {string.Join(", ", issues)}", data: healthData); } else { return HealthCheckResult.Unhealthy($"Orleans cluster has critical issues: {string.Join(", ", issues)}", data: healthData); } } catch (Exception ex) { _logger.LogError(ex, "Error checking Orleans health"); return HealthCheckResult.Unhealthy( "Failed to check Orleans cluster health", ex, data: new Dictionary { ["ErrorMessage"] = ex.Message, ["ErrorType"] = ex.GetType().Name, ["StackTrace"] = ex.StackTrace }); } } private async Task<(bool IsHealthy, List Issues)> CheckClusterConnectivity(Dictionary healthData, CancellationToken cancellationToken) { var issues = new List(); var clusterInfo = new Dictionary(); try { // Check cluster membership try { var managementGrain = _grainFactory.GetGrain(0); var membershipTable = await managementGrain.GetDetailedHosts(); clusterInfo["ActiveSilos"] = membershipTable.Count(); var membershipList = new List(); foreach (var silo in membershipTable) { var siloData = new Dictionary { ["SiloName"] = silo.SiloName, ["Status"] = silo.Status.ToString(), ["SiloAddress"] = silo.SiloAddress.ToString(), ["HostName"] = silo.HostName, ["ProxyPort"] = silo.ProxyPort }; if (silo.StartTime != default) { siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); } else { siloData["StartTime"] = "Unknown"; } membershipList.Add(siloData); } clusterInfo["MembershipTable"] = membershipList; // Count active silos var activeSilos = 0; var localhostEnvironment = false; foreach (var silo in membershipTable) { if (silo.Status == SiloStatus.Active) { activeSilos++; } if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local")) { localhostEnvironment = true; } } clusterInfo["ActiveSilosCount"] = activeSilos; clusterInfo["IsLocalhostEnvironment"] = localhostEnvironment; if (membershipTable.Count() == 0) { issues.Add("No silos found in cluster"); } else if (activeSilos == 0) { issues.Add("No active silos found in cluster"); } } catch (Exception ex) { issues.Add($"Failed to get cluster membership: {ex.Message}"); clusterInfo["MembershipError"] = ex.Message; } healthData["ClusterConnectivity"] = clusterInfo; return (issues.Count == 0, issues); } catch (Exception ex) { issues.Add($"Cluster connectivity check failed: {ex.Message}"); healthData["ClusterConnectivity"] = new Dictionary { ["Error"] = ex.Message, ["ErrorType"] = ex.GetType().Name }; return (false, issues); } } private async Task<(bool IsHealthy, List Issues)> CheckSiloStatus(Dictionary healthData, CancellationToken cancellationToken) { var issues = new List(); var siloInfo = new Dictionary(); try { // Check silo roles and configuration var siloRole = Environment.GetEnvironmentVariable("SILO_ROLE") ?? "Unknown"; var taskSlot = Environment.GetEnvironmentVariable("TASK_SLOT") ?? "1"; siloInfo["SiloRole"] = siloRole; siloInfo["TaskSlot"] = taskSlot; // Check if clustering is disabled var disableClustering = Environment.GetEnvironmentVariable("DISABLE_ORLEANS_CLUSTERING"); var clusteringDisabled = !string.IsNullOrEmpty(disableClustering) && bool.Parse(disableClustering); siloInfo["ClusteringDisabled"] = clusteringDisabled; if (clusteringDisabled) { issues.Add("Orleans clustering is disabled - running in localhost mode"); } // Check Orleans grains configuration var runGrains = Environment.GetEnvironmentVariable("RUN_ORLEANS_GRAINS"); var grainsEnabled = string.IsNullOrEmpty(runGrains) || bool.Parse(runGrains); siloInfo["GrainsEnabled"] = grainsEnabled; if (!grainsEnabled) { issues.Add("Orleans grains are disabled"); } healthData["SiloStatus"] = siloInfo; return (issues.Count == 0, issues); } catch (Exception ex) { issues.Add($"Silo status check failed: {ex.Message}"); healthData["SiloStatus"] = new Dictionary { ["Error"] = ex.Message, ["ErrorType"] = ex.GetType().Name }; return (false, issues); } } private (bool IsHealthy, List Issues) CheckTimeSynchronization(Dictionary healthData) { var issues = new List(); var timeInfo = new Dictionary(); try { var utcNow = DateTime.UtcNow; var localNow = DateTime.Now; var timeZone = TimeZoneInfo.Local; timeInfo["UtcTime"] = utcNow.ToString("yyyy-MM-dd HH:mm:ss UTC"); timeInfo["LocalTime"] = localNow.ToString("yyyy-MM-dd HH:mm:ss"); timeInfo["TimeZone"] = timeZone.DisplayName; timeInfo["TimeZoneId"] = timeZone.Id; timeInfo["UtcOffset"] = timeZone.GetUtcOffset(utcNow).ToString(); // Check if timezone is properly configured if (timeZone.Id == "UTC" || timeZone.Id == "GMT") { timeInfo["TimezoneWarning"] = "Using UTC/GMT timezone - ensure this is intentional"; } // Check for potential time drift (basic check) var processStartTime = Process.GetCurrentProcess().StartTime; var uptime = utcNow - processStartTime; timeInfo["ProcessUptime"] = uptime.ToString(@"dd\.hh\:mm\:ss"); timeInfo["ProcessStartTime"] = processStartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); healthData["TimeSynchronization"] = timeInfo; return (true, issues); } catch (Exception ex) { issues.Add($"Time synchronization check failed: {ex.Message}"); healthData["TimeSynchronization"] = new Dictionary { ["Error"] = ex.Message, ["ErrorType"] = ex.GetType().Name }; return (false, issues); } } private async Task<(bool IsHealthy, List Issues)> CheckOrleansMetrics(Dictionary healthData, CancellationToken cancellationToken) { var issues = new List(); var metricsInfo = new Dictionary(); try { // Get Orleans statistics var managementGrain = _grainFactory.GetGrain(0); try { var siloStatistics = await managementGrain.GetDetailedHosts(); var siloCount = siloStatistics.Count(); metricsInfo["SiloCount"] = siloCount; // Get basic silo information var siloMetricsList = new List(); var activeSilos = 0; var deadSilos = 0; var localhostEnvironment = false; foreach (var silo in siloStatistics) { var siloData = new Dictionary { ["SiloName"] = silo.SiloName, ["Status"] = silo.Status.ToString(), ["SiloAddress"] = silo.SiloAddress.ToString(), ["HostName"] = silo.HostName, ["ProxyPort"] = silo.ProxyPort }; if (silo.StartTime != default) { siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC"); } else { siloData["StartTime"] = "Unknown"; } siloMetricsList.Add(siloData); // Count silo statuses if (silo.Status == SiloStatus.Active) { activeSilos++; } else if (silo.Status == SiloStatus.Dead) { deadSilos++; } // Detect localhost environment (same hostname for all silos) if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local")) { localhostEnvironment = true; } } metricsInfo["SiloMetrics"] = siloMetricsList; metricsInfo["ActiveSilos"] = activeSilos; metricsInfo["DeadSilos"] = deadSilos; metricsInfo["IsLocalhostEnvironment"] = localhostEnvironment; // Determine expected silo count based on environment var expectedActiveSilos = localhostEnvironment ? 1 : 2; // Localhost: 1, Sandbox: 2 metricsInfo["ExpectedActiveSilos"] = expectedActiveSilos; // Check for health issues based on environment if (activeSilos == 0) { issues.Add("No active silos found"); } else if (localhostEnvironment) { // In localhost, we expect exactly 1 active silo if (activeSilos != 1) { issues.Add($"Localhost environment should have exactly 1 active silo, found {activeSilos}"); } // Dead silos in localhost are normal (from previous runs) if (deadSilos > 0) { metricsInfo["DeadSilosNote"] = "Dead silos in localhost are normal (from previous runs)"; } } else { // In sandbox/production, we expect at least 2 active silos for redundancy if (activeSilos < 2) { issues.Add($"Production environment should have at least 2 active silos for redundancy, found {activeSilos}"); } // Dead silos in production are concerning if (deadSilos > 0) { issues.Add($"Found {deadSilos} dead silos in production environment"); } } } catch (Exception ex) { issues.Add($"Failed to get Orleans statistics: {ex.Message}"); metricsInfo["StatisticsError"] = ex.Message; } healthData["OrleansMetrics"] = metricsInfo; return (issues.Count == 0, issues); } catch (Exception ex) { issues.Add($"Orleans metrics check failed: {ex.Message}"); healthData["OrleansMetrics"] = new Dictionary { ["Error"] = ex.Message, ["ErrorType"] = ex.GetType().Name }; return (false, issues); } } private async Task<(bool IsHealthy, List Issues)> CheckGrainActivation(Dictionary healthData, CancellationToken cancellationToken) { var issues = new List(); var grainInfo = new Dictionary(); try { // Test basic grain activation var testGrain = _grainFactory.GetGrain(0); try { // Try to call a simple method to test grain activation var hosts = await testGrain.GetDetailedHosts(); grainInfo["GrainActivationTest"] = "Success"; grainInfo["ActivationTestResult"] = $"Successfully activated ManagementGrain and retrieved {hosts.Count()} hosts"; } catch (Exception ex) { issues.Add($"Grain activation test failed: {ex.Message}"); grainInfo["GrainActivationTest"] = "Failed"; grainInfo["ActivationTestError"] = ex.Message; } // Check grain factory status grainInfo["GrainFactoryAvailable"] = _grainFactory != null; healthData["GrainActivation"] = grainInfo; return (issues.Count == 0, issues); } catch (Exception ex) { issues.Add($"Grain activation check failed: {ex.Message}"); healthData["GrainActivation"] = new Dictionary { ["Error"] = ex.Message, ["ErrorType"] = ex.GetType().Name }; return (false, issues); } } } }