438 lines
18 KiB
C#
438 lines
18 KiB
C#
using System.Diagnostics;
|
|
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
|
|
|
namespace Managing.Api.HealthChecks
|
|
{
|
|
public class OrleansHealthCheck : IHealthCheck
|
|
{
|
|
private readonly IGrainFactory _grainFactory;
|
|
private readonly ILogger<OrleansHealthCheck> _logger;
|
|
|
|
public OrleansHealthCheck(IGrainFactory grainFactory, ILogger<OrleansHealthCheck> logger)
|
|
{
|
|
_grainFactory = grainFactory;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var healthData = new Dictionary<string, object>();
|
|
var isHealthy = true;
|
|
var issues = new List<string>();
|
|
|
|
// Check cluster connectivity
|
|
var clusterHealth = await CheckClusterConnectivity(healthData, cancellationToken);
|
|
if (!clusterHealth.IsHealthy)
|
|
{
|
|
isHealthy = false;
|
|
issues.AddRange(clusterHealth.Issues);
|
|
}
|
|
|
|
// Check silo status and roles
|
|
var siloHealth = await CheckSiloStatus(healthData, cancellationToken);
|
|
if (!siloHealth.IsHealthy)
|
|
{
|
|
isHealthy = false;
|
|
issues.AddRange(siloHealth.Issues);
|
|
}
|
|
|
|
// Check time synchronization
|
|
var timeHealth = CheckTimeSynchronization(healthData);
|
|
if (!timeHealth.IsHealthy)
|
|
{
|
|
isHealthy = false;
|
|
issues.AddRange(timeHealth.Issues);
|
|
}
|
|
|
|
// Check Orleans metrics
|
|
var metricsHealth = await CheckOrleansMetrics(healthData, cancellationToken);
|
|
if (!metricsHealth.IsHealthy)
|
|
{
|
|
isHealthy = false;
|
|
issues.AddRange(metricsHealth.Issues);
|
|
}
|
|
|
|
// Check grain activation
|
|
var grainHealth = await CheckGrainActivation(healthData, cancellationToken);
|
|
if (!grainHealth.IsHealthy)
|
|
{
|
|
isHealthy = false;
|
|
issues.AddRange(grainHealth.Issues);
|
|
}
|
|
|
|
// Determine overall health status
|
|
if (isHealthy)
|
|
{
|
|
return HealthCheckResult.Healthy("Orleans cluster is healthy", data: healthData);
|
|
}
|
|
else if (issues.Count <= 2)
|
|
{
|
|
return HealthCheckResult.Degraded($"Orleans cluster has minor issues: {string.Join(", ", issues)}", data: healthData);
|
|
}
|
|
else
|
|
{
|
|
return HealthCheckResult.Unhealthy($"Orleans cluster has critical issues: {string.Join(", ", issues)}", data: healthData);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error checking Orleans health");
|
|
return HealthCheckResult.Unhealthy(
|
|
"Failed to check Orleans cluster health",
|
|
ex,
|
|
data: new Dictionary<string, object>
|
|
{
|
|
["ErrorMessage"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name,
|
|
["StackTrace"] = ex.StackTrace
|
|
});
|
|
}
|
|
}
|
|
|
|
private async Task<(bool IsHealthy, List<string> Issues)> CheckClusterConnectivity(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
|
{
|
|
var issues = new List<string>();
|
|
var clusterInfo = new Dictionary<string, object>();
|
|
|
|
try
|
|
{
|
|
// Check cluster membership
|
|
try
|
|
{
|
|
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
|
var membershipTable = await managementGrain.GetDetailedHosts();
|
|
clusterInfo["ActiveSilos"] = membershipTable.Count();
|
|
|
|
var membershipList = new List<object>();
|
|
foreach (var silo in membershipTable)
|
|
{
|
|
var siloData = new Dictionary<string, object>
|
|
{
|
|
["SiloName"] = silo.SiloName,
|
|
["Status"] = silo.Status.ToString(),
|
|
["SiloAddress"] = silo.SiloAddress.ToString(),
|
|
["HostName"] = silo.HostName,
|
|
["ProxyPort"] = silo.ProxyPort
|
|
};
|
|
|
|
if (silo.StartTime != default)
|
|
{
|
|
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
|
}
|
|
else
|
|
{
|
|
siloData["StartTime"] = "Unknown";
|
|
}
|
|
|
|
membershipList.Add(siloData);
|
|
}
|
|
clusterInfo["MembershipTable"] = membershipList;
|
|
|
|
// Count active silos
|
|
var activeSilos = 0;
|
|
var localhostEnvironment = false;
|
|
foreach (var silo in membershipTable)
|
|
{
|
|
if (silo.Status == SiloStatus.Active)
|
|
{
|
|
activeSilos++;
|
|
}
|
|
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
|
|
{
|
|
localhostEnvironment = true;
|
|
}
|
|
}
|
|
|
|
clusterInfo["ActiveSilosCount"] = activeSilos;
|
|
clusterInfo["IsLocalhostEnvironment"] = localhostEnvironment;
|
|
|
|
if (membershipTable.Count() == 0)
|
|
{
|
|
issues.Add("No silos found in cluster");
|
|
}
|
|
else if (activeSilos == 0)
|
|
{
|
|
issues.Add("No active silos found in cluster");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Failed to get cluster membership: {ex.Message}");
|
|
clusterInfo["MembershipError"] = ex.Message;
|
|
}
|
|
|
|
healthData["ClusterConnectivity"] = clusterInfo;
|
|
return (issues.Count == 0, issues);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Cluster connectivity check failed: {ex.Message}");
|
|
healthData["ClusterConnectivity"] = new Dictionary<string, object>
|
|
{
|
|
["Error"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name
|
|
};
|
|
return (false, issues);
|
|
}
|
|
}
|
|
|
|
private async Task<(bool IsHealthy, List<string> Issues)> CheckSiloStatus(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
|
{
|
|
var issues = new List<string>();
|
|
var siloInfo = new Dictionary<string, object>();
|
|
|
|
try
|
|
{
|
|
// Check silo roles and configuration
|
|
var siloRole = Environment.GetEnvironmentVariable("SILO_ROLE") ?? "Unknown";
|
|
var taskSlot = Environment.GetEnvironmentVariable("TASK_SLOT") ?? "1";
|
|
siloInfo["SiloRole"] = siloRole;
|
|
siloInfo["TaskSlot"] = taskSlot;
|
|
|
|
// Check if clustering is disabled
|
|
var disableClustering = Environment.GetEnvironmentVariable("DISABLE_ORLEANS_CLUSTERING");
|
|
var clusteringDisabled = !string.IsNullOrEmpty(disableClustering) && bool.Parse(disableClustering);
|
|
siloInfo["ClusteringDisabled"] = clusteringDisabled;
|
|
|
|
if (clusteringDisabled)
|
|
{
|
|
issues.Add("Orleans clustering is disabled - running in localhost mode");
|
|
}
|
|
|
|
// Check Orleans grains configuration
|
|
var runGrains = Environment.GetEnvironmentVariable("RUN_ORLEANS_GRAINS");
|
|
var grainsEnabled = string.IsNullOrEmpty(runGrains) || bool.Parse(runGrains);
|
|
siloInfo["GrainsEnabled"] = grainsEnabled;
|
|
|
|
if (!grainsEnabled)
|
|
{
|
|
issues.Add("Orleans grains are disabled");
|
|
}
|
|
|
|
healthData["SiloStatus"] = siloInfo;
|
|
return (issues.Count == 0, issues);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Silo status check failed: {ex.Message}");
|
|
healthData["SiloStatus"] = new Dictionary<string, object>
|
|
{
|
|
["Error"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name
|
|
};
|
|
return (false, issues);
|
|
}
|
|
}
|
|
|
|
private (bool IsHealthy, List<string> Issues) CheckTimeSynchronization(Dictionary<string, object> healthData)
|
|
{
|
|
var issues = new List<string>();
|
|
var timeInfo = new Dictionary<string, object>();
|
|
|
|
try
|
|
{
|
|
var utcNow = DateTime.UtcNow;
|
|
var localNow = DateTime.Now;
|
|
var timeZone = TimeZoneInfo.Local;
|
|
|
|
timeInfo["UtcTime"] = utcNow.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
|
timeInfo["LocalTime"] = localNow.ToString("yyyy-MM-dd HH:mm:ss");
|
|
timeInfo["TimeZone"] = timeZone.DisplayName;
|
|
timeInfo["TimeZoneId"] = timeZone.Id;
|
|
timeInfo["UtcOffset"] = timeZone.GetUtcOffset(utcNow).ToString();
|
|
|
|
// Check if timezone is properly configured
|
|
if (timeZone.Id == "UTC" || timeZone.Id == "GMT")
|
|
{
|
|
timeInfo["TimezoneWarning"] = "Using UTC/GMT timezone - ensure this is intentional";
|
|
}
|
|
|
|
// Check for potential time drift (basic check)
|
|
var processStartTime = Process.GetCurrentProcess().StartTime;
|
|
var uptime = utcNow - processStartTime;
|
|
timeInfo["ProcessUptime"] = uptime.ToString(@"dd\.hh\:mm\:ss");
|
|
timeInfo["ProcessStartTime"] = processStartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
|
|
|
healthData["TimeSynchronization"] = timeInfo;
|
|
return (true, issues);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Time synchronization check failed: {ex.Message}");
|
|
healthData["TimeSynchronization"] = new Dictionary<string, object>
|
|
{
|
|
["Error"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name
|
|
};
|
|
return (false, issues);
|
|
}
|
|
}
|
|
|
|
private async Task<(bool IsHealthy, List<string> Issues)> CheckOrleansMetrics(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
|
{
|
|
var issues = new List<string>();
|
|
var metricsInfo = new Dictionary<string, object>();
|
|
|
|
try
|
|
{
|
|
// Get Orleans statistics
|
|
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
|
|
|
try
|
|
{
|
|
var siloStatistics = await managementGrain.GetDetailedHosts();
|
|
var siloCount = siloStatistics.Count();
|
|
metricsInfo["SiloCount"] = siloCount;
|
|
|
|
// Get basic silo information
|
|
var siloMetricsList = new List<object>();
|
|
var activeSilos = 0;
|
|
var deadSilos = 0;
|
|
var localhostEnvironment = false;
|
|
|
|
foreach (var silo in siloStatistics)
|
|
{
|
|
var siloData = new Dictionary<string, object>
|
|
{
|
|
["SiloName"] = silo.SiloName,
|
|
["Status"] = silo.Status.ToString(),
|
|
["SiloAddress"] = silo.SiloAddress.ToString(),
|
|
["HostName"] = silo.HostName,
|
|
["ProxyPort"] = silo.ProxyPort
|
|
};
|
|
|
|
if (silo.StartTime != default)
|
|
{
|
|
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
|
}
|
|
else
|
|
{
|
|
siloData["StartTime"] = "Unknown";
|
|
}
|
|
|
|
siloMetricsList.Add(siloData);
|
|
|
|
// Count silo statuses
|
|
if (silo.Status == SiloStatus.Active)
|
|
{
|
|
activeSilos++;
|
|
}
|
|
else if (silo.Status == SiloStatus.Dead)
|
|
{
|
|
deadSilos++;
|
|
}
|
|
|
|
// Detect localhost environment (same hostname for all silos)
|
|
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
|
|
{
|
|
localhostEnvironment = true;
|
|
}
|
|
}
|
|
|
|
metricsInfo["SiloMetrics"] = siloMetricsList;
|
|
metricsInfo["ActiveSilos"] = activeSilos;
|
|
metricsInfo["DeadSilos"] = deadSilos;
|
|
metricsInfo["IsLocalhostEnvironment"] = localhostEnvironment;
|
|
|
|
// Determine expected silo count based on environment
|
|
var expectedActiveSilos = localhostEnvironment ? 1 : 2; // Localhost: 1, Sandbox: 2
|
|
metricsInfo["ExpectedActiveSilos"] = expectedActiveSilos;
|
|
|
|
// Check for health issues based on environment
|
|
if (activeSilos == 0)
|
|
{
|
|
issues.Add("No active silos found");
|
|
}
|
|
else if (localhostEnvironment)
|
|
{
|
|
// In localhost, we expect exactly 1 active silo
|
|
if (activeSilos != 1)
|
|
{
|
|
issues.Add($"Localhost environment should have exactly 1 active silo, found {activeSilos}");
|
|
}
|
|
// Dead silos in localhost are normal (from previous runs)
|
|
if (deadSilos > 0)
|
|
{
|
|
metricsInfo["DeadSilosNote"] = "Dead silos in localhost are normal (from previous runs)";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// In sandbox/production, we expect at least 2 active silos for redundancy
|
|
if (activeSilos < 2)
|
|
{
|
|
issues.Add($"Production environment should have at least 2 active silos for redundancy, found {activeSilos}");
|
|
}
|
|
// Dead silos in production are concerning
|
|
if (deadSilos > 0)
|
|
{
|
|
issues.Add($"Found {deadSilos} dead silos in production environment");
|
|
}
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Failed to get Orleans statistics: {ex.Message}");
|
|
metricsInfo["StatisticsError"] = ex.Message;
|
|
}
|
|
|
|
healthData["OrleansMetrics"] = metricsInfo;
|
|
return (issues.Count == 0, issues);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Orleans metrics check failed: {ex.Message}");
|
|
healthData["OrleansMetrics"] = new Dictionary<string, object>
|
|
{
|
|
["Error"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name
|
|
};
|
|
return (false, issues);
|
|
}
|
|
}
|
|
|
|
private async Task<(bool IsHealthy, List<string> Issues)> CheckGrainActivation(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
|
{
|
|
var issues = new List<string>();
|
|
var grainInfo = new Dictionary<string, object>();
|
|
|
|
try
|
|
{
|
|
// Test basic grain activation
|
|
var testGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
|
|
|
try
|
|
{
|
|
// Try to call a simple method to test grain activation
|
|
var hosts = await testGrain.GetDetailedHosts();
|
|
grainInfo["GrainActivationTest"] = "Success";
|
|
grainInfo["ActivationTestResult"] = $"Successfully activated ManagementGrain and retrieved {hosts.Count()} hosts";
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Grain activation test failed: {ex.Message}");
|
|
grainInfo["GrainActivationTest"] = "Failed";
|
|
grainInfo["ActivationTestError"] = ex.Message;
|
|
}
|
|
|
|
// Check grain factory status
|
|
grainInfo["GrainFactoryAvailable"] = _grainFactory != null;
|
|
|
|
healthData["GrainActivation"] = grainInfo;
|
|
return (issues.Count == 0, issues);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
issues.Add($"Grain activation check failed: {ex.Message}");
|
|
healthData["GrainActivation"] = new Dictionary<string, object>
|
|
{
|
|
["Error"] = ex.Message,
|
|
["ErrorType"] = ex.GetType().Name
|
|
};
|
|
return (false, issues);
|
|
}
|
|
}
|
|
}
|
|
} |