Add orleans healthchecks

This commit is contained in:
2025-10-03 12:46:44 +07:00
parent 44fd3c6919
commit b970090492
2 changed files with 440 additions and 1 deletions

View File

@@ -0,0 +1,438 @@
using System.Diagnostics;
using Microsoft.Extensions.Diagnostics.HealthChecks;
namespace Managing.Api.HealthChecks
{
public class OrleansHealthCheck : IHealthCheck
{
private readonly IGrainFactory _grainFactory;
private readonly ILogger<OrleansHealthCheck> _logger;
public OrleansHealthCheck(IGrainFactory grainFactory, ILogger<OrleansHealthCheck> logger)
{
_grainFactory = grainFactory;
_logger = logger;
}
public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default)
{
try
{
var healthData = new Dictionary<string, object>();
var isHealthy = true;
var issues = new List<string>();
// Check cluster connectivity
var clusterHealth = await CheckClusterConnectivity(healthData, cancellationToken);
if (!clusterHealth.IsHealthy)
{
isHealthy = false;
issues.AddRange(clusterHealth.Issues);
}
// Check silo status and roles
var siloHealth = await CheckSiloStatus(healthData, cancellationToken);
if (!siloHealth.IsHealthy)
{
isHealthy = false;
issues.AddRange(siloHealth.Issues);
}
// Check time synchronization
var timeHealth = CheckTimeSynchronization(healthData);
if (!timeHealth.IsHealthy)
{
isHealthy = false;
issues.AddRange(timeHealth.Issues);
}
// Check Orleans metrics
var metricsHealth = await CheckOrleansMetrics(healthData, cancellationToken);
if (!metricsHealth.IsHealthy)
{
isHealthy = false;
issues.AddRange(metricsHealth.Issues);
}
// Check grain activation
var grainHealth = await CheckGrainActivation(healthData, cancellationToken);
if (!grainHealth.IsHealthy)
{
isHealthy = false;
issues.AddRange(grainHealth.Issues);
}
// Determine overall health status
if (isHealthy)
{
return HealthCheckResult.Healthy("Orleans cluster is healthy", data: healthData);
}
else if (issues.Count <= 2)
{
return HealthCheckResult.Degraded($"Orleans cluster has minor issues: {string.Join(", ", issues)}", data: healthData);
}
else
{
return HealthCheckResult.Unhealthy($"Orleans cluster has critical issues: {string.Join(", ", issues)}", data: healthData);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Error checking Orleans health");
return HealthCheckResult.Unhealthy(
"Failed to check Orleans cluster health",
ex,
data: new Dictionary<string, object>
{
["ErrorMessage"] = ex.Message,
["ErrorType"] = ex.GetType().Name,
["StackTrace"] = ex.StackTrace
});
}
}
private async Task<(bool IsHealthy, List<string> Issues)> CheckClusterConnectivity(Dictionary<string, object> healthData, CancellationToken cancellationToken)
{
var issues = new List<string>();
var clusterInfo = new Dictionary<string, object>();
try
{
// Check cluster membership
try
{
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
var membershipTable = await managementGrain.GetDetailedHosts();
clusterInfo["ActiveSilos"] = membershipTable.Count();
var membershipList = new List<object>();
foreach (var silo in membershipTable)
{
var siloData = new Dictionary<string, object>
{
["SiloName"] = silo.SiloName,
["Status"] = silo.Status.ToString(),
["SiloAddress"] = silo.SiloAddress.ToString(),
["HostName"] = silo.HostName,
["ProxyPort"] = silo.ProxyPort
};
if (silo.StartTime != default)
{
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
}
else
{
siloData["StartTime"] = "Unknown";
}
membershipList.Add(siloData);
}
clusterInfo["MembershipTable"] = membershipList;
// Count active silos
var activeSilos = 0;
var localhostEnvironment = false;
foreach (var silo in membershipTable)
{
if (silo.Status == SiloStatus.Active)
{
activeSilos++;
}
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
{
localhostEnvironment = true;
}
}
clusterInfo["ActiveSilosCount"] = activeSilos;
clusterInfo["IsLocalhostEnvironment"] = localhostEnvironment;
if (membershipTable.Count() == 0)
{
issues.Add("No silos found in cluster");
}
else if (activeSilos == 0)
{
issues.Add("No active silos found in cluster");
}
}
catch (Exception ex)
{
issues.Add($"Failed to get cluster membership: {ex.Message}");
clusterInfo["MembershipError"] = ex.Message;
}
healthData["ClusterConnectivity"] = clusterInfo;
return (issues.Count == 0, issues);
}
catch (Exception ex)
{
issues.Add($"Cluster connectivity check failed: {ex.Message}");
healthData["ClusterConnectivity"] = new Dictionary<string, object>
{
["Error"] = ex.Message,
["ErrorType"] = ex.GetType().Name
};
return (false, issues);
}
}
private async Task<(bool IsHealthy, List<string> Issues)> CheckSiloStatus(Dictionary<string, object> healthData, CancellationToken cancellationToken)
{
var issues = new List<string>();
var siloInfo = new Dictionary<string, object>();
try
{
// Check silo roles and configuration
var siloRole = Environment.GetEnvironmentVariable("SILO_ROLE") ?? "Unknown";
var taskSlot = Environment.GetEnvironmentVariable("TASK_SLOT") ?? "1";
siloInfo["SiloRole"] = siloRole;
siloInfo["TaskSlot"] = taskSlot;
// Check if clustering is disabled
var disableClustering = Environment.GetEnvironmentVariable("DISABLE_ORLEANS_CLUSTERING");
var clusteringDisabled = !string.IsNullOrEmpty(disableClustering) && bool.Parse(disableClustering);
siloInfo["ClusteringDisabled"] = clusteringDisabled;
if (clusteringDisabled)
{
issues.Add("Orleans clustering is disabled - running in localhost mode");
}
// Check Orleans grains configuration
var runGrains = Environment.GetEnvironmentVariable("RUN_ORLEANS_GRAINS");
var grainsEnabled = string.IsNullOrEmpty(runGrains) || bool.Parse(runGrains);
siloInfo["GrainsEnabled"] = grainsEnabled;
if (!grainsEnabled)
{
issues.Add("Orleans grains are disabled");
}
healthData["SiloStatus"] = siloInfo;
return (issues.Count == 0, issues);
}
catch (Exception ex)
{
issues.Add($"Silo status check failed: {ex.Message}");
healthData["SiloStatus"] = new Dictionary<string, object>
{
["Error"] = ex.Message,
["ErrorType"] = ex.GetType().Name
};
return (false, issues);
}
}
private (bool IsHealthy, List<string> Issues) CheckTimeSynchronization(Dictionary<string, object> healthData)
{
var issues = new List<string>();
var timeInfo = new Dictionary<string, object>();
try
{
var utcNow = DateTime.UtcNow;
var localNow = DateTime.Now;
var timeZone = TimeZoneInfo.Local;
timeInfo["UtcTime"] = utcNow.ToString("yyyy-MM-dd HH:mm:ss UTC");
timeInfo["LocalTime"] = localNow.ToString("yyyy-MM-dd HH:mm:ss");
timeInfo["TimeZone"] = timeZone.DisplayName;
timeInfo["TimeZoneId"] = timeZone.Id;
timeInfo["UtcOffset"] = timeZone.GetUtcOffset(utcNow).ToString();
// Check if timezone is properly configured
if (timeZone.Id == "UTC" || timeZone.Id == "GMT")
{
timeInfo["TimezoneWarning"] = "Using UTC/GMT timezone - ensure this is intentional";
}
// Check for potential time drift (basic check)
var processStartTime = Process.GetCurrentProcess().StartTime;
var uptime = utcNow - processStartTime;
timeInfo["ProcessUptime"] = uptime.ToString(@"dd\.hh\:mm\:ss");
timeInfo["ProcessStartTime"] = processStartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
healthData["TimeSynchronization"] = timeInfo;
return (true, issues);
}
catch (Exception ex)
{
issues.Add($"Time synchronization check failed: {ex.Message}");
healthData["TimeSynchronization"] = new Dictionary<string, object>
{
["Error"] = ex.Message,
["ErrorType"] = ex.GetType().Name
};
return (false, issues);
}
}
private async Task<(bool IsHealthy, List<string> Issues)> CheckOrleansMetrics(Dictionary<string, object> healthData, CancellationToken cancellationToken)
{
var issues = new List<string>();
var metricsInfo = new Dictionary<string, object>();
try
{
// Get Orleans statistics
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
try
{
var siloStatistics = await managementGrain.GetDetailedHosts();
var siloCount = siloStatistics.Count();
metricsInfo["SiloCount"] = siloCount;
// Get basic silo information
var siloMetricsList = new List<object>();
var activeSilos = 0;
var deadSilos = 0;
var localhostEnvironment = false;
foreach (var silo in siloStatistics)
{
var siloData = new Dictionary<string, object>
{
["SiloName"] = silo.SiloName,
["Status"] = silo.Status.ToString(),
["SiloAddress"] = silo.SiloAddress.ToString(),
["HostName"] = silo.HostName,
["ProxyPort"] = silo.ProxyPort
};
if (silo.StartTime != default)
{
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
}
else
{
siloData["StartTime"] = "Unknown";
}
siloMetricsList.Add(siloData);
// Count silo statuses
if (silo.Status == SiloStatus.Active)
{
activeSilos++;
}
else if (silo.Status == SiloStatus.Dead)
{
deadSilos++;
}
// Detect localhost environment (same hostname for all silos)
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
{
localhostEnvironment = true;
}
}
metricsInfo["SiloMetrics"] = siloMetricsList;
metricsInfo["ActiveSilos"] = activeSilos;
metricsInfo["DeadSilos"] = deadSilos;
metricsInfo["IsLocalhostEnvironment"] = localhostEnvironment;
// Determine expected silo count based on environment
var expectedActiveSilos = localhostEnvironment ? 1 : 2; // Localhost: 1, Sandbox: 2
metricsInfo["ExpectedActiveSilos"] = expectedActiveSilos;
// Check for health issues based on environment
if (activeSilos == 0)
{
issues.Add("No active silos found");
}
else if (localhostEnvironment)
{
// In localhost, we expect exactly 1 active silo
if (activeSilos != 1)
{
issues.Add($"Localhost environment should have exactly 1 active silo, found {activeSilos}");
}
// Dead silos in localhost are normal (from previous runs)
if (deadSilos > 0)
{
metricsInfo["DeadSilosNote"] = "Dead silos in localhost are normal (from previous runs)";
}
}
else
{
// In sandbox/production, we expect at least 2 active silos for redundancy
if (activeSilos < 2)
{
issues.Add($"Production environment should have at least 2 active silos for redundancy, found {activeSilos}");
}
// Dead silos in production are concerning
if (deadSilos > 0)
{
issues.Add($"Found {deadSilos} dead silos in production environment");
}
}
}
catch (Exception ex)
{
issues.Add($"Failed to get Orleans statistics: {ex.Message}");
metricsInfo["StatisticsError"] = ex.Message;
}
healthData["OrleansMetrics"] = metricsInfo;
return (issues.Count == 0, issues);
}
catch (Exception ex)
{
issues.Add($"Orleans metrics check failed: {ex.Message}");
healthData["OrleansMetrics"] = new Dictionary<string, object>
{
["Error"] = ex.Message,
["ErrorType"] = ex.GetType().Name
};
return (false, issues);
}
}
private async Task<(bool IsHealthy, List<string> Issues)> CheckGrainActivation(Dictionary<string, object> healthData, CancellationToken cancellationToken)
{
var issues = new List<string>();
var grainInfo = new Dictionary<string, object>();
try
{
// Test basic grain activation
var testGrain = _grainFactory.GetGrain<IManagementGrain>(0);
try
{
// Try to call a simple method to test grain activation
var hosts = await testGrain.GetDetailedHosts();
grainInfo["GrainActivationTest"] = "Success";
grainInfo["ActivationTestResult"] = $"Successfully activated ManagementGrain and retrieved {hosts.Count()} hosts";
}
catch (Exception ex)
{
issues.Add($"Grain activation test failed: {ex.Message}");
grainInfo["GrainActivationTest"] = "Failed";
grainInfo["ActivationTestError"] = ex.Message;
}
// Check grain factory status
grainInfo["GrainFactoryAvailable"] = _grainFactory != null;
healthData["GrainActivation"] = grainInfo;
return (issues.Count == 0, issues);
}
catch (Exception ex)
{
issues.Add($"Grain activation check failed: {ex.Message}");
healthData["GrainActivation"] = new Dictionary<string, object>
{
["Error"] = ex.Message,
["ErrorType"] = ex.GetType().Name
};
return (false, issues);
}
}
}
}

View File

@@ -124,7 +124,8 @@ builder.Services.AddHealthChecks()
.AddUrlGroup(new Uri($"{influxUrl}/health"), name: "influxdb", tags: ["database"])
.AddCheck<Web3ProxyHealthCheck>("web3proxy", tags: ["api", "external"])
.AddCheck<CandleDataHealthCheck>("candle-data", tags: ["database", "candles"])
.AddCheck<GmxConnectivityHealthCheck>("gmx-connectivity", tags: ["api", "external"]);
.AddCheck<GmxConnectivityHealthCheck>("gmx-connectivity", tags: ["api", "external"])
.AddCheck<OrleansHealthCheck>("orleans-cluster", tags: ["orleans", "cluster"]);
builder.Host.UseSerilog((hostBuilder, loggerConfiguration) =>
{