Add orleans healthchecks
This commit is contained in:
438
src/Managing.Api/HealthChecks/OrleansHealthCheck.cs
Normal file
438
src/Managing.Api/HealthChecks/OrleansHealthCheck.cs
Normal file
@@ -0,0 +1,438 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
|
||||
namespace Managing.Api.HealthChecks
|
||||
{
|
||||
public class OrleansHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IGrainFactory _grainFactory;
|
||||
private readonly ILogger<OrleansHealthCheck> _logger;
|
||||
|
||||
public OrleansHealthCheck(IGrainFactory grainFactory, ILogger<OrleansHealthCheck> logger)
|
||||
{
|
||||
_grainFactory = grainFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var healthData = new Dictionary<string, object>();
|
||||
var isHealthy = true;
|
||||
var issues = new List<string>();
|
||||
|
||||
// Check cluster connectivity
|
||||
var clusterHealth = await CheckClusterConnectivity(healthData, cancellationToken);
|
||||
if (!clusterHealth.IsHealthy)
|
||||
{
|
||||
isHealthy = false;
|
||||
issues.AddRange(clusterHealth.Issues);
|
||||
}
|
||||
|
||||
// Check silo status and roles
|
||||
var siloHealth = await CheckSiloStatus(healthData, cancellationToken);
|
||||
if (!siloHealth.IsHealthy)
|
||||
{
|
||||
isHealthy = false;
|
||||
issues.AddRange(siloHealth.Issues);
|
||||
}
|
||||
|
||||
// Check time synchronization
|
||||
var timeHealth = CheckTimeSynchronization(healthData);
|
||||
if (!timeHealth.IsHealthy)
|
||||
{
|
||||
isHealthy = false;
|
||||
issues.AddRange(timeHealth.Issues);
|
||||
}
|
||||
|
||||
// Check Orleans metrics
|
||||
var metricsHealth = await CheckOrleansMetrics(healthData, cancellationToken);
|
||||
if (!metricsHealth.IsHealthy)
|
||||
{
|
||||
isHealthy = false;
|
||||
issues.AddRange(metricsHealth.Issues);
|
||||
}
|
||||
|
||||
// Check grain activation
|
||||
var grainHealth = await CheckGrainActivation(healthData, cancellationToken);
|
||||
if (!grainHealth.IsHealthy)
|
||||
{
|
||||
isHealthy = false;
|
||||
issues.AddRange(grainHealth.Issues);
|
||||
}
|
||||
|
||||
// Determine overall health status
|
||||
if (isHealthy)
|
||||
{
|
||||
return HealthCheckResult.Healthy("Orleans cluster is healthy", data: healthData);
|
||||
}
|
||||
else if (issues.Count <= 2)
|
||||
{
|
||||
return HealthCheckResult.Degraded($"Orleans cluster has minor issues: {string.Join(", ", issues)}", data: healthData);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HealthCheckResult.Unhealthy($"Orleans cluster has critical issues: {string.Join(", ", issues)}", data: healthData);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error checking Orleans health");
|
||||
return HealthCheckResult.Unhealthy(
|
||||
"Failed to check Orleans cluster health",
|
||||
ex,
|
||||
data: new Dictionary<string, object>
|
||||
{
|
||||
["ErrorMessage"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name,
|
||||
["StackTrace"] = ex.StackTrace
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(bool IsHealthy, List<string> Issues)> CheckClusterConnectivity(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var clusterInfo = new Dictionary<string, object>();
|
||||
|
||||
try
|
||||
{
|
||||
// Check cluster membership
|
||||
try
|
||||
{
|
||||
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
||||
var membershipTable = await managementGrain.GetDetailedHosts();
|
||||
clusterInfo["ActiveSilos"] = membershipTable.Count();
|
||||
|
||||
var membershipList = new List<object>();
|
||||
foreach (var silo in membershipTable)
|
||||
{
|
||||
var siloData = new Dictionary<string, object>
|
||||
{
|
||||
["SiloName"] = silo.SiloName,
|
||||
["Status"] = silo.Status.ToString(),
|
||||
["SiloAddress"] = silo.SiloAddress.ToString(),
|
||||
["HostName"] = silo.HostName,
|
||||
["ProxyPort"] = silo.ProxyPort
|
||||
};
|
||||
|
||||
if (silo.StartTime != default)
|
||||
{
|
||||
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
||||
}
|
||||
else
|
||||
{
|
||||
siloData["StartTime"] = "Unknown";
|
||||
}
|
||||
|
||||
membershipList.Add(siloData);
|
||||
}
|
||||
clusterInfo["MembershipTable"] = membershipList;
|
||||
|
||||
// Count active silos
|
||||
var activeSilos = 0;
|
||||
var localhostEnvironment = false;
|
||||
foreach (var silo in membershipTable)
|
||||
{
|
||||
if (silo.Status == SiloStatus.Active)
|
||||
{
|
||||
activeSilos++;
|
||||
}
|
||||
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
|
||||
{
|
||||
localhostEnvironment = true;
|
||||
}
|
||||
}
|
||||
|
||||
clusterInfo["ActiveSilosCount"] = activeSilos;
|
||||
clusterInfo["IsLocalhostEnvironment"] = localhostEnvironment;
|
||||
|
||||
if (membershipTable.Count() == 0)
|
||||
{
|
||||
issues.Add("No silos found in cluster");
|
||||
}
|
||||
else if (activeSilos == 0)
|
||||
{
|
||||
issues.Add("No active silos found in cluster");
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Failed to get cluster membership: {ex.Message}");
|
||||
clusterInfo["MembershipError"] = ex.Message;
|
||||
}
|
||||
|
||||
healthData["ClusterConnectivity"] = clusterInfo;
|
||||
return (issues.Count == 0, issues);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Cluster connectivity check failed: {ex.Message}");
|
||||
healthData["ClusterConnectivity"] = new Dictionary<string, object>
|
||||
{
|
||||
["Error"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name
|
||||
};
|
||||
return (false, issues);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(bool IsHealthy, List<string> Issues)> CheckSiloStatus(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var siloInfo = new Dictionary<string, object>();
|
||||
|
||||
try
|
||||
{
|
||||
// Check silo roles and configuration
|
||||
var siloRole = Environment.GetEnvironmentVariable("SILO_ROLE") ?? "Unknown";
|
||||
var taskSlot = Environment.GetEnvironmentVariable("TASK_SLOT") ?? "1";
|
||||
siloInfo["SiloRole"] = siloRole;
|
||||
siloInfo["TaskSlot"] = taskSlot;
|
||||
|
||||
// Check if clustering is disabled
|
||||
var disableClustering = Environment.GetEnvironmentVariable("DISABLE_ORLEANS_CLUSTERING");
|
||||
var clusteringDisabled = !string.IsNullOrEmpty(disableClustering) && bool.Parse(disableClustering);
|
||||
siloInfo["ClusteringDisabled"] = clusteringDisabled;
|
||||
|
||||
if (clusteringDisabled)
|
||||
{
|
||||
issues.Add("Orleans clustering is disabled - running in localhost mode");
|
||||
}
|
||||
|
||||
// Check Orleans grains configuration
|
||||
var runGrains = Environment.GetEnvironmentVariable("RUN_ORLEANS_GRAINS");
|
||||
var grainsEnabled = string.IsNullOrEmpty(runGrains) || bool.Parse(runGrains);
|
||||
siloInfo["GrainsEnabled"] = grainsEnabled;
|
||||
|
||||
if (!grainsEnabled)
|
||||
{
|
||||
issues.Add("Orleans grains are disabled");
|
||||
}
|
||||
|
||||
healthData["SiloStatus"] = siloInfo;
|
||||
return (issues.Count == 0, issues);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Silo status check failed: {ex.Message}");
|
||||
healthData["SiloStatus"] = new Dictionary<string, object>
|
||||
{
|
||||
["Error"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name
|
||||
};
|
||||
return (false, issues);
|
||||
}
|
||||
}
|
||||
|
||||
private (bool IsHealthy, List<string> Issues) CheckTimeSynchronization(Dictionary<string, object> healthData)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var timeInfo = new Dictionary<string, object>();
|
||||
|
||||
try
|
||||
{
|
||||
var utcNow = DateTime.UtcNow;
|
||||
var localNow = DateTime.Now;
|
||||
var timeZone = TimeZoneInfo.Local;
|
||||
|
||||
timeInfo["UtcTime"] = utcNow.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
||||
timeInfo["LocalTime"] = localNow.ToString("yyyy-MM-dd HH:mm:ss");
|
||||
timeInfo["TimeZone"] = timeZone.DisplayName;
|
||||
timeInfo["TimeZoneId"] = timeZone.Id;
|
||||
timeInfo["UtcOffset"] = timeZone.GetUtcOffset(utcNow).ToString();
|
||||
|
||||
// Check if timezone is properly configured
|
||||
if (timeZone.Id == "UTC" || timeZone.Id == "GMT")
|
||||
{
|
||||
timeInfo["TimezoneWarning"] = "Using UTC/GMT timezone - ensure this is intentional";
|
||||
}
|
||||
|
||||
// Check for potential time drift (basic check)
|
||||
var processStartTime = Process.GetCurrentProcess().StartTime;
|
||||
var uptime = utcNow - processStartTime;
|
||||
timeInfo["ProcessUptime"] = uptime.ToString(@"dd\.hh\:mm\:ss");
|
||||
timeInfo["ProcessStartTime"] = processStartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
||||
|
||||
healthData["TimeSynchronization"] = timeInfo;
|
||||
return (true, issues);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Time synchronization check failed: {ex.Message}");
|
||||
healthData["TimeSynchronization"] = new Dictionary<string, object>
|
||||
{
|
||||
["Error"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name
|
||||
};
|
||||
return (false, issues);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(bool IsHealthy, List<string> Issues)> CheckOrleansMetrics(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var metricsInfo = new Dictionary<string, object>();
|
||||
|
||||
try
|
||||
{
|
||||
// Get Orleans statistics
|
||||
var managementGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
||||
|
||||
try
|
||||
{
|
||||
var siloStatistics = await managementGrain.GetDetailedHosts();
|
||||
var siloCount = siloStatistics.Count();
|
||||
metricsInfo["SiloCount"] = siloCount;
|
||||
|
||||
// Get basic silo information
|
||||
var siloMetricsList = new List<object>();
|
||||
var activeSilos = 0;
|
||||
var deadSilos = 0;
|
||||
var localhostEnvironment = false;
|
||||
|
||||
foreach (var silo in siloStatistics)
|
||||
{
|
||||
var siloData = new Dictionary<string, object>
|
||||
{
|
||||
["SiloName"] = silo.SiloName,
|
||||
["Status"] = silo.Status.ToString(),
|
||||
["SiloAddress"] = silo.SiloAddress.ToString(),
|
||||
["HostName"] = silo.HostName,
|
||||
["ProxyPort"] = silo.ProxyPort
|
||||
};
|
||||
|
||||
if (silo.StartTime != default)
|
||||
{
|
||||
siloData["StartTime"] = silo.StartTime.ToString("yyyy-MM-dd HH:mm:ss UTC");
|
||||
}
|
||||
else
|
||||
{
|
||||
siloData["StartTime"] = "Unknown";
|
||||
}
|
||||
|
||||
siloMetricsList.Add(siloData);
|
||||
|
||||
// Count silo statuses
|
||||
if (silo.Status == SiloStatus.Active)
|
||||
{
|
||||
activeSilos++;
|
||||
}
|
||||
else if (silo.Status == SiloStatus.Dead)
|
||||
{
|
||||
deadSilos++;
|
||||
}
|
||||
|
||||
// Detect localhost environment (same hostname for all silos)
|
||||
if (silo.HostName.Contains("localhost") || silo.HostName.Contains(".local"))
|
||||
{
|
||||
localhostEnvironment = true;
|
||||
}
|
||||
}
|
||||
|
||||
metricsInfo["SiloMetrics"] = siloMetricsList;
|
||||
metricsInfo["ActiveSilos"] = activeSilos;
|
||||
metricsInfo["DeadSilos"] = deadSilos;
|
||||
metricsInfo["IsLocalhostEnvironment"] = localhostEnvironment;
|
||||
|
||||
// Determine expected silo count based on environment
|
||||
var expectedActiveSilos = localhostEnvironment ? 1 : 2; // Localhost: 1, Sandbox: 2
|
||||
metricsInfo["ExpectedActiveSilos"] = expectedActiveSilos;
|
||||
|
||||
// Check for health issues based on environment
|
||||
if (activeSilos == 0)
|
||||
{
|
||||
issues.Add("No active silos found");
|
||||
}
|
||||
else if (localhostEnvironment)
|
||||
{
|
||||
// In localhost, we expect exactly 1 active silo
|
||||
if (activeSilos != 1)
|
||||
{
|
||||
issues.Add($"Localhost environment should have exactly 1 active silo, found {activeSilos}");
|
||||
}
|
||||
// Dead silos in localhost are normal (from previous runs)
|
||||
if (deadSilos > 0)
|
||||
{
|
||||
metricsInfo["DeadSilosNote"] = "Dead silos in localhost are normal (from previous runs)";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// In sandbox/production, we expect at least 2 active silos for redundancy
|
||||
if (activeSilos < 2)
|
||||
{
|
||||
issues.Add($"Production environment should have at least 2 active silos for redundancy, found {activeSilos}");
|
||||
}
|
||||
// Dead silos in production are concerning
|
||||
if (deadSilos > 0)
|
||||
{
|
||||
issues.Add($"Found {deadSilos} dead silos in production environment");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Failed to get Orleans statistics: {ex.Message}");
|
||||
metricsInfo["StatisticsError"] = ex.Message;
|
||||
}
|
||||
|
||||
healthData["OrleansMetrics"] = metricsInfo;
|
||||
return (issues.Count == 0, issues);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Orleans metrics check failed: {ex.Message}");
|
||||
healthData["OrleansMetrics"] = new Dictionary<string, object>
|
||||
{
|
||||
["Error"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name
|
||||
};
|
||||
return (false, issues);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(bool IsHealthy, List<string> Issues)> CheckGrainActivation(Dictionary<string, object> healthData, CancellationToken cancellationToken)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var grainInfo = new Dictionary<string, object>();
|
||||
|
||||
try
|
||||
{
|
||||
// Test basic grain activation
|
||||
var testGrain = _grainFactory.GetGrain<IManagementGrain>(0);
|
||||
|
||||
try
|
||||
{
|
||||
// Try to call a simple method to test grain activation
|
||||
var hosts = await testGrain.GetDetailedHosts();
|
||||
grainInfo["GrainActivationTest"] = "Success";
|
||||
grainInfo["ActivationTestResult"] = $"Successfully activated ManagementGrain and retrieved {hosts.Count()} hosts";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Grain activation test failed: {ex.Message}");
|
||||
grainInfo["GrainActivationTest"] = "Failed";
|
||||
grainInfo["ActivationTestError"] = ex.Message;
|
||||
}
|
||||
|
||||
// Check grain factory status
|
||||
grainInfo["GrainFactoryAvailable"] = _grainFactory != null;
|
||||
|
||||
healthData["GrainActivation"] = grainInfo;
|
||||
return (issues.Count == 0, issues);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add($"Grain activation check failed: {ex.Message}");
|
||||
healthData["GrainActivation"] = new Dictionary<string, object>
|
||||
{
|
||||
["Error"] = ex.Message,
|
||||
["ErrorType"] = ex.GetType().Name
|
||||
};
|
||||
return (false, issues);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -124,7 +124,8 @@ builder.Services.AddHealthChecks()
|
||||
.AddUrlGroup(new Uri($"{influxUrl}/health"), name: "influxdb", tags: ["database"])
|
||||
.AddCheck<Web3ProxyHealthCheck>("web3proxy", tags: ["api", "external"])
|
||||
.AddCheck<CandleDataHealthCheck>("candle-data", tags: ["database", "candles"])
|
||||
.AddCheck<GmxConnectivityHealthCheck>("gmx-connectivity", tags: ["api", "external"]);
|
||||
.AddCheck<GmxConnectivityHealthCheck>("gmx-connectivity", tags: ["api", "external"])
|
||||
.AddCheck<OrleansHealthCheck>("orleans-cluster", tags: ["orleans", "cluster"]);
|
||||
|
||||
builder.Host.UseSerilog((hostBuilder, loggerConfiguration) =>
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user