From 949044c73dc6e916fa54029387d56fa6c2228423 Mon Sep 17 00:00:00 2001 From: cryptooda Date: Tue, 6 Jan 2026 19:43:11 +0700 Subject: [PATCH] Enhance LlmController and GeminiProvider for improved rate limit handling - Increased delay between iterations in LlmController from 500ms to 2000ms to better respect rate limits. - Added retry logic in LlmController for handling rate limit errors (HTTP 429) with a 10-second wait before retrying. - Introduced additional delay after tool calls in LlmController to further mitigate rate limit issues. - Updated GeminiProvider to increase maximum retry attempts from 3 to 5 and base retry delay from 2s to 3s for better handling of rate limits. - Enhanced logging for rate limit scenarios to provide clearer feedback during API interactions. --- src/Managing.Api/Controllers/LlmController.cs | 74 +++++++++++++++++-- .../LLM/Providers/GeminiProvider.cs | 31 +++++--- 2 files changed, 90 insertions(+), 15 deletions(-) diff --git a/src/Managing.Api/Controllers/LlmController.cs b/src/Managing.Api/Controllers/LlmController.cs index 4011acfc..d312ab8b 100644 --- a/src/Managing.Api/Controllers/LlmController.cs +++ b/src/Managing.Api/Controllers/LlmController.cs @@ -1,3 +1,4 @@ +using System.Net.Http; using System.Text.Json; using System.Text.RegularExpressions; using Managing.Application.Abstractions.Services; @@ -215,7 +216,8 @@ public class LlmController : BaseController int maxIterations = DetermineMaxIterations(chatRequest); int iteration = 0; LlmChatResponse? finalResponse = null; - const int DelayBetweenIterationsMs = 500; + const int DelayBetweenIterationsMs = 2000; // Increased from 500ms to 2s to respect rate limits + const int DelayAfterToolCallsMs = 1000; // Additional delay after tool calls before next LLM call await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate { @@ -254,7 +256,7 @@ public class LlmController : BaseController // Trim context if conversation is getting too long TrimConversationContext(chatRequest); - // Send chat request to LLM + // Send chat request to LLM with retry logic for rate limits await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate { Type = "thinking", @@ -263,7 +265,37 @@ public class LlmController : BaseController MaxIterations = maxIterations }); - var response = await llmService.ChatAsync(user, chatRequest); + LlmChatResponse response; + try + { + response = await llmService.ChatAsync(user, chatRequest); + } + catch (HttpRequestException httpEx) when (httpEx.Message.Contains("429") || httpEx.Message.Contains("TooManyRequests") || httpEx.Message.Contains("RESOURCE_EXHAUSTED")) + { + // Rate limit hit - wait longer before retrying + logger.LogWarning("Rate limit hit (429) in iteration {Iteration}. Waiting 10 seconds before retry...", iteration); + await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate + { + Type = "thinking", + Message = "Rate limit reached. Waiting before retrying...", + Iteration = iteration, + MaxIterations = maxIterations + }); + + // Wait 10 seconds for rate limit to reset + await Task.Delay(10000); + + // Retry once + try + { + response = await llmService.ChatAsync(user, chatRequest); + } + catch (Exception retryEx) + { + logger.LogError(retryEx, "Retry after rate limit also failed in iteration {Iteration}", iteration); + throw new HttpRequestException($"Rate limit error persists after retry: {retryEx.Message}", retryEx); + } + } // If LLM doesn't want to call tools, we have our final answer if (!response.RequiresToolExecution || response.ToolCalls == null || !response.ToolCalls.Any()) @@ -375,6 +407,9 @@ public class LlmController : BaseController // Add tool results to conversation history chatRequest.Messages.AddRange(toolResults); + // Add delay after tool calls before next LLM call to avoid rate limits + await Task.Delay(DelayAfterToolCallsMs); + // Continue loop to get LLM's response to the tool results } @@ -466,7 +501,8 @@ public class LlmController : BaseController int maxIterations = DetermineMaxIterations(request); int iteration = 0; LlmChatResponse? finalResponse = null; - const int DelayBetweenIterationsMs = 500; // 500ms delay between iterations to avoid rate limits + const int DelayBetweenIterationsMs = 2000; // Increased from 500ms to 2s to respect rate limits + const int DelayAfterToolCallsMs = 1000; // Additional delay after tool calls before next LLM call while (iteration < maxIterations) { @@ -486,8 +522,31 @@ public class LlmController : BaseController // Trim context if conversation is getting too long TrimConversationContext(request); - // Send chat request to LLM - var response = await _llmService.ChatAsync(user, request); + // Send chat request to LLM with retry logic for rate limits + LlmChatResponse response; + try + { + response = await _llmService.ChatAsync(user, request); + } + catch (HttpRequestException httpEx) when (httpEx.Message.Contains("429") || httpEx.Message.Contains("TooManyRequests") || httpEx.Message.Contains("RESOURCE_EXHAUSTED")) + { + // Rate limit hit - wait longer before retrying + _logger.LogWarning("Rate limit hit (429) in iteration {Iteration}. Waiting 10 seconds before retry...", iteration); + + // Wait 10 seconds for rate limit to reset + await Task.Delay(10000); + + // Retry once + try + { + response = await _llmService.ChatAsync(user, request); + } + catch (Exception retryEx) + { + _logger.LogError(retryEx, "Retry after rate limit also failed in iteration {Iteration}", iteration); + throw new HttpRequestException($"Rate limit error persists after retry: {retryEx.Message}", retryEx); + } + } // If LLM doesn't want to call tools, we have our final answer if (!response.RequiresToolExecution || response.ToolCalls == null || !response.ToolCalls.Any()) @@ -544,6 +603,9 @@ public class LlmController : BaseController // Add tool results to conversation history request.Messages.AddRange(toolResults); + // Add delay after tool calls before next LLM call to avoid rate limits + await Task.Delay(DelayAfterToolCallsMs); + // Continue loop to get LLM's response to the tool results } diff --git a/src/Managing.Application/LLM/Providers/GeminiProvider.cs b/src/Managing.Application/LLM/Providers/GeminiProvider.cs index a3e911a8..1edf1f01 100644 --- a/src/Managing.Application/LLM/Providers/GeminiProvider.cs +++ b/src/Managing.Application/LLM/Providers/GeminiProvider.cs @@ -21,8 +21,8 @@ public class GeminiProvider : ILlmProvider private readonly IAsyncPolicy _retryPolicy; private const string BaseUrl = "https://generativelanguage.googleapis.com/v1beta"; private const string FallbackModel = "gemini-2.0-flash-exp"; - private const int MaxRetryAttempts = 3; - private const int BaseRetryDelayMs = 2000; // 2 seconds base delay + private const int MaxRetryAttempts = 5; // Increased from 3 to 5 for better rate limit handling + private const int BaseRetryDelayMs = 3000; // Increased from 2s to 3s base delay for rate limits public string Name => "gemini"; @@ -49,23 +49,35 @@ public class GeminiProvider : ILlmProvider var retryAfter = ParseRetryAfterHeader(result.Result); if (retryAfter.HasValue) { + // Use Retry-After header value, but ensure minimum delay + var delayRetry = TimeSpan.FromSeconds(Math.Max(retryAfter.Value.TotalSeconds, 10)); _logger.LogInformation( - "Rate limited (429). Respecting Retry-After header: {RetryAfterSeconds}s", - retryAfter.Value.TotalSeconds); - return retryAfter.Value; + "Rate limited (429). Respecting Retry-After header: {RetryAfterSeconds}s (using {ActualDelay}s)", + retryAfter.Value.TotalSeconds, delayRetry.TotalSeconds); + return delayRetry; } + + // If no Retry-After header, use longer exponential backoff for 429 errors + var rateLimitDelay = + BaseRetryDelayMs * Math.Pow(2, retryAttempt) * 2; // Double the delay for rate limits + var rateLimitJitter = new Random().Next(0, BaseRetryDelayMs); + var rateLimitTotalDelay = TimeSpan.FromMilliseconds(rateLimitDelay + rateLimitJitter); + _logger.LogInformation( + "Rate limited (429) without Retry-After header. Using extended backoff: {DelayMs}ms", + rateLimitTotalDelay.TotalMilliseconds); + return rateLimitTotalDelay; } // Exponential backoff with jitter: baseDelay * 2^(retryAttempt-1) + random jitter var exponentialDelay = BaseRetryDelayMs * Math.Pow(2, retryAttempt - 1); var jitter = new Random().Next(0, BaseRetryDelayMs / 4); - var delay = TimeSpan.FromMilliseconds(exponentialDelay + jitter); + var backoffDelay = TimeSpan.FromMilliseconds(exponentialDelay + jitter); _logger.LogInformation( "Retrying after exponential backoff: {DelayMs}ms (attempt {Attempt}/{MaxAttempts})", - delay.TotalMilliseconds, retryAttempt, MaxRetryAttempts + 1); + backoffDelay.TotalMilliseconds, retryAttempt, MaxRetryAttempts + 1); - return delay; + return backoffDelay; }, onRetryAsync: async (outcome, timespan, retryCount, context) => { @@ -100,7 +112,8 @@ public class GeminiProvider : ILlmProvider if (!response.IsSuccessStatusCode) { var errorContent = await response.Content.ReadAsStringAsync(); - _logger.LogError("Gemini API error after retries: {StatusCode} - {Error}", response.StatusCode, errorContent); + _logger.LogError("Gemini API error after retries: {StatusCode} - {Error}", response.StatusCode, + errorContent); throw new HttpRequestException($"Gemini API error: {response.StatusCode} - {errorContent}"); }