Enhance LlmController and GeminiProvider for improved rate limit handling
- Increased delay between iterations in LlmController from 500ms to 2000ms to better respect rate limits. - Added retry logic in LlmController for handling rate limit errors (HTTP 429) with a 10-second wait before retrying. - Introduced additional delay after tool calls in LlmController to further mitigate rate limit issues. - Updated GeminiProvider to increase maximum retry attempts from 3 to 5 and base retry delay from 2s to 3s for better handling of rate limits. - Enhanced logging for rate limit scenarios to provide clearer feedback during API interactions.
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
using Managing.Application.Abstractions.Services;
|
||||
@@ -215,7 +216,8 @@ public class LlmController : BaseController
|
||||
int maxIterations = DetermineMaxIterations(chatRequest);
|
||||
int iteration = 0;
|
||||
LlmChatResponse? finalResponse = null;
|
||||
const int DelayBetweenIterationsMs = 500;
|
||||
const int DelayBetweenIterationsMs = 2000; // Increased from 500ms to 2s to respect rate limits
|
||||
const int DelayAfterToolCallsMs = 1000; // Additional delay after tool calls before next LLM call
|
||||
|
||||
await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate
|
||||
{
|
||||
@@ -254,7 +256,7 @@ public class LlmController : BaseController
|
||||
// Trim context if conversation is getting too long
|
||||
TrimConversationContext(chatRequest);
|
||||
|
||||
// Send chat request to LLM
|
||||
// Send chat request to LLM with retry logic for rate limits
|
||||
await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate
|
||||
{
|
||||
Type = "thinking",
|
||||
@@ -263,7 +265,37 @@ public class LlmController : BaseController
|
||||
MaxIterations = maxIterations
|
||||
});
|
||||
|
||||
var response = await llmService.ChatAsync(user, chatRequest);
|
||||
LlmChatResponse response;
|
||||
try
|
||||
{
|
||||
response = await llmService.ChatAsync(user, chatRequest);
|
||||
}
|
||||
catch (HttpRequestException httpEx) when (httpEx.Message.Contains("429") || httpEx.Message.Contains("TooManyRequests") || httpEx.Message.Contains("RESOURCE_EXHAUSTED"))
|
||||
{
|
||||
// Rate limit hit - wait longer before retrying
|
||||
logger.LogWarning("Rate limit hit (429) in iteration {Iteration}. Waiting 10 seconds before retry...", iteration);
|
||||
await SendProgressUpdate(connectionId, hubContext, logger, new LlmProgressUpdate
|
||||
{
|
||||
Type = "thinking",
|
||||
Message = "Rate limit reached. Waiting before retrying...",
|
||||
Iteration = iteration,
|
||||
MaxIterations = maxIterations
|
||||
});
|
||||
|
||||
// Wait 10 seconds for rate limit to reset
|
||||
await Task.Delay(10000);
|
||||
|
||||
// Retry once
|
||||
try
|
||||
{
|
||||
response = await llmService.ChatAsync(user, chatRequest);
|
||||
}
|
||||
catch (Exception retryEx)
|
||||
{
|
||||
logger.LogError(retryEx, "Retry after rate limit also failed in iteration {Iteration}", iteration);
|
||||
throw new HttpRequestException($"Rate limit error persists after retry: {retryEx.Message}", retryEx);
|
||||
}
|
||||
}
|
||||
|
||||
// If LLM doesn't want to call tools, we have our final answer
|
||||
if (!response.RequiresToolExecution || response.ToolCalls == null || !response.ToolCalls.Any())
|
||||
@@ -375,6 +407,9 @@ public class LlmController : BaseController
|
||||
// Add tool results to conversation history
|
||||
chatRequest.Messages.AddRange(toolResults);
|
||||
|
||||
// Add delay after tool calls before next LLM call to avoid rate limits
|
||||
await Task.Delay(DelayAfterToolCallsMs);
|
||||
|
||||
// Continue loop to get LLM's response to the tool results
|
||||
}
|
||||
|
||||
@@ -466,7 +501,8 @@ public class LlmController : BaseController
|
||||
int maxIterations = DetermineMaxIterations(request);
|
||||
int iteration = 0;
|
||||
LlmChatResponse? finalResponse = null;
|
||||
const int DelayBetweenIterationsMs = 500; // 500ms delay between iterations to avoid rate limits
|
||||
const int DelayBetweenIterationsMs = 2000; // Increased from 500ms to 2s to respect rate limits
|
||||
const int DelayAfterToolCallsMs = 1000; // Additional delay after tool calls before next LLM call
|
||||
|
||||
while (iteration < maxIterations)
|
||||
{
|
||||
@@ -486,8 +522,31 @@ public class LlmController : BaseController
|
||||
// Trim context if conversation is getting too long
|
||||
TrimConversationContext(request);
|
||||
|
||||
// Send chat request to LLM
|
||||
var response = await _llmService.ChatAsync(user, request);
|
||||
// Send chat request to LLM with retry logic for rate limits
|
||||
LlmChatResponse response;
|
||||
try
|
||||
{
|
||||
response = await _llmService.ChatAsync(user, request);
|
||||
}
|
||||
catch (HttpRequestException httpEx) when (httpEx.Message.Contains("429") || httpEx.Message.Contains("TooManyRequests") || httpEx.Message.Contains("RESOURCE_EXHAUSTED"))
|
||||
{
|
||||
// Rate limit hit - wait longer before retrying
|
||||
_logger.LogWarning("Rate limit hit (429) in iteration {Iteration}. Waiting 10 seconds before retry...", iteration);
|
||||
|
||||
// Wait 10 seconds for rate limit to reset
|
||||
await Task.Delay(10000);
|
||||
|
||||
// Retry once
|
||||
try
|
||||
{
|
||||
response = await _llmService.ChatAsync(user, request);
|
||||
}
|
||||
catch (Exception retryEx)
|
||||
{
|
||||
_logger.LogError(retryEx, "Retry after rate limit also failed in iteration {Iteration}", iteration);
|
||||
throw new HttpRequestException($"Rate limit error persists after retry: {retryEx.Message}", retryEx);
|
||||
}
|
||||
}
|
||||
|
||||
// If LLM doesn't want to call tools, we have our final answer
|
||||
if (!response.RequiresToolExecution || response.ToolCalls == null || !response.ToolCalls.Any())
|
||||
@@ -544,6 +603,9 @@ public class LlmController : BaseController
|
||||
// Add tool results to conversation history
|
||||
request.Messages.AddRange(toolResults);
|
||||
|
||||
// Add delay after tool calls before next LLM call to avoid rate limits
|
||||
await Task.Delay(DelayAfterToolCallsMs);
|
||||
|
||||
// Continue loop to get LLM's response to the tool results
|
||||
}
|
||||
|
||||
|
||||
@@ -21,8 +21,8 @@ public class GeminiProvider : ILlmProvider
|
||||
private readonly IAsyncPolicy<HttpResponseMessage> _retryPolicy;
|
||||
private const string BaseUrl = "https://generativelanguage.googleapis.com/v1beta";
|
||||
private const string FallbackModel = "gemini-2.0-flash-exp";
|
||||
private const int MaxRetryAttempts = 3;
|
||||
private const int BaseRetryDelayMs = 2000; // 2 seconds base delay
|
||||
private const int MaxRetryAttempts = 5; // Increased from 3 to 5 for better rate limit handling
|
||||
private const int BaseRetryDelayMs = 3000; // Increased from 2s to 3s base delay for rate limits
|
||||
|
||||
public string Name => "gemini";
|
||||
|
||||
@@ -49,23 +49,35 @@ public class GeminiProvider : ILlmProvider
|
||||
var retryAfter = ParseRetryAfterHeader(result.Result);
|
||||
if (retryAfter.HasValue)
|
||||
{
|
||||
// Use Retry-After header value, but ensure minimum delay
|
||||
var delayRetry = TimeSpan.FromSeconds(Math.Max(retryAfter.Value.TotalSeconds, 10));
|
||||
_logger.LogInformation(
|
||||
"Rate limited (429). Respecting Retry-After header: {RetryAfterSeconds}s",
|
||||
retryAfter.Value.TotalSeconds);
|
||||
return retryAfter.Value;
|
||||
"Rate limited (429). Respecting Retry-After header: {RetryAfterSeconds}s (using {ActualDelay}s)",
|
||||
retryAfter.Value.TotalSeconds, delayRetry.TotalSeconds);
|
||||
return delayRetry;
|
||||
}
|
||||
|
||||
// If no Retry-After header, use longer exponential backoff for 429 errors
|
||||
var rateLimitDelay =
|
||||
BaseRetryDelayMs * Math.Pow(2, retryAttempt) * 2; // Double the delay for rate limits
|
||||
var rateLimitJitter = new Random().Next(0, BaseRetryDelayMs);
|
||||
var rateLimitTotalDelay = TimeSpan.FromMilliseconds(rateLimitDelay + rateLimitJitter);
|
||||
_logger.LogInformation(
|
||||
"Rate limited (429) without Retry-After header. Using extended backoff: {DelayMs}ms",
|
||||
rateLimitTotalDelay.TotalMilliseconds);
|
||||
return rateLimitTotalDelay;
|
||||
}
|
||||
|
||||
// Exponential backoff with jitter: baseDelay * 2^(retryAttempt-1) + random jitter
|
||||
var exponentialDelay = BaseRetryDelayMs * Math.Pow(2, retryAttempt - 1);
|
||||
var jitter = new Random().Next(0, BaseRetryDelayMs / 4);
|
||||
var delay = TimeSpan.FromMilliseconds(exponentialDelay + jitter);
|
||||
var backoffDelay = TimeSpan.FromMilliseconds(exponentialDelay + jitter);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Retrying after exponential backoff: {DelayMs}ms (attempt {Attempt}/{MaxAttempts})",
|
||||
delay.TotalMilliseconds, retryAttempt, MaxRetryAttempts + 1);
|
||||
backoffDelay.TotalMilliseconds, retryAttempt, MaxRetryAttempts + 1);
|
||||
|
||||
return delay;
|
||||
return backoffDelay;
|
||||
},
|
||||
onRetryAsync: async (outcome, timespan, retryCount, context) =>
|
||||
{
|
||||
@@ -100,7 +112,8 @@ public class GeminiProvider : ILlmProvider
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
var errorContent = await response.Content.ReadAsStringAsync();
|
||||
_logger.LogError("Gemini API error after retries: {StatusCode} - {Error}", response.StatusCode, errorContent);
|
||||
_logger.LogError("Gemini API error after retries: {StatusCode} - {Error}", response.StatusCode,
|
||||
errorContent);
|
||||
throw new HttpRequestException($"Gemini API error: {response.StatusCode} - {errorContent}");
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user