feat(api): implement dynamic max tokens handling for various providers
- Added _max_tokens_param method in AIAgent to return appropriate max tokens parameter based on the provider (OpenAI vs. others). - Updated API calls in AIAgent to utilize the new max tokens handling. - Introduced auxiliary_max_tokens_param function in auxiliary_client for consistent max tokens management across auxiliary clients. - Refactored multiple tools to use auxiliary_max_tokens_param for improved compatibility with different models and providers.
This commit is contained in:
@@ -154,3 +154,20 @@ def get_auxiliary_extra_body() -> dict:
|
|||||||
by Nous Portal. Returns empty dict otherwise.
|
by Nous Portal. Returns empty dict otherwise.
|
||||||
"""
|
"""
|
||||||
return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
|
return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
|
||||||
|
|
||||||
|
|
||||||
|
def auxiliary_max_tokens_param(value: int) -> dict:
|
||||||
|
"""Return the correct max tokens kwarg for the auxiliary client's provider.
|
||||||
|
|
||||||
|
OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer
|
||||||
|
models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'.
|
||||||
|
"""
|
||||||
|
custom_base = os.getenv("OPENAI_BASE_URL", "")
|
||||||
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
||||||
|
# Only use max_completion_tokens when the auxiliary client resolved to
|
||||||
|
# direct OpenAI (no OpenRouter key, no Nous auth, custom endpoint is api.openai.com)
|
||||||
|
if (not or_key
|
||||||
|
and _read_nous_auth() is None
|
||||||
|
and "api.openai.com" in custom_base.lower()):
|
||||||
|
return {"max_completion_tokens": value}
|
||||||
|
return {"max_tokens": value}
|
||||||
|
|||||||
@@ -113,13 +113,26 @@ TURNS TO SUMMARIZE:
|
|||||||
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = self.client.chat.completions.create(
|
kwargs = {
|
||||||
model=self.summary_model,
|
"model": self.summary_model,
|
||||||
messages=[{"role": "user", "content": prompt}],
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
temperature=0.3,
|
"temperature": 0.3,
|
||||||
max_tokens=self.summary_target_tokens * 2,
|
"timeout": 30.0,
|
||||||
timeout=30.0,
|
}
|
||||||
)
|
# Most providers (OpenRouter, local models) use max_tokens.
|
||||||
|
# Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
|
||||||
|
# requires max_completion_tokens instead.
|
||||||
|
try:
|
||||||
|
kwargs["max_tokens"] = self.summary_target_tokens * 2
|
||||||
|
response = self.client.chat.completions.create(**kwargs)
|
||||||
|
except Exception as first_err:
|
||||||
|
if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
|
||||||
|
kwargs.pop("max_tokens", None)
|
||||||
|
kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
|
||||||
|
response = self.client.chat.completions.create(**kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
summary = response.choices[0].message.content.strip()
|
summary = response.choices[0].message.content.strip()
|
||||||
if not summary.startswith("[CONTEXT SUMMARY]:"):
|
if not summary.startswith("[CONTEXT SUMMARY]:"):
|
||||||
summary = "[CONTEXT SUMMARY]: " + summary
|
summary = "[CONTEXT SUMMARY]: " + summary
|
||||||
|
|||||||
21
run_agent.py
21
run_agent.py
@@ -450,6 +450,21 @@ class AIAgent:
|
|||||||
else:
|
else:
|
||||||
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
|
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
|
||||||
|
|
||||||
|
def _max_tokens_param(self, value: int) -> dict:
|
||||||
|
"""Return the correct max tokens kwarg for the current provider.
|
||||||
|
|
||||||
|
OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
|
||||||
|
'max_completion_tokens'. OpenRouter, local models, and older
|
||||||
|
OpenAI models use 'max_tokens'.
|
||||||
|
"""
|
||||||
|
_is_direct_openai = (
|
||||||
|
"api.openai.com" in self.base_url.lower()
|
||||||
|
and "openrouter" not in self.base_url.lower()
|
||||||
|
)
|
||||||
|
if _is_direct_openai:
|
||||||
|
return {"max_completion_tokens": value}
|
||||||
|
return {"max_tokens": value}
|
||||||
|
|
||||||
def _has_content_after_think_block(self, content: str) -> bool:
|
def _has_content_after_think_block(self, content: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if content has actual text after any <think></think> blocks.
|
Check if content has actual text after any <think></think> blocks.
|
||||||
@@ -1190,7 +1205,7 @@ class AIAgent:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if self.max_tokens is not None:
|
if self.max_tokens is not None:
|
||||||
api_kwargs["max_tokens"] = self.max_tokens
|
api_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||||
|
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
|
|
||||||
@@ -1324,7 +1339,7 @@ class AIAgent:
|
|||||||
"messages": api_messages,
|
"messages": api_messages,
|
||||||
"tools": [memory_tool_def],
|
"tools": [memory_tool_def],
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"max_tokens": 1024,
|
**self._max_tokens_param(1024),
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
|
response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
|
||||||
@@ -1644,7 +1659,7 @@ class AIAgent:
|
|||||||
"messages": api_messages,
|
"messages": api_messages,
|
||||||
}
|
}
|
||||||
if self.max_tokens is not None:
|
if self.max_tokens is not None:
|
||||||
summary_kwargs["max_tokens"] = self.max_tokens
|
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||||
if summary_extra_body:
|
if summary_extra_body:
|
||||||
summary_kwargs["extra_body"] = summary_extra_body
|
summary_kwargs["extra_body"] = summary_extra_body
|
||||||
|
|
||||||
|
|||||||
@@ -812,10 +812,11 @@ def _extract_relevant_content(
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from agent.auxiliary_client import auxiliary_max_tokens_param
|
||||||
response = _aux_vision_client.chat.completions.create(
|
response = _aux_vision_client.chat.completions.create(
|
||||||
model=EXTRACTION_MODEL,
|
model=EXTRACTION_MODEL,
|
||||||
messages=[{"role": "user", "content": extraction_prompt}],
|
messages=[{"role": "user", "content": extraction_prompt}],
|
||||||
max_tokens=4000,
|
**auxiliary_max_tokens_param(4000),
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
@@ -1283,6 +1284,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Use the sync auxiliary vision client directly
|
# Use the sync auxiliary vision client directly
|
||||||
|
from agent.auxiliary_client import auxiliary_max_tokens_param
|
||||||
response = _aux_vision_client.chat.completions.create(
|
response = _aux_vision_client.chat.completions.create(
|
||||||
model=EXTRACTION_MODEL,
|
model=EXTRACTION_MODEL,
|
||||||
messages=[
|
messages=[
|
||||||
@@ -1294,7 +1296,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
max_tokens=2000,
|
**auxiliary_max_tokens_param(2000),
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -170,7 +170,7 @@ async def _summarize_session(
|
|||||||
max_retries = 3
|
max_retries = 3
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
||||||
_extra = get_auxiliary_extra_body()
|
_extra = get_auxiliary_extra_body()
|
||||||
response = await _async_aux_client.chat.completions.create(
|
response = await _async_aux_client.chat.completions.create(
|
||||||
model=_SUMMARIZER_MODEL,
|
model=_SUMMARIZER_MODEL,
|
||||||
@@ -180,7 +180,7 @@ async def _summarize_session(
|
|||||||
],
|
],
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
**({} if not _extra else {"extra_body": _extra}),
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=MAX_SUMMARY_TOKENS,
|
**auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS),
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content.strip()
|
return response.choices[0].message.content.strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -314,13 +314,13 @@ async def vision_analyze_tool(
|
|||||||
logger.info("Processing image with %s...", model)
|
logger.info("Processing image with %s...", model)
|
||||||
|
|
||||||
# Call the vision API
|
# Call the vision API
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
||||||
_extra = get_auxiliary_extra_body()
|
_extra = get_auxiliary_extra_body()
|
||||||
response = await _aux_async_client.chat.completions.create(
|
response = await _aux_async_client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=2000,
|
**auxiliary_max_tokens_param(2000),
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
**({} if not _extra else {"extra_body": _extra}),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -242,7 +242,7 @@ Create a markdown summary that captures all key information in a well-organized,
|
|||||||
if _aux_async_client is None:
|
if _aux_async_client is None:
|
||||||
logger.warning("No auxiliary model available for web content processing")
|
logger.warning("No auxiliary model available for web content processing")
|
||||||
return None
|
return None
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
||||||
_extra = get_auxiliary_extra_body()
|
_extra = get_auxiliary_extra_body()
|
||||||
response = await _aux_async_client.chat.completions.create(
|
response = await _aux_async_client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
@@ -251,7 +251,7 @@ Create a markdown summary that captures all key information in a well-organized,
|
|||||||
{"role": "user", "content": user_prompt}
|
{"role": "user", "content": user_prompt}
|
||||||
],
|
],
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=max_tokens,
|
**auxiliary_max_tokens_param(max_tokens),
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
**({} if not _extra else {"extra_body": _extra}),
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content.strip()
|
return response.choices[0].message.content.strip()
|
||||||
@@ -365,7 +365,7 @@ Create a single, unified markdown summary."""
|
|||||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||||
return fallback
|
return fallback
|
||||||
|
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
||||||
_extra = get_auxiliary_extra_body()
|
_extra = get_auxiliary_extra_body()
|
||||||
response = await _aux_async_client.chat.completions.create(
|
response = await _aux_async_client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
@@ -374,7 +374,7 @@ Create a single, unified markdown summary."""
|
|||||||
{"role": "user", "content": synthesis_prompt}
|
{"role": "user", "content": synthesis_prompt}
|
||||||
],
|
],
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=4000,
|
**auxiliary_max_tokens_param(4000),
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
**({} if not _extra else {"extra_body": _extra}),
|
||||||
)
|
)
|
||||||
final_summary = response.choices[0].message.content.strip()
|
final_summary = response.choices[0].message.content.strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user