From 58fce0a37bab011ca372f1e1b667ec7b39d403e9 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Thu, 26 Feb 2026 20:23:56 -0800 Subject: [PATCH] feat(api): implement dynamic max tokens handling for various providers - Added _max_tokens_param method in AIAgent to return appropriate max tokens parameter based on the provider (OpenAI vs. others). - Updated API calls in AIAgent to utilize the new max tokens handling. - Introduced auxiliary_max_tokens_param function in auxiliary_client for consistent max tokens management across auxiliary clients. - Refactored multiple tools to use auxiliary_max_tokens_param for improved compatibility with different models and providers. --- agent/auxiliary_client.py | 17 +++++++++++++++++ agent/context_compressor.py | 27 ++++++++++++++++++++------- run_agent.py | 21 ++++++++++++++++++--- tools/browser_tool.py | 6 ++++-- tools/session_search_tool.py | 4 ++-- tools/vision_tools.py | 4 ++-- tools/web_tools.py | 8 ++++---- 7 files changed, 67 insertions(+), 20 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 0ad4de220..ef179c410 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -154,3 +154,20 @@ def get_auxiliary_extra_body() -> dict: by Nous Portal. Returns empty dict otherwise. """ return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {} + + +def auxiliary_max_tokens_param(value: int) -> dict: + """Return the correct max tokens kwarg for the auxiliary client's provider. + + OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer + models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'. + """ + custom_base = os.getenv("OPENAI_BASE_URL", "") + or_key = os.getenv("OPENROUTER_API_KEY") + # Only use max_completion_tokens when the auxiliary client resolved to + # direct OpenAI (no OpenRouter key, no Nous auth, custom endpoint is api.openai.com) + if (not or_key + and _read_nous_auth() is None + and "api.openai.com" in custom_base.lower()): + return {"max_completion_tokens": value} + return {"max_tokens": value} diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 8f072a37a..329fd9680 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -113,13 +113,26 @@ TURNS TO SUMMARIZE: Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: - response = self.client.chat.completions.create( - model=self.summary_model, - messages=[{"role": "user", "content": prompt}], - temperature=0.3, - max_tokens=self.summary_target_tokens * 2, - timeout=30.0, - ) + kwargs = { + "model": self.summary_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "timeout": 30.0, + } + # Most providers (OpenRouter, local models) use max_tokens. + # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) + # requires max_completion_tokens instead. + try: + kwargs["max_tokens"] = self.summary_target_tokens * 2 + response = self.client.chat.completions.create(**kwargs) + except Exception as first_err: + if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 + response = self.client.chat.completions.create(**kwargs) + else: + raise + summary = response.choices[0].message.content.strip() if not summary.startswith("[CONTEXT SUMMARY]:"): summary = "[CONTEXT SUMMARY]: " + summary diff --git a/run_agent.py b/run_agent.py index 3b7d6e3bd..467281d01 100644 --- a/run_agent.py +++ b/run_agent.py @@ -450,6 +450,21 @@ class AIAgent: else: print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") + def _max_tokens_param(self, value: int) -> dict: + """Return the correct max tokens kwarg for the current provider. + + OpenAI's newer models (gpt-4o, o-series, gpt-5+) require + 'max_completion_tokens'. OpenRouter, local models, and older + OpenAI models use 'max_tokens'. + """ + _is_direct_openai = ( + "api.openai.com" in self.base_url.lower() + and "openrouter" not in self.base_url.lower() + ) + if _is_direct_openai: + return {"max_completion_tokens": value} + return {"max_tokens": value} + def _has_content_after_think_block(self, content: str) -> bool: """ Check if content has actual text after any blocks. @@ -1190,7 +1205,7 @@ class AIAgent: } if self.max_tokens is not None: - api_kwargs["max_tokens"] = self.max_tokens + api_kwargs.update(self._max_tokens_param(self.max_tokens)) extra_body = {} @@ -1324,7 +1339,7 @@ class AIAgent: "messages": api_messages, "tools": [memory_tool_def], "temperature": 0.3, - "max_tokens": 1024, + **self._max_tokens_param(1024), } response = self.client.chat.completions.create(**api_kwargs, timeout=30.0) @@ -1644,7 +1659,7 @@ class AIAgent: "messages": api_messages, } if self.max_tokens is not None: - summary_kwargs["max_tokens"] = self.max_tokens + summary_kwargs.update(self._max_tokens_param(self.max_tokens)) if summary_extra_body: summary_kwargs["extra_body"] = summary_extra_body diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 43a56b1d0..208d6e863 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -812,10 +812,11 @@ def _extract_relevant_content( ) try: + from agent.auxiliary_client import auxiliary_max_tokens_param response = _aux_vision_client.chat.completions.create( model=EXTRACTION_MODEL, messages=[{"role": "user", "content": extraction_prompt}], - max_tokens=4000, + **auxiliary_max_tokens_param(4000), temperature=0.1, ) return response.choices[0].message.content @@ -1283,6 +1284,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str: ) # Use the sync auxiliary vision client directly + from agent.auxiliary_client import auxiliary_max_tokens_param response = _aux_vision_client.chat.completions.create( model=EXTRACTION_MODEL, messages=[ @@ -1294,7 +1296,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str: ], } ], - max_tokens=2000, + **auxiliary_max_tokens_param(2000), temperature=0.1, ) diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index 299286d98..bcfbfdf2a 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -170,7 +170,7 @@ async def _summarize_session( max_retries = 3 for attempt in range(max_retries): try: - from agent.auxiliary_client import get_auxiliary_extra_body + from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param _extra = get_auxiliary_extra_body() response = await _async_aux_client.chat.completions.create( model=_SUMMARIZER_MODEL, @@ -180,7 +180,7 @@ async def _summarize_session( ], **({} if not _extra else {"extra_body": _extra}), temperature=0.1, - max_tokens=MAX_SUMMARY_TOKENS, + **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS), ) return response.choices[0].message.content.strip() except Exception as e: diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 456f85583..39413d5b0 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -314,13 +314,13 @@ async def vision_analyze_tool( logger.info("Processing image with %s...", model) # Call the vision API - from agent.auxiliary_client import get_auxiliary_extra_body + from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param _extra = get_auxiliary_extra_body() response = await _aux_async_client.chat.completions.create( model=model, messages=messages, temperature=0.1, - max_tokens=2000, + **auxiliary_max_tokens_param(2000), **({} if not _extra else {"extra_body": _extra}), ) diff --git a/tools/web_tools.py b/tools/web_tools.py index a7f64166e..4e0772166 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -242,7 +242,7 @@ Create a markdown summary that captures all key information in a well-organized, if _aux_async_client is None: logger.warning("No auxiliary model available for web content processing") return None - from agent.auxiliary_client import get_auxiliary_extra_body + from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param _extra = get_auxiliary_extra_body() response = await _aux_async_client.chat.completions.create( model=model, @@ -251,7 +251,7 @@ Create a markdown summary that captures all key information in a well-organized, {"role": "user", "content": user_prompt} ], temperature=0.1, - max_tokens=max_tokens, + **auxiliary_max_tokens_param(max_tokens), **({} if not _extra else {"extra_body": _extra}), ) return response.choices[0].message.content.strip() @@ -365,7 +365,7 @@ Create a single, unified markdown summary.""" fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" return fallback - from agent.auxiliary_client import get_auxiliary_extra_body + from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param _extra = get_auxiliary_extra_body() response = await _aux_async_client.chat.completions.create( model=model, @@ -374,7 +374,7 @@ Create a single, unified markdown summary.""" {"role": "user", "content": synthesis_prompt} ], temperature=0.1, - max_tokens=4000, + **auxiliary_max_tokens_param(4000), **({} if not _extra else {"extra_body": _extra}), ) final_summary = response.choices[0].message.content.strip()