From 58fce0a37bab011ca372f1e1b667ec7b39d403e9 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Thu, 26 Feb 2026 20:23:56 -0800
Subject: [PATCH] feat(api): implement dynamic max tokens handling for various
 providers

- Added _max_tokens_param method in AIAgent to return appropriate max tokens parameter based on the provider (OpenAI vs. others).
- Updated API calls in AIAgent to utilize the new max tokens handling.
- Introduced auxiliary_max_tokens_param function in auxiliary_client for consistent max tokens management across auxiliary clients.
- Refactored multiple tools to use auxiliary_max_tokens_param for improved compatibility with different models and providers.
---
 agent/auxiliary_client.py    | 17 +++++++++++++++++
 agent/context_compressor.py  | 27 ++++++++++++++++++++-------
 run_agent.py                 | 21 ++++++++++++++++++---
 tools/browser_tool.py        |  6 ++++--
 tools/session_search_tool.py |  4 ++--
 tools/vision_tools.py        |  4 ++--
 tools/web_tools.py           |  8 ++++----
 7 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 0ad4de220..ef179c410 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -154,3 +154,20 @@ def get_auxiliary_extra_body() -> dict:
     by Nous Portal. Returns empty dict otherwise.
     """
     return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
+
+
+def auxiliary_max_tokens_param(value: int) -> dict:
+    """Return the correct max tokens kwarg for the auxiliary client's provider.
+    
+    OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer
+    models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'.
+    """
+    custom_base = os.getenv("OPENAI_BASE_URL", "")
+    or_key = os.getenv("OPENROUTER_API_KEY")
+    # Only use max_completion_tokens when the auxiliary client resolved to
+    # direct OpenAI (no OpenRouter key, no Nous auth, custom endpoint is api.openai.com)
+    if (not or_key
+            and _read_nous_auth() is None
+            and "api.openai.com" in custom_base.lower()):
+        return {"max_completion_tokens": value}
+    return {"max_tokens": value}
diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index 8f072a37a..329fd9680 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -113,13 +113,26 @@ TURNS TO SUMMARIZE:
 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
 
         try:
-            response = self.client.chat.completions.create(
-                model=self.summary_model,
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.3,
-                max_tokens=self.summary_target_tokens * 2,
-                timeout=30.0,
-            )
+            kwargs = {
+                "model": self.summary_model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.3,
+                "timeout": 30.0,
+            }
+            # Most providers (OpenRouter, local models) use max_tokens.
+            # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
+            # requires max_completion_tokens instead.
+            try:
+                kwargs["max_tokens"] = self.summary_target_tokens * 2
+                response = self.client.chat.completions.create(**kwargs)
+            except Exception as first_err:
+                if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
+                    kwargs.pop("max_tokens", None)
+                    kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
+                    response = self.client.chat.completions.create(**kwargs)
+                else:
+                    raise
+
             summary = response.choices[0].message.content.strip()
             if not summary.startswith("[CONTEXT SUMMARY]:"):
                 summary = "[CONTEXT SUMMARY]: " + summary
diff --git a/run_agent.py b/run_agent.py
index 3b7d6e3bd..467281d01 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -450,6 +450,21 @@ class AIAgent:
             else:
                 print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
     
+    def _max_tokens_param(self, value: int) -> dict:
+        """Return the correct max tokens kwarg for the current provider.
+        
+        OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
+        'max_completion_tokens'. OpenRouter, local models, and older
+        OpenAI models use 'max_tokens'.
+        """
+        _is_direct_openai = (
+            "api.openai.com" in self.base_url.lower()
+            and "openrouter" not in self.base_url.lower()
+        )
+        if _is_direct_openai:
+            return {"max_completion_tokens": value}
+        return {"max_tokens": value}
+
     def _has_content_after_think_block(self, content: str) -> bool:
         """
         Check if content has actual text after any <think></think> blocks.
@@ -1190,7 +1205,7 @@ class AIAgent:
         }
 
         if self.max_tokens is not None:
-            api_kwargs["max_tokens"] = self.max_tokens
+            api_kwargs.update(self._max_tokens_param(self.max_tokens))
 
         extra_body = {}
 
@@ -1324,7 +1339,7 @@ class AIAgent:
                 "messages": api_messages,
                 "tools": [memory_tool_def],
                 "temperature": 0.3,
-                "max_tokens": 1024,
+                **self._max_tokens_param(1024),
             }
 
             response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
@@ -1644,7 +1659,7 @@ class AIAgent:
                 "messages": api_messages,
             }
             if self.max_tokens is not None:
-                summary_kwargs["max_tokens"] = self.max_tokens
+                summary_kwargs.update(self._max_tokens_param(self.max_tokens))
             if summary_extra_body:
                 summary_kwargs["extra_body"] = summary_extra_body
 
diff --git a/tools/browser_tool.py b/tools/browser_tool.py
index 43a56b1d0..208d6e863 100644
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -812,10 +812,11 @@ def _extract_relevant_content(
         )
 
     try:
+        from agent.auxiliary_client import auxiliary_max_tokens_param
         response = _aux_vision_client.chat.completions.create(
             model=EXTRACTION_MODEL,
             messages=[{"role": "user", "content": extraction_prompt}],
-            max_tokens=4000,
+            **auxiliary_max_tokens_param(4000),
             temperature=0.1,
         )
         return response.choices[0].message.content
@@ -1283,6 +1284,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
         )
 
         # Use the sync auxiliary vision client directly
+        from agent.auxiliary_client import auxiliary_max_tokens_param
         response = _aux_vision_client.chat.completions.create(
             model=EXTRACTION_MODEL,
             messages=[
@@ -1294,7 +1296,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
                     ],
                 }
             ],
-            max_tokens=2000,
+            **auxiliary_max_tokens_param(2000),
             temperature=0.1,
         )
         
diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py
index 299286d98..bcfbfdf2a 100644
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@@ -170,7 +170,7 @@ async def _summarize_session(
     max_retries = 3
     for attempt in range(max_retries):
         try:
-            from agent.auxiliary_client import get_auxiliary_extra_body
+            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
             _extra = get_auxiliary_extra_body()
             response = await _async_aux_client.chat.completions.create(
                 model=_SUMMARIZER_MODEL,
@@ -180,7 +180,7 @@ async def _summarize_session(
                 ],
                 **({} if not _extra else {"extra_body": _extra}),
                 temperature=0.1,
-                max_tokens=MAX_SUMMARY_TOKENS,
+                **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS),
             )
             return response.choices[0].message.content.strip()
         except Exception as e:
diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index 456f85583..39413d5b0 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -314,13 +314,13 @@ async def vision_analyze_tool(
         logger.info("Processing image with %s...", model)
         
         # Call the vision API
-        from agent.auxiliary_client import get_auxiliary_extra_body
+        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
         _extra = get_auxiliary_extra_body()
         response = await _aux_async_client.chat.completions.create(
             model=model,
             messages=messages,
             temperature=0.1,
-            max_tokens=2000,
+            **auxiliary_max_tokens_param(2000),
             **({} if not _extra else {"extra_body": _extra}),
         )
         
diff --git a/tools/web_tools.py b/tools/web_tools.py
index a7f64166e..4e0772166 100644
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -242,7 +242,7 @@ Create a markdown summary that captures all key information in a well-organized,
             if _aux_async_client is None:
                 logger.warning("No auxiliary model available for web content processing")
                 return None
-            from agent.auxiliary_client import get_auxiliary_extra_body
+            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
             _extra = get_auxiliary_extra_body()
             response = await _aux_async_client.chat.completions.create(
                 model=model,
@@ -251,7 +251,7 @@ Create a markdown summary that captures all key information in a well-organized,
                     {"role": "user", "content": user_prompt}
                 ],
                 temperature=0.1,
-                max_tokens=max_tokens,
+                **auxiliary_max_tokens_param(max_tokens),
                 **({} if not _extra else {"extra_body": _extra}),
             )
             return response.choices[0].message.content.strip()
@@ -365,7 +365,7 @@ Create a single, unified markdown summary."""
                 fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
             return fallback
 
-        from agent.auxiliary_client import get_auxiliary_extra_body
+        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
         _extra = get_auxiliary_extra_body()
         response = await _aux_async_client.chat.completions.create(
             model=model,
@@ -374,7 +374,7 @@ Create a single, unified markdown summary."""
                 {"role": "user", "content": synthesis_prompt}
             ],
             temperature=0.1,
-            max_tokens=4000,
+            **auxiliary_max_tokens_param(4000),
             **({} if not _extra else {"extra_body": _extra}),
         )
         final_summary = response.choices[0].message.content.strip()