From 84302aedac68b9471f7225c3d08e617f633eb6f1 Mon Sep 17 00:00:00 2001 From: Kimi Agent Date: Fri, 20 Mar 2026 16:27:24 -0400 Subject: [PATCH] fix: pass max_tokens to Ollama provider in cascade router (#622) Co-authored-by: Kimi Agent Co-committed-by: Kimi Agent --- src/infrastructure/router/cascade.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/infrastructure/router/cascade.py b/src/infrastructure/router/cascade.py index 40b1304..fae8afe 100644 --- a/src/infrastructure/router/cascade.py +++ b/src/infrastructure/router/cascade.py @@ -564,6 +564,7 @@ class CascadeRouter: messages=messages, model=model or provider.get_default_model(), temperature=temperature, + max_tokens=max_tokens, content_type=content_type, ) elif provider.type == "openai": @@ -604,6 +605,7 @@ class CascadeRouter: messages: list[dict], model: str, temperature: float, + max_tokens: int | None = None, content_type: ContentType = ContentType.TEXT, ) -> dict: """Call Ollama API with multi-modal support.""" @@ -614,13 +616,15 @@ class CascadeRouter: # Transform messages for Ollama format (including images) transformed_messages = self._transform_messages_for_ollama(messages) + options = {"temperature": temperature} + if max_tokens: + options["num_predict"] = max_tokens + payload = { "model": model, "messages": transformed_messages, "stream": False, - "options": { - "temperature": temperature, - }, + "options": options, } timeout = aiohttp.ClientTimeout(total=self.config.timeout_seconds)