Files
recap-relay/startos/versions/v0.2.16.ts
T

13 lines
1.0 KiB
TypeScript

import { VersionInfo } from '@start9labs/start-sdk'
export const v_0_2_16 = VersionInfo.of({
version: '0.2.16:0',
releaseNotes: {
en_US: 'Analyze pipeline now talks to operator-hardware LLMs in a much faster mode. Two changes to the chat-completion request the relay sends to vLLM: (1) response_format is set to JSON-object mode, which constrains the model to emit valid JSON instead of wrapping the output in prose preamble like "Here are the topics I identified:" — saves real decode tokens and avoids parse failures. (2) chat_template_kwargs.enable_thinking is set to false, which disables Qwen3.6\'s reasoning mode for this task. Thinking mode is great for math but pure latency-noise for structured extraction. Together these typically cut analyze wall-time on operator hardware by 30-50%. Both fields are vLLM-specific; non-Qwen / non-vLLM backends ignore them, so this is safe across other operator-hardware setups.',
},
migrations: {
up: async ({ effects }) => {},
down: async ({ effects }) => {},
},
})