13 lines
1.0 KiB
TypeScript
13 lines
1.0 KiB
TypeScript
import { VersionInfo } from '@start9labs/start-sdk'
|
|
|
|
export const v_0_2_16 = VersionInfo.of({
|
|
version: '0.2.16:0',
|
|
releaseNotes: {
|
|
en_US: 'Analyze pipeline now talks to operator-hardware LLMs in a much faster mode. Two changes to the chat-completion request the relay sends to vLLM: (1) response_format is set to JSON-object mode, which constrains the model to emit valid JSON instead of wrapping the output in prose preamble like "Here are the topics I identified:" — saves real decode tokens and avoids parse failures. (2) chat_template_kwargs.enable_thinking is set to false, which disables Qwen3.6\'s reasoning mode for this task. Thinking mode is great for math but pure latency-noise for structured extraction. Together these typically cut analyze wall-time on operator hardware by 30-50%. Both fields are vLLM-specific; non-Qwen / non-vLLM backends ignore them, so this is safe across other operator-hardware setups.',
|
|
},
|
|
migrations: {
|
|
up: async ({ effects }) => {},
|
|
down: async ({ effects }) => {},
|
|
},
|
|
})
|