recap-relay/startos/versions/v0.2.16.ts

import { VersionInfo } from '@start9labs/start-sdk'

export const v_0_2_16 = VersionInfo.of({
  version: '0.2.16:0',
  releaseNotes: {
    en_US: 'Analyze pipeline now talks to operator-hardware LLMs in a much faster mode. Two changes to the chat-completion request the relay sends to vLLM: (1) response_format is set to JSON-object mode, which constrains the model to emit valid JSON instead of wrapping the output in prose preamble like "Here are the topics I identified:" — saves real decode tokens and avoids parse failures. (2) chat_template_kwargs.enable_thinking is set to false, which disables Qwen3.6\'s reasoning mode for this task. Thinking mode is great for math but pure latency-noise for structured extraction. Together these typically cut analyze wall-time on operator hardware by 30-50%. Both fields are vLLM-specific; non-Qwen / non-vLLM backends ignore them, so this is safe across other operator-hardware setups.',
  },
  migrations: {
    up: async ({ effects }) => {},
    down: async ({ effects }) => {},
  },
})