proof-of-work/proof-of-work/lib/ai/lenientJson.ts

/**
 * Lenient JSON parser for incremental rendering of in-flight LLM
 * output.
 *
 * The model emits JSON one token at a time. Strict JSON.parse fails
 * until the very last `}` arrives. lenientJsonParse instead:
 *
 *   1. Locates the first `{` (after stripping ```json fences).
 *   2. Walks the buffer tracking quote state + an open-bracket
 *      stack so we know what to close in what order.
 *   3. Closes any open string with `"`.
 *   4. Trims a partial trailing keyword (true/false/null prefix),
 *      trailing comma, and dangling key:value pair where value is
 *      missing.
 *   5. Closes open structures in reverse-of-opening order (so
 *      `[{` closes as `}]`, not `]}`).
 *   6. JSON.parse the result; return null if it still fails.
 *
 * The returned object is a best-effort snapshot of the program so
 * far. The Generate UI uses it to render a live preview as the
 * model writes; once the stream ends, the FULL response is parsed
 * with the strict parser via parseAIProgram for the final render.
 *
 * This is intentionally simple — partial numbers (e.g. `-2.`) and
 * partial escape sequences just return null until the next chunk
 * makes them well-formed.
 */
export function lenientJsonParse(raw: string): unknown | null {
  if (!raw) return null;

  // Strip ```json fences (or plain ``` fences). Tolerates an
  // unclosed trailing fence (still streaming).
  let s = raw;
  const fenced = s.match(/```(?:json)?\s*([\s\S]*?)(?:\s*```|$)/);
  if (fenced) s = fenced[1];

  // Locate first `{`.
  const startIdx = s.indexOf('{');
  if (startIdx < 0) return null;
  s = s.slice(startIdx);

  // Quick path: maybe it's already valid (rare during streaming,
  // common after the stream completes).
  try {
    return JSON.parse(s);
  } catch {
    // fall through
  }

  // Walk the buffer tracking the open-bracket stack. We don't try
  // to recover from mismatched closers (would be model malformity);
  // we just don't pop more than we have.
  const stack: Array<'{' | '['> = [];
  let inStr = false;
  let escape = false;
  for (let i = 0; i < s.length; i++) {
    const c = s[i];
    if (escape) {
      escape = false;
      continue;
    }
    if (c === '\\') {
      escape = true;
      continue;
    }
    if (c === '"') {
      inStr = !inStr;
      continue;
    }
    if (inStr) continue;
    if (c === '{') stack.push('{');
    else if (c === '}') {
      if (stack[stack.length - 1] === '{') stack.pop();
    } else if (c === '[') stack.push('[');
    else if (c === ']') {
      if (stack[stack.length - 1] === '[') stack.pop();
    }
  }

  let candidate = s;

  // Close any open string at the tail.
  if (inStr) candidate += '"';

  // Trim trailing whitespace.
  candidate = candidate.replace(/\s+$/, '');

  // Drop a partial trailing keyword (`true`/`false`/`null` prefix)
  // sitting after a `:`, `,`, or `[`.
  candidate = candidate.replace(
    /([:,[])\s*(?:t|tr|tru|f|fa|fal|fals|n|nu|nul)$/,
    '$1',
  );

  // Drop a trailing comma (no value follows yet).
  candidate = candidate.replace(/,\s*$/, '');

  // Drop a dangling key + colon (value not started yet).
  candidate = candidate.replace(/"[^"\\]*(?:\\.[^"\\]*)*"\s*:\s*$/, '');

  // Drop another trailing comma that may now be exposed.
  candidate = candidate.replace(/,\s*$/, '');

  // Close stack in reverse-of-opening order. `[{` becomes `}]` not
  // `]}` — that's the bug a depth-counter approach would have.
  while (stack.length > 0) {
    const top = stack.pop()!;
    candidate += top === '{' ? '}' : ']';
  }

  try {
    return JSON.parse(candidate);
  } catch {
    return null;
  }
}