Module split: extract audio I/O helpers to server/audio.js

• getAudioDuration(path) — ffprobe wrapper, returns seconds | null • splitAudioFile(in, dir, secs) — ffmpeg -acodec copy chunking • downloadPodcastAudio(url, dst) — streams HTTP audio to disk Also moved fetchUrl into util.js (alongside the other stateless helpers) — it's a generic HTTP-GET-with-redirects used by RSS parsing and channel discovery, not strictly audio. server/index.js: 2758 → 2694 lines. Smoke tested: server boots; /api/license-status, /api/health, / respond. No behavior change.
2026-05-08 16:53:06 -05:00
parent 1c78e46ebd
commit 4c3cb6a077
3 changed files with 122 additions and 76 deletions
@@ -0,0 +1,86 @@
+// Audio I/O helpers — ffprobe for metadata, ffmpeg for splitting, plus
+// HTTP downloading for podcast episodes. Pure module: no state, no
+// Express, only takes paths/URLs and returns data.
+
+import { execFile } from "child_process";
+import { promisify } from "util";
+import path from "path";
+import http from "http";
+import https from "https";
+import { createWriteStream } from "fs";
+
+const execFileAsync = promisify(execFile);
+
+// ── Audio duration via ffprobe ──────────────────────────────────────────────
+// Returns the duration in seconds, or null if ffprobe can't read the file
+// (corrupt, missing, unsupported format). Caller decides what to do with
+// null — most call sites treat it as "unknown" and skip duration-dependent
+// branches.
+export async function getAudioDuration(filePath) {
+  try {
+    const { stdout } = await execFileAsync("ffprobe", [
+      "-v", "error",
+      "-show_entries", "format=duration",
+      "-of", "default=noprint_wrappers=1:nokey=1",
+      filePath,
+    ], { timeout: 15000 });
+    const dur = parseFloat(stdout.trim());
+    return isNaN(dur) ? null : dur;
+  } catch {
+    return null;
+  }
+}
+
+// ── Split a long audio file into chunks ─────────────────────────────────────
+// Used when a video is too long for a single Gemini transcription call.
+// Returns null if no split is needed (audio fits in one chunk), otherwise
+// an array of `{ path, startOffset, index }`. Uses `-acodec copy` so it's
+// fast and lossless — no re-encoding.
+export async function splitAudioFile(inputPath, outputDir, chunkSeconds = 2700) {
+  const duration = await getAudioDuration(inputPath);
+  if (!duration || duration <= chunkSeconds) return null;
+
+  const chunks = [];
+  let startSec = 0;
+  let i = 0;
+  while (startSec < duration) {
+    const chunkPath = path.join(outputDir, `chunk_${i}.mp3`);
+    const segLen = Math.min(chunkSeconds, duration - startSec);
+    await execFileAsync("ffmpeg", [
+      "-y", "-i", inputPath,
+      "-ss", String(startSec),
+      "-t", String(segLen),
+      "-acodec", "copy",
+      chunkPath,
+    ], { timeout: 120000 });
+    chunks.push({ path: chunkPath, startOffset: startSec, index: i });
+    startSec += chunkSeconds;
+    i++;
+  }
+  return chunks;
+}
+
+// ── Download a podcast episode by URL ───────────────────────────────────────
+// Streams the HTTP response straight to disk. Follows redirects. Rejects
+// on any non-200 final status. Used by /api/process when the input URL is
+// a podcast episode rather than a YouTube video.
+export function downloadPodcastAudio(audioUrl, destPath) {
+  return new Promise((resolve, reject) => {
+    const doFetch = (url) => {
+      const getter = url.startsWith("https") ? https : http;
+      getter.get(url, (res) => {
+        if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+          return doFetch(res.headers.location);
+        }
+        if (res.statusCode !== 200) {
+          return reject(new Error(`HTTP ${res.statusCode} downloading podcast audio`));
+        }
+        const fileStream = createWriteStream(destPath);
+        res.pipe(fileStream);
+        fileStream.on("finish", () => fileStream.close(resolve));
+        fileStream.on("error", reject);
+      }).on("error", reject);
+    };
+    doFetch(audioUrl);
+  });
+}
@@ -17,8 +17,14 @@ import {
  parseTimestampedTranscript,
  safeText,
  retryGemini,
+  fetchUrl,
 } from "./util.js";
 import { calcCost, buildAnalysisPrompt } from "./gemini-helpers.js";
+import {
+  getAudioDuration,
+  splitAudioFile,
+  downloadPodcastAudio,
+} from "./audio.js";

 const execFileAsync = promisify(execFile);
 const app = express();
@@ -949,19 +955,7 @@ async function fetchUploadDates(videoIds) {
 // ── RSS-based date fetching (bypasses bot detection) ─────────────────────

 // Fetch a URL and return the response body as a string
-function fetchUrl(url) {
-  return new Promise((resolve, reject) => {
-    https.get(url, (res) => {
-      if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
-        return fetchUrl(res.headers.location).then(resolve, reject);
-      }
-      let data = "";
-      res.on("data", (chunk) => (data += chunk));
-      res.on("end", () => resolve(data));
-      res.on("error", reject);
-    }).on("error", reject);
-  });
-}
+// fetchUrl moved to ./util.js

 // Get channel_id from a YouTube channel/playlist URL using yt-dlp
 async function getChannelId(url) {
@@ -1086,26 +1080,7 @@ async function parsePodcastRSS(feedUrl, limit = 200) {
 }

 // Download a podcast episode audio file via HTTP(S) to a local path
-function downloadPodcastAudio(audioUrl, destPath) {
-  return new Promise((resolve, reject) => {
-    const doFetch = (url) => {
-      const getter = url.startsWith("https") ? https : http;
-      getter.get(url, (res) => {
-        if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
-          return doFetch(res.headers.location);
-        }
-        if (res.statusCode !== 200) {
-          return reject(new Error(`HTTP ${res.statusCode} downloading podcast audio`));
-        }
-        const fileStream = createWriteStream(destPath);
-        res.pipe(fileStream);
-        fileStream.on("finish", () => fileStream.close(resolve));
-        fileStream.on("error", reject);
-      }).on("error", reject);
-    };
-    doFetch(audioUrl);
-  });
-}
+// downloadPodcastAudio moved to ./audio.js

 // Get channel name from URL
 async function fetchChannelName(url) {
@@ -2638,46 +2613,7 @@ Return ONLY the timestamped transcript, nothing else.`;

 // ── Helpers ────────────────────────────────────────────────────────────────

-// ── Audio duration helper (ffprobe) ─────────────────────────────────────
-async function getAudioDuration(filePath) {
-  try {
-    const { stdout } = await execFileAsync("ffprobe", [
-      "-v", "error",
-      "-show_entries", "format=duration",
-      "-of", "default=noprint_wrappers=1:nokey=1",
-      filePath,
-    ], { timeout: 15000 });
-    const dur = parseFloat(stdout.trim());
-    return isNaN(dur) ? null : dur;
-  } catch {
-    return null;
-  }
-}
-
-// ── Split audio into chunks with ffmpeg ─────────────────────────────────
-async function splitAudioFile(inputPath, outputDir, chunkSeconds = 2700) {
-  const duration = await getAudioDuration(inputPath);
-  if (!duration || duration <= chunkSeconds) return null; // no split needed
-
-  const chunks = [];
-  let startSec = 0;
-  let i = 0;
-  while (startSec < duration) {
-    const chunkPath = path.join(outputDir, `chunk_${i}.mp3`);
-    const segLen = Math.min(chunkSeconds, duration - startSec);
-    await execFileAsync("ffmpeg", [
-      "-y", "-i", inputPath,
-      "-ss", String(startSec),
-      "-t", String(segLen),
-      "-acodec", "copy",
-      chunkPath,
-    ], { timeout: 120000 });
-    chunks.push({ path: chunkPath, startOffset: startSec, index: i });
-    startSec += chunkSeconds;
-    i++;
-  }
-  return chunks;
-}
+// getAudioDuration + splitAudioFile moved to ./audio.js

 // sendEvent / extractVideoId / formatTime / parseTimestampedTranscript moved to ./util.js

@@ -1,6 +1,9 @@
-// Pure helpers — no module-scoped state, no Express, no I/O effects.
-// Anything in here is safe to import from any other module without
-// worrying about ordering or initialization side effects.
+// Stateless helpers — no module-scoped state, no Express, no
+// initialization side effects. Anything in here is safe to import from
+// any other module without worrying about ordering. A few helpers do
+// I/O (fetchUrl) but only when called.
+
+import https from "https";

 // ── SSE helper ──────────────────────────────────────────────────────────────
 // Writes a single Server-Sent Events frame: `event: X\ndata: Y\n\n`.
@@ -89,6 +92,27 @@ export function safeText(result) {
  return "";
 }

+// ── HTTP GET with redirect following ────────────────────────────────────────
+// Returns the response body as a string. Follows HTTP redirects up to a
+// reasonable depth (relies on https module's default behavior plus a one-
+// level recursion). Used for fetching RSS feeds, channel pages, etc.
+//
+// For binary downloads (e.g. podcast audio), use audio.downloadPodcastAudio
+// — it streams to disk instead of buffering in memory.
+export function fetchUrl(url) {
+  return new Promise((resolve, reject) => {
+    https.get(url, (res) => {
+      if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+        return fetchUrl(res.headers.location).then(resolve, reject);
+      }
+      let data = "";
+      res.on("data", (chunk) => (data += chunk));
+      res.on("end", () => resolve(data));
+      res.on("error", reject);
+    }).on("error", reject);
+  });
+}
+
 // ── Retry helper for transient Gemini API errors ────────────────────────────
 // Retries on 503/429 and on common transient network errors. Linear backoff
 // (delayMs * attempt). The optional `log` callback receives a one-line