spark-control/image/whisperx_container/Dockerfile

# WhisperX ASR + diarization container for Spark 2 (Blackwell GB10, sm_120).
#
# Replaces the custom Parakeet wrapper + Sortformer overlay with a single
# mainline pipeline: faster-whisper for transcription + pyannote.audio 3.1
# for diarization + wav2vec2 forced alignment for word-level timestamps.
#
# Build (on Spark 2, where Blackwell + nvcr.io credentials are available):
#   docker build -t whisperx-asr:latest .
#
# Run:
#   docker run -d --restart unless-stopped --name whisperx-asr \
#     --gpus all --memory=40g \
#     -p 8002:8002 \
#     -v whisperx-models:/root/.cache/huggingface \
#     -e HF_TOKEN="$(cat ~/.cache/huggingface/token)" \
#     -e WHISPER_MODEL=medium \
#     whisperx-asr:latest
#
# The memory cap is intentional: even if WhisperX hits a pathological input,
# it gets OOM-killed cleanly instead of swap-thrashing the whole Spark.

FROM nvcr.io/nvidia/pytorch:25.11-py3

# WhisperX runs ffmpeg under the hood for audio decoding
RUN apt-get update \
 && apt-get install -y --no-install-recommends ffmpeg \
 && rm -rf /var/lib/apt/lists/*

# Install whisperx + the FastAPI wrapper deps. --break-system-packages because
# the NGC PyTorch image has its own managed Python that's flagged "system".
COPY requirements.txt /tmp/requirements.txt
RUN pip install --break-system-packages --no-cache-dir -r /tmp/requirements.txt

# Pre-warm the default Whisper + alignment models at build time so first-call
# latency on a fresh container is small. (~3 GB cached into the image; if you
# want a smaller image, comment this out and accept the first-call download.)
ARG WHISPER_MODEL=medium
ENV WHISPER_MODEL=${WHISPER_MODEL}
RUN python3 -c "import whisperx; whisperx.load_model('${WHISPER_MODEL}', 'cpu', compute_type='int8')" \
 && python3 -c "import whisperx; whisperx.load_align_model(language_code='en', device='cpu')"

WORKDIR /opt/whisperx
COPY app /opt/whisperx/app

# Expose for spark-control's proxy on Spark 2
EXPOSE 8002

HEALTHCHECK --interval=30s --timeout=10s --start-period=180s \
  CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8002/health')" || exit 1

CMD ["python3", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8002", "--workers", "1"]