# WhisperX ASR + diarization container for Spark 2 (Blackwell GB10, sm_120). # # Replaces the custom Parakeet wrapper + Sortformer overlay with a single # mainline pipeline: faster-whisper for transcription + pyannote.audio 3.1 # for diarization + wav2vec2 forced alignment for word-level timestamps. # # Build (on Spark 2, where Blackwell + nvcr.io credentials are available): # docker build -t whisperx-asr:latest . # # Run: # docker run -d --restart unless-stopped --name whisperx-asr \ # --gpus all --memory=40g \ # -p 8002:8002 \ # -v whisperx-models:/root/.cache/huggingface \ # -e HF_TOKEN="$(cat ~/.cache/huggingface/token)" \ # -e WHISPER_MODEL=medium \ # whisperx-asr:latest # # The memory cap is intentional: even if WhisperX hits a pathological input, # it gets OOM-killed cleanly instead of swap-thrashing the whole Spark. FROM nvcr.io/nvidia/pytorch:25.11-py3 # WhisperX runs ffmpeg under the hood for audio decoding. # git + cmake + build-essential are needed to build torchaudio from source # (see below); we remove them at the end of the next layer to keep the image # from growing unnecessarily. RUN apt-get update \ && apt-get install -y --no-install-recommends \ ffmpeg git cmake build-essential ninja-build \ && rm -rf /var/lib/apt/lists/* # Pin torch + torchvision to whatever NGC actually shipped so pip can't swap # them out when it satisfies whisperx/pyannote deps. (NGC's torch is a custom # build with a non-standard local version like "2.10.0a0+b558c986e8.nv25.11" # — stock pip wheels would clobber it and break the ABI.) RUN python3 -c "import torch, torchvision; \ import sys; \ sys.stdout.write(f'torch=={torch.__version__}\ntorchvision=={torchvision.__version__}\n')" \ > /tmp/torch-constraints.txt \ && echo '── pinned torch versions ──' && cat /tmp/torch-constraints.txt # NGC PyTorch images don't include torchaudio (NVIDIA optimizes for # vision/text workloads). Stock torchaudio wheels are ABI-incompatible with # NGC's custom torch 2.10a, so the only working option is building from # source against the NGC torch already in the image. Pinning to v2.5.1 — the # last torchaudio tag that builds cleanly against torch 2.5–2.10 and is a # proven compatibility target. ENV USE_CUDA=1 BUILD_SOX=0 TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" RUN pip install --break-system-packages --no-cache-dir \ git+https://github.com/pytorch/audio.git@v2.5.1 \ && python3 -c "import torchaudio; print('torchaudio built:', torchaudio.__version__)" # Append torchaudio to constraints so pip can't replace it later. RUN python3 -c "import torchaudio; print(f'torchaudio=={torchaudio.__version__}')" \ >> /tmp/torch-constraints.txt \ && echo '── final pinned versions ──' && cat /tmp/torch-constraints.txt # Install whisperx + the FastAPI wrapper deps under the torch+torchaudio # constraint. pip will satisfy whisperx/pyannote without swapping any of the # pytorch-family packages. COPY requirements.txt /tmp/requirements.txt RUN pip install --break-system-packages --no-cache-dir \ -c /tmp/torch-constraints.txt -r /tmp/requirements.txt # Pre-warm the default Whisper + alignment models at build time so first-call # latency on a fresh container is small. (~3 GB cached into the image; if you # want a smaller image, comment this out and accept the first-call download.) ARG WHISPER_MODEL=medium ENV WHISPER_MODEL=${WHISPER_MODEL} RUN python3 -c "import whisperx; whisperx.load_model('${WHISPER_MODEL}', 'cpu', compute_type='int8')" \ && python3 -c "import whisperx; whisperx.load_align_model(language_code='en', device='cpu')" WORKDIR /opt/whisperx COPY app /opt/whisperx/app # Expose for spark-control's proxy on Spark 2 EXPOSE 8002 HEALTHCHECK --interval=30s --timeout=10s --start-period=180s \ CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8002/health')" || exit 1 CMD ["python3", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8002", "--workers", "1"]