From 9919fbf8f87d8fb77b462b0064e3f00c1f6d22fa Mon Sep 17 00:00:00 2001 From: Grant Date: Fri, 8 May 2026 13:00:06 -0500 Subject: [PATCH] =?UTF-8?q?v0.1.0:50=20=E2=80=94=20auto-recover=20from=20s?= =?UTF-8?q?qlx=20checksum=20drift=20on=20idempotent=20migrations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two operators in a row hit the same crash-loop on upgrade: Error: running migrations Caused by: migration 9 was previously applied but has been modified sqlx records a SHA-384 of each migration's bytes when first applied, then verifies the on-disk bytes still match on every subsequent boot. Cross-build drift (trailing newlines, line-ending normalization, etc.) produces different bytes for semantically-identical SQL — and sqlx refuses to start. Recovery required SSHing in and running: sqlite3 /data/keysat.db "DELETE FROM _sqlx_migrations WHERE version = 9;" That's bad UX. Worse, every operator going through this version range hits it once. Self-heal: db::init now wraps sqlx::migrate!().run() with detection for MigrateError::VersionMismatch(N) on a constant allowlist of migrations certified safe to re-run (IDEMPOTENT_MIGRATIONS, just [9] for now). When triggered, the daemon clears the stale row, retries, logs a WARN explaining what happened, and continues. No SSH dance. Allowlist gate is critical — auto-clearing checksums on additive ALTER TABLE migrations like 0010 would error on retry (SQLite has no ADD COLUMN IF NOT EXISTS). Only migrations explicitly designed as drop-and-rebuild (like 0009) and tested via the `migration_NNNN_is_idempotent` pattern in tests/migrations.rs qualify. Regression test in tests/migrations.rs exactly simulates the production incident: 1. apply all migrations cleanly 2. poison v9's recorded checksum with bogus bytes 3. confirm raw sqlx::migrate! bails (proves the poisoning works) 4. call db::init — must succeed by clearing + re-applying v9 5. confirm v9 + v10 are both recorded with non-poisoned checksums Test count: 38 (was 37; +1 db_init_self_heals test). For operators currently stuck on the :49 crash-loop: just upgrade to :50 from the StartOS marketplace. The :50 daemon will see the mismatch on first boot, auto-clear v9's row, re-apply (0009 is idempotent by design), and continue to 0010. No manual sqlite3 needed. --- licensing-service/src/db/mod.rs | 80 +++++++++++++++++++++++++-- licensing-service/tests/migrations.rs | 79 ++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 4 deletions(-) diff --git a/licensing-service/src/db/mod.rs b/licensing-service/src/db/mod.rs index 836abfd..aaf8899 100644 --- a/licensing-service/src/db/mod.rs +++ b/licensing-service/src/db/mod.rs @@ -35,11 +35,83 @@ pub async fn init(path: &Path) -> Result { .await .with_context(|| format!("opening sqlite at {}", path.display()))?; - sqlx::migrate!("./migrations") - .run(&pool) - .await - .context("running migrations")?; + run_migrations_with_self_heal(&pool).await?; tracing::info!(path = %path.display(), "database ready"); Ok(pool) } + +/// Migrations that have been certified safe to re-run from scratch. If +/// sqlx complains about a checksum mismatch on one of these (which can +/// happen when the file content shifts subtly between builds — +/// trailing whitespace, line endings, build-host normalization), the +/// daemon clears the row from `_sqlx_migrations` and retries instead +/// of crash-looping. +/// +/// Add a migration's version to this list ONLY when: +/// - It's `CREATE TABLE IF NOT EXISTS` / `INSERT OR IGNORE` style +/// OR a deliberate drop-and-rebuild that produces identical state +/// regardless of starting point. +/// - It does NOT include `ALTER TABLE ADD COLUMN` (that errors on +/// re-apply — SQLite has no `ADD COLUMN IF NOT EXISTS`). +/// - You've tested it via `migration_NNNN_is_idempotent` in +/// `tests/migrations.rs`. +const IDEMPOTENT_MIGRATIONS: &[i64] = &[ + 9, // see migrations/0009_discount_codes_set_price.sql — explicitly + // designed as a stash-drop-rebuild-restore that yields the same + // end state regardless of the starting state. Pinned by + // migration_0009_is_idempotent in tests/migrations.rs. +]; + +/// Run migrations with auto-recovery for the +/// `MigrateError::VersionMismatch` case on idempotent migrations. +/// +/// Why this exists: sqlx records a SHA-384 of each migration file's +/// bytes when it's first applied, then verifies the on-disk bytes +/// still match on every subsequent boot. The verification is too +/// strict for our use case — a rebuild-from-clean-source can produce +/// different bytes (trailing newlines, line endings, etc.) even when +/// the SQL semantics are unchanged. Without this self-heal, every +/// such drift requires the operator to SSH in and run +/// `DELETE FROM _sqlx_migrations WHERE version = N` by hand. +/// +/// The auto-clear is gated on `IDEMPOTENT_MIGRATIONS` so we only +/// re-apply migrations we've explicitly certified as safe to re-run. +/// Anything else still propagates the error and crashes the daemon — +/// preventing accidental data corruption from re-running a destructive +/// migration. +async fn run_migrations_with_self_heal(pool: &SqlitePool) -> Result<()> { + use sqlx::migrate::MigrateError; + let migrator = sqlx::migrate!("./migrations"); + match migrator.run(pool).await { + Ok(()) => Ok(()), + Err(MigrateError::VersionMismatch(version)) + if IDEMPOTENT_MIGRATIONS.contains(&version) => + { + tracing::warn!( + migration = version, + "migration {version} checksum mismatch on a known-idempotent migration; \ + clearing _sqlx_migrations row and retrying. This usually means the \ + migration file's bytes drifted subtly between builds (trailing \ + whitespace, line endings) without a semantic change." + ); + sqlx::query("DELETE FROM _sqlx_migrations WHERE version = ?") + .bind(version) + .execute(pool) + .await + .with_context(|| { + format!("clearing _sqlx_migrations row for self-heal of v{version}") + })?; + migrator + .run(pool) + .await + .with_context(|| format!("retry of migrations after self-heal of v{version}"))?; + tracing::info!( + migration = version, + "migration {version} re-applied successfully after checksum self-heal" + ); + Ok(()) + } + Err(e) => Err(e).context("running migrations"), + } +} diff --git a/licensing-service/tests/migrations.rs b/licensing-service/tests/migrations.rs index 15735bb..1dd78df 100644 --- a/licensing-service/tests/migrations.rs +++ b/licensing-service/tests/migrations.rs @@ -397,6 +397,85 @@ async fn migration_0009_is_idempotent() { assert_db_clean(&pool).await.expect("db clean after re-apply"); } +/// Regression for the v0.1.0:48 → :49 incident: the `_sqlx_migrations` +/// table records a checksum for each applied migration; on every +/// subsequent boot sqlx verifies the on-disk bytes still match. +/// Builds across versions can produce subtly different bytes +/// (trailing newlines, line-endings, build-host normalization) for +/// the same semantic SQL, which makes sqlx refuse to start with +/// "migration N was previously applied but has been modified" and +/// crashes the daemon. +/// +/// `db::init` works around this by detecting the +/// `MigrateError::VersionMismatch` for migrations on the +/// `IDEMPOTENT_MIGRATIONS` allowlist (just `9` for now), clearing the +/// stale row, and retrying. This test simulates the exact scenario: +/// poison the recorded checksum for v9, run init, expect success. +#[tokio::test] +async fn db_init_self_heals_checksum_mismatch_on_idempotent_migrations() { + let (pool, _tmp) = make_pool().await; + + // Step 1: apply all migrations cleanly to populate + // _sqlx_migrations with current checksums. + sqlx::migrate!("./migrations") + .run(&pool) + .await + .expect("first apply"); + + // Step 2: poison the recorded checksum for v9. This simulates + // the cross-build drift that triggered the production incident. + let bogus_checksum: Vec = (0..48).map(|_| 0xEF).collect(); // Sha384 = 48 bytes + let n = sqlx::query("UPDATE _sqlx_migrations SET checksum = ? WHERE version = 9") + .bind(&bogus_checksum) + .execute(&pool) + .await + .unwrap() + .rows_affected(); + assert_eq!(n, 1, "_sqlx_migrations should have a row for v9"); + + // Step 3: confirm sqlx::migrate! ALONE bails — proves the + // poisoning works and that without self-heal the daemon would + // crash here. + let ungated = sqlx::migrate!("./migrations").run(&pool).await; + assert!( + matches!( + ungated, + Err(sqlx::migrate::MigrateError::VersionMismatch(9)) + ), + "raw sqlx::migrate! should reject the poisoned row: got {ungated:?}" + ); + + // Step 4: drop the existing pool and call db::init on the same + // file. The self-heal should clear v9's row, re-apply, succeed. + let tmp_path = _tmp.path().to_path_buf(); + drop(pool); + drop(_tmp); + let healed = keysat::db::init(&tmp_path) + .await + .expect("db::init should self-heal the poisoned v9 row"); + + // Sanity check: v9 is back in _sqlx_migrations with a fresh + // (correct) checksum, and v10 is still there from the original + // apply. + let count: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM _sqlx_migrations WHERE version IN (9, 10)") + .fetch_one(&healed) + .await + .unwrap(); + assert_eq!(count, 2, "both 9 and 10 should be recorded after self-heal"); + + // The poisoned checksum was replaced with the real one. + let new_checksum: Vec = + sqlx::query_scalar("SELECT checksum FROM _sqlx_migrations WHERE version = 9") + .fetch_one(&healed) + .await + .unwrap(); + assert_ne!( + new_checksum, bogus_checksum, + "self-heal must replace the poisoned checksum with the current one" + ); +} + /// Migration 0010 (multi-currency foundation): verifies that the /// backfill correctly populates the new `price_currency` and /// `price_value` columns against products that existed before the