Fix SQLite WAL corruption: checkpoint before export, copy WAL/SHM files, preserve FTS schema via iterdump

This commit is contained in:
2026-04-20 18:28:06 +08:00
parent d7cbf7833f
commit 4ca7c2da6c
2 changed files with 86 additions and 39 deletions

Binary file not shown.

125
sync.sh
View File

@@ -13,25 +13,46 @@ echo "[$(date '+%H:%M:%S')] Sync from $HOSTNAME..."
# with remote when pulling. Remove it before fetch+merge.
rm -f "$SYNC_DIR/state_merged.db"
# ── Step 1: Export local state.db (via temp dir to avoid lock) ───────────
python3 << PYEOF
# ── Step 1: Export local state.db (via temp dir to avoid lock & WAL issues)
python3 << 'PYEOF'
import sqlite3, os, shutil, tempfile
local_db = os.path.join(os.path.expanduser('~/.hermes'), 'state.db')
export_db = os.path.join(os.path.expanduser('~/.hermes-sync'), f"state_{os.environ.get('HOSTNAME') or __import__('socket').gethostname()}.db")
hostname = os.environ.get('HOSTNAME') or __import__('socket').gethostname()
tmpdir = tempfile.mkdtemp(prefix='hs_exp_')
try:
tmp_db = os.path.join(tmpdir, 'db')
shutil.copy2(local_db, tmp_db)
# Step 1a: Open with WAL mode and checkpoint (forces WAL -> main db)
conn = sqlite3.connect(f'file:{local_db}?mode=ro', uri=True)
jm = conn.execute('PRAGMA journal_mode').fetchone()[0]
test = sqlite3.connect(tmp_db)
if jm == 'wal':
# WAL mode: checkpoint first to flush WAL into main db
conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
print(f'WAL checkpointed ({jm})')
conn.close()
# Step 1b: Copy all three files (db + -wal + -shm) if they exist
tmp_db = os.path.join(tmpdir, 'db')
for suffix in ['', '-wal', '-shm']:
src = local_db + suffix
if os.path.exists(src):
shutil.copy2(src, tmp_db + suffix)
# Step 1c: Verify (open read-only to confirm it's not corrupted)
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
test.close()
shutil.copy2(tmp_db, export_db)
print(f'Exported: {r}s/{m}m')
if os.path.exists(tmp_db + '-wal'):
shutil.copy2(tmp_db + '-wal', export_db + '-wal')
if os.path.exists(tmp_db + '-shm'):
shutil.copy2(tmp_db + '-shm', export_db + '-shm')
print(f'Exported: {r}s/{m}m (journal={jm})')
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
PYEOF
@@ -85,53 +106,54 @@ tmpdir = tempfile.mkdtemp(prefix='hs_merge_')
tmp_merged = os.path.join(tmpdir, 'merged.db')
try:
# Create merged DB with proper schema
# Create merged DB (DELETE journal avoids WAL complications during merge)
conn = sqlite3.connect(tmp_merged)
conn.execute('PRAGMA journal_mode=DELETE')
conn.execute('PRAGMA locking_mode=NORMAL')
conn.execute('PRAGMA synchronous=FULL')
conn.execute('''
CREATE TABLE sessions (
id TEXT PRIMARY KEY, source TEXT NOT NULL, user_id TEXT,
model TEXT, model_config TEXT, system_prompt TEXT,
parent_session_id TEXT, started_at REAL, ended_at REAL,
end_reason TEXT, message_count INTEGER DEFAULT 0,
tool_call_count INTEGER DEFAULT 0, input_tokens INTEGER DEFAULT 0,
output_tokens INTEGER DEFAULT 0, cache_read_tokens INTEGER DEFAULT 0,
cache_write_tokens INTEGER DEFAULT 0, reasoning_tokens INTEGER DEFAULT 0,
billing_provider TEXT, billing_base_url TEXT, billing_mode TEXT,
estimated_cost_usd REAL, actual_cost_usd REAL,
cost_status TEXT, cost_source TEXT, pricing_version TEXT, title TEXT
)''')
# Use one reference DB to get exact schema (handles FTS, etc.)
ref_db = db_files[0]
ref_copy = os.path.join(tmpdir, 'ref.db')
shutil.copy2(ref_db, ref_copy)
if os.path.exists(ref_db + '-wal'):
shutil.copy2(ref_db + '-wal', ref_copy + '-wal')
if os.path.exists(ref_db + '-shm'):
shutil.copy2(ref_db + '-shm', ref_copy + '-shm')
conn.execute('''
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
session_id TEXT NOT NULL, role TEXT NOT NULL, content TEXT,
tool_call_id TEXT, tool_calls TEXT, tool_name TEXT,
timestamp REAL NOT NULL, token_count INTEGER,
finish_reason TEXT, reasoning TEXT, reasoning_details TEXT,
codex_reasoning_items TEXT
)''')
conn.execute('CREATE INDEX idx_msg_session ON messages(session_id)')
conn.execute('CREATE INDEX idx_msg_ts ON messages(timestamp)')
ref = sqlite3.connect(f'file:{ref_copy}?mode=ro', uri=True)
for line in ref.iterdump():
# Skip FTS virtual table data rows (they are derived, not real data)
if line.startswith('INSERT INTO messages_fts') or \
line.startswith('DELETE FROM messages_fts'):
continue
try:
conn.execute(line)
except Exception as e:
pass # Ignore schema errors from partial DDL
ref.close()
os.remove(ref_copy)
if os.path.exists(ref_copy + '-wal'):
os.remove(ref_copy + '-wal')
# Copy sessions (INSERT OR REPLACE to dedupe by PK)
for db_file in db_files:
name = os.path.basename(db_file)
# Copy to temp file first (avoids WAL lock issues with open connections)
# Copy to temp with WAL files
tmp_copy = os.path.join(tmpdir, name)
shutil.copy2(db_file, tmp_copy)
# Also copy WAL if exists
if os.path.exists(db_file + '-wal'):
shutil.copy2(db_file + '-wal', tmp_copy + '-wal')
if os.path.exists(db_file + '-shm'):
shutil.copy2(db_file + '-shm', tmp_copy + '-shm')
src = sqlite3.connect(tmp_copy)
src = sqlite3.connect(f'file:{tmp_copy}?mode=ro', uri=True)
s_cnt = src.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m_cnt = src.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
print(f' {name}: {s_cnt}s/{m_cnt}m')
jm = src.execute('PRAGMA journal_mode').fetchone()[0]
print(f' {name}: {s_cnt}s/{m_cnt}m journal={jm}')
sess_rows = src.execute('SELECT * FROM sessions').fetchall()
sess_cols = len(src.execute('PRAGMA table_info(sessions)').fetchall())
@@ -145,8 +167,9 @@ try:
src.close()
os.remove(tmp_copy)
if os.path.exists(tmp_copy + '-wal'):
os.remove(tmp_copy + '-wal')
for suf in ['-wal', '-shm']:
if os.path.exists(tmp_copy + suf):
os.remove(tmp_copy + suf)
conn.commit()
conn.close()
@@ -154,6 +177,10 @@ try:
if os.path.exists(merged_path):
os.remove(merged_path)
shutil.copy2(tmp_merged, merged_path)
# Ensure no WAL on merged (merge output should be clean DELETE)
for suf in ['-wal', '-shm']:
if os.path.exists(merged_path + suf):
os.remove(merged_path + suf)
print(f'Merged: {os.path.getsize(merged_path)/1024:.0f} KB')
finally:
@@ -188,16 +215,36 @@ if not os.path.exists(merged_path):
else:
tmpdir = tempfile.mkdtemp(prefix='hs_rest_')
try:
# Step 7a: Ensure merged db is fully checkpointed (DELETE journal mode)
merge_conn = sqlite3.connect(f'file:{merged_path}?mode=ro', uri=True)
merge_jm = merge_conn.execute('PRAGMA journal_mode').fetchone()[0]
if merge_jm == 'wal':
merge_conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
print(f'Restored merged DB had WAL, checkpointed ({merge_jm})')
merge_conn.close()
tmp_db = os.path.join(tmpdir, 'db')
shutil.copy2(merged_path, tmp_db)
# Also copy -wal/-shm if present
for suf in ['-wal', '-shm']:
if os.path.exists(merged_path + suf):
shutil.copy2(merged_path + suf, tmp_db + suf)
test = sqlite3.connect(tmp_db)
# Step 7b: Verify before overwriting
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
test.execute('PRAGMA integrity_check') # verify
test.close()
shutil.copy2(local_db, local_db + '.bak')
shutil.copy2(tmp_db, local_db)
# Also copy WAL files to local
for suf in ['-wal', '-shm']:
if os.path.exists(tmp_db + suf):
shutil.copy2(tmp_db + suf, local_db + suf)
elif os.path.exists(local_db + suf):
os.remove(local_db + suf)
os.remove(local_db + '.bak')
print(f'Restored: {r}s/{m}m')
finally: