Fix SQLite WAL corruption: checkpoint before export, copy WAL/SHM files, preserve FTS schema via iterdump

This commit is contained in:
2026-04-20 18:28:06 +08:00
parent d7cbf7833f
commit 4ca7c2da6c
2 changed files with 86 additions and 39 deletions

Binary file not shown.

125
sync.sh
View File

@@ -13,25 +13,46 @@ echo "[$(date '+%H:%M:%S')] Sync from $HOSTNAME..."
# with remote when pulling. Remove it before fetch+merge. # with remote when pulling. Remove it before fetch+merge.
rm -f "$SYNC_DIR/state_merged.db" rm -f "$SYNC_DIR/state_merged.db"
# ── Step 1: Export local state.db (via temp dir to avoid lock) ─────────── # ── Step 1: Export local state.db (via temp dir to avoid lock & WAL issues)
python3 << PYEOF python3 << 'PYEOF'
import sqlite3, os, shutil, tempfile import sqlite3, os, shutil, tempfile
local_db = os.path.join(os.path.expanduser('~/.hermes'), 'state.db') local_db = os.path.join(os.path.expanduser('~/.hermes'), 'state.db')
export_db = os.path.join(os.path.expanduser('~/.hermes-sync'), f"state_{os.environ.get('HOSTNAME') or __import__('socket').gethostname()}.db") export_db = os.path.join(os.path.expanduser('~/.hermes-sync'), f"state_{os.environ.get('HOSTNAME') or __import__('socket').gethostname()}.db")
hostname = os.environ.get('HOSTNAME') or __import__('socket').gethostname()
tmpdir = tempfile.mkdtemp(prefix='hs_exp_') tmpdir = tempfile.mkdtemp(prefix='hs_exp_')
try: try:
tmp_db = os.path.join(tmpdir, 'db') # Step 1a: Open with WAL mode and checkpoint (forces WAL -> main db)
shutil.copy2(local_db, tmp_db) conn = sqlite3.connect(f'file:{local_db}?mode=ro', uri=True)
jm = conn.execute('PRAGMA journal_mode').fetchone()[0]
test = sqlite3.connect(tmp_db) if jm == 'wal':
# WAL mode: checkpoint first to flush WAL into main db
conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
print(f'WAL checkpointed ({jm})')
conn.close()
# Step 1b: Copy all three files (db + -wal + -shm) if they exist
tmp_db = os.path.join(tmpdir, 'db')
for suffix in ['', '-wal', '-shm']:
src = local_db + suffix
if os.path.exists(src):
shutil.copy2(src, tmp_db + suffix)
# Step 1c: Verify (open read-only to confirm it's not corrupted)
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0] r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0] m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
test.close() test.close()
shutil.copy2(tmp_db, export_db) shutil.copy2(tmp_db, export_db)
print(f'Exported: {r}s/{m}m') if os.path.exists(tmp_db + '-wal'):
shutil.copy2(tmp_db + '-wal', export_db + '-wal')
if os.path.exists(tmp_db + '-shm'):
shutil.copy2(tmp_db + '-shm', export_db + '-shm')
print(f'Exported: {r}s/{m}m (journal={jm})')
finally: finally:
shutil.rmtree(tmpdir, ignore_errors=True) shutil.rmtree(tmpdir, ignore_errors=True)
PYEOF PYEOF
@@ -85,53 +106,54 @@ tmpdir = tempfile.mkdtemp(prefix='hs_merge_')
tmp_merged = os.path.join(tmpdir, 'merged.db') tmp_merged = os.path.join(tmpdir, 'merged.db')
try: try:
# Create merged DB with proper schema # Create merged DB (DELETE journal avoids WAL complications during merge)
conn = sqlite3.connect(tmp_merged) conn = sqlite3.connect(tmp_merged)
conn.execute('PRAGMA journal_mode=DELETE') conn.execute('PRAGMA journal_mode=DELETE')
conn.execute('PRAGMA locking_mode=NORMAL') conn.execute('PRAGMA locking_mode=NORMAL')
conn.execute('PRAGMA synchronous=FULL') conn.execute('PRAGMA synchronous=FULL')
conn.execute(''' # Use one reference DB to get exact schema (handles FTS, etc.)
CREATE TABLE sessions ( ref_db = db_files[0]
id TEXT PRIMARY KEY, source TEXT NOT NULL, user_id TEXT, ref_copy = os.path.join(tmpdir, 'ref.db')
model TEXT, model_config TEXT, system_prompt TEXT, shutil.copy2(ref_db, ref_copy)
parent_session_id TEXT, started_at REAL, ended_at REAL, if os.path.exists(ref_db + '-wal'):
end_reason TEXT, message_count INTEGER DEFAULT 0, shutil.copy2(ref_db + '-wal', ref_copy + '-wal')
tool_call_count INTEGER DEFAULT 0, input_tokens INTEGER DEFAULT 0, if os.path.exists(ref_db + '-shm'):
output_tokens INTEGER DEFAULT 0, cache_read_tokens INTEGER DEFAULT 0, shutil.copy2(ref_db + '-shm', ref_copy + '-shm')
cache_write_tokens INTEGER DEFAULT 0, reasoning_tokens INTEGER DEFAULT 0,
billing_provider TEXT, billing_base_url TEXT, billing_mode TEXT,
estimated_cost_usd REAL, actual_cost_usd REAL,
cost_status TEXT, cost_source TEXT, pricing_version TEXT, title TEXT
)''')
conn.execute(''' ref = sqlite3.connect(f'file:{ref_copy}?mode=ro', uri=True)
CREATE TABLE messages ( for line in ref.iterdump():
id INTEGER PRIMARY KEY, # Skip FTS virtual table data rows (they are derived, not real data)
session_id TEXT NOT NULL, role TEXT NOT NULL, content TEXT, if line.startswith('INSERT INTO messages_fts') or \
tool_call_id TEXT, tool_calls TEXT, tool_name TEXT, line.startswith('DELETE FROM messages_fts'):
timestamp REAL NOT NULL, token_count INTEGER, continue
finish_reason TEXT, reasoning TEXT, reasoning_details TEXT, try:
codex_reasoning_items TEXT conn.execute(line)
)''') except Exception as e:
pass # Ignore schema errors from partial DDL
conn.execute('CREATE INDEX idx_msg_session ON messages(session_id)') ref.close()
conn.execute('CREATE INDEX idx_msg_ts ON messages(timestamp)') os.remove(ref_copy)
if os.path.exists(ref_copy + '-wal'):
os.remove(ref_copy + '-wal')
# Copy sessions (INSERT OR REPLACE to dedupe by PK)
for db_file in db_files: for db_file in db_files:
name = os.path.basename(db_file) name = os.path.basename(db_file)
# Copy to temp file first (avoids WAL lock issues with open connections)
# Copy to temp with WAL files
tmp_copy = os.path.join(tmpdir, name) tmp_copy = os.path.join(tmpdir, name)
shutil.copy2(db_file, tmp_copy) shutil.copy2(db_file, tmp_copy)
# Also copy WAL if exists
if os.path.exists(db_file + '-wal'): if os.path.exists(db_file + '-wal'):
shutil.copy2(db_file + '-wal', tmp_copy + '-wal') shutil.copy2(db_file + '-wal', tmp_copy + '-wal')
if os.path.exists(db_file + '-shm'):
shutil.copy2(db_file + '-shm', tmp_copy + '-shm')
src = sqlite3.connect(tmp_copy) src = sqlite3.connect(f'file:{tmp_copy}?mode=ro', uri=True)
s_cnt = src.execute('SELECT COUNT(*) FROM sessions').fetchone()[0] s_cnt = src.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m_cnt = src.execute('SELECT COUNT(*) FROM messages').fetchone()[0] m_cnt = src.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
print(f' {name}: {s_cnt}s/{m_cnt}m') jm = src.execute('PRAGMA journal_mode').fetchone()[0]
print(f' {name}: {s_cnt}s/{m_cnt}m journal={jm}')
sess_rows = src.execute('SELECT * FROM sessions').fetchall() sess_rows = src.execute('SELECT * FROM sessions').fetchall()
sess_cols = len(src.execute('PRAGMA table_info(sessions)').fetchall()) sess_cols = len(src.execute('PRAGMA table_info(sessions)').fetchall())
@@ -145,8 +167,9 @@ try:
src.close() src.close()
os.remove(tmp_copy) os.remove(tmp_copy)
if os.path.exists(tmp_copy + '-wal'): for suf in ['-wal', '-shm']:
os.remove(tmp_copy + '-wal') if os.path.exists(tmp_copy + suf):
os.remove(tmp_copy + suf)
conn.commit() conn.commit()
conn.close() conn.close()
@@ -154,6 +177,10 @@ try:
if os.path.exists(merged_path): if os.path.exists(merged_path):
os.remove(merged_path) os.remove(merged_path)
shutil.copy2(tmp_merged, merged_path) shutil.copy2(tmp_merged, merged_path)
# Ensure no WAL on merged (merge output should be clean DELETE)
for suf in ['-wal', '-shm']:
if os.path.exists(merged_path + suf):
os.remove(merged_path + suf)
print(f'Merged: {os.path.getsize(merged_path)/1024:.0f} KB') print(f'Merged: {os.path.getsize(merged_path)/1024:.0f} KB')
finally: finally:
@@ -188,16 +215,36 @@ if not os.path.exists(merged_path):
else: else:
tmpdir = tempfile.mkdtemp(prefix='hs_rest_') tmpdir = tempfile.mkdtemp(prefix='hs_rest_')
try: try:
# Step 7a: Ensure merged db is fully checkpointed (DELETE journal mode)
merge_conn = sqlite3.connect(f'file:{merged_path}?mode=ro', uri=True)
merge_jm = merge_conn.execute('PRAGMA journal_mode').fetchone()[0]
if merge_jm == 'wal':
merge_conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
print(f'Restored merged DB had WAL, checkpointed ({merge_jm})')
merge_conn.close()
tmp_db = os.path.join(tmpdir, 'db') tmp_db = os.path.join(tmpdir, 'db')
shutil.copy2(merged_path, tmp_db) shutil.copy2(merged_path, tmp_db)
# Also copy -wal/-shm if present
for suf in ['-wal', '-shm']:
if os.path.exists(merged_path + suf):
shutil.copy2(merged_path + suf, tmp_db + suf)
test = sqlite3.connect(tmp_db) # Step 7b: Verify before overwriting
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0] r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0] m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
test.execute('PRAGMA integrity_check') # verify
test.close() test.close()
shutil.copy2(local_db, local_db + '.bak') shutil.copy2(local_db, local_db + '.bak')
shutil.copy2(tmp_db, local_db) shutil.copy2(tmp_db, local_db)
# Also copy WAL files to local
for suf in ['-wal', '-shm']:
if os.path.exists(tmp_db + suf):
shutil.copy2(tmp_db + suf, local_db + suf)
elif os.path.exists(local_db + suf):
os.remove(local_db + suf)
os.remove(local_db + '.bak') os.remove(local_db + '.bak')
print(f'Restored: {r}s/{m}m') print(f'Restored: {r}s/{m}m')
finally: finally: