Fix SQLite WAL corruption: checkpoint before export, copy WAL/SHM files, preserve FTS schema via iterdump
This commit is contained in:
Binary file not shown.
125
sync.sh
125
sync.sh
@@ -13,25 +13,46 @@ echo "[$(date '+%H:%M:%S')] Sync from $HOSTNAME..."
|
||||
# with remote when pulling. Remove it before fetch+merge.
|
||||
rm -f "$SYNC_DIR/state_merged.db"
|
||||
|
||||
# ── Step 1: Export local state.db (via temp dir to avoid lock) ────────────
|
||||
python3 << PYEOF
|
||||
# ── Step 1: Export local state.db (via temp dir to avoid lock & WAL issues) ─
|
||||
python3 << 'PYEOF'
|
||||
import sqlite3, os, shutil, tempfile
|
||||
|
||||
local_db = os.path.join(os.path.expanduser('~/.hermes'), 'state.db')
|
||||
export_db = os.path.join(os.path.expanduser('~/.hermes-sync'), f"state_{os.environ.get('HOSTNAME') or __import__('socket').gethostname()}.db")
|
||||
hostname = os.environ.get('HOSTNAME') or __import__('socket').gethostname()
|
||||
tmpdir = tempfile.mkdtemp(prefix='hs_exp_')
|
||||
|
||||
try:
|
||||
tmp_db = os.path.join(tmpdir, 'db')
|
||||
shutil.copy2(local_db, tmp_db)
|
||||
# Step 1a: Open with WAL mode and checkpoint (forces WAL -> main db)
|
||||
conn = sqlite3.connect(f'file:{local_db}?mode=ro', uri=True)
|
||||
jm = conn.execute('PRAGMA journal_mode').fetchone()[0]
|
||||
|
||||
test = sqlite3.connect(tmp_db)
|
||||
if jm == 'wal':
|
||||
# WAL mode: checkpoint first to flush WAL into main db
|
||||
conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
|
||||
print(f'WAL checkpointed ({jm})')
|
||||
conn.close()
|
||||
|
||||
# Step 1b: Copy all three files (db + -wal + -shm) if they exist
|
||||
tmp_db = os.path.join(tmpdir, 'db')
|
||||
for suffix in ['', '-wal', '-shm']:
|
||||
src = local_db + suffix
|
||||
if os.path.exists(src):
|
||||
shutil.copy2(src, tmp_db + suffix)
|
||||
|
||||
# Step 1c: Verify (open read-only to confirm it's not corrupted)
|
||||
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
|
||||
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
|
||||
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
|
||||
test.close()
|
||||
|
||||
shutil.copy2(tmp_db, export_db)
|
||||
print(f'Exported: {r}s/{m}m')
|
||||
if os.path.exists(tmp_db + '-wal'):
|
||||
shutil.copy2(tmp_db + '-wal', export_db + '-wal')
|
||||
if os.path.exists(tmp_db + '-shm'):
|
||||
shutil.copy2(tmp_db + '-shm', export_db + '-shm')
|
||||
|
||||
print(f'Exported: {r}s/{m}m (journal={jm})')
|
||||
finally:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
PYEOF
|
||||
@@ -85,53 +106,54 @@ tmpdir = tempfile.mkdtemp(prefix='hs_merge_')
|
||||
tmp_merged = os.path.join(tmpdir, 'merged.db')
|
||||
|
||||
try:
|
||||
# Create merged DB with proper schema
|
||||
# Create merged DB (DELETE journal avoids WAL complications during merge)
|
||||
conn = sqlite3.connect(tmp_merged)
|
||||
conn.execute('PRAGMA journal_mode=DELETE')
|
||||
conn.execute('PRAGMA locking_mode=NORMAL')
|
||||
conn.execute('PRAGMA synchronous=FULL')
|
||||
|
||||
conn.execute('''
|
||||
CREATE TABLE sessions (
|
||||
id TEXT PRIMARY KEY, source TEXT NOT NULL, user_id TEXT,
|
||||
model TEXT, model_config TEXT, system_prompt TEXT,
|
||||
parent_session_id TEXT, started_at REAL, ended_at REAL,
|
||||
end_reason TEXT, message_count INTEGER DEFAULT 0,
|
||||
tool_call_count INTEGER DEFAULT 0, input_tokens INTEGER DEFAULT 0,
|
||||
output_tokens INTEGER DEFAULT 0, cache_read_tokens INTEGER DEFAULT 0,
|
||||
cache_write_tokens INTEGER DEFAULT 0, reasoning_tokens INTEGER DEFAULT 0,
|
||||
billing_provider TEXT, billing_base_url TEXT, billing_mode TEXT,
|
||||
estimated_cost_usd REAL, actual_cost_usd REAL,
|
||||
cost_status TEXT, cost_source TEXT, pricing_version TEXT, title TEXT
|
||||
)''')
|
||||
# Use one reference DB to get exact schema (handles FTS, etc.)
|
||||
ref_db = db_files[0]
|
||||
ref_copy = os.path.join(tmpdir, 'ref.db')
|
||||
shutil.copy2(ref_db, ref_copy)
|
||||
if os.path.exists(ref_db + '-wal'):
|
||||
shutil.copy2(ref_db + '-wal', ref_copy + '-wal')
|
||||
if os.path.exists(ref_db + '-shm'):
|
||||
shutil.copy2(ref_db + '-shm', ref_copy + '-shm')
|
||||
|
||||
conn.execute('''
|
||||
CREATE TABLE messages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
session_id TEXT NOT NULL, role TEXT NOT NULL, content TEXT,
|
||||
tool_call_id TEXT, tool_calls TEXT, tool_name TEXT,
|
||||
timestamp REAL NOT NULL, token_count INTEGER,
|
||||
finish_reason TEXT, reasoning TEXT, reasoning_details TEXT,
|
||||
codex_reasoning_items TEXT
|
||||
)''')
|
||||
|
||||
conn.execute('CREATE INDEX idx_msg_session ON messages(session_id)')
|
||||
conn.execute('CREATE INDEX idx_msg_ts ON messages(timestamp)')
|
||||
ref = sqlite3.connect(f'file:{ref_copy}?mode=ro', uri=True)
|
||||
for line in ref.iterdump():
|
||||
# Skip FTS virtual table data rows (they are derived, not real data)
|
||||
if line.startswith('INSERT INTO messages_fts') or \
|
||||
line.startswith('DELETE FROM messages_fts'):
|
||||
continue
|
||||
try:
|
||||
conn.execute(line)
|
||||
except Exception as e:
|
||||
pass # Ignore schema errors from partial DDL
|
||||
ref.close()
|
||||
os.remove(ref_copy)
|
||||
if os.path.exists(ref_copy + '-wal'):
|
||||
os.remove(ref_copy + '-wal')
|
||||
|
||||
# Copy sessions (INSERT OR REPLACE to dedupe by PK)
|
||||
for db_file in db_files:
|
||||
name = os.path.basename(db_file)
|
||||
# Copy to temp file first (avoids WAL lock issues with open connections)
|
||||
|
||||
# Copy to temp with WAL files
|
||||
tmp_copy = os.path.join(tmpdir, name)
|
||||
shutil.copy2(db_file, tmp_copy)
|
||||
# Also copy WAL if exists
|
||||
if os.path.exists(db_file + '-wal'):
|
||||
shutil.copy2(db_file + '-wal', tmp_copy + '-wal')
|
||||
if os.path.exists(db_file + '-shm'):
|
||||
shutil.copy2(db_file + '-shm', tmp_copy + '-shm')
|
||||
|
||||
src = sqlite3.connect(tmp_copy)
|
||||
src = sqlite3.connect(f'file:{tmp_copy}?mode=ro', uri=True)
|
||||
|
||||
s_cnt = src.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
|
||||
m_cnt = src.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
|
||||
print(f' {name}: {s_cnt}s/{m_cnt}m')
|
||||
jm = src.execute('PRAGMA journal_mode').fetchone()[0]
|
||||
print(f' {name}: {s_cnt}s/{m_cnt}m journal={jm}')
|
||||
|
||||
sess_rows = src.execute('SELECT * FROM sessions').fetchall()
|
||||
sess_cols = len(src.execute('PRAGMA table_info(sessions)').fetchall())
|
||||
@@ -145,8 +167,9 @@ try:
|
||||
|
||||
src.close()
|
||||
os.remove(tmp_copy)
|
||||
if os.path.exists(tmp_copy + '-wal'):
|
||||
os.remove(tmp_copy + '-wal')
|
||||
for suf in ['-wal', '-shm']:
|
||||
if os.path.exists(tmp_copy + suf):
|
||||
os.remove(tmp_copy + suf)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -154,6 +177,10 @@ try:
|
||||
if os.path.exists(merged_path):
|
||||
os.remove(merged_path)
|
||||
shutil.copy2(tmp_merged, merged_path)
|
||||
# Ensure no WAL on merged (merge output should be clean DELETE)
|
||||
for suf in ['-wal', '-shm']:
|
||||
if os.path.exists(merged_path + suf):
|
||||
os.remove(merged_path + suf)
|
||||
print(f'Merged: {os.path.getsize(merged_path)/1024:.0f} KB')
|
||||
|
||||
finally:
|
||||
@@ -188,16 +215,36 @@ if not os.path.exists(merged_path):
|
||||
else:
|
||||
tmpdir = tempfile.mkdtemp(prefix='hs_rest_')
|
||||
try:
|
||||
# Step 7a: Ensure merged db is fully checkpointed (DELETE journal mode)
|
||||
merge_conn = sqlite3.connect(f'file:{merged_path}?mode=ro', uri=True)
|
||||
merge_jm = merge_conn.execute('PRAGMA journal_mode').fetchone()[0]
|
||||
if merge_jm == 'wal':
|
||||
merge_conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
|
||||
print(f'Restored merged DB had WAL, checkpointed ({merge_jm})')
|
||||
merge_conn.close()
|
||||
|
||||
tmp_db = os.path.join(tmpdir, 'db')
|
||||
shutil.copy2(merged_path, tmp_db)
|
||||
# Also copy -wal/-shm if present
|
||||
for suf in ['-wal', '-shm']:
|
||||
if os.path.exists(merged_path + suf):
|
||||
shutil.copy2(merged_path + suf, tmp_db + suf)
|
||||
|
||||
test = sqlite3.connect(tmp_db)
|
||||
# Step 7b: Verify before overwriting
|
||||
test = sqlite3.connect(f'file:{tmp_db}?mode=ro', uri=True)
|
||||
r = test.execute('SELECT COUNT(*) FROM sessions').fetchone()[0]
|
||||
m = test.execute('SELECT COUNT(*) FROM messages').fetchone()[0]
|
||||
test.execute('PRAGMA integrity_check') # verify
|
||||
test.close()
|
||||
|
||||
shutil.copy2(local_db, local_db + '.bak')
|
||||
shutil.copy2(tmp_db, local_db)
|
||||
# Also copy WAL files to local
|
||||
for suf in ['-wal', '-shm']:
|
||||
if os.path.exists(tmp_db + suf):
|
||||
shutil.copy2(tmp_db + suf, local_db + suf)
|
||||
elif os.path.exists(local_db + suf):
|
||||
os.remove(local_db + suf)
|
||||
os.remove(local_db + '.bak')
|
||||
print(f'Restored: {r}s/{m}m')
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user