#!/usr/bin/env python3
"""卡住任务检测器 - 每5分钟扫描
- 执行中超过30分钟的任务 → 标记为"执行超时"
- 戴尔SSH失败未恢复的 → 标记
- 输出统计供监控
"""
import os, sys, json, subprocess, time
from datetime import datetime

DB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "tasks.db")
import sqlite3

def get_db():
    conn = sqlite3.connect(DB, timeout=10)
    conn.row_factory = sqlite3.Row
    return conn

def log(msg):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logs")
    os.makedirs(log_dir, exist_ok=True)
    with open(os.path.join(log_dir, "stuck_check.log"), "a") as f:
        f.write(f"[{ts}] {msg}\n")
    print(f"[{ts}] {msg}")

def check_stuck():
    db = get_db()
    now = datetime.now()
    results = {"stuck": [], "fixed": [], "summary": {}}

    # 1) 执行中超过30分钟的任务
    stuck_rows = db.execute("""
        SELECT t.slug, t.name, t.executor, e.id as exec_id, e.started_at, e.requirement_id
        FROM executions e JOIN tasks t ON e.task_id=t.id
        WHERE e.status='执行中' AND e.started_at < datetime('now','-30 minutes')
        ORDER BY e.started_at ASC
    """).fetchall()

    for r in stuck_rows:
        msg = f"卡住: {r['slug']} ({r['name']}) executor={r['executor']} started={r['started_at']}"
        log(msg)
        results["stuck"].append(dict(r))

        # 标记为超时
        db.execute("UPDATE executions SET status='超时' WHERE id=?", (r['exec_id'],))
        if r['requirement_id'] and r['requirement_id'] > 0:
            db.execute("UPDATE requirements SET status='failed' WHERE id=?", (r['requirement_id'],))
        db.execute("UPDATE tasks SET status='执行超时', updated_at=? WHERE slug=?",
                   (now.strftime('%Y-%m-%d %H:%M:%S'), r['slug']))
        results["fixed"].append(r['slug'])

    # 2) 卡在"执行中"超过5分钟但没执行记录（trigger启动失败）
    empty_stuck = db.execute("""
        SELECT t.slug, t.name, t.executor, t.updated_at
        FROM tasks t
        WHERE t.status='执行中' AND t.updated_at < datetime('now','-10 minutes')
        AND NOT EXISTS (SELECT 1 FROM executions e WHERE e.task_id=t.id AND e.status='执行中')
    """).fetchall()

    for r in empty_stuck:
        msg = f"空卡: {r['slug']} ({r['name']}) executor={r['executor']}"
        log(msg)
        results["stuck"].append(dict(r))
        db.execute("UPDATE tasks SET status='待处理', updated_at=? WHERE slug=?",
                   (now.strftime('%Y-%m-%d %H:%M:%S'), r['slug']))
        results["fixed"].append(r['slug'])

    # 3) pending的需求但没有触发（可能trigger挂了）
    orphan_reqs = db.execute("""
        SELECT r.id as req_id, t.id as task_id, t.slug, t.name, r.created_at
        FROM requirements r JOIN tasks t ON r.task_id=t.id
        WHERE r.status='waiting' AND r.created_at < datetime('now','-15 minutes')
        AND t.status NOT IN ('执行中', '待处理')
    """).fetchall()

    for r in orphan_reqs:
        msg = f"孤儿需求: {r['slug']} #{r['req_id']} created={r['created_at']}"
        log(msg)
        results["stuck"].append(dict(r))
        db.execute("UPDATE tasks SET status='待处理', updated_at=? WHERE id=?",
                   (now.strftime('%Y-%m-%d %H:%M:%S'), r['task_id']))
        # 触发重新执行
        trigger_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "trigger_execute.py")
        subprocess.Popen([sys.executable, trigger_script, str(r['req_id']), r['slug']],
                         stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        results["fixed"].append(f"{r['slug']}#{r['req_id']}")

    db.commit()
    db.close()

    # 统计
    results["summary"] = {
        "stuck_count": len(results["stuck"]),
        "fixed_count": len(results["fixed"]),
        "checked_at": now.strftime("%Y-%m-%d %H:%M:%S")
    }
    log(f"扫描完成: 卡住{results['summary']['stuck_count']}个, 修复{results['summary']['fixed_count']}个")
    return results

if __name__ == '__main__':
    check_stuck()
