pikasTech-unidesk/scripts/src/platform-infra-sub2api-codex-sentinel.ts

import { Buffer } from "node:buffer";

export interface CodexPoolSentinelConfig {
  monitor: {
    enabled: boolean;
  };
  actions: {
    enabled: boolean;
  };
  schedule: string;
  image: string;
  serviceAccountName: string;
  configMapName: string;
  credentialsSecretName: string;
  stateConfigMapName: string;
  cronJobName: string;
  roleName: string;
  roleBindingName: string;
  model: string;
  endpoint: "responses";
  marker: {
    prefix: string;
    exact: boolean;
  };
  probe: {
    timeoutSeconds: number;
    maxOutputTokens: number;
    transportRetryMinutes: number;
    userAgent: string;
  };
  gatewayFailureMonitor: {
    enabled: boolean;
    lookbackSeconds: number;
    tailLines: number;
    initialTtlMinutes: number;
    maxTtlMinutes: number;
    backoffMultiplier: number;
    paths: string[];
  };
  sdk: {
    openaiPythonVersion: string;
  };
  cadence: {
    successInitialIntervalMinutes: number;
    successMaxIntervalMinutes: number;
    trustedSuccessMaxIntervalMinutes: number;
    untrustedSuccessMaxIntervalMinutes: number;
    successBackoffMultiplier: number;
    jitterPercent: number;
  };
  freeze: {
    initialTtlMinutes: number;
    maxTtlMinutes: number;
    backoffMultiplier: number;
    jitterPercent: number;
  };
  pricing: {
    usdPer1MInputTokens: number;
    usdPer1MOutputTokens: number;
  };
  historyLimit: number;
  protectedManualAccounts?: string[];
}

export interface CodexPoolSentinelImageTarget {
  baseImage: string;
  runtimeImage: string;
  repository: string;
  tag: string;
}

export interface CodexPoolSentinelProfileSecret {
  accountName: string;
  profile: string;
  baseUrl: string;
  apiKey: string;
  upstreamUserAgent: string | null;
  trustUpstream: boolean;
  sentinelProtect: CodexPoolSentinelProtectPolicy;
}

export interface CodexPoolSentinelProtectPolicy {
  enabled: boolean;
  consecutiveFailures: number;
  initialRetryDelaySeconds: number;
  maxRetryDelaySeconds: number;
  backoffMultiplier: number;
}

export interface CodexPoolSentinelManifestOptions {
  namespace: string;
  serviceName: string;
  serviceDns: string;
  appSecretName: string;
  adminEmailDefault: string;
  proxy?: {
    httpProxy: string;
    noProxy: string;
  } | null;
}

export function codexPoolSentinelRuntimeImage(config: CodexPoolSentinelConfig): CodexPoolSentinelImageTarget {
  const tag = sentinelRuntimeImageTag(config.image, config.sdk.openaiPythonVersion);
  return {
    baseImage: config.image,
    runtimeImage: `127.0.0.1:5000/platform-infra/sub2api-account-sentinel:${tag}`,
    repository: "127.0.0.1:5000/platform-infra/sub2api-account-sentinel",
    tag,
  };
}

function sentinelRuntimeImageTag(baseImage: string, openaiPythonVersion: string): string {
  return `openai-${openaiPythonVersion}-${baseImage}`
    .replace(/[^A-Za-z0-9_.-]+/gu, "-")
    .replace(/^-+/u, "")
    .slice(0, 128);
}

export function readCodexPoolSentinelConfig(value: unknown, sourcePath: string): CodexPoolSentinelConfig {
  if (!isRecord(value)) throw new Error(`${sourcePath}.sentinel must be a YAML object`);
  const monitor = readRequiredRecord(valueAt(value, "monitor"), `${sourcePath}.sentinel.monitor`);
  const actions = readRequiredRecord(valueAt(value, "actions"), `${sourcePath}.sentinel.actions`);
  const marker = readRequiredRecord(valueAt(value, "marker"), `${sourcePath}.sentinel.marker`);
  const probe = readRequiredRecord(valueAt(value, "probe"), `${sourcePath}.sentinel.probe`);
  const gatewayFailureMonitor = readRequiredRecord(valueAt(value, "gatewayFailureMonitor"), `${sourcePath}.sentinel.gatewayFailureMonitor`);
  const sdk = readRequiredRecord(valueAt(value, "sdk"), `${sourcePath}.sentinel.sdk`);
  const cadence = readRequiredRecord(valueAt(value, "cadence"), `${sourcePath}.sentinel.cadence`);
  const freeze = readRequiredRecord(valueAt(value, "freeze"), `${sourcePath}.sentinel.freeze`);
  const pricing = readRequiredRecord(valueAt(value, "pricing"), `${sourcePath}.sentinel.pricing`);
  const config: CodexPoolSentinelConfig = {
    monitor: {
      enabled: readRequiredBoolean(valueAt(monitor, "enabled"), `${sourcePath}.sentinel.monitor.enabled`),
    },
    actions: {
      enabled: readRequiredBoolean(valueAt(actions, "enabled"), `${sourcePath}.sentinel.actions.enabled`),
    },
    schedule: readRequiredString(valueAt(value, "schedule"), `${sourcePath}.sentinel.schedule`),
    image: readRequiredImage(valueAt(value, "image"), `${sourcePath}.sentinel.image`),
    serviceAccountName: readRequiredDnsName(valueAt(value, "serviceAccountName"), `${sourcePath}.sentinel.serviceAccountName`),
    configMapName: readRequiredDnsName(valueAt(value, "configMapName"), `${sourcePath}.sentinel.configMapName`),
    credentialsSecretName: readRequiredDnsName(valueAt(value, "credentialsSecretName"), `${sourcePath}.sentinel.credentialsSecretName`),
    stateConfigMapName: readRequiredDnsName(valueAt(value, "stateConfigMapName"), `${sourcePath}.sentinel.stateConfigMapName`),
    cronJobName: readRequiredDnsName(valueAt(value, "cronJobName"), `${sourcePath}.sentinel.cronJobName`),
    roleName: readRequiredDnsName(valueAt(value, "roleName"), `${sourcePath}.sentinel.roleName`),
    roleBindingName: readRequiredDnsName(valueAt(value, "roleBindingName"), `${sourcePath}.sentinel.roleBindingName`),
    model: readRequiredModelName(valueAt(value, "model"), `${sourcePath}.sentinel.model`),
    endpoint: readRequiredEndpoint(valueAt(value, "endpoint"), `${sourcePath}.sentinel.endpoint`),
    marker: {
      prefix: readRequiredMarkerPrefix(valueAt(marker, "prefix"), `${sourcePath}.sentinel.marker.prefix`),
      exact: readRequiredBoolean(valueAt(marker, "exact"), `${sourcePath}.sentinel.marker.exact`),
    },
    probe: {
      timeoutSeconds: readRequiredInt(valueAt(probe, "timeoutSeconds"), `${sourcePath}.sentinel.probe.timeoutSeconds`, 3, 300),
      maxOutputTokens: readRequiredInt(valueAt(probe, "maxOutputTokens"), `${sourcePath}.sentinel.probe.maxOutputTokens`, 1, 128),
      transportRetryMinutes: readRequiredInt(valueAt(probe, "transportRetryMinutes"), `${sourcePath}.sentinel.probe.transportRetryMinutes`, 1, 120),
      userAgent: readRequiredUserAgent(valueAt(probe, "userAgent"), `${sourcePath}.sentinel.probe.userAgent`),
    },
    gatewayFailureMonitor: {
      enabled: readRequiredBoolean(valueAt(gatewayFailureMonitor, "enabled"), `${sourcePath}.sentinel.gatewayFailureMonitor.enabled`),
      lookbackSeconds: readRequiredInt(valueAt(gatewayFailureMonitor, "lookbackSeconds"), `${sourcePath}.sentinel.gatewayFailureMonitor.lookbackSeconds`, 60, 7200),
      tailLines: readRequiredInt(valueAt(gatewayFailureMonitor, "tailLines"), `${sourcePath}.sentinel.gatewayFailureMonitor.tailLines`, 100, 50000),
      initialTtlMinutes: readRequiredInt(valueAt(gatewayFailureMonitor, "initialTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.initialTtlMinutes`, 1, 1440),
      maxTtlMinutes: readRequiredInt(valueAt(gatewayFailureMonitor, "maxTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes`, 1, 1440),
      backoffMultiplier: readRequiredInt(valueAt(gatewayFailureMonitor, "backoffMultiplier"), `${sourcePath}.sentinel.gatewayFailureMonitor.backoffMultiplier`, 1, 10),
      paths: readRequiredPathList(valueAt(gatewayFailureMonitor, "paths"), `${sourcePath}.sentinel.gatewayFailureMonitor.paths`),
    },
    sdk: {
      openaiPythonVersion: readRequiredOpenAiPythonVersion(valueAt(sdk, "openaiPythonVersion"), `${sourcePath}.sentinel.sdk.openaiPythonVersion`),
    },
    cadence: {
      successInitialIntervalMinutes: readRequiredInt(valueAt(cadence, "successInitialIntervalMinutes"), `${sourcePath}.sentinel.cadence.successInitialIntervalMinutes`, 1, 1440),
      successMaxIntervalMinutes: readRequiredInt(valueAt(cadence, "successMaxIntervalMinutes"), `${sourcePath}.sentinel.cadence.successMaxIntervalMinutes`, 1, 1440),
      trustedSuccessMaxIntervalMinutes: readRequiredInt(valueAt(cadence, "trustedSuccessMaxIntervalMinutes"), `${sourcePath}.sentinel.cadence.trustedSuccessMaxIntervalMinutes`, 1, 1440),
      untrustedSuccessMaxIntervalMinutes: readRequiredInt(valueAt(cadence, "untrustedSuccessMaxIntervalMinutes"), `${sourcePath}.sentinel.cadence.untrustedSuccessMaxIntervalMinutes`, 1, 1440),
      successBackoffMultiplier: readRequiredInt(valueAt(cadence, "successBackoffMultiplier"), `${sourcePath}.sentinel.cadence.successBackoffMultiplier`, 1, 10),
      jitterPercent: readRequiredInt(valueAt(cadence, "jitterPercent"), `${sourcePath}.sentinel.cadence.jitterPercent`, 0, 50),
    },
    freeze: {
      initialTtlMinutes: readRequiredInt(valueAt(freeze, "initialTtlMinutes"), `${sourcePath}.sentinel.freeze.initialTtlMinutes`, 1, 1440),
      maxTtlMinutes: readRequiredInt(valueAt(freeze, "maxTtlMinutes"), `${sourcePath}.sentinel.freeze.maxTtlMinutes`, 1, 1440),
      backoffMultiplier: readRequiredInt(valueAt(freeze, "backoffMultiplier"), `${sourcePath}.sentinel.freeze.backoffMultiplier`, 1, 10),
      jitterPercent: readRequiredInt(valueAt(freeze, "jitterPercent"), `${sourcePath}.sentinel.freeze.jitterPercent`, 0, 50),
    },
    pricing: {
      usdPer1MInputTokens: readRequiredNumber(valueAt(pricing, "usdPer1MInputTokens"), `${sourcePath}.sentinel.pricing.usdPer1MInputTokens`, 0, 100000),
      usdPer1MOutputTokens: readRequiredNumber(valueAt(pricing, "usdPer1MOutputTokens"), `${sourcePath}.sentinel.pricing.usdPer1MOutputTokens`, 0, 100000),
    },
    historyLimit: readRequiredInt(valueAt(value, "historyLimit"), `${sourcePath}.sentinel.historyLimit`, 1, 2000),
  };
  if (config.actions.enabled && !config.monitor.enabled) {
    throw new Error(`${sourcePath}.sentinel.actions.enabled requires sentinel.monitor.enabled=true`);
  }
  if (config.cadence.successMaxIntervalMinutes < config.cadence.successInitialIntervalMinutes) {
    throw new Error(`${sourcePath}.sentinel.cadence.successMaxIntervalMinutes must be >= successInitialIntervalMinutes`);
  }
  if (config.cadence.trustedSuccessMaxIntervalMinutes < config.cadence.successInitialIntervalMinutes) {
    throw new Error(`${sourcePath}.sentinel.cadence.trustedSuccessMaxIntervalMinutes must be >= successInitialIntervalMinutes`);
  }
  if (config.cadence.untrustedSuccessMaxIntervalMinutes < config.cadence.successInitialIntervalMinutes) {
    throw new Error(`${sourcePath}.sentinel.cadence.untrustedSuccessMaxIntervalMinutes must be >= successInitialIntervalMinutes`);
  }
  if (config.freeze.maxTtlMinutes < config.freeze.initialTtlMinutes) {
    throw new Error(`${sourcePath}.sentinel.freeze.maxTtlMinutes must be >= initialTtlMinutes`);
  }
  if (config.gatewayFailureMonitor.maxTtlMinutes < config.gatewayFailureMonitor.initialTtlMinutes) {
    throw new Error(`${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes must be >= initialTtlMinutes`);
  }
  if (!/^[-0-9A-Za-z_/*,\s]+$/u.test(config.schedule)) {
    throw new Error(`${sourcePath}.sentinel.schedule has an unsupported cron format`);
  }
  if (!/^[A-Za-z0-9._:/@-]+$/u.test(config.image)) {
    throw new Error(`${sourcePath}.sentinel.image has an unsupported image format`);
  }
  return config;
}

export function codexPoolSentinelSummary(config: CodexPoolSentinelConfig): Record<string, unknown> {
  return {
    monitorEnabled: config.monitor.enabled,
    actionsEnabled: config.actions.enabled,
    schedule: config.schedule,
    cronJobName: config.cronJobName,
    roleName: config.roleName,
    roleBindingName: config.roleBindingName,
    configMapName: config.configMapName,
    credentialsSecretName: config.credentialsSecretName,
    stateConfigMapName: config.stateConfigMapName,
    image: config.image,
    model: config.model,
    endpoint: config.endpoint,
    probe: config.probe,
    gatewayFailureMonitor: config.gatewayFailureMonitor,
    sdk: config.sdk,
    cadence: config.cadence,
    freeze: config.freeze,
    protectedManualAccountCount: config.protectedManualAccounts?.length ?? 0,
    accounting: {
      mode: "record-only",
      pricing: config.pricing,
    },
    marker: {
      prefix: config.marker.prefix,
      exact: config.marker.exact,
    },
    valuesPrinted: false,
  };
}

export function renderCodexPoolSentinelManifest(
  config: CodexPoolSentinelConfig,
  profiles: CodexPoolSentinelProfileSecret[],
  options: CodexPoolSentinelManifestOptions,
): string {
  const profilesJson = JSON.stringify({ profiles }, null, 2);
  const runnerConfig = {
    monitor: config.monitor,
    actions: config.actions,
    service: {
      baseUrl: `http://${options.serviceDns}`,
      adminEmailDefault: options.adminEmailDefault,
    },
    model: config.model,
    endpoint: config.endpoint,
    marker: config.marker,
    probe: config.probe,
    gatewayFailureMonitor: config.gatewayFailureMonitor,
    sdk: config.sdk,
    cadence: config.cadence,
    freeze: config.freeze,
    pricing: config.pricing,
    protectedManualAccounts: config.protectedManualAccounts ?? [],
    state: {
      configMapName: config.stateConfigMapName,
      historyLimit: config.historyLimit,
    },
  };
  const suspend = config.monitor.enabled ? "false" : "true";
  const activeDeadlineSeconds = Math.max(300, Math.min(3600, config.probe.timeoutSeconds + 240));
  const command = sentinelContainerShellCommand(config);
  const runtimeImage = codexPoolSentinelRuntimeImage(config).runtimeImage;
  const proxyEnv = options.proxy?.httpProxy
    ? `                - name: HTTP_PROXY
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: HTTPS_PROXY
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: ALL_PROXY
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: http_proxy
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: https_proxy
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: all_proxy
                  value: ${JSON.stringify(options.proxy.httpProxy)}
                - name: NO_PROXY
                  value: ${JSON.stringify(options.proxy.noProxy)}
                - name: no_proxy
                  value: ${JSON.stringify(options.proxy.noProxy)}
`
    : "";
  return `apiVersion: v1
kind: Secret
metadata:
  name: ${config.credentialsSecretName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
    unidesk.ai/secret-purpose: sub2api-account-sentinel-profiles
type: Opaque
data:
  profiles.json: ${Buffer.from(profilesJson, "utf8").toString("base64")}
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: ${config.configMapName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
data:
  config.json: |
${indentBlock(JSON.stringify(runnerConfig, null, 2), 4)}
  sentinel.py: |
${indentBlock(sentinelRunnerPython(), 4)}
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: ${config.serviceAccountName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: ${config.roleName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
rules:
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get", "create", "update", "patch"]
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list"]
  - apiGroups: [""]
    resources: ["pods/log"]
    verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: ${config.roleBindingName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
subjects:
  - kind: ServiceAccount
    name: ${config.serviceAccountName}
    namespace: ${options.namespace}
roleRef:
  kind: Role
  name: ${config.roleName}
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: CronJob
metadata:
  name: ${config.cronJobName}
  namespace: ${options.namespace}
  labels:
    app.kubernetes.io/name: ${config.cronJobName}
    app.kubernetes.io/part-of: platform-infra
    app.kubernetes.io/managed-by: unidesk
spec:
  schedule: "${config.schedule}"
  concurrencyPolicy: Forbid
  suspend: ${suspend}
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 5
  jobTemplate:
    spec:
      ttlSecondsAfterFinished: 3600
      activeDeadlineSeconds: ${activeDeadlineSeconds}
      template:
        metadata:
          labels:
            app.kubernetes.io/name: ${config.cronJobName}
            app.kubernetes.io/part-of: platform-infra
            app.kubernetes.io/managed-by: unidesk
        spec:
          serviceAccountName: ${config.serviceAccountName}
          restartPolicy: Never
          containers:
            - name: sentinel
              image: ${runtimeImage}
              imagePullPolicy: IfNotPresent
              command: ["sh", "-c"]
              args:
                - ${JSON.stringify(command)}
              env:
                - name: ADMIN_EMAIL
                  valueFrom:
                    configMapKeyRef:
                      name: sub2api-config
                      key: ADMIN_EMAIL
                      optional: true
                - name: ADMIN_PASSWORD
                  valueFrom:
                    secretKeyRef:
                      name: ${options.appSecretName}
                      key: ADMIN_PASSWORD
                      optional: true
                - name: POD_NAMESPACE
                  valueFrom:
                    fieldRef:
                      fieldPath: metadata.namespace
${proxyEnv}
              volumeMounts:
                - name: sentinel-config
                  mountPath: /opt/sentinel
                  readOnly: true
                - name: sentinel-profiles
                  mountPath: /opt/sentinel-secrets
                  readOnly: true
          volumes:
            - name: sentinel-config
              configMap:
                name: ${config.configMapName}
                defaultMode: 0555
            - name: sentinel-profiles
              secret:
                secretName: ${config.credentialsSecretName}
`;
}

export function sentinelContainerShellCommand(config: CodexPoolSentinelConfig): string {
  return [
    "set -eu",
    `export OPENAI_PYTHON_VERSION=${JSON.stringify(config.sdk.openaiPythonVersion)}`,
    "if ! python3 - <<'PY'",
    "import importlib.metadata",
    "import os",
    "expected = os.environ['OPENAI_PYTHON_VERSION']",
    "try:",
    "    current = importlib.metadata.version('openai')",
    "except importlib.metadata.PackageNotFoundError:",
    "    current = None",
    "if current != expected:",
    "    raise SystemExit(1)",
    "PY",
    "then",
    "  python3 -m pip install --no-cache-dir \"openai==$OPENAI_PYTHON_VERSION\"",
    "fi",
    "python3 - <<'PY'",
    "import importlib.metadata",
    "import os",
    "expected = os.environ['OPENAI_PYTHON_VERSION']",
    "current = importlib.metadata.version('openai')",
    "if current != expected:",
    "    raise SystemExit(f'openai-python-version-mismatch expected={expected} current={current}')",
    "PY",
    "exec python3 /opt/sentinel/sentinel.py",
  ].join("\n");
}

export function sentinelRunnerPython(): string {
  return String.raw`#!/usr/bin/env python3
import base64
import hashlib
import json
import math
import os
import random
import ssl
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone, timedelta
from urllib import error, parse, request
from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAI

CONFIG_PATH = "/opt/sentinel/config.json"
PROFILES_PATH = "/opt/sentinel-secrets/profiles.json"
STATE_KEY = "state.json"

def utc_now():
    return datetime.now(timezone.utc)

def iso(dt):
    return dt.astimezone(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")

def parse_iso(value):
    if not isinstance(value, str) or not value:
        return None
    try:
        return datetime.fromisoformat(value.replace("Z", "+00:00"))
    except Exception:
        return None

def load_json(path):
    with open(path, encoding="utf-8") as handle:
        return json.load(handle)

def sha(value):
    return hashlib.sha256(str(value).encode("utf-8", errors="replace")).hexdigest()[:16]

def preview(value, limit=160):
    if not isinstance(value, str):
        value = str(value)
    value = " ".join(value.replace("\r", " ").replace("\n", " ").split())
    return value[:limit]

def day_key(now):
    return now.strftime("%Y-%m-%d")

def add_minutes(now, minutes, jitter_percent=0):
    minutes = float(minutes)
    if jitter_percent:
        factor = 1 + random.uniform(-jitter_percent, jitter_percent) / 100
        minutes = max(1, minutes * factor)
    return now + timedelta(minutes=minutes)

def estimate_tokens(text):
    if not isinstance(text, str) or not text:
        return 0
    return max(1, math.ceil(len(text) / 4))

class KubeClient:
    def __init__(self, namespace):
        self.namespace = namespace
        with open("/var/run/secrets/kubernetes.io/serviceaccount/token", encoding="utf-8") as handle:
            self.token = handle.read().strip()
        self.base = "https://kubernetes.default.svc"
        self.context = ssl.create_default_context(cafile="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")

    def api(self, method, path, payload=None):
        body = None if payload is None else json.dumps(payload).encode("utf-8")
        req = request.Request(
            self.base + path,
            data=body,
            method=method,
            headers={
                "Authorization": "Bearer " + self.token,
                "Accept": "application/json",
                "Content-Type": "application/json",
            },
        )
        try:
            with request.urlopen(req, timeout=15, context=self.context) as resp:
                raw = resp.read()
                if not raw:
                    return resp.status, None
                text = raw.decode("utf-8", errors="replace")
                try:
                    return resp.status, json.loads(text)
                except Exception:
                    return resp.status, text
        except error.HTTPError as exc:
            raw = exc.read()
            try:
                parsed = json.loads(raw.decode("utf-8")) if raw else None
            except Exception:
                parsed = {"message": raw.decode("utf-8", errors="replace")}
            return exc.code, parsed

    def get_configmap(self, name):
        status, data = self.api("GET", f"/api/v1/namespaces/{self.namespace}/configmaps/{name}")
        if status == 404:
            return None
        if status >= 300:
            raise RuntimeError(f"get configmap {name} failed: {status} {data}")
        return data

    def create_configmap(self, name, state):
        payload = {
            "apiVersion": "v1",
            "kind": "ConfigMap",
            "metadata": {
                "name": name,
                "namespace": self.namespace,
                "labels": {
                    "app.kubernetes.io/part-of": "platform-infra",
                    "app.kubernetes.io/managed-by": "unidesk",
                    "unidesk.ai/state-purpose": "sub2api-account-sentinel",
                },
            },
            "data": {STATE_KEY: json.dumps(state, ensure_ascii=False, indent=2)},
        }
        status, data = self.api("POST", f"/api/v1/namespaces/{self.namespace}/configmaps", payload)
        if status >= 300 and status != 409:
            raise RuntimeError(f"create state configmap {name} failed: {status} {data}")
        return self.get_configmap(name)

    def update_configmap_state(self, obj, state):
        obj.setdefault("data", {})[STATE_KEY] = json.dumps(state, ensure_ascii=False, indent=2)
        name = obj["metadata"]["name"]
        status, data = self.api("PUT", f"/api/v1/namespaces/{self.namespace}/configmaps/{name}", obj)
        if status >= 300:
            raise RuntimeError(f"update state configmap {name} failed: {status} {data}")
        return data

    def list_pods(self, label_selector):
        query = parse.urlencode({"labelSelector": label_selector})
        status, data = self.api("GET", f"/api/v1/namespaces/{self.namespace}/pods?{query}")
        if status >= 300:
            raise RuntimeError(f"list pods failed: {status} {data}")
        return data.get("items") if isinstance(data, dict) and isinstance(data.get("items"), list) else []

    def pod_logs(self, pod_name, container, since_seconds, tail_lines):
        query = parse.urlencode({
            "container": container,
            "sinceSeconds": int(since_seconds),
            "tailLines": int(tail_lines),
        })
        status, data = self.api("GET", f"/api/v1/namespaces/{self.namespace}/pods/{url_quote(pod_name)}/log?{query}")
        if status >= 300:
            raise RuntimeError(f"get pod logs failed: {status} {data}")
        return data if isinstance(data, str) else ""

def default_state():
    return {
        "version": 1,
        "accounts": {},
        "ledger": {},
        "history": [],
    }

def load_state(kube, config):
    name = config["state"]["configMapName"]
    obj = kube.get_configmap(name)
    if obj is None:
        obj = kube.create_configmap(name, default_state())
    raw = (obj.get("data") or {}).get(STATE_KEY)
    try:
        state = json.loads(raw) if raw else default_state()
    except Exception:
        state = default_state()
        state["stateLoadWarning"] = "invalid-state-json-reset"
    state.setdefault("version", 1)
    state.setdefault("accounts", {})
    state.setdefault("ledger", {})
    state.setdefault("history", [])
    return obj, state

def http_json(method, url, headers=None, payload=None, timeout=30, max_bytes=65536):
    body = None if payload is None else json.dumps(payload, separators=(",", ":")).encode("utf-8")
    req = request.Request(url, data=body, method=method, headers=headers or {})
    started = time.time()
    try:
        with request.urlopen(req, timeout=timeout) as resp:
            raw = resp.read(max_bytes + 1)
            too_large = len(raw) > max_bytes
            if too_large:
                raw = raw[:max_bytes]
            text = raw.decode("utf-8", errors="replace")
            parsed = None
            try:
                parsed = json.loads(text) if text.strip() else None
            except Exception:
                parsed = None
            app_success = not (isinstance(parsed, dict) and parsed.get("code") not in (None, 0))
            return {
                "ok": 200 <= resp.status < 300 and not too_large and app_success,
                "status": resp.status,
                "json": parsed,
                "text": text,
                "tooLarge": too_large,
                "durationMs": int((time.time() - started) * 1000),
            }
    except error.HTTPError as exc:
        raw = exc.read(max_bytes + 1)
        text = raw.decode("utf-8", errors="replace")
        parsed = None
        try:
            parsed = json.loads(text) if text.strip() else None
        except Exception:
            parsed = None
        return {
            "ok": False,
            "status": exc.code,
            "json": parsed,
            "text": text,
            "tooLarge": len(raw) > max_bytes,
            "durationMs": int((time.time() - started) * 1000),
            "error": str(exc),
        }
    except Exception as exc:
        return {
            "ok": False,
            "status": 0,
            "json": None,
            "text": "",
            "tooLarge": False,
            "durationMs": int((time.time() - started) * 1000),
            "error": str(exc),
        }

def find_token(value):
    if isinstance(value, dict):
        for key in ("access_token", "token"):
            if isinstance(value.get(key), str) and value[key]:
                return value[key]
        for item in value.values():
            found = find_token(item)
            if found:
                return found
    if isinstance(value, list):
        for item in value:
            found = find_token(item)
            if found:
                return found
    return None

def url_quote(value):
    return parse.quote(str(value), safe="")

class Sub2ApiAdmin:
    def __init__(self, config):
        self.base = config["service"]["baseUrl"].rstrip("/")
        self.email = os.environ.get("ADMIN_EMAIL") or config["service"]["adminEmailDefault"]
        self.password = os.environ.get("ADMIN_PASSWORD") or ""
        protected = config.get("protectedManualAccounts") if isinstance(config.get("protectedManualAccounts"), list) else []
        self.protected_manual_accounts = set(str(item) for item in protected if isinstance(item, str) and item)
        self.token = None
        self.accounts_by_name = None

    def login(self):
        if self.token:
            return self.token
        if not self.password:
            raise RuntimeError("ADMIN_PASSWORD is missing")
        resp = http_json("POST", self.base + "/api/v1/auth/login", {"Content-Type": "application/json"}, {"email": self.email, "password": self.password}, timeout=15)
        token = find_token(resp.get("json"))
        if not resp["ok"] or not token:
            raise RuntimeError("admin login failed")
        self.token = token
        return token

    def request(self, method, path, payload=None):
        token = self.login()
        headers = {"Authorization": "Bearer " + token, "Content-Type": "application/json"}
        resp = http_json(method, self.base + path, headers, payload, timeout=20, max_bytes=2 * 1024 * 1024)
        if not resp["ok"]:
            parsed = resp.get("json")
            code = parsed.get("code") if isinstance(parsed, dict) else None
            message = parsed.get("message") if isinstance(parsed, dict) else None
            detail = f"status={resp.get('status')} tooLarge={resp.get('tooLarge')}"
            if code is not None:
                detail += f" code={code}"
            if message:
                detail += f" message={preview(message, 200)}"
            raise RuntimeError(f"admin {method} {path} failed: {detail}")
        parsed = resp.get("json")
        if isinstance(parsed, dict) and parsed.get("code") not in (None, 0):
            raise RuntimeError(f"admin {method} {path} failed: code={parsed.get('code')} message={parsed.get('message')}")
        if isinstance(parsed, dict) and "data" in parsed:
            return parsed["data"]
        return parsed

    def accounts(self):
        if self.accounts_by_name is not None:
            return self.accounts_by_name
        data = self.request("GET", "/api/v1/admin/accounts?page=1&page_size=500&platform=openai&type=apikey&search=unidesk-codex-")
        items = []
        if isinstance(data, list):
            items = data
        elif isinstance(data, dict):
            for key in ("items", "accounts"):
                if isinstance(data.get(key), list):
                    items = data[key]
                    break
        self.accounts_by_name = {item.get("name"): item for item in items if isinstance(item, dict) and isinstance(item.get("name"), str)}
        return self.accounts_by_name

    def account(self, account_name):
        if self.accounts_by_name is not None and account_name in self.accounts_by_name:
            return self.accounts_by_name[account_name]
        data = self.request("GET", "/api/v1/admin/accounts?page=1&page_size=20&platform=openai&type=apikey&search=" + url_quote(account_name))
        items = data if isinstance(data, list) else []
        if isinstance(data, dict):
            for key in ("items", "accounts"):
                if isinstance(data.get(key), list):
                    items = data[key]
                    break
        for item in items:
            if isinstance(item, dict) and item.get("name") == account_name:
                if self.accounts_by_name is not None:
                    self.accounts_by_name[account_name] = item
                return item
        return None

    def set_schedulable(self, account_name, schedulable):
        if account_name in self.protected_manual_accounts:
            return {
                "accountId": None,
                "previousSchedulable": None,
                "schedulable": None,
                "skipped": True,
                "reason": "protected-manual-account",
            }
        account = self.account(account_name)
        if not account or account.get("id") is None:
            raise RuntimeError(f"account {account_name} not found")
        previous = account.get("schedulable")
        self.request("POST", f"/api/v1/admin/accounts/{account['id']}/schedulable", {"schedulable": bool(schedulable)})
        account["schedulable"] = bool(schedulable)
        if self.accounts_by_name is not None:
            self.accounts_by_name[account_name] = account
        return {"accountId": account.get("id"), "previousSchedulable": previous, "schedulable": bool(schedulable)}

def upstream_base_url(base_url):
    base = str(base_url).rstrip("/")
    return base if base.endswith("/v1") else base + "/v1"

def output_text(parsed):
    if isinstance(parsed, dict) and isinstance(parsed.get("output_text"), str):
        return parsed["output_text"]
    parts = []
    output = parsed.get("output") if isinstance(parsed, dict) else None
    if isinstance(output, list):
        for item in output:
            if not isinstance(item, dict):
                continue
            content = item.get("content")
            if not isinstance(content, list):
                continue
            for block in content:
                if isinstance(block, dict) and isinstance(block.get("text"), str):
                    parts.append(block["text"])
    return "\n".join(parts)

def model_dump(value):
    if hasattr(value, "model_dump"):
        return value.model_dump()
    if isinstance(value, dict):
        return value
    return {}

def body_text(value):
    if isinstance(value, bytes):
        return value.decode("utf-8", errors="replace")
    if isinstance(value, str):
        return value
    try:
        return json.dumps(value, ensure_ascii=False)
    except Exception:
        return str(value)

def redact_diagnostic(value):
    if isinstance(value, dict):
        redacted = {}
        for key, item in value.items():
            key_text = str(key)
            if any(token in key_text.lower() for token in ("key", "token", "secret", "password", "credential", "authorization")):
                redacted[key_text] = "[redacted]"
            else:
                redacted[key_text] = redact_diagnostic(item)
        return redacted
    if isinstance(value, list):
        return [redact_diagnostic(item) for item in value[:20]]
    if isinstance(value, str):
        return value if len(value) <= 2000 else value[:2000] + "...[truncated]"
    if isinstance(value, (int, float, bool)) or value is None:
        return value
    return str(value)

def selected_headers(headers):
    if headers is None:
        return {}
    selected = {}
    for key in (
        "content-type",
        "x-request-id",
        "x-ratelimit-limit-requests",
        "x-ratelimit-remaining-requests",
        "x-ratelimit-reset-requests",
        "cf-ray",
        "server",
    ):
        try:
            value = headers.get(key)
        except Exception:
            value = None
        if value:
            selected[key] = str(value)
    return selected

def openai_error_fields(body):
    if not isinstance(body, dict):
        return {}
    error_obj = body.get("error")
    if isinstance(error_obj, dict):
        return {
            "message": error_obj.get("message"),
            "type": error_obj.get("type"),
            "param": error_obj.get("param"),
            "code": error_obj.get("code"),
        }
    return {
        "message": body.get("message"),
        "type": body.get("type"),
        "param": body.get("param"),
        "code": body.get("code"),
    }

def error_details(kind, status, body=None, message=None, headers=None):
    text = body_text(body)
    return {
        "kind": kind,
        "statusCode": status,
        "message": str(message) if message else None,
        "openaiError": openai_error_fields(body),
        "body": redact_diagnostic(body) if isinstance(body, (dict, list)) else None,
        "bodyHash": sha(text),
        "bodyPreview": preview(text, 1000),
        "headers": selected_headers(headers),
    }

def sub2api_style_input(prompt):
    return [{
        "role": "user",
        "content": [{
            "type": "input_text",
            "text": prompt,
        }],
    }]

def sub2api_style_instructions():
    return (
        "You are Codex, based on GPT-5. You are running as a coding agent in the Codex CLI on a user's computer."
    )

def event_type(event):
    if isinstance(event, dict):
        return event.get("type")
    return getattr(event, "type", None)

def event_delta(event):
    if isinstance(event, dict):
        value = event.get("delta")
        return value if isinstance(value, str) else ""
    value = getattr(event, "delta", "")
    return value if isinstance(value, str) else ""

def event_error_message(event):
    data = model_dump(event)
    if isinstance(data, dict):
        if isinstance(data.get("error"), dict):
            message = data["error"].get("message")
            if isinstance(message, str) and message:
                return message
        if isinstance(data.get("response"), dict) and isinstance(data["response"].get("error"), dict):
            message = data["response"]["error"].get("message")
            if isinstance(message, str) and message:
                return message
    return None

def openai_responses_create(profile, config, marker, prompt):
    headers = {
        "User-Agent": profile.get("upstreamUserAgent") or config["probe"].get("userAgent") or "Go-http-client/1.1",
        "X-Request-ID": "unidesk-account-sentinel-" + hashlib.sha256(marker.encode()).hexdigest()[:16],
    }
    client = OpenAI(
        api_key=profile["apiKey"],
        base_url=upstream_base_url(profile["baseUrl"]),
        timeout=float(config["probe"]["timeoutSeconds"]),
        max_retries=0,
    )
    started = time.time()
    try:
        stream = client.responses.create(
            model=config["model"],
            input=sub2api_style_input(prompt),
            instructions=sub2api_style_instructions(),
            stream=True,
            extra_headers=headers,
        )
        deltas = []
        events = []
        seen_completed = False
        max_chars = max(32, int(config["probe"]["maxOutputTokens"]) * 12)
        for event in stream:
            event_data = model_dump(event)
            etype = event_type(event)
            events.append({"type": etype, "preview": preview(body_text(event_data), 240)})
            if etype == "response.output_text.delta":
                delta = event_delta(event)
                if delta:
                    deltas.append(delta)
                    if len("".join(deltas)) > max_chars:
                        break
            elif etype in ("response.completed", "response.done"):
                seen_completed = True
                break
            elif etype in ("response.failed", "error"):
                message = event_error_message(event) or "OpenAI response failed"
                raise RuntimeError(message)
        out = "".join(deltas)
        parsed = {"stream": True, "completed": seen_completed, "events": events[-20:], "output_text": out}
        if not seen_completed:
            parsed["streamError"] = "stream ended before response.completed"
        return {
            "ok": seen_completed,
            "status": 200 if seen_completed else 0,
            "json": parsed,
            "outputText": out,
            "text": body_text(parsed),
            "tooLarge": not seen_completed and len(out) > max_chars,
            "durationMs": int((time.time() - started) * 1000),
            "sdk": "openai-python",
            "requestShape": "sub2api-account-test-streaming-responses",
        }
    except APIStatusError as exc:
        status = getattr(exc, "status_code", 0) or 0
        body = getattr(exc, "body", None)
        response = getattr(exc, "response", None)
        return {
            "ok": False,
            "status": status,
            "json": body if isinstance(body, dict) else None,
            "text": body_text(body or response or ""),
            "tooLarge": False,
            "durationMs": int((time.time() - started) * 1000),
            "error": str(exc),
            "errorDetails": error_details("APIStatusError", status, body, str(exc), getattr(response, "headers", None)),
            "sdk": "openai-python",
            "requestShape": "sub2api-account-test-streaming-responses",
        }
    except (APITimeoutError, APIConnectionError) as exc:
        return {
            "ok": False,
            "status": 0,
            "json": None,
            "text": "",
            "tooLarge": False,
            "durationMs": int((time.time() - started) * 1000),
            "error": str(exc),
            "errorDetails": error_details(exc.__class__.__name__, 0, None, str(exc), None),
            "sdk": "openai-python",
            "requestShape": "sub2api-account-test-streaming-responses",
        }
    except Exception as exc:
        return {
            "ok": False,
            "status": 0,
            "json": None,
            "text": "",
            "tooLarge": False,
            "durationMs": int((time.time() - started) * 1000),
            "error": str(exc),
            "errorDetails": error_details(exc.__class__.__name__, 0, None, str(exc), None),
            "sdk": "openai-python",
            "requestShape": "sub2api-account-test-streaming-responses",
        }

def usage_from(parsed, prompt, out, config):
    usage = parsed.get("usage") if isinstance(parsed, dict) and isinstance(parsed.get("usage"), dict) else {}
    input_tokens = usage.get("input_tokens")
    output_tokens = usage.get("output_tokens")
    estimated = False
    if not isinstance(input_tokens, int):
        input_tokens = estimate_tokens(prompt)
        estimated = True
    if not isinstance(output_tokens, int):
        output_tokens = estimate_tokens(out)
        estimated = True
    total = usage.get("total_tokens")
    if not isinstance(total, int):
        total = input_tokens + output_tokens
        estimated = True
    cost = (
        input_tokens * float(config["pricing"]["usdPer1MInputTokens"])
        + output_tokens * float(config["pricing"]["usdPer1MOutputTokens"])
    ) / 1000000
    return {
        "inputTokens": input_tokens,
        "outputTokens": output_tokens,
        "totalTokens": total,
        "estimated": estimated,
        "estimatedCostUsd": cost,
    }

def probe_account(profile, config, purpose):
    marker = config["marker"]["prefix"] + "_" + hashlib.sha256((profile["accountName"] + str(time.time()) + str(random.random())).encode()).hexdigest()[:10]
    prompt = "Return exactly this marker and no other text: " + marker
    resp = openai_responses_create(profile, config, marker, prompt)
    parsed = resp.get("json")
    out = resp.get("outputText") if isinstance(resp.get("outputText"), str) else output_text(parsed)
    trimmed = out.strip()
    marker_matched = trimmed == marker if config["marker"].get("exact", True) else marker in trimmed
    usage = usage_from(parsed if isinstance(parsed, dict) else {}, prompt, out or resp.get("text", ""), config)
    http_success = isinstance(resp.get("status"), int) and 200 <= resp.get("status") < 300
    ok = marker_matched
    mismatch = not marker_matched
    if marker_matched:
        failure_kind = "none"
    elif resp.get("tooLarge"):
        failure_kind = "response-too-large"
    elif not resp["ok"]:
        failure_kind = "transport-or-http-failure"
    elif http_success:
        failure_kind = "success-body-mismatch"
    else:
        failure_kind = "unknown-marker-mismatch"
    return {
        "accountName": profile["accountName"],
        "profile": profile.get("profile"),
        "trustUpstream": profile.get("trustUpstream") is True,
        "purpose": purpose,
        "ok": ok,
        "markerMatched": marker_matched,
        "markerHash": sha(marker),
        "httpStatus": resp.get("status"),
        "transportOk": resp["ok"],
        "tooLarge": resp.get("tooLarge"),
        "durationMs": resp.get("durationMs"),
        "outputHash": sha(out),
        "outputPreview": "" if marker_matched else preview(out or resp.get("text", ""), 160),
        "responseBodyHash": sha(resp.get("text", "")),
        "responseBodyPreview": "" if marker_matched else preview(resp.get("text", ""), 1000),
        "error": resp.get("error"),
        "errorDetails": resp.get("errorDetails"),
        "usage": usage,
        "mismatch": mismatch,
        "transportFailure": not resp["ok"],
        "failureKind": failure_kind,
        "sdk": resp.get("sdk"),
        "requestShape": resp.get("requestShape"),
    }

def protect_policy(profile):
    policy = profile.get("sentinelProtect") if isinstance(profile.get("sentinelProtect"), dict) else {}
    return policy if policy.get("enabled") is True else None

def protect_failure_threshold(policy):
    try:
        return max(1, int(policy.get("consecutiveFailures") or 1))
    except Exception:
        return 1

def protect_retry_delay_seconds(policy, retry_index):
    try:
        initial = max(1, int(policy.get("initialRetryDelaySeconds") or 2))
    except Exception:
        initial = 2
    try:
        maximum = max(initial, int(policy.get("maxRetryDelaySeconds") or initial))
    except Exception:
        maximum = initial
    try:
        multiplier = max(1, int(policy.get("backoffMultiplier") or 2))
    except Exception:
        multiplier = 2
    if retry_index <= 0:
        return 0
    return min(maximum, initial * (multiplier ** (retry_index - 1)))

def probe_account_with_protection(profile, config, purpose):
    policy = protect_policy(profile)
    if policy is None:
        return probe_account(profile, config, purpose)
    threshold = protect_failure_threshold(policy)
    attempts = []
    last_result = None
    for index in range(threshold):
        delay = protect_retry_delay_seconds(policy, index)
        if delay > 0:
            time.sleep(delay)
        attempt_purpose = purpose if index == 0 else purpose + "-protect-retry"
        result = probe_account(profile, config, attempt_purpose)
        attempt = {
            "attempt": index + 1,
            "delaySeconds": delay,
            "ok": result.get("ok"),
            "markerMatched": result.get("markerMatched"),
            "httpStatus": result.get("httpStatus"),
            "durationMs": result.get("durationMs"),
            "failureKind": result.get("failureKind"),
            "outputHash": result.get("outputHash"),
            "responseBodyHash": result.get("responseBodyHash"),
            "errorDetails": result.get("errorDetails"),
        }
        attempts.append(attempt)
        last_result = result
        if result.get("markerMatched") is True:
            result["sentinelProtect"] = {
                "enabled": True,
                "threshold": threshold,
                "attempts": attempts,
                "failureCount": index,
                "protected": index > 0,
                "decision": "pass",
            }
            if index > 0:
                result["purpose"] = purpose + "-protect-recovered"
            return result
    if last_result is None:
        last_result = probe_account(profile, config, purpose)
    last_result["sentinelProtect"] = {
        "enabled": True,
        "threshold": threshold,
        "attempts": attempts,
        "failureCount": len(attempts),
        "protected": False,
        "decision": "fail",
    }
    last_result["purpose"] = purpose + "-protect-exhausted"
    return last_result

def ledger_for(state, now):
    day = day_key(now)
    ledger = state.setdefault("ledger", {}).setdefault(day, {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0, "estimatedCostUsd": 0, "requestCount": 0})
    return day, ledger

def account_day(account_state, day):
    return account_state.setdefault("daily", {}).setdefault(day, {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0, "estimatedCostUsd": 0, "requestCount": 0})

def runtime_temp_unschedulable_until(account):
    if not isinstance(account, dict):
        return None
    return account.get("temp_unschedulable_until") or account.get("tempUnschedulableUntil")

def runtime_temp_unschedulable_reason(account):
    if not isinstance(account, dict):
        return None
    return account.get("temp_unschedulable_reason") or account.get("tempUnschedulableReason")

def runtime_recovery_due(account_state):
    return account_state.get("runtimeSchedulable") is False

def sync_runtime_schedulable_state(state, profiles, now, admin):
    accounts_state = state.setdefault("accounts", {})
    synced_at = iso(now)
    summary = {
        "ok": False,
        "syncedAt": synced_at,
        "managedAccountCount": len(profiles),
        "schedulableAccounts": [],
        "unschedulableAccounts": [],
        "missingAccounts": [],
    }
    try:
        runtime_accounts = admin.accounts()
    except Exception as exc:
        error_text = str(exc)
        summary["error"] = error_text
        state["runtimeSchedulable"] = summary
        for profile in profiles:
            name = profile.get("accountName") if isinstance(profile, dict) else None
            if isinstance(name, str) and name:
                account_state = accounts_state.setdefault(name, {})
                account_state["runtimeSchedulable"] = None
                account_state["runtimeSyncedAt"] = synced_at
                account_state["runtimeSyncError"] = error_text
        return [{"type": "runtime-sync", "ok": False, "error": error_text}]
    for profile in profiles:
        name = profile.get("accountName") if isinstance(profile, dict) else None
        if not isinstance(name, str) or not name:
            continue
        account_state = accounts_state.setdefault(name, {})
        account = runtime_accounts.get(name)
        if not isinstance(account, dict):
            account_state["runtimeMissing"] = True
            account_state["runtimeAccountId"] = None
            account_state["runtimeStatus"] = None
            account_state["runtimeSchedulable"] = None
            account_state["runtimeTempUnschedulableUntil"] = None
            account_state["runtimeTempUnschedulableSet"] = None
            account_state["runtimeSyncedAt"] = synced_at
            account_state.pop("runtimeSyncError", None)
            summary["missingAccounts"].append(name)
            continue
        temp_until = runtime_temp_unschedulable_until(account)
        temp_reason = runtime_temp_unschedulable_reason(account)
        temp_set = temp_until is not None or bool(temp_reason)
        schedulable = account.get("schedulable")
        if not isinstance(schedulable, bool):
            schedulable = None
        account_state["runtimeMissing"] = False
        account_state["runtimeAccountId"] = account.get("id")
        account_state["runtimeStatus"] = account.get("status")
        account_state["runtimeSchedulable"] = schedulable
        account_state["runtimeTempUnschedulableUntil"] = temp_until
        account_state["runtimeTempUnschedulableSet"] = temp_set
        account_state["runtimeSyncedAt"] = synced_at
        account_state.pop("runtimeSyncError", None)
        if temp_reason:
            account_state["runtimeTempUnschedulableReasonHash"] = sha(temp_reason)
            account_state["runtimeTempUnschedulableReasonPreview"] = preview(temp_reason, 240)
        else:
            account_state.pop("runtimeTempUnschedulableReasonHash", None)
            account_state.pop("runtimeTempUnschedulableReasonPreview", None)
        if schedulable is True:
            summary["schedulableAccounts"].append(name)
        elif schedulable is False:
            summary["unschedulableAccounts"].append({
                "accountName": name,
                "status": account.get("status"),
                "tempUnschedulableSet": temp_set,
            })
    summary["ok"] = True
    state["runtimeSchedulable"] = summary
    return [{
        "type": "runtime-sync",
        "ok": True,
        "schedulableCount": len(summary["schedulableAccounts"]),
        "unschedulableCount": len(summary["unschedulableAccounts"]),
        "missingCount": len(summary["missingAccounts"]),
    }]

def add_usage(state, account_state, now, usage):
    day, ledger = ledger_for(state, now)
    daily = account_day(account_state, day)
    for target in (ledger, daily):
        target["inputTokens"] = int(target.get("inputTokens") or 0) + int(usage.get("inputTokens") or 0)
        target["outputTokens"] = int(target.get("outputTokens") or 0) + int(usage.get("outputTokens") or 0)
        target["totalTokens"] = int(target.get("totalTokens") or 0) + int(usage.get("totalTokens") or 0)
        target["estimatedCostUsd"] = round(float(target.get("estimatedCostUsd") or 0) + float(usage.get("estimatedCostUsd") or 0), 8)
        target["requestCount"] = int(target.get("requestCount") or 0) + 1

def due_time(account_state):
    quarantine = account_state.get("quarantine")
    if isinstance(quarantine, dict) and quarantine.get("active") is True:
        return parse_iso(quarantine.get("until"))
    if runtime_recovery_due(account_state):
        return None
    return parse_iso(account_state.get("nextProbeAfter"))

def choose_due_profiles(profiles, state, config, now):
    day, ledger = ledger_for(state, now)
    due = []
    accounts = state.setdefault("accounts", {})
    for profile in profiles:
        name = profile["accountName"]
        account_state = accounts.setdefault(name, {})
        when = due_time(account_state)
        if when is None or when <= now:
            quarantine = account_state.get("quarantine")
            active_quarantine = isinstance(quarantine, dict) and quarantine.get("active") is True
            purpose = "recovery" if active_quarantine else "runtime-recovery" if runtime_recovery_due(account_state) else "health"
            due.append({"profile": profile, "purpose": purpose, "dueAt": iso(when) if when else None})
    due.sort(key=lambda item: item["dueAt"] or "")
    return due, {"selected": len(due), "due": len(due), "limit": "all-due", "budgetMode": "record-only", "ledger": ledger}

def forced_account_names():
    raw = os.environ.get("SENTINEL_ACCOUNT_NAMES") or ""
    names = [item.strip() for item in raw.split(",") if item.strip()]
    return set(names)

def choose_forced_profiles(profiles, state, config, now, names):
    accounts = state.setdefault("accounts", {})
    found = []
    missing = sorted(names)
    due = []
    for profile in profiles:
        name = profile["accountName"]
        if name not in names:
            continue
        account_state = accounts.setdefault(name, {})
        quarantine = account_state.get("quarantine")
        active_quarantine = isinstance(quarantine, dict) and quarantine.get("active") is True
        purpose = "manual-recovery" if active_quarantine else "manual-runtime-recovery" if runtime_recovery_due(account_state) else "manual-health"
        due.append({"profile": profile, "purpose": purpose, "dueAt": "forced"})
        found.append(name)
    missing = sorted(name for name in names if name not in set(found))
    return due, {"selected": len(due), "due": len(due), "limit": "forced-accounts", "budgetMode": "record-only", "ledger": ledger_for(state, now)[1], "requestedAccounts": sorted(names), "missingAccounts": missing}

def success_max_interval(profile, config):
    cadence = config["cadence"]
    if profile.get("trustUpstream") is True:
        return int(cadence.get("trustedSuccessMaxIntervalMinutes") or cadence.get("successMaxIntervalMinutes"))
    return int(cadence.get("untrustedSuccessMaxIntervalMinutes") or cadence.get("successMaxIntervalMinutes"))

def next_success_interval(account_state, config, profile):
    streak = int(account_state.get("successStreak") or 0)
    previous = int(account_state.get("successIntervalMinutes") or 0)
    initial = int(config["cadence"]["successInitialIntervalMinutes"])
    maximum = success_max_interval(profile, config)
    multiplier = int(config["cadence"]["successBackoffMultiplier"])
    return initial if streak <= 0 or previous <= 0 else min(maximum, max(initial, previous * multiplier))

def next_freeze_interval(account_state, config, was_recovery):
    quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else {}
    previous = int(quarantine.get("intervalMinutes") or 0)
    initial = int(config["freeze"]["initialTtlMinutes"])
    maximum = int(config["freeze"]["maxTtlMinutes"])
    multiplier = int(config["freeze"]["backoffMultiplier"])
    if was_recovery and previous > 0:
        return min(maximum, max(initial, previous * multiplier))
    return initial

def apply_result(result, state, config, now, admin, profile):
    name = result["accountName"]
    account_state = state.setdefault("accounts", {}).setdefault(name, {})
    add_usage(state, account_state, now, result.get("usage") or {})
    actions_enabled = bool(config["actions"]["enabled"])
    quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else None
    was_recovery = bool(quarantine and quarantine.get("active") is True)
    was_runtime_recovery = runtime_recovery_due(account_state)
    action = {"taken": False, "type": None}
    if result.get("ok") is True:
        quality_gate = account_state.get("qualityGate") if isinstance(account_state.get("qualityGate"), dict) else None
        if was_recovery or was_runtime_recovery:
            if actions_enabled:
                try:
                    action_type = "restore" if was_recovery else "restore-runtime-unschedulable"
                    action = {"taken": True, "type": action_type, "result": admin.set_schedulable(name, True)}
                    account_state["runtimeSchedulable"] = True
                    account_state["runtimeSyncedAt"] = iso(now)
                    account_state.pop("runtimeSyncError", None)
                except Exception as exc:
                    action = {"taken": False, "type": "restore-failed", "error": str(exc)}
            if was_recovery:
                account_state["quarantine"] = {"active": False, "clearedAt": iso(now), "lastApplied": quarantine.get("applied") is True}
            account_state["successStreak"] = 0
            account_state["successIntervalMinutes"] = 0
        elif isinstance(quarantine, dict) and quarantine.get("active") is not True:
            account_state["quarantine"] = {"active": False, "clearedAt": iso(now), "lastApplied": quarantine.get("applied") is True}
        if quality_gate and quality_gate.get("pending") is True:
            account_state["qualityGate"] = {**quality_gate, "pending": False, "clearedAt": iso(now)}
        interval = next_success_interval(account_state, config, profile)
        account_state["successStreak"] = int(account_state.get("successStreak") or 0) + 1
        account_state["successIntervalMinutes"] = interval
        account_state["successMaxIntervalMinutes"] = success_max_interval(profile, config)
        account_state["nextProbeAfter"] = iso(add_minutes(now, interval, int(config["cadence"]["jitterPercent"])))
        account_state["lastOkAt"] = iso(now)
        account_state["lastStatus"] = "ok"
    else:
        should_freeze = result.get("markerMatched") is not True
        if should_freeze:
            interval = next_freeze_interval(account_state, config, was_recovery)
            until = add_minutes(now, interval, int(config["freeze"]["jitterPercent"]))
            applied = False
            if actions_enabled:
                try:
                    action = {"taken": True, "type": "freeze", "result": admin.set_schedulable(name, False)}
                    applied = True
                    account_state["runtimeSchedulable"] = False
                    account_state["runtimeSyncedAt"] = iso(now)
                except Exception as exc:
                    action = {"taken": False, "type": "freeze-failed", "error": str(exc)}
            else:
                action = {"taken": False, "type": "would-freeze"}
            account_state["quarantine"] = {
                "active": True,
                "applied": applied,
                "until": iso(until),
                "intervalMinutes": interval,
                "reason": "marker-not-matched",
                "failureKind": result.get("failureKind"),
                "markerHash": result.get("markerHash"),
                "outputHash": result.get("outputHash"),
                "responseBodyHash": result.get("responseBodyHash"),
                "errorDetails": result.get("errorDetails"),
                "lastBadAt": iso(now),
            }
            account_state["nextProbeAfter"] = iso(until)
            account_state["successStreak"] = 0
            account_state["successIntervalMinutes"] = 0
            account_state["successMaxIntervalMinutes"] = success_max_interval(profile, config)
            account_state["lastStatus"] = "quarantined"
        else:
            retry = int(config["probe"]["transportRetryMinutes"])
            account_state["nextProbeAfter"] = iso(add_minutes(now, retry, int(config["cadence"]["jitterPercent"])))
            account_state["lastStatus"] = "marker-not-matched-no-freeze"
        account_state["lastFailureAt"] = iso(now)
    account_state["lastProbeAt"] = iso(now)
    account_state["trustUpstream"] = profile.get("trustUpstream") is True
    account_state["lastProbe"] = {
        "ok": result.get("ok"),
        "purpose": result.get("purpose"),
        "trustUpstream": result.get("trustUpstream"),
        "sentinelProtect": result.get("sentinelProtect"),
        "successMaxIntervalMinutes": success_max_interval(profile, config),
        "httpStatus": result.get("httpStatus"),
        "durationMs": result.get("durationMs"),
        "markerMatched": result.get("markerMatched"),
        "transportOk": result.get("transportOk"),
        "outputHash": result.get("outputHash"),
        "outputPreview": result.get("outputPreview"),
        "responseBodyHash": result.get("responseBodyHash"),
        "responseBodyPreview": result.get("responseBodyPreview"),
        "error": result.get("error"),
        "errorDetails": result.get("errorDetails"),
        "usage": result.get("usage"),
        "failureKind": result.get("failureKind"),
        "sdk": result.get("sdk"),
        "requestShape": result.get("requestShape"),
        "action": action,
    }
    return action

def log_line_payload(line):
    pos = line.find("{")
    if pos < 0:
        return None, "", None
    prefix = line[:pos].rstrip("\t ")
    parts = prefix.split("\t")
    ts = parts[0] if parts else ""
    message = parts[3] if len(parts) >= 4 else ""
    try:
        return ts, message, json.loads(line[pos:])
    except Exception:
        return ts, message, None

def gateway_monitor_paths(config):
    cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
    paths = cfg.get("paths")
    if isinstance(paths, list) and paths:
        return set(str(item) for item in paths if isinstance(item, str) and item)
    return {"/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"}

def gateway_failure_kind(message, payload, config):
    if not isinstance(payload, dict):
        return None
    path = payload.get("path")
    if path not in gateway_monitor_paths(config):
        return None
    if payload.get("account_id") is None:
        return None
    if "codex.remote_compact.failed" in message:
        status = payload.get("status_code")
        if isinstance(status, int) and status >= 500:
            return "gateway-compact-final-failure"
        return None
    if "openai.upstream_failover_switching" in message and path in ("/responses/compact", "/v1/responses/compact"):
        upstream_status = payload.get("upstream_status")
        if isinstance(upstream_status, int) and upstream_status >= 500:
            return "gateway-compact-upstream-failover"
        return None
    if "openai.forward_failed" not in message:
        return None
    error_text = str(payload.get("error") or "").lower()
    fallback_written = payload.get("fallback_error_response_written") is True
    upstream_already_written = payload.get("upstream_error_response_already_written") is True
    stream_failure = any(token in error_text for token in (
        "stream usage incomplete",
        "missing terminal event",
        "stream read error",
        "stream data interval timeout",
    ))
    if fallback_written or upstream_already_written or stream_failure:
        return "gateway-stream-forward-failure"
    session_affinity_failure = any(token in error_text for token in (
        "encrypted content could not be decrypted",
        "could not be verified",
        "invalid_encrypted_content",
    ))
    if session_affinity_failure:
        return "gateway-session-affinity-failure"
    final_compatibility_failure = any(token in error_text for token in (
        "bad_response_status_code",
        "model_not_found",
        "no available channel for model",
        "unsupported model",
        "not support",
        "not supported",
        "payload too large",
        "request too large",
        "context length",
        "context window",
        "maximum context",
    ))
    if final_compatibility_failure:
        return "gateway-final-compatibility-failure"
    final_5xx_failure = any(token in error_text for token in (
        "upstream error: 500",
        "upstream error: 502",
        "upstream error: 503",
        "upstream error: 504",
        "upstream error: 524",
        "gateway timeout",
        "bad gateway",
        "upstream request failed",
        "unknown error",
        "context deadline exceeded",
        "context canceled",
    ))
    if final_5xx_failure:
        return "gateway-final-transient-failure"
    return None

def gateway_failure_is_observe_only(failure_kind):
    return failure_kind in {"gateway-session-affinity-failure", "gateway-compact-final-failure", "gateway-compact-upstream-failover"}

def gateway_failure_item(ts, pod_name, payload, failure_kind):
    request_id = payload.get("request_id") or sha(json.dumps(payload, sort_keys=True, ensure_ascii=False))
    try:
        account_id = int(payload.get("account_id"))
    except Exception:
        account_id = None
    return {
        "at": ts,
        "pod": pod_name,
        "requestId": request_id,
        "clientRequestId": payload.get("client_request_id"),
        "accountId": account_id,
        "failureKind": failure_kind,
        "path": payload.get("path"),
        "errorPreview": preview(payload.get("error"), 240),
        "fallbackErrorResponseWritten": payload.get("fallback_error_response_written") is True,
        "upstreamErrorResponseAlreadyWritten": payload.get("upstream_error_response_already_written") is True,
        "bodyBytes": payload.get("body_bytes"),
        "latencyMs": payload.get("latency_ms"),
        "statusCode": payload.get("status_code"),
        "upstreamStatus": payload.get("upstream_status"),
    }

def trim_gateway_seen(monitor_state, now, lookback_seconds):
    seen = monitor_state.setdefault("seenRequestIds", {})
    if not isinstance(seen, dict):
        seen = {}
        monitor_state["seenRequestIds"] = seen
    cutoff = now - timedelta(seconds=max(int(lookback_seconds) * 4, 3600))
    for request_id, seen_at in list(seen.items()):
        parsed = parse_iso(seen_at)
        if parsed is None or parsed < cutoff:
            seen.pop(request_id, None)
    return seen

def gateway_failure_account_map(admin):
    by_id = {}
    for name, account in admin.accounts().items():
        try:
            account_id = int(account.get("id"))
        except Exception:
            continue
        by_id[account_id] = name
    return by_id

def next_gateway_freeze_interval(account_state, config):
    monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
    initial = int(monitor_cfg.get("initialTtlMinutes") or 5)
    maximum = int(monitor_cfg.get("maxTtlMinutes") or 30)
    multiplier = int(monitor_cfg.get("backoffMultiplier") or 2)
    previous = int(account_state.get("gatewayFailureBackoffIntervalMinutes") or 0)
    quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else {}
    if quarantine.get("reason") == "gateway-forward-failure":
        previous = max(previous, int(quarantine.get("intervalMinutes") or 0))
    if previous <= 0:
        return initial
    return min(maximum, max(initial, previous * multiplier))

def apply_gateway_failure(account_name, failures, state, config, now, admin, profile):
    latest = failures[-1]
    account_state = state.setdefault("accounts", {}).setdefault(account_name, {})
    policy = protect_policy(profile)
    protected_probe = None
    if policy is not None:
        protected_probe = probe_account_with_protection(profile, config, "gateway-failure-confirm")
        protected_probe["sourceGatewayFailure"] = {
            "requestId": latest.get("requestId"),
            "clientRequestId": latest.get("clientRequestId"),
            "failureKind": latest.get("failureKind"),
            "path": latest.get("path"),
            "countInRun": len(failures),
        }
        account_state["lastProbeAt"] = iso(now)
        account_state["lastProbe"] = {
            "ok": protected_probe.get("ok"),
            "purpose": protected_probe.get("purpose"),
            "trustUpstream": protected_probe.get("trustUpstream"),
            "sentinelProtect": protected_probe.get("sentinelProtect"),
            "successMaxIntervalMinutes": success_max_interval(profile, config),
            "httpStatus": protected_probe.get("httpStatus"),
            "durationMs": protected_probe.get("durationMs"),
            "markerMatched": protected_probe.get("markerMatched"),
            "transportOk": protected_probe.get("transportOk"),
            "outputHash": protected_probe.get("outputHash"),
            "outputPreview": protected_probe.get("outputPreview"),
            "responseBodyHash": protected_probe.get("responseBodyHash"),
            "responseBodyPreview": protected_probe.get("responseBodyPreview"),
            "error": protected_probe.get("error"),
            "errorDetails": protected_probe.get("errorDetails"),
            "usage": protected_probe.get("usage"),
            "failureKind": protected_probe.get("failureKind"),
            "sdk": protected_probe.get("sdk"),
            "requestShape": protected_probe.get("requestShape"),
            "action": {"taken": False, "type": "protect-confirm-pass" if protected_probe.get("markerMatched") is True else "protect-confirm-fail"},
            "sourceGatewayFailure": protected_probe.get("sourceGatewayFailure"),
        }
        add_usage(state, account_state, now, protected_probe.get("usage") or {})
        account_state["sentinelProtect"] = protected_probe.get("sentinelProtect")
        account_state["trustUpstream"] = profile.get("trustUpstream") is True
        account_state["successMaxIntervalMinutes"] = success_max_interval(profile, config)
        if protected_probe.get("markerMatched") is True:
            interval = next_success_interval(account_state, config, profile)
            account_state["successStreak"] = int(account_state.get("successStreak") or 0) + 1
            account_state["successIntervalMinutes"] = interval
            account_state["nextProbeAfter"] = iso(add_minutes(now, interval, int(config["cadence"]["jitterPercent"])))
            account_state["lastOkAt"] = iso(now)
            account_state["lastStatus"] = "gateway-failure-protect-confirmed-ok"
            account_state["lastGatewayFailureAt"] = iso(now)
            account_state["lastGatewayFailure"] = {
                "accountName": account_name,
                "accountId": latest.get("accountId"),
                "requestId": latest.get("requestId"),
                "clientRequestId": latest.get("clientRequestId"),
                "failureKind": latest.get("failureKind"),
                "path": latest.get("path"),
                "errorPreview": latest.get("errorPreview"),
                "countInRun": len(failures),
                "firstAt": failures[0].get("at"),
                "lastAt": latest.get("at"),
                "action": {"taken": False, "type": "protect-confirm-pass"},
                "sentinelProtect": protected_probe.get("sentinelProtect"),
            }
            return {"taken": False, "type": "protect-confirm-pass", "sentinelProtect": protected_probe.get("sentinelProtect")}
    interval = next_gateway_freeze_interval(account_state, config)
    until = add_minutes(now, interval, int(config["freeze"]["jitterPercent"]))
    actions_enabled = bool(config["actions"]["enabled"])
    applied = False
    action = {"taken": False, "type": "would-freeze"}
    if actions_enabled:
        try:
            action = {"taken": True, "type": "freeze", "result": admin.set_schedulable(account_name, False)}
            applied = True
        except Exception as exc:
            action = {"taken": False, "type": "freeze-failed", "error": str(exc)}
    account_state["quarantine"] = {
        "active": True,
        "applied": applied,
        "until": iso(until),
        "intervalMinutes": interval,
        "reason": "gateway-forward-failure",
        "failureKind": latest.get("failureKind") or "gateway-forward-failure",
        "errorDetails": {
            "kind": "Sub2APIGatewayForwardFailure",
            "requestId": latest.get("requestId"),
            "clientRequestId": latest.get("clientRequestId"),
            "failureKind": latest.get("failureKind"),
            "path": latest.get("path"),
            "errorPreview": latest.get("errorPreview"),
            "fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
            "upstreamErrorResponseAlreadyWritten": latest.get("upstreamErrorResponseAlreadyWritten"),
            "bodyBytes": latest.get("bodyBytes"),
            "latencyMs": latest.get("latencyMs"),
            "countInRun": len(failures),
        },
        "lastBadAt": iso(now),
        "sentinelProtect": protected_probe.get("sentinelProtect") if isinstance(protected_probe, dict) else None,
    }
    account_state["nextProbeAfter"] = iso(until)
    account_state["successStreak"] = 0
    account_state["successIntervalMinutes"] = 0
    account_state["successMaxIntervalMinutes"] = success_max_interval(profile, config)
    account_state["lastStatus"] = "quarantined"
    account_state["lastFailureAt"] = iso(now)
    account_state["lastGatewayFailureAt"] = iso(now)
    account_state["gatewayFailureBackoffIntervalMinutes"] = interval
    account_state["lastGatewayFailure"] = {
        "accountName": account_name,
        "accountId": latest.get("accountId"),
        "requestId": latest.get("requestId"),
        "clientRequestId": latest.get("clientRequestId"),
        "failureKind": latest.get("failureKind"),
        "path": latest.get("path"),
        "errorPreview": latest.get("errorPreview"),
        "fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
        "upstreamErrorResponseAlreadyWritten": latest.get("upstreamErrorResponseAlreadyWritten"),
        "bodyBytes": latest.get("bodyBytes"),
        "latencyMs": latest.get("latencyMs"),
        "countInRun": len(failures),
        "firstAt": failures[0].get("at"),
        "lastAt": latest.get("at"),
        "intervalMinutes": interval,
        "freezeUntil": iso(until),
        "action": action,
        "sentinelProtect": protected_probe.get("sentinelProtect") if isinstance(protected_probe, dict) else None,
    }
    return action

def record_gateway_observation(account_name, failures, state, now):
    latest = failures[-1]
    account_state = state.setdefault("accounts", {}).setdefault(account_name, {})
    account_state["lastGatewayAffinityFailureAt"] = iso(now)
    account_state["lastGatewayAffinityFailure"] = {
        "accountName": account_name,
        "accountId": latest.get("accountId"),
        "requestId": latest.get("requestId"),
        "clientRequestId": latest.get("clientRequestId"),
        "failureKind": latest.get("failureKind"),
        "path": latest.get("path"),
        "errorPreview": latest.get("errorPreview"),
        "bodyBytes": latest.get("bodyBytes"),
        "latencyMs": latest.get("latencyMs"),
        "countInRun": len(failures),
        "firstAt": failures[0].get("at"),
        "lastAt": latest.get("at"),
        "action": {
            "taken": False,
            "type": "observe-session-affinity-failure",
        },
    }
    return {"taken": False, "type": "observe-session-affinity-failure"}

def run_gateway_failure_monitor(state, config, now, kube, admin, profiles):
    cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
    if cfg.get("enabled") is not True:
        return {"enabled": False, "scanned": 0, "newFailures": 0, "actions": []}
    lookback_seconds = int(cfg.get("lookbackSeconds") or 900)
    tail_lines = int(cfg.get("tailLines") or 4000)
    monitor_state = state.setdefault("gatewayFailureMonitor", {})
    if not isinstance(monitor_state, dict):
        monitor_state = {}
        state["gatewayFailureMonitor"] = monitor_state
    seen = trim_gateway_seen(monitor_state, now, lookback_seconds)
    pods = kube.list_pods("app.kubernetes.io/name=sub2api")
    candidates = []
    log_errors = []
    for pod in pods:
        metadata = pod.get("metadata") if isinstance(pod, dict) else {}
        status = pod.get("status") if isinstance(pod, dict) else {}
        pod_name = metadata.get("name")
        if not isinstance(pod_name, str) or not pod_name:
            continue
        if status.get("phase") not in (None, "Running"):
            continue
        try:
            logs = kube.pod_logs(pod_name, "sub2api", lookback_seconds, tail_lines)
        except Exception as exc:
            log_errors.append({"pod": pod_name, "error": str(exc)})
            continue
        for line in str(logs).splitlines():
            ts, message, payload = log_line_payload(line)
            failure_kind = gateway_failure_kind(message, payload, config)
            if failure_kind is not None:
                candidates.append(gateway_failure_item(ts, pod_name, payload, failure_kind))
    by_id = gateway_failure_account_map(admin) if candidates else {}
    profile_by_name = {item.get("accountName"): item for item in profiles if isinstance(item, dict) and isinstance(item.get("accountName"), str)}
    by_account = {}
    skipped = []
    new_failures = []
    for item in candidates:
        request_id = item.get("requestId")
        if not isinstance(request_id, str) or not request_id:
            request_id = sha(json.dumps(item, sort_keys=True, ensure_ascii=False))
            item["requestId"] = request_id
        if request_id in seen:
            continue
        seen[request_id] = iso(now)
        account_name = by_id.get(item.get("accountId"))
        if not account_name:
            skipped.append({"requestId": request_id, "accountId": item.get("accountId"), "reason": "account-id-not-managed"})
            continue
        if account_name not in profile_by_name:
            skipped.append({"requestId": request_id, "accountId": item.get("accountId"), "accountName": account_name, "reason": "account-not-in-profiles"})
            continue
        item["accountName"] = account_name
        by_account.setdefault(account_name, []).append(item)
        new_failures.append(item)
    actions = []
    for account_name, failures in sorted(by_account.items()):
        failures.sort(key=lambda item: item.get("at") or "")
        profile = profile_by_name[account_name]
        freeze_failures = [item for item in failures if not gateway_failure_is_observe_only(item.get("failureKind"))]
        observed_failures = [item for item in failures if gateway_failure_is_observe_only(item.get("failureKind"))]
        if freeze_failures:
            action = apply_gateway_failure(account_name, freeze_failures, state, config, now, admin, profile)
            actions.append({
                "accountName": account_name,
                "accountId": freeze_failures[-1].get("accountId"),
                "failureCount": len(freeze_failures),
                "requestId": freeze_failures[-1].get("requestId"),
                "failureKind": freeze_failures[-1].get("failureKind"),
                "path": freeze_failures[-1].get("path"),
                "errorPreview": freeze_failures[-1].get("errorPreview"),
                "taken": action.get("taken"),
                "type": action.get("type"),
                "error": action.get("error"),
            })
        if observed_failures:
            action = record_gateway_observation(account_name, observed_failures, state, now)
            actions.append({
                "accountName": account_name,
                "accountId": observed_failures[-1].get("accountId"),
                "failureCount": len(observed_failures),
                "requestId": observed_failures[-1].get("requestId"),
                "failureKind": observed_failures[-1].get("failureKind"),
                "path": observed_failures[-1].get("path"),
                "errorPreview": observed_failures[-1].get("errorPreview"),
                "taken": action.get("taken"),
                "type": action.get("type"),
                "error": action.get("error"),
            })
    monitor_state["lastRunAt"] = iso(now)
    monitor_state["lastScannedPods"] = [((pod.get("metadata") or {}).get("name")) for pod in pods if isinstance(pod, dict)]
    monitor_state["lastCandidateCount"] = len(candidates)
    monitor_state["lastNewFailureCount"] = len(new_failures)
    monitor_state["lastActionCount"] = len(actions)
    monitor_state["lastFailures"] = new_failures[-20:]
    monitor_state["lastSkipped"] = skipped[-20:]
    monitor_state["lastLogErrors"] = log_errors[-10:]
    return {
        "enabled": True,
        "lookbackSeconds": lookback_seconds,
        "tailLines": tail_lines,
        "scannedPods": len(pods),
        "candidates": len(candidates),
        "newFailures": len(new_failures),
        "actionsTaken": sum(1 for item in actions if item.get("taken") is True),
        "actions": actions[-20:],
        "skipped": skipped[-20:],
        "logErrors": log_errors[-10:],
    }

def reconcile_active_quarantines(state, config, now, admin):
    actions = []
    if not config["actions"]["enabled"]:
        return actions
    for name, account_state in state.setdefault("accounts", {}).items():
        quarantine = account_state.get("quarantine")
        if not isinstance(quarantine, dict) or quarantine.get("active") is not True:
            continue
        until = parse_iso(quarantine.get("until"))
        if until is not None and until <= now:
            continue
        if quarantine.get("applied") is not True:
            try:
                admin.set_schedulable(name, False)
                quarantine["applied"] = True
                quarantine["appliedAt"] = iso(now)
                actions.append({"accountName": name, "type": "apply-pending-freeze", "ok": True})
            except Exception as exc:
                actions.append({"accountName": name, "type": "apply-pending-freeze", "ok": False, "error": str(exc)})
            continue
        try:
            admin.set_schedulable(name, False)
            actions.append({"accountName": name, "type": "reassert-freeze", "ok": True})
        except Exception as exc:
            actions.append({"accountName": name, "type": "reassert-freeze", "ok": False, "error": str(exc)})
    return actions

def main():
    now = utc_now()
    config = load_json(CONFIG_PATH)
    profiles = load_json(PROFILES_PATH).get("profiles") or []
    namespace = os.environ.get("POD_NAMESPACE") or "platform-infra"
    kube = KubeClient(namespace)
    state_obj, state = load_state(kube, config)
    admin = Sub2ApiAdmin(config)
    runtime_sync = sync_runtime_schedulable_state(state, profiles, now, admin)
    reconcile = runtime_sync + reconcile_active_quarantines(state, config, now, admin)
    forced_names = forced_account_names()
    gateway_monitor = {"enabled": False, "skipped": "forced-manual-probe"} if forced_names else run_gateway_failure_monitor(state, config, now, kube, admin, profiles)
    if forced_names:
        due, selection = choose_forced_profiles(profiles, state, config, now, forced_names)
    else:
        due, selection = choose_due_profiles(profiles, state, config, now)
    results = []
    actions = []
    if (config["monitor"]["enabled"] or forced_names) and due:
        with ThreadPoolExecutor(max_workers=max(1, len(due))) as executor:
            futures = {executor.submit(probe_account_with_protection, item["profile"], config, item["purpose"]): item["profile"] for item in due}
            for future in as_completed(futures):
                result = future.result()
                results.append(result)
                profile = futures[future]
                actions.append({"accountName": result["accountName"], **apply_result(result, state, config, now, admin, profile)})
    history = state.setdefault("history", [])
    run_summary = {
        "at": iso(now),
        "monitorEnabled": bool(config["monitor"]["enabled"]),
        "actionsEnabled": bool(config["actions"]["enabled"]),
        "profileCount": len(profiles),
        "selected": len(due),
        "okCount": sum(1 for item in results if item.get("ok") is True),
        "mismatchCount": sum(1 for item in results if item.get("markerMatched") is not True),
        "markerMismatchCount": sum(1 for item in results if item.get("markerMatched") is not True),
        "transportFailureCount": sum(1 for item in results if item.get("transportFailure") is True),
        "actionsTaken": sum(1 for item in actions if item.get("taken") is True),
        "gatewayFailureMonitor": {
            "enabled": gateway_monitor.get("enabled") is True,
            "newFailures": gateway_monitor.get("newFailures", 0),
            "actionsTaken": gateway_monitor.get("actionsTaken", 0),
            "skipped": gateway_monitor.get("skipped"),
            "logErrors": gateway_monitor.get("logErrors"),
        },
        "runtimeSchedulable": state.get("runtimeSchedulable"),
        "selection": selection,
        "reconcile": reconcile[-20:],
    }
    history.append(run_summary)
    del history[:-int(config["state"]["historyLimit"])]
    state["lastRun"] = run_summary
    kube.update_configmap_state(state_obj, state)
    print(json.dumps({
        "ok": True,
        "summary": run_summary,
        "results": [{
            "accountName": item.get("accountName"),
            "purpose": item.get("purpose"),
            "trustUpstream": item.get("trustUpstream"),
            "sentinelProtect": item.get("sentinelProtect"),
            "ok": item.get("ok"),
            "markerMatched": item.get("markerMatched"),
            "httpStatus": item.get("httpStatus"),
            "durationMs": item.get("durationMs"),
            "usage": item.get("usage"),
            "outputHash": item.get("outputHash"),
            "outputPreview": item.get("outputPreview"),
            "responseBodyHash": item.get("responseBodyHash"),
            "responseBodyPreview": item.get("responseBodyPreview"),
            "error": item.get("error"),
            "errorDetails": item.get("errorDetails"),
            "failureKind": item.get("failureKind"),
            "sdk": item.get("sdk"),
            "requestShape": item.get("requestShape"),
        } for item in results],
        "actions": actions,
        "gatewayFailureMonitor": gateway_monitor,
        "valuesPrinted": False,
    }, ensure_ascii=False))

if __name__ == "__main__":
    try:
        main()
    except Exception as exc:
        print(json.dumps({
            "ok": False,
            "error": str(exc),
            "traceback": traceback.format_exc()[-4000:],
            "valuesPrinted": False,
        }, ensure_ascii=False))
        raise
`;
}

function indentBlock(value: string, spaces: number): string {
  const prefix = " ".repeat(spaces);
  return value.split("\n").map((line) => `${prefix}${line}`).join("\n");
}

function valueAt(value: Record<string, unknown>, key: string): unknown {
  return Object.prototype.hasOwnProperty.call(value, key) ? value[key] : undefined;
}

function readRequiredRecord(value: unknown, key: string): Record<string, unknown> {
  if (!isRecord(value)) throw new Error(`${key} must be a YAML object`);
  return value;
}

function readRequiredBoolean(value: unknown, key: string): boolean {
  if (value === undefined || value === null) throw new Error(`${key} is required`);
  if (typeof value === "boolean") return value;
  if (typeof value === "string" && value.trim() === "true") return true;
  if (typeof value === "string" && value.trim() === "false") return false;
  throw new Error(`${key} must be a boolean`);
}

function readRequiredDnsName(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (!/^[a-z0-9]([-a-z0-9]*[a-z0-9])?$/u.test(text)) throw new Error(`${key} must be a Kubernetes DNS label`);
  return text;
}

function readRequiredModelName(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (!/^[A-Za-z0-9._:-]+$/u.test(text)) throw new Error(`${key} has an unsupported model name`);
  return text;
}

function readRequiredEndpoint(value: unknown, key: string): "responses" {
  const text = readRequiredString(value, key);
  if (text !== "responses") throw new Error(`${key} must be responses`);
  return text;
}

function readRequiredImage(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (!/^[A-Za-z0-9._:/@-]+$/u.test(text)) throw new Error(`${key} has an unsupported image format`);
  return text;
}

function readRequiredOpenAiPythonVersion(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (!/^[0-9]+[.][0-9]+[.][0-9]+$/u.test(text)) throw new Error(`${key} must be a pinned semver version like 2.41.1`);
  return text;
}

function readRequiredString(value: unknown, key: string): string {
  if (value === undefined || value === null) throw new Error(`${key} is required`);
  if (typeof value !== "string" || value.trim().length === 0) throw new Error(`${key} must be a non-empty string`);
  if (/[\r\n]/u.test(value)) throw new Error(`${key} must not contain newlines`);
  return value.trim();
}

function readRequiredMarkerPrefix(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (!/^[A-Za-z0-9_-]{2,32}$/u.test(text)) throw new Error(`${key} must be 2-32 chars of letters, digits, _ or -`);
  return text;
}

function readRequiredUserAgent(value: unknown, key: string): string {
  const text = readRequiredString(value, key);
  if (/[\r\n]/u.test(text)) throw new Error(`${key} must not contain newlines`);
  if (Buffer.byteLength(text, "utf8") > 200) throw new Error(`${key} must be at most 200 bytes`);
  return text;
}

function readRequiredPathList(value: unknown, key: string): string[] {
  if (value === undefined || value === null) throw new Error(`${key} is required`);
  if (!Array.isArray(value) || value.length === 0) throw new Error(`${key} must be a non-empty string array`);
  const paths = value.map((item, index) => {
    if (typeof item !== "string" || item.trim().length === 0) throw new Error(`${key}[${index}] must be a non-empty string`);
    const path = item.trim();
    if (!/^\/[A-Za-z0-9._~!$&'()*+,;=:@/-]*$/u.test(path)) throw new Error(`${key}[${index}] has an unsupported HTTP path`);
    return path;
  });
  return [...new Set(paths)];
}

function readRequiredInt(value: unknown, key: string, min: number, max: number): number {
  if (value === undefined || value === null) throw new Error(`${key} is required`);
  const parsed = typeof value === "number" ? value : typeof value === "string" && value.trim() ? Number(value) : Number.NaN;
  if (!Number.isInteger(parsed) || parsed < min || parsed > max) throw new Error(`${key} must be an integer from ${min} to ${max}`);
  return parsed;
}

function readRequiredNumber(value: unknown, key: string, min: number, max: number): number {
  if (value === undefined || value === null) throw new Error(`${key} is required`);
  const parsed = typeof value === "number" ? value : typeof value === "string" && value.trim() ? Number(value) : Number.NaN;
  if (!Number.isFinite(parsed) || parsed < min || parsed > max) throw new Error(`${key} must be a number from ${min} to ${max}`);
  return parsed;
}

function isRecord(value: unknown): value is Record<string, unknown> {
  return typeof value === "object" && value !== null && !Array.isArray(value);
}