fix: add opencode smoke observability probes
This commit is contained in:
@@ -45,6 +45,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
examples: [
|
||||
"bun scripts/cli.ts web-probe run --node D601 --lane v03 --wait-messages-ms 1000",
|
||||
"bun scripts/cli.ts web-probe run --node D601 --lane v03 --fresh-session --message 'ping'",
|
||||
"bun scripts/cli.ts web-probe opencode-smoke --node D601 --lane v03 --message 'hi'",
|
||||
"bun scripts/cli.ts web-probe script --node D601 --lane v03 --script-file .state/probes/workbench.mjs",
|
||||
"bun scripts/cli.ts web-probe screenshot --node D601 --lane v03 --url https://monitor.pikapython.com --viewport 1440x900",
|
||||
"bun scripts/cli.ts web-probe screenshot --node D601 --lane v03 --url https://monitor.pikapython.com --viewport 390x844 --name monitor-mobile.png",
|
||||
@@ -72,6 +73,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
],
|
||||
actions: {
|
||||
run: "Run the repo-owned scripts/web-live-dom-probe.mjs helper.",
|
||||
"opencode-smoke": "Run the repo-owned OpenCode iframe/direct-host composer smoke and require DOM assistant text plus EventSource update/finish/idle evidence.",
|
||||
script: "Run caller-provided Playwright JS after CLI-managed /auth/login; scripts must not handle secrets themselves.",
|
||||
screenshot: "Capture a no-auth or public page through the selected node/lane remote browser and download PNG artifacts to the caller /tmp by default.",
|
||||
observe: "Start, inspect, control, stop, collect, and analyze a long-running observer that writes JSONL artifacts.",
|
||||
@@ -80,6 +82,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
notes: [
|
||||
"Default URL, browser proxy mode, observe/analyze thresholds, and project-management command allowlist come from config/hwlab-node-lanes.yaml webProbe.",
|
||||
"`web-probe script` is an ad-hoc exploration escape hatch; repeated/high-frequency workflows must become `web-probe observe command` types or repo-owned web-probe commands.",
|
||||
"`web-probe opencode-smoke` is the repo-owned OpenCode smoke; prefer it over repeating one-off OpenCode Playwright snippets.",
|
||||
"observe is passive by default; user actions must be explicit observe command entries in control.jsonl.",
|
||||
"After observe start, prefer observe status|command|stop|collect|analyze <id> instead of repeating --node/--lane/--state-dir.",
|
||||
"collect views render bounded summaries from existing artifacts and do not create a second source of truth.",
|
||||
|
||||
@@ -86,8 +86,12 @@ export interface NodeWebProbeScriptOptions {
|
||||
browserProxyMode: WebProbeBrowserProxyMode;
|
||||
commandTimeoutSeconds: number;
|
||||
scriptText: string;
|
||||
commandLabel?: string;
|
||||
suppressAdHocWarning?: boolean;
|
||||
generatedHints?: string[];
|
||||
generatedPreferredCommands?: Record<string, string>;
|
||||
scriptSource: {
|
||||
kind: "stdin" | "file";
|
||||
kind: "stdin" | "file" | "generated";
|
||||
path: string | null;
|
||||
byteCount: number;
|
||||
sha256: string;
|
||||
|
||||
@@ -469,6 +469,7 @@ export function runNodeWebProbeScript(
|
||||
material: BootstrapAdminPasswordMaterial,
|
||||
credential: Record<string, unknown>,
|
||||
): Record<string, unknown> {
|
||||
const commandLabel = options.commandLabel ?? `web-probe script --node ${options.node} --lane ${options.lane}`;
|
||||
const webProbeProxy = nodeWebProbeHostProxyEnv(spec, options.browserProxyMode);
|
||||
const script = nodeWebProbeScriptRemoteShell(options, secretSpec, material.username ?? secretSpec.bootstrapAdminUsername, material.password ?? "", webProbeProxy, spec.webProbe?.playwrightBrowsersPath);
|
||||
const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
|
||||
@@ -545,7 +546,7 @@ export function runNodeWebProbeScript(
|
||||
return renderWebProbeScriptResult({
|
||||
ok: passed,
|
||||
status: passed ? "pass" : "blocked",
|
||||
command: `web-probe script --node ${options.node} --lane ${options.lane}`,
|
||||
command: commandLabel,
|
||||
node: options.node,
|
||||
lane: options.lane,
|
||||
workspace: spec.workspace,
|
||||
@@ -579,6 +580,7 @@ export function runNodeWebProbeScript(
|
||||
}
|
||||
|
||||
function webProbeScriptGovernanceWarnings(options: NodeWebProbeScriptOptions): Record<string, unknown>[] {
|
||||
if (options.suppressAdHocWarning === true) return [];
|
||||
return [{
|
||||
code: "web_probe_script_ad_hoc_only",
|
||||
severity: "warning",
|
||||
@@ -590,6 +592,7 @@ function webProbeScriptGovernanceWarnings(options: NodeWebProbeScriptOptions): R
|
||||
}
|
||||
|
||||
function webProbeScriptGovernanceHints(options: NodeWebProbeScriptOptions): string[] {
|
||||
if (options.generatedHints !== undefined) return options.generatedHints;
|
||||
return [
|
||||
"Prefer `web-probe observe start` plus `web-probe observe command` for interactive flows; use `observe collect/analyze` for repeated evidence reads.",
|
||||
"If the same script is needed more than once, add or extend a reusable command type in the web-probe observe command surface.",
|
||||
@@ -598,6 +601,7 @@ function webProbeScriptGovernanceHints(options: NodeWebProbeScriptOptions): stri
|
||||
}
|
||||
|
||||
function webProbeScriptPreferredCommands(options: NodeWebProbeScriptOptions): Record<string, string> {
|
||||
if (options.generatedPreferredCommands !== undefined) return options.generatedPreferredCommands;
|
||||
return {
|
||||
startObserver: `bun scripts/cli.ts web-probe observe start --node ${options.node} --lane ${options.lane} --target-path /projects/mdtodo`,
|
||||
mdtodoSummary: "bun scripts/cli.ts web-probe observe collect <observerId> --view project-mdtodo-summary",
|
||||
|
||||
@@ -564,7 +564,9 @@ export function runNodeWebProbe(options: NodeWebProbeOptions): Record<string, un
|
||||
return {
|
||||
ok: false,
|
||||
status: "blocked",
|
||||
command: options.action === "observe"
|
||||
command: options.action === "script" && typeof options.commandLabel === "string"
|
||||
? options.commandLabel
|
||||
: options.action === "observe"
|
||||
? `web-probe observe ${options.observeAction} --node ${options.node} --lane ${options.lane}`
|
||||
: `web-probe ${options.action} --node ${options.node} --lane ${options.lane}`,
|
||||
node: options.node,
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -321,7 +321,7 @@ export function searchScript(observability: ObservabilityConfig, target: Observa
|
||||
const proxyPrefix = `/api/v1/namespaces/${target.namespace}/services/http:${observability.traceBackend.serviceName}:http/proxy`;
|
||||
const searchProxyPath = `${proxyPrefix}${searchPath}`;
|
||||
const grepLiteral = options.grep === null ? "None" : JSON.stringify(options.grep);
|
||||
const effectiveQuery = inferSearchTempoQuery(options);
|
||||
const effectiveQuery = options.query ?? inferSearchTempoQuery(options);
|
||||
const queryLiteral = effectiveQuery === null ? "None" : JSON.stringify(effectiveQuery);
|
||||
const pathLiteral = options.path === null ? "None" : JSON.stringify(options.path);
|
||||
const statusLiteral = options.status === null ? "None" : String(options.status);
|
||||
@@ -383,6 +383,16 @@ IMPORTANT_ATTRS = [
|
||||
"returnedMessages", "totalMessages", "roleSequencePrefix",
|
||||
"consecutiveUserPrefix", "adjacentSameRoleCount", "userCount",
|
||||
"agentCount",
|
||||
"opencode.proxy.phase", "opencode.proxy.streaming",
|
||||
"opencode.proxy.ticket_accepted", "opencode.proxy.sse.directory_rewrite_enabled",
|
||||
"opencode.proxy.sse.directory_rewrite_from",
|
||||
"opencode.proxy.sse.directory_rewrite_to",
|
||||
"opencode.provider.sse.content_chunks",
|
||||
"opencode.provider.sse.content_chars",
|
||||
"opencode.provider.sse.output_data_lines",
|
||||
"opencode.provider.sse.done_lines",
|
||||
"opencode.provider.sse.json_errors",
|
||||
"opencode.provider.sse.reasoning_only_choices_dropped",
|
||||
"http.target", "http.url", "url.path",
|
||||
"db.system", "db.operation.name", "db.sql.table", "db.query.arg_count",
|
||||
"db.index.expected", "db.pool.max_open", "db.pool.open_connections",
|
||||
@@ -542,8 +552,57 @@ def compact_span(span, service, resource_attrs, scope_name):
|
||||
def grep_matches_text(text):
|
||||
return GREP is not None and GREP.lower() in text.lower()
|
||||
|
||||
def parse_grep_key_value():
|
||||
if GREP is None:
|
||||
return None, None
|
||||
match = re.match(r"^([A-Za-z0-9_.-]+)=(.+)$", GREP)
|
||||
if not match:
|
||||
return None, None
|
||||
key = match.group(1)
|
||||
if key.startswith("span."):
|
||||
key = key[5:]
|
||||
raw = match.group(2).strip()
|
||||
if len(raw) >= 2 and ((raw[0] == raw[-1] == '"') or (raw[0] == raw[-1] == "'")):
|
||||
raw = raw[1:-1]
|
||||
lowered = raw.lower()
|
||||
if lowered == "true":
|
||||
return key, True
|
||||
if lowered == "false":
|
||||
return key, False
|
||||
if re.match(r"^-?(?:0|[1-9][0-9]*)$", raw):
|
||||
try:
|
||||
return key, int(raw)
|
||||
except Exception:
|
||||
pass
|
||||
if re.match(r"^-?(?:0|[1-9][0-9]*)[.][0-9]+$", raw):
|
||||
try:
|
||||
return key, float(raw)
|
||||
except Exception:
|
||||
pass
|
||||
return key, raw
|
||||
|
||||
def values_equal_for_grep(actual, expected):
|
||||
if actual == expected:
|
||||
return True
|
||||
if isinstance(actual, bool) or isinstance(expected, bool):
|
||||
return str(actual).lower() == str(expected).lower()
|
||||
if isinstance(actual, (int, float)) and isinstance(expected, (int, float)):
|
||||
return float(actual) == float(expected)
|
||||
return str(actual) == str(expected)
|
||||
|
||||
def grep_matches_item(item):
|
||||
return GREP is not None and grep_matches_text(json.dumps(item, ensure_ascii=False, sort_keys=True))
|
||||
if GREP is None:
|
||||
return False
|
||||
key, expected = parse_grep_key_value()
|
||||
if key is not None:
|
||||
if key == "name" and values_equal_for_grep(item.get("name"), expected):
|
||||
return True
|
||||
if key.startswith("resource.") and values_equal_for_grep(item.get(key.replace("resource.", "", 1)), expected):
|
||||
return True
|
||||
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
|
||||
if key in attrs and values_equal_for_grep(attrs.get(key), expected):
|
||||
return True
|
||||
return grep_matches_text(json.dumps(item, ensure_ascii=False, sort_keys=True))
|
||||
|
||||
def span_matches_filters(item):
|
||||
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
|
||||
@@ -641,7 +700,10 @@ def trace_summary(trace_id, meta, body, rc, stderr):
|
||||
for span in raw_spans:
|
||||
if not isinstance(span, dict):
|
||||
continue
|
||||
raw_attrs = attrs_to_dict(span.get("attributes"))
|
||||
item = compact_span(span, service, resource_attrs, scope_name)
|
||||
match_item = dict(item)
|
||||
match_item["attributes"] = raw_attrs
|
||||
attrs = item.get("attributes", {})
|
||||
if isinstance(attrs, dict) and isinstance(attrs.get("traceId"), str):
|
||||
business_trace_ids.add(attrs.get("traceId"))
|
||||
@@ -650,7 +712,7 @@ def trace_summary(trace_id, meta, body, rc, stderr):
|
||||
spans.append(item)
|
||||
if is_error_span(span, attrs if isinstance(attrs, dict) else {}):
|
||||
error_spans.append(item)
|
||||
if span_matches_filters(item) and (GREP is None or grep_matches_item(item)):
|
||||
if span_matches_filters(match_item) and (GREP is None or grep_matches_item(match_item)):
|
||||
matched_spans.append(item)
|
||||
return {
|
||||
"traceId": trace_id,
|
||||
@@ -728,6 +790,8 @@ payload = {
|
||||
"tempoQuery": QUERY,
|
||||
"pathFilter": PATH_FILTER,
|
||||
"statusFilter": STATUS_FILTER,
|
||||
"grepCoverage": None if GREP is None else "raw trace body, span name, status message, route and full span attributes inside scanned candidate traces",
|
||||
"grepQueryInference": "tempo-query-present" if QUERY is not None and GREP is not None else None,
|
||||
"businessTraceSearch": BUSINESS_TRACE_GREP,
|
||||
"limit": LIMIT,
|
||||
"candidateLimit": CANDIDATE_LIMIT,
|
||||
|
||||
@@ -39,6 +39,9 @@ export function observabilityHelp(): Record<string, unknown> {
|
||||
"bun scripts/cli.ts platform-infra observability validate --target D518 [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability trace --target D518 --trace-id <traceId> [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target JD01 --grep opencode.proxy.stream.start --lookback-minutes 30 --limit 20",
|
||||
"bun scripts/cli.ts platform-infra observability search --target JD01 --grep opencode.proxy.sse.directory_rewrite_enabled --lookback-minutes 30 --limit 20",
|
||||
"bun scripts/cli.ts platform-infra observability search --target JD01 --grep opencode-provider-proxy --lookback-minutes 30 --limit 20",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id <trc_...> [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--session-id <ses_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
|
||||
|
||||
@@ -104,6 +104,8 @@ export async function search(config: UniDeskConfig, options: SearchOptions): Pro
|
||||
endAt: new Date(endSeconds * 1000).toISOString(),
|
||||
businessTraceId,
|
||||
mode: businessTraceId === null ? "candidate-grep" : "business-trace-exact",
|
||||
grepQueryInference: options.query === null && businessTraceId === null && effectiveTempoQuery !== null ? "inferred-from-grep-or-filters" : null,
|
||||
grepCoverage: options.grep === null ? null : "candidate traces are fetched by tempoQuery, then each scanned trace is matched against raw trace body, span name, status message, route and full span attributes",
|
||||
candidateLimit: options.candidateLimit,
|
||||
limit: options.limit,
|
||||
};
|
||||
@@ -143,6 +145,9 @@ function compactSearchFullResult(value: unknown): Record<string, unknown> {
|
||||
tempoQuery: source.tempoQuery ?? null,
|
||||
pathFilter: source.pathFilter ?? null,
|
||||
statusFilter: source.statusFilter ?? null,
|
||||
matchingActive: source.matchingActive ?? null,
|
||||
grepCoverage: source.grepCoverage ?? null,
|
||||
grepQueryInference: source.grepQueryInference ?? null,
|
||||
limit: source.limit ?? null,
|
||||
candidateLimit: source.candidateLimit ?? null,
|
||||
searchParseOk: source.searchParseOk ?? null,
|
||||
@@ -196,7 +201,57 @@ export function inferSearchTempoQuery(options: SearchOptions): string | null {
|
||||
const filters: string[] = [];
|
||||
if (options.path !== null) filters.push(`.http.route = ${JSON.stringify(options.path)}`);
|
||||
if (options.status !== null) filters.push(`.http.response.status_code = ${options.status}`);
|
||||
return filters.length > 0 ? `{ ${filters.join(" && ")} }` : null;
|
||||
if (filters.length > 0) return `{ ${filters.join(" && ")} }`;
|
||||
const grep = options.grep?.trim() ?? "";
|
||||
if (!grep) return null;
|
||||
const keyValue = grep.match(/^([A-Za-z0-9_.-]+)=(.+)$/u);
|
||||
if (keyValue !== null) {
|
||||
const key = keyValue[1] ?? "";
|
||||
const value = (keyValue[2] ?? "").trim();
|
||||
const traceQlKey = key === "name" || key.startsWith("resource.") ? key : `.${key.replace(/^span[.]/u, "")}`;
|
||||
if (/^(?:name|resource[.][A-Za-z0-9_.-]+|[.][A-Za-z0-9_.-]+)$/u.test(traceQlKey) && value.length > 0 && value.length <= 200) return `{ ${traceQlKey} = ${traceQlLiteral(value)} }`;
|
||||
}
|
||||
if (grep.startsWith("/") && !grep.includes("\n") && grep.length <= 300) return `{ .http.route = ${JSON.stringify(grep)} }`;
|
||||
if (/^[A-Za-z0-9][A-Za-z0-9_.:-]{0,180}$/u.test(grep)) {
|
||||
if (isKnownTraceAttributeKey(grep)) return `{ .${grep} != nil }`;
|
||||
if (grep.includes(".") || grep.includes(":")) return `{ name = ${JSON.stringify(grep)} }`;
|
||||
if (/^(?:hwlab|agentrun|opencode|platform|code-queue|unidesk)[A-Za-z0-9-]*$/u.test(grep) || /-(?:api|web|proxy|runner|manager|service|mgr|collector|tempo)$/u.test(grep)) return `{ resource.service.name = ${JSON.stringify(grep)} }`;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const knownTraceAttributeKeys = new Set([
|
||||
"opencode.proxy.phase",
|
||||
"opencode.proxy.streaming",
|
||||
"opencode.proxy.ticket_accepted",
|
||||
"opencode.proxy.sse.directory_rewrite_enabled",
|
||||
"opencode.proxy.sse.directory_rewrite_from",
|
||||
"opencode.proxy.sse.directory_rewrite_to",
|
||||
"opencode.provider.sse.content_chunks",
|
||||
"opencode.provider.sse.content_chars",
|
||||
"opencode.provider.sse.output_data_lines",
|
||||
"opencode.provider.sse.done_lines",
|
||||
"opencode.provider.sse.json_errors",
|
||||
"opencode.provider.sse.reasoning_only_choices_dropped",
|
||||
]);
|
||||
|
||||
function isKnownTraceAttributeKey(value: string): boolean {
|
||||
return knownTraceAttributeKeys.has(value) || value.startsWith("opencode.proxy.sse.") || value.startsWith("opencode.provider.sse.");
|
||||
}
|
||||
|
||||
function traceQlLiteral(value: string): string {
|
||||
const unquoted = stripOuterQuotes(value.trim());
|
||||
if (/^(?:true|false)$/iu.test(unquoted)) return unquoted.toLowerCase();
|
||||
if (/^-?(?:0|[1-9][0-9]*)(?:[.][0-9]+)?$/u.test(unquoted)) return unquoted;
|
||||
if (unquoted === "nil") return "nil";
|
||||
return JSON.stringify(unquoted);
|
||||
}
|
||||
|
||||
function stripOuterQuotes(value: string): string {
|
||||
if (value.length >= 2 && ((value.startsWith("\"") && value.endsWith("\"")) || (value.startsWith("'") && value.endsWith("'")))) {
|
||||
return value.slice(1, -1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function businessTraceIdFromSearchText(value: string | null): string | null {
|
||||
@@ -584,12 +639,20 @@ export function renderSearchTable(input: {
|
||||
` candidateLimit=${textValue(input.query.candidateLimit)} limit=${textValue(input.query.limit)}`,
|
||||
` candidates=${textValue(input.result.candidateTraceCount)} scanned=${textValue(input.result.scannedTraceCount)} matched=${textValue(input.result.matchedTraceCount)} stopped=${textValue(input.result.scanStopped)}`,
|
||||
];
|
||||
if (input.query.grep !== null) {
|
||||
lines.push(` grepCoverage=${textValue(input.query.grepCoverage)}`);
|
||||
if (input.query.grepQueryInference !== null) lines.push(` grepQueryInference=${textValue(input.query.grepQueryInference)}`);
|
||||
}
|
||||
const firstTraceId = traces.length > 0 ? textValue(traces[0].traceId) : "";
|
||||
lines.push("", "Next:");
|
||||
if (firstTraceId.length > 0 && firstTraceId !== "-") {
|
||||
lines.push(` bun scripts/cli.ts platform-infra observability trace --target ${input.target.id} --trace-id ${firstTraceId}`);
|
||||
}
|
||||
lines.push(` ${buildSearchCommand(input.target, input.options, true)}`);
|
||||
if (input.query.grep !== null && Number(input.result.matchedTraceCount ?? 0) === 0) {
|
||||
lines.push(` explicit TraceQL: bun scripts/cli.ts platform-infra observability search --target ${input.target.id} --query '{ resource.service.name = "<service>" }' --lookback-minutes ${textValue(input.query.lookbackMinutes)} --candidate-limit ${textValue(input.query.candidateLimit)} --limit ${textValue(input.query.limit)}`);
|
||||
lines.push(` widen candidates: bun scripts/cli.ts platform-infra observability search --target ${input.target.id} --grep ${JSON.stringify(String(input.query.grep))} --lookback-minutes ${textValue(input.query.lookbackMinutes)} --candidate-limit 300 --limit ${textValue(input.query.limit)}`);
|
||||
}
|
||||
lines.push("", "Disclosure:");
|
||||
lines.push(" default view is a bounded table; use --full for structured diagnosis JSON or trace --trace-id for one trace.");
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user