7d3b06bd22
Merge PR #92 after rebasing onto current master and validating focused Code Queue CLI contracts. Adds bounded terminal review data to codex read without prompt/tool log disclosure.
243 lines
12 KiB
TypeScript
243 lines
12 KiB
TypeScript
import { codexReadTaskForTest } from "./src/code-queue";
|
|
|
|
type JsonRecord = Record<string, unknown>;
|
|
type FetchCall = { path: string; init?: { method?: string; body?: unknown } };
|
|
|
|
const promptSecret = "PROMPT_BODY_SHOULD_NOT_LEAK_FROM_CODEX_READ";
|
|
const toolSecret = "TOOL_LOG_SHOULD_NOT_LEAK_FROM_CODEX_READ";
|
|
const feedbackSecret = "FEEDBACK_PROMPT_SHOULD_NOT_LEAK_FROM_CODEX_READ";
|
|
const referenceSecret = "REFERENCE_INJECTION_BASE_PROMPT_SHOULD_NOT_LEAK_FROM_CODEX_READ";
|
|
|
|
function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
|
|
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
|
|
}
|
|
|
|
function asRecord(value: unknown, label: string): JsonRecord {
|
|
assertCondition(typeof value === "object" && value !== null && !Array.isArray(value), `${label} must be an object`, value);
|
|
return value as JsonRecord;
|
|
}
|
|
|
|
function asArray(value: unknown, label: string): unknown[] {
|
|
assertCondition(Array.isArray(value), `${label} must be an array`, value);
|
|
return value as unknown[];
|
|
}
|
|
|
|
function taskIdFromPath(path: string): string {
|
|
const match = path.match(/\/api\/tasks\/([^/?]+)/u);
|
|
return decodeURIComponent(match?.[1] ?? "unknown");
|
|
}
|
|
|
|
function summaryFor(taskId: string, status: "succeeded" | "failed"): JsonRecord {
|
|
const failed = status === "failed";
|
|
const finalResponse = failed
|
|
? "Failure summary: tests did not pass, but the runner reported the exact failing command."
|
|
: "Final response: implemented the terminal read fix and validated the focused contract.";
|
|
return {
|
|
id: taskId,
|
|
queueId: "hwlab",
|
|
status,
|
|
providerId: "D601",
|
|
executionMode: "container",
|
|
model: "gpt-5.5",
|
|
agentPort: "codex",
|
|
cwd: "/workspace/unidesk",
|
|
reasoningEffort: "medium",
|
|
maxAttempts: 2,
|
|
currentAttempt: failed ? 2 : 1,
|
|
currentMode: failed ? "retry" : "initial",
|
|
judgeFailCount: failed ? 2 : 0,
|
|
judgeFailRetryLimit: 3,
|
|
codexThreadId: `thread_${taskId}`,
|
|
activeTurnId: null,
|
|
createdAt: "2026-05-22T00:00:00.000Z",
|
|
startedAt: "2026-05-22T00:01:00.000Z",
|
|
updatedAt: "2026-05-22T00:03:00.000Z",
|
|
finishedAt: "2026-05-22T00:03:00.000Z",
|
|
timing: { totalMs: 120000 },
|
|
initialPrompt: `${promptSecret}\nPlease fix the task.`,
|
|
prompt: `${promptSecret}\nPlease fix the task.`,
|
|
referenceTaskIds: ["codex_reference_task"],
|
|
referenceInjection: {
|
|
version: 2,
|
|
injectedAt: "2026-05-22T00:00:30.000Z",
|
|
basePrompt: referenceSecret,
|
|
directReferenceTaskIds: ["codex_reference_task"],
|
|
maxRounds: 3,
|
|
truncated: false,
|
|
itemCount: 1,
|
|
items: [{
|
|
round: 1,
|
|
roundIndex: 0,
|
|
taskId: "codex_reference_task",
|
|
viaTaskId: null,
|
|
status: "succeeded",
|
|
providerId: "D601",
|
|
executionMode: "container",
|
|
model: "gpt-5.5",
|
|
cwd: "/workspace/unidesk",
|
|
createdAt: "2026-05-21T00:00:00.000Z",
|
|
updatedAt: "2026-05-21T00:03:00.000Z",
|
|
promptChars: 9999,
|
|
finalResponseChars: 1234,
|
|
finalResponseAt: "2026-05-21T00:03:00.000Z",
|
|
finalResponseSource: "finalResponse",
|
|
referenceTaskIds: [],
|
|
cliHint: "bun scripts/cli.ts codex task codex_reference_task",
|
|
}],
|
|
},
|
|
lastAssistantMessage: {
|
|
at: "2026-05-22T00:03:00.000Z",
|
|
seq: 41,
|
|
source: "finalResponse",
|
|
text: finalResponse,
|
|
},
|
|
toolSummary: {
|
|
count: 3,
|
|
returned: 1,
|
|
limit: 1,
|
|
truncated: true,
|
|
items: [{ seq: 39, kind: "ran", outputPreview: toolSecret }],
|
|
},
|
|
attempts: [
|
|
{
|
|
index: failed ? 2 : 1,
|
|
mode: failed ? "retry" : "initial",
|
|
terminalStatus: failed ? "failed" : "completed",
|
|
appServerExitCode: failed ? 1 : 0,
|
|
appServerSignal: null,
|
|
error: failed ? "focused contract failed" : null,
|
|
stderrTail: failed ? "bun scripts/code-queue-cli-read-terminal-contract-test.ts failed" : "",
|
|
startedAt: "2026-05-22T00:01:00.000Z",
|
|
finishedAt: "2026-05-22T00:03:00.000Z",
|
|
outputStartSeq: 1,
|
|
outputEndSeq: 42,
|
|
finalResponse,
|
|
finalResponsePreview: finalResponse,
|
|
finalResponseChars: finalResponse.length,
|
|
feedbackPromptPreview: feedbackSecret,
|
|
judge: failed ? { decision: "fail", confidence: 0.88, reason: "contract failed" } : { decision: "complete", confidence: 0.97, reason: "verified" },
|
|
runnerErrorClassification: failed ? { class: "test-failure", retryable: false } : null,
|
|
},
|
|
],
|
|
lastJudge: failed ? { decision: "fail", confidence: 0.88, reason: "contract failed", source: "minimax" } : { decision: "complete", confidence: 0.97, reason: "verified", source: "minimax" },
|
|
lastError: failed ? "focused contract failed" : null,
|
|
cancelRequested: false,
|
|
transcriptCount: 12,
|
|
transcriptMaxSeq: 42,
|
|
outputCount: 42,
|
|
retainedOutputCount: 20,
|
|
outputMaxSeq: 42,
|
|
eventCount: 5,
|
|
};
|
|
}
|
|
|
|
function readTerminalFixture(calls: FetchCall[]): (path: string, init?: { method?: string; body?: unknown }) => unknown {
|
|
return (path, init) => {
|
|
calls.push({ path, init });
|
|
const taskId = taskIdFromPath(path);
|
|
const status = taskId.includes("failed") ? "failed" : "succeeded";
|
|
if (path.includes("/summary")) {
|
|
return { ok: true, status: 200, body: { ok: true, summary: summaryFor(taskId, status) } };
|
|
}
|
|
if (path.includes("/read")) {
|
|
return {
|
|
ok: true,
|
|
status: 200,
|
|
body: {
|
|
ok: true,
|
|
task: {
|
|
id: taskId,
|
|
queueId: "hwlab",
|
|
status,
|
|
readAt: "2026-05-22T00:04:00.000Z",
|
|
terminalUnread: false,
|
|
},
|
|
queue: { counts: { [status]: 1 }, unreadTerminal: 0 },
|
|
},
|
|
};
|
|
}
|
|
throw new Error(`unexpected path ${path}`);
|
|
};
|
|
}
|
|
|
|
function missingTaskFixture(calls: FetchCall[]): (path: string, init?: { method?: string; body?: unknown }) => unknown {
|
|
return (path, init) => {
|
|
calls.push({ path, init });
|
|
return { ok: true, status: 404, body: { ok: false, error: "task not found" } };
|
|
};
|
|
}
|
|
|
|
function assertTerminalReadShape(result: unknown, taskId: string, status: "succeeded" | "failed"): void {
|
|
const data = asRecord(result, "result");
|
|
const task = asRecord(data.task, "task");
|
|
const finalResponse = asRecord(task.finalResponse, "finalResponse");
|
|
const attempts = asRecord(task.attempts, "attempts");
|
|
const lastAttempt = asRecord(attempts.lastAttempt, "lastAttempt");
|
|
const read = asRecord(data.read, "read");
|
|
const disclosure = asRecord(task.disclosure, "disclosure");
|
|
const body = JSON.stringify(result);
|
|
|
|
assertCondition(task.id === taskId, "read result must preserve task id", task);
|
|
assertCondition(task.queueId === "hwlab", "read result must preserve queue id", task);
|
|
assertCondition(task.status === status, "read result must preserve terminal status", task);
|
|
assertCondition(task.model === "gpt-5.5" && task.providerId === "D601" && task.cwd === "/workspace/unidesk", "read result must preserve stable execution metadata", task);
|
|
assertCondition(task.createdAt === "2026-05-22T00:00:00.000Z", "read result must include createdAt", task);
|
|
assertCondition(task.startedAt === "2026-05-22T00:01:00.000Z", "read result must include startedAt", task);
|
|
assertCondition(task.updatedAt === "2026-05-22T00:03:00.000Z", "read result must include updatedAt", task);
|
|
assertCondition(task.finishedAt === "2026-05-22T00:03:00.000Z", "read result must include finishedAt", task);
|
|
assertCondition(task.readAt === "2026-05-22T00:04:00.000Z" && task.terminalUnread === false, "read result must preserve read acknowledgement", task);
|
|
assertCondition(read.marked === true && read.terminalUnread === false, "top-level read acknowledgement must be stable", read);
|
|
assertCondition(String(finalResponse.text ?? "").includes(status === "failed" ? "Failure summary" : "Final response"), "read result must include final response text", finalResponse);
|
|
assertCondition(finalResponse.chars === String(finalResponse.text ?? "").length && finalResponse.truncated === false, "read result must include bounded final response preview metadata", finalResponse);
|
|
assertCondition(lastAttempt.terminalStatus === (status === "failed" ? "failed" : "completed"), "read result must include terminal attempt summary", lastAttempt);
|
|
assertCondition(disclosure.promptIncluded === false && disclosure.toolLogsIncluded === false && disclosure.finalResponseIncluded === true, "read disclosure policy must be explicit", disclosure);
|
|
const commands = asRecord(task.commands, "task.commands");
|
|
assertCondition(String(commands.detail ?? "") === `bun scripts/cli.ts codex task ${taskId} --detail`, "read result must include detail drill-down command", commands);
|
|
assertCondition(String(commands.trace ?? "").includes(`codex task ${taskId} --trace`), "read result must include trace drill-down command", commands);
|
|
assertCondition(String(commands.output ?? "").includes(`codex output ${taskId}`), "read result must include output drill-down command", commands);
|
|
assertCondition(!body.includes(promptSecret), "read result must not leak prompt body", body);
|
|
assertCondition(!body.includes(toolSecret), "read result must not leak tool logs", body);
|
|
assertCondition(!body.includes(feedbackSecret), "read result must not leak feedback prompt body", body);
|
|
assertCondition(!body.includes(referenceSecret), "read result must not leak reference injection base prompt", body);
|
|
if (status === "failed") {
|
|
assertCondition(task.lastError === "focused contract failed", "failed read must include lastError", task);
|
|
assertCondition(String(asRecord(lastAttempt.stderrTail, "stderrTail").text ?? "").includes("contract-test"), "failed read must include stderr tail", lastAttempt);
|
|
assertCondition(asRecord(lastAttempt.runnerErrorClassification, "runnerErrorClassification").class === "test-failure", "failed read must include runner error classification", lastAttempt);
|
|
}
|
|
}
|
|
|
|
function run(): JsonRecord {
|
|
const succeededCalls: FetchCall[] = [];
|
|
const succeeded = codexReadTaskForTest("codex_succeeded_terminal", readTerminalFixture(succeededCalls));
|
|
assertTerminalReadShape(succeeded, "codex_succeeded_terminal", "succeeded");
|
|
assertCondition(succeededCalls.length === 2, "succeeded read must fetch summary then mark read", succeededCalls);
|
|
assertCondition(succeededCalls[0]?.path.includes("/summary?toolLimit=3") && succeededCalls[1]?.path.includes("/read"), "succeeded read call order must preserve body before mutation", succeededCalls);
|
|
assertCondition(succeededCalls[1]?.init?.method === "POST", "read mutation must use POST", succeededCalls);
|
|
|
|
const failedCalls: FetchCall[] = [];
|
|
const failed = codexReadTaskForTest("codex_failed_terminal", readTerminalFixture(failedCalls));
|
|
assertTerminalReadShape(failed, "codex_failed_terminal", "failed");
|
|
|
|
const missingCalls: FetchCall[] = [];
|
|
let missingError: Error | null = null;
|
|
try {
|
|
codexReadTaskForTest("codex_missing_terminal", missingTaskFixture(missingCalls));
|
|
} catch (error) {
|
|
missingError = error instanceof Error ? error : new Error(String(error));
|
|
}
|
|
assertCondition(missingError !== null && missingError.message.includes("task not found"), "missing task must fail with task not found", missingError?.message);
|
|
assertCondition(missingCalls.length === 1 && missingCalls[0]?.path.includes("/summary"), "missing task must not issue read mutation after failed lookup", missingCalls);
|
|
|
|
return {
|
|
ok: true,
|
|
checks: [
|
|
"succeeded terminal read returns status, queue, timestamps, final response preview, and drill-down commands from summary before marking read",
|
|
"failed terminal read returns final response, lastError, stderr tail, and runner classification",
|
|
"missing task fails before issuing a read mutation",
|
|
"prompt, tool logs, and feedback prompts stay behind progressive disclosure commands",
|
|
],
|
|
};
|
|
}
|
|
|
|
process.stdout.write(`${JSON.stringify(run(), null, 2)}\n`);
|