Skip to content

Commit e6abab1

Browse files
committed
Add publishable ContextBench lane metrics
1 parent da23692 commit e6abab1

1 file changed

Lines changed: 116 additions & 5 deletions

File tree

scripts/contextbench-score-five-lane-selections.mjs

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@ function run(cmd, args, opts = {}) {
5454
};
5555
}
5656

57+
function cleanPath(path) {
58+
return String(path || '').replaceAll('\\', '/').replace(/^\.\//, '');
59+
}
60+
5761
function addSpan(map, file, start, end) {
58-
const clean = String(file || '').replaceAll('\\', '/').replace(/^\.\//, '');
62+
const clean = cleanPath(file);
5963
if (!clean) return;
6064
const s = Math.max(1, Number(start) || 1);
6165
const e = Math.max(s, Number(end) || s);
@@ -64,6 +68,85 @@ function addSpan(map, file, start, end) {
6468
map.set(clean, list);
6569
}
6670

71+
function estimateTokensFromBytes(bytes) {
72+
if (!Number.isFinite(bytes)) return null;
73+
return Math.ceil(bytes / 4);
74+
}
75+
76+
function measuredNumber(value, unit, source, unavailableReason = 'not captured in source artifact') {
77+
const numeric = Number(value);
78+
if (Number.isFinite(numeric)) return { value: numeric, unit, source };
79+
return { value: null, unit, source, unavailableReason };
80+
}
81+
82+
function byteCount(text) {
83+
return Buffer.byteLength(String(text || ''), 'utf8');
84+
}
85+
86+
function buildTimeMetrics(readiness, evaluator, rowWallDurationMs, evaluatorSkippedReason = null) {
87+
const setupIndex = readiness.setupIndex || {};
88+
return {
89+
setupDurationMs: measuredNumber(setupIndex.setupDurationMs, 'ms', 'lane readiness setupIndex', 'readiness artifact did not report setupDurationMs'),
90+
indexDurationMs: measuredNumber(setupIndex.indexDurationMs, 'ms', 'lane readiness setupIndex', 'readiness artifact did not report indexDurationMs'),
91+
queryDurationMs: measuredNumber(setupIndex.queryDurationMs, 'ms', 'lane readiness setupIndex', 'readiness artifact did not report queryDurationMs'),
92+
selectorDurationMs: measuredNumber(null, 'ms', 'selector stage', 'selector ran before scoring and did not emit wall-clock telemetry'),
93+
evaluatorDurationMs: evaluator
94+
? measuredNumber(evaluator.durationMs, 'ms', 'official ContextBench evaluator command')
95+
: measuredNumber(null, 'ms', 'official ContextBench evaluator command', evaluatorSkippedReason || 'evaluator did not run'),
96+
rowWallDurationMs: measuredNumber(rowWallDurationMs, 'ms', 'scorer per-lane wall clock'),
97+
};
98+
}
99+
100+
function buildTokenMetrics(selection, prediction) {
101+
const candidateMetrics = selection.candidateMetrics || selection.readiness?.candidateMetrics || {};
102+
const candidateBytes = Number(candidateMetrics.bytes);
103+
const candidateEstimatedTokens = Number(candidateMetrics.estimatedTokens);
104+
const predictionBytes = byteCount(JSON.stringify(prediction || {}));
105+
const selectorUsage = selection.selectorUsage || {};
106+
return {
107+
estimator: 'ceil(utf8_bytes/4); cost estimate only, not provider billing telemetry',
108+
candidatePack: {
109+
candidateCount: Number(selection.readiness?.candidateCount ?? selection.candidateCount ?? candidateMetrics.candidateCount ?? 0),
110+
fileCount: Number.isFinite(Number(candidateMetrics.fileCount)) ? Number(candidateMetrics.fileCount) : null,
111+
spanCount: Number.isFinite(Number(candidateMetrics.spanCount)) ? Number(candidateMetrics.spanCount) : null,
112+
bytes: Number.isFinite(candidateBytes)
113+
? measuredNumber(candidateBytes, 'bytes', candidateMetrics.source || 'candidate pack artifact')
114+
: measuredNumber(null, 'bytes', candidateMetrics.source || 'candidate pack artifact', candidateMetrics.unavailableReason || 'candidate pack bytes were not emitted for this lane'),
115+
estimatedTokens: Number.isFinite(candidateEstimatedTokens)
116+
? measuredNumber(candidateEstimatedTokens, 'tokens', candidateMetrics.source || 'candidate pack artifact')
117+
: measuredNumber(null, 'tokens', candidateMetrics.source || 'candidate pack artifact', candidateMetrics.unavailableReason || 'candidate pack token estimate was not emitted for this lane'),
118+
},
119+
prediction: {
120+
bytes: measuredNumber(predictionBytes, 'bytes', 'official evaluator prediction JSON'),
121+
estimatedTokens: measuredNumber(estimateTokensFromBytes(predictionBytes), 'tokens', 'official evaluator prediction JSON'),
122+
},
123+
selectorUsage: {
124+
model: selection.selectorModel || selections.model || 'gpt-5.4-mini-high',
125+
inputTokens: measuredNumber(selectorUsage.inputTokens, 'tokens', 'selector provider usage', 'selector usage telemetry was not captured for this proof artifact'),
126+
outputTokens: measuredNumber(selectorUsage.outputTokens, 'tokens', 'selector provider usage', 'selector usage telemetry was not captured for this proof artifact'),
127+
cachedInputTokens: measuredNumber(selectorUsage.cachedInputTokens, 'tokens', 'selector provider usage', 'selector usage telemetry was not captured for this proof artifact'),
128+
reasoningTokens: measuredNumber(selectorUsage.reasoningTokens, 'tokens', 'selector provider usage', 'selector usage telemetry was not captured for this proof artifact'),
129+
totalTokens: measuredNumber(selectorUsage.totalTokens, 'tokens', 'selector provider usage', 'selector usage telemetry was not captured for this proof artifact'),
130+
},
131+
};
132+
}
133+
134+
function reliabilityFor(selection, rowBase, status, scoreable) {
135+
return {
136+
status,
137+
officialEvaluatorScoreable: scoreable,
138+
setupStatus: rowBase.setupStatus,
139+
indexStatus: rowBase.indexStatus,
140+
toolCallable: rowBase.toolCallable,
141+
nonEmptyPrediction: rowBase.nonEmptyPrediction,
142+
candidateCount: rowBase.candidateCount,
143+
sourceRun: selection.readiness?.sourceRun || selection.sourceRun || null,
144+
sourceJob: selection.readiness?.sourceJob || selection.sourceJob || null,
145+
sourceArtifact: selection.readiness?.sourceArtifact || selection.sourceArtifact || null,
146+
sourceDigest: selection.readiness?.sourceDigest || selection.sourceDigest || null,
147+
};
148+
}
149+
67150
function resultTableRow(row) {
68151
const final = row.score?.final || {};
69152
return {
@@ -79,9 +162,17 @@ function resultTableRow(row) {
79162
linePrecision: final.line?.precision ?? null,
80163
editlocRecall: row.score?.editloc?.recall ?? null,
81164
editlocPrecision: row.score?.editloc?.precision ?? null,
165+
setupDurationMs: row.timeMetrics?.setupDurationMs?.value ?? null,
166+
indexDurationMs: row.timeMetrics?.indexDurationMs?.value ?? null,
167+
queryDurationMs: row.timeMetrics?.queryDurationMs?.value ?? null,
168+
evaluatorDurationMs: row.timeMetrics?.evaluatorDurationMs?.value ?? null,
169+
rowWallDurationMs: row.timeMetrics?.rowWallDurationMs?.value ?? null,
170+
candidateEstimatedTokens: row.tokenMetrics?.candidatePack?.estimatedTokens?.value ?? null,
171+
predictionEstimatedTokens: row.tokenMetrics?.prediction?.estimatedTokens?.value ?? null,
82172
};
83173
}
84174

175+
const runStarted = Date.now();
85176
const runDir = join(root, 'lane-score');
86177
mkdirSync(runDir, { recursive: true });
87178
writeFileSync(join(runDir, 'selections.json'), JSON.stringify(selections, null, 2));
@@ -106,14 +197,15 @@ if (gold.status !== 0) throw new Error(`gold materialization failed: ${gold.stde
106197

107198
const rows = [];
108199
for (const selection of laneSelections) {
200+
const rowStarted = Date.now();
109201
const lane = selection.lane_id || selection.lane;
110202
const laneDir = join(runDir, lane);
111203
mkdirSync(laneDir, { recursive: true });
112204
const spans = Array.isArray(selection.spans) ? selection.spans : [];
113205
const files = Array.isArray(selection.files) ? selection.files : [];
114206
const spanMap = new Map();
115207
for (const span of spans) addSpan(spanMap, span.file, span.start, span.end);
116-
const predFiles = [...new Set([...files, ...spans.map((span) => String(span.file || '').replaceAll('\\', '/').replace(/^\.\//, ''))])].filter(Boolean);
208+
const predFiles = [...new Set([...files, ...spans.map((span) => cleanPath(span.file))])].filter(Boolean);
117209
const predSpans = Object.fromEntries(spanMap.entries());
118210
const nonEmptyPrediction = predFiles.length > 0 || spans.length > 0;
119211
const readiness = selection.readiness || {};
@@ -135,7 +227,17 @@ for (const selection of laneSelections) {
135227

136228
writeFileSync(join(laneDir, 'selection.json'), JSON.stringify(selection, null, 2));
137229
if (!nonEmptyPrediction) {
138-
rows.push({ ...rowBase, status: 'empty_prediction', officialEvaluatorScoreable: false, score: null });
230+
const timeMetrics = buildTimeMetrics(readiness, null, Date.now() - rowStarted, 'prediction was empty');
231+
const row = {
232+
...rowBase,
233+
status: 'empty_prediction',
234+
officialEvaluatorScoreable: false,
235+
score: null,
236+
timeMetrics,
237+
tokenMetrics: buildTokenMetrics(selection, null),
238+
};
239+
row.reliability = reliabilityFor(selection, rowBase, row.status, row.officialEvaluatorScoreable);
240+
rows.push(row);
139241
continue;
140242
}
141243

@@ -177,12 +279,16 @@ for (const selection of laneSelections) {
177279
if (lines.length > 0) score = JSON.parse(lines.at(-1));
178280
}
179281
const scoreable = evaluator.status === 0 && Boolean(score);
180-
rows.push({
282+
const row = {
181283
...rowBase,
182284
status: scoreable ? 'completed' : 'judge_failed',
183285
officialEvaluatorScoreable: scoreable,
184286
score,
185-
});
287+
timeMetrics: buildTimeMetrics(readiness, evaluator, Date.now() - rowStarted),
288+
tokenMetrics: buildTokenMetrics(selection, prediction),
289+
};
290+
row.reliability = reliabilityFor(selection, rowBase, row.status, row.officialEvaluatorScoreable);
291+
rows.push(row);
186292
}
187293

188294
const scoreableRows = rows.filter((row) => row.officialEvaluatorScoreable);
@@ -193,9 +299,14 @@ const summary = {
193299
requiredCompetitors: requiredLanes.length,
194300
requiredLanes,
195301
setupIndexCostReportedSeparately: true,
302+
officialEvaluatorQualityRowsOnly: true,
196303
model: selections.model || 'gpt-5.4-mini-high',
197304
predictionSource: selections.predictionSource || 'gpt-5.4-mini-high subagent selections over real lane candidate packs',
198305
caveats: selections.caveats || [],
306+
runMetrics: {
307+
goldMaterializationDurationMs: gold.durationMs,
308+
totalWallDurationMs: Date.now() - runStarted,
309+
},
199310
resultsTable: scoreableRows.map(resultTableRow),
200311
rows,
201312
};

0 commit comments

Comments
 (0)