@@ -54,8 +54,12 @@ function run(cmd, args, opts = {}) {
5454 } ;
5555}
5656
57+ function cleanPath ( path ) {
58+ return String ( path || '' ) . replaceAll ( '\\' , '/' ) . replace ( / ^ \. \/ / , '' ) ;
59+ }
60+
5761function addSpan ( map , file , start , end ) {
58- const clean = String ( file || '' ) . replaceAll ( '\\' , '/' ) . replace ( / ^ \. \/ / , '' ) ;
62+ const clean = cleanPath ( file ) ;
5963 if ( ! clean ) return ;
6064 const s = Math . max ( 1 , Number ( start ) || 1 ) ;
6165 const e = Math . max ( s , Number ( end ) || s ) ;
@@ -64,6 +68,85 @@ function addSpan(map, file, start, end) {
6468 map . set ( clean , list ) ;
6569}
6670
71+ function estimateTokensFromBytes ( bytes ) {
72+ if ( ! Number . isFinite ( bytes ) ) return null ;
73+ return Math . ceil ( bytes / 4 ) ;
74+ }
75+
76+ function measuredNumber ( value , unit , source , unavailableReason = 'not captured in source artifact' ) {
77+ const numeric = Number ( value ) ;
78+ if ( Number . isFinite ( numeric ) ) return { value : numeric , unit, source } ;
79+ return { value : null , unit, source, unavailableReason } ;
80+ }
81+
82+ function byteCount ( text ) {
83+ return Buffer . byteLength ( String ( text || '' ) , 'utf8' ) ;
84+ }
85+
86+ function buildTimeMetrics ( readiness , evaluator , rowWallDurationMs , evaluatorSkippedReason = null ) {
87+ const setupIndex = readiness . setupIndex || { } ;
88+ return {
89+ setupDurationMs : measuredNumber ( setupIndex . setupDurationMs , 'ms' , 'lane readiness setupIndex' , 'readiness artifact did not report setupDurationMs' ) ,
90+ indexDurationMs : measuredNumber ( setupIndex . indexDurationMs , 'ms' , 'lane readiness setupIndex' , 'readiness artifact did not report indexDurationMs' ) ,
91+ queryDurationMs : measuredNumber ( setupIndex . queryDurationMs , 'ms' , 'lane readiness setupIndex' , 'readiness artifact did not report queryDurationMs' ) ,
92+ selectorDurationMs : measuredNumber ( null , 'ms' , 'selector stage' , 'selector ran before scoring and did not emit wall-clock telemetry' ) ,
93+ evaluatorDurationMs : evaluator
94+ ? measuredNumber ( evaluator . durationMs , 'ms' , 'official ContextBench evaluator command' )
95+ : measuredNumber ( null , 'ms' , 'official ContextBench evaluator command' , evaluatorSkippedReason || 'evaluator did not run' ) ,
96+ rowWallDurationMs : measuredNumber ( rowWallDurationMs , 'ms' , 'scorer per-lane wall clock' ) ,
97+ } ;
98+ }
99+
100+ function buildTokenMetrics ( selection , prediction ) {
101+ const candidateMetrics = selection . candidateMetrics || selection . readiness ?. candidateMetrics || { } ;
102+ const candidateBytes = Number ( candidateMetrics . bytes ) ;
103+ const candidateEstimatedTokens = Number ( candidateMetrics . estimatedTokens ) ;
104+ const predictionBytes = byteCount ( JSON . stringify ( prediction || { } ) ) ;
105+ const selectorUsage = selection . selectorUsage || { } ;
106+ return {
107+ estimator : 'ceil(utf8_bytes/4); cost estimate only, not provider billing telemetry' ,
108+ candidatePack : {
109+ candidateCount : Number ( selection . readiness ?. candidateCount ?? selection . candidateCount ?? candidateMetrics . candidateCount ?? 0 ) ,
110+ fileCount : Number . isFinite ( Number ( candidateMetrics . fileCount ) ) ? Number ( candidateMetrics . fileCount ) : null ,
111+ spanCount : Number . isFinite ( Number ( candidateMetrics . spanCount ) ) ? Number ( candidateMetrics . spanCount ) : null ,
112+ bytes : Number . isFinite ( candidateBytes )
113+ ? measuredNumber ( candidateBytes , 'bytes' , candidateMetrics . source || 'candidate pack artifact' )
114+ : measuredNumber ( null , 'bytes' , candidateMetrics . source || 'candidate pack artifact' , candidateMetrics . unavailableReason || 'candidate pack bytes were not emitted for this lane' ) ,
115+ estimatedTokens : Number . isFinite ( candidateEstimatedTokens )
116+ ? measuredNumber ( candidateEstimatedTokens , 'tokens' , candidateMetrics . source || 'candidate pack artifact' )
117+ : measuredNumber ( null , 'tokens' , candidateMetrics . source || 'candidate pack artifact' , candidateMetrics . unavailableReason || 'candidate pack token estimate was not emitted for this lane' ) ,
118+ } ,
119+ prediction : {
120+ bytes : measuredNumber ( predictionBytes , 'bytes' , 'official evaluator prediction JSON' ) ,
121+ estimatedTokens : measuredNumber ( estimateTokensFromBytes ( predictionBytes ) , 'tokens' , 'official evaluator prediction JSON' ) ,
122+ } ,
123+ selectorUsage : {
124+ model : selection . selectorModel || selections . model || 'gpt-5.4-mini-high' ,
125+ inputTokens : measuredNumber ( selectorUsage . inputTokens , 'tokens' , 'selector provider usage' , 'selector usage telemetry was not captured for this proof artifact' ) ,
126+ outputTokens : measuredNumber ( selectorUsage . outputTokens , 'tokens' , 'selector provider usage' , 'selector usage telemetry was not captured for this proof artifact' ) ,
127+ cachedInputTokens : measuredNumber ( selectorUsage . cachedInputTokens , 'tokens' , 'selector provider usage' , 'selector usage telemetry was not captured for this proof artifact' ) ,
128+ reasoningTokens : measuredNumber ( selectorUsage . reasoningTokens , 'tokens' , 'selector provider usage' , 'selector usage telemetry was not captured for this proof artifact' ) ,
129+ totalTokens : measuredNumber ( selectorUsage . totalTokens , 'tokens' , 'selector provider usage' , 'selector usage telemetry was not captured for this proof artifact' ) ,
130+ } ,
131+ } ;
132+ }
133+
134+ function reliabilityFor ( selection , rowBase , status , scoreable ) {
135+ return {
136+ status,
137+ officialEvaluatorScoreable : scoreable ,
138+ setupStatus : rowBase . setupStatus ,
139+ indexStatus : rowBase . indexStatus ,
140+ toolCallable : rowBase . toolCallable ,
141+ nonEmptyPrediction : rowBase . nonEmptyPrediction ,
142+ candidateCount : rowBase . candidateCount ,
143+ sourceRun : selection . readiness ?. sourceRun || selection . sourceRun || null ,
144+ sourceJob : selection . readiness ?. sourceJob || selection . sourceJob || null ,
145+ sourceArtifact : selection . readiness ?. sourceArtifact || selection . sourceArtifact || null ,
146+ sourceDigest : selection . readiness ?. sourceDigest || selection . sourceDigest || null ,
147+ } ;
148+ }
149+
67150function resultTableRow ( row ) {
68151 const final = row . score ?. final || { } ;
69152 return {
@@ -79,9 +162,17 @@ function resultTableRow(row) {
79162 linePrecision : final . line ?. precision ?? null ,
80163 editlocRecall : row . score ?. editloc ?. recall ?? null ,
81164 editlocPrecision : row . score ?. editloc ?. precision ?? null ,
165+ setupDurationMs : row . timeMetrics ?. setupDurationMs ?. value ?? null ,
166+ indexDurationMs : row . timeMetrics ?. indexDurationMs ?. value ?? null ,
167+ queryDurationMs : row . timeMetrics ?. queryDurationMs ?. value ?? null ,
168+ evaluatorDurationMs : row . timeMetrics ?. evaluatorDurationMs ?. value ?? null ,
169+ rowWallDurationMs : row . timeMetrics ?. rowWallDurationMs ?. value ?? null ,
170+ candidateEstimatedTokens : row . tokenMetrics ?. candidatePack ?. estimatedTokens ?. value ?? null ,
171+ predictionEstimatedTokens : row . tokenMetrics ?. prediction ?. estimatedTokens ?. value ?? null ,
82172 } ;
83173}
84174
175+ const runStarted = Date . now ( ) ;
85176const runDir = join ( root , 'lane-score' ) ;
86177mkdirSync ( runDir , { recursive : true } ) ;
87178writeFileSync ( join ( runDir , 'selections.json' ) , JSON . stringify ( selections , null , 2 ) ) ;
@@ -106,14 +197,15 @@ if (gold.status !== 0) throw new Error(`gold materialization failed: ${gold.stde
106197
107198const rows = [ ] ;
108199for ( const selection of laneSelections ) {
200+ const rowStarted = Date . now ( ) ;
109201 const lane = selection . lane_id || selection . lane ;
110202 const laneDir = join ( runDir , lane ) ;
111203 mkdirSync ( laneDir , { recursive : true } ) ;
112204 const spans = Array . isArray ( selection . spans ) ? selection . spans : [ ] ;
113205 const files = Array . isArray ( selection . files ) ? selection . files : [ ] ;
114206 const spanMap = new Map ( ) ;
115207 for ( const span of spans ) addSpan ( spanMap , span . file , span . start , span . end ) ;
116- const predFiles = [ ...new Set ( [ ...files , ...spans . map ( ( span ) => String ( span . file || '' ) . replaceAll ( '\\' , '/' ) . replace ( / ^ \. \/ / , '' ) ) ] ) ] . filter ( Boolean ) ;
208+ const predFiles = [ ...new Set ( [ ...files , ...spans . map ( ( span ) => cleanPath ( span . file ) ) ] ) ] . filter ( Boolean ) ;
117209 const predSpans = Object . fromEntries ( spanMap . entries ( ) ) ;
118210 const nonEmptyPrediction = predFiles . length > 0 || spans . length > 0 ;
119211 const readiness = selection . readiness || { } ;
@@ -135,7 +227,17 @@ for (const selection of laneSelections) {
135227
136228 writeFileSync ( join ( laneDir , 'selection.json' ) , JSON . stringify ( selection , null , 2 ) ) ;
137229 if ( ! nonEmptyPrediction ) {
138- rows . push ( { ...rowBase , status : 'empty_prediction' , officialEvaluatorScoreable : false , score : null } ) ;
230+ const timeMetrics = buildTimeMetrics ( readiness , null , Date . now ( ) - rowStarted , 'prediction was empty' ) ;
231+ const row = {
232+ ...rowBase ,
233+ status : 'empty_prediction' ,
234+ officialEvaluatorScoreable : false ,
235+ score : null ,
236+ timeMetrics,
237+ tokenMetrics : buildTokenMetrics ( selection , null ) ,
238+ } ;
239+ row . reliability = reliabilityFor ( selection , rowBase , row . status , row . officialEvaluatorScoreable ) ;
240+ rows . push ( row ) ;
139241 continue ;
140242 }
141243
@@ -177,12 +279,16 @@ for (const selection of laneSelections) {
177279 if ( lines . length > 0 ) score = JSON . parse ( lines . at ( - 1 ) ) ;
178280 }
179281 const scoreable = evaluator . status === 0 && Boolean ( score ) ;
180- rows . push ( {
282+ const row = {
181283 ...rowBase ,
182284 status : scoreable ? 'completed' : 'judge_failed' ,
183285 officialEvaluatorScoreable : scoreable ,
184286 score,
185- } ) ;
287+ timeMetrics : buildTimeMetrics ( readiness , evaluator , Date . now ( ) - rowStarted ) ,
288+ tokenMetrics : buildTokenMetrics ( selection , prediction ) ,
289+ } ;
290+ row . reliability = reliabilityFor ( selection , rowBase , row . status , row . officialEvaluatorScoreable ) ;
291+ rows . push ( row ) ;
186292}
187293
188294const scoreableRows = rows . filter ( ( row ) => row . officialEvaluatorScoreable ) ;
@@ -193,9 +299,14 @@ const summary = {
193299 requiredCompetitors : requiredLanes . length ,
194300 requiredLanes,
195301 setupIndexCostReportedSeparately : true ,
302+ officialEvaluatorQualityRowsOnly : true ,
196303 model : selections . model || 'gpt-5.4-mini-high' ,
197304 predictionSource : selections . predictionSource || 'gpt-5.4-mini-high subagent selections over real lane candidate packs' ,
198305 caveats : selections . caveats || [ ] ,
306+ runMetrics : {
307+ goldMaterializationDurationMs : gold . durationMs ,
308+ totalWallDurationMs : Date . now ( ) - runStarted ,
309+ } ,
199310 resultsTable : scoreableRows . map ( resultTableRow ) ,
200311 rows,
201312} ;
0 commit comments