@@ -66,12 +66,31 @@ function parseGithubPrLabels(raw: string): string[] {
6666 . filter ( Boolean ) ;
6767}
6868
69+ function normalizeEvaluationConnectorId ( raw : string ) : string {
70+ // Support `models:judge:eis/<modelId>` where the judge value is a model id, not a connector id.
71+ if ( raw . startsWith ( 'eis/' ) ) {
72+ return `eis-${ normalizeBuildkiteKey ( raw . slice ( 'eis/' . length ) ) } ` ;
73+ }
74+
75+ // Support `models:judge:<modelGroup>` (e.g. `llm-gateway/gpt-5.2`) where the judge value is a model group.
76+ if ( raw . includes ( '/' ) ) {
77+ return `litellm-${ normalizeBuildkiteKey ( raw ) } ` ;
78+ }
79+
80+ // Already a connector id (e.g. `litellm-*` / `eis-*`) or some other explicit id.
81+ return raw ;
82+ }
83+
6984function buildEvalsYaml ( {
7085 selectedSuites,
7186 modelGroups,
87+ evaluationConnectorId,
88+ includeEisModels,
7289} : {
7390 selectedSuites : EvalsSuiteMetadataEntry [ ] ;
7491 modelGroups : string [ ] | undefined ;
92+ evaluationConnectorId : string | undefined ;
93+ includeEisModels : boolean ;
7594} ) : string {
7695 const suiteSteps = selectedSuites
7796 . map ( ( suite ) => {
@@ -81,17 +100,29 @@ function buildEvalsYaml({
81100 modelGroups && modelGroups . length > 0
82101 ? ` EVAL_MODEL_GROUPS: '${ modelGroups . join ( ',' ) } '`
83102 : null ;
103+ const evaluationConnectorIdEnv = evaluationConnectorId
104+ ? ` EVALUATION_CONNECTOR_ID: '${ evaluationConnectorId } '`
105+ : null ;
106+ const includeEisModelsEnv = includeEisModels
107+ ? ` EVAL_INCLUDE_EIS_MODELS: '1'`
108+ : null ;
84109 return [
85110 ` - label: '${ label } '` ,
86111 ` key: ${ key } ` ,
87112 ` command: bash .buildkite/scripts/steps/evals/run_suite.sh` ,
88113 ` env:` ,
89114 ` KBN_EVALS: '1'` ,
115+ ` FTR_EIS_CCM: '1'` ,
90116 ` EVAL_SUITE_ID: '${ suite . id } '` ,
91117 ` EVAL_FANOUT: '1'` ,
118+ ...( evaluationConnectorIdEnv ? [ evaluationConnectorIdEnv ] : [ ] ) ,
119+ ...( includeEisModelsEnv ? [ includeEisModelsEnv ] : [ ] ) ,
92120 ...( modelGroupsEnv ? [ modelGroupsEnv ] : [ ] ) ,
93121 ` timeout_in_minutes: 60` ,
94122 ` agents:` ,
123+ ` image: family/kibana-ubuntu-2404` ,
124+ ` imageProject: elastic-images-prod` ,
125+ ` provider: gcp` ,
95126 ` machineType: n2-standard-8` ,
96127 ` preemptible: true` ,
97128 ` retry:` ,
@@ -127,23 +158,35 @@ function buildEvalsYaml({
127158 * for the matching eval suites.
128159 */
129160export function getEvalPipeline ( githubPrLabels : string ) : string | null {
161+ const parsedLabels = parseGithubPrLabels ( githubPrLabels ) ;
162+
130163 // Run eval suite(s) when their GH label(s) are present (see `evals.suites.json`).
131164 const evalSuites = readEvalsSuiteMetadata ( ) ;
132- const runAllEvals = githubPrLabels . includes ( 'evals:all' ) ;
165+ const runAllEvals = parsedLabels . includes ( 'evals:all' ) ;
133166 const selectedEvalSuites = runAllEvals
134167 ? evalSuites
135168 : evalSuites . filter ( ( suite ) => {
136169 const labels = suite . ciLabels ?. length ? suite . ciLabels : [ `evals:${ suite . id } ` ] ;
137- return labels . some ( ( label ) => githubPrLabels . includes ( label ) ) ;
170+ return labels . some ( ( label ) => parsedLabels . includes ( label ) ) ;
138171 } ) ;
139172 // Optional model filtering for eval fanout (models:* labels).
140173 // - No `models:*` labels => run all models returned by LiteLLM (current behavior).
141174 // - One or more `models:<model-group>` labels => only run connectors whose `defaultModel`
142175 // matches one of those model groups.
143176 // - `models:all` can be used to explicitly opt into all models (ignored if combined with specifics).
144- const parsedLabels = parseGithubPrLabels ( githubPrLabels ) ;
177+ const rawEvaluationConnectorId = parsedLabels
178+ . find ( ( label ) => label . startsWith ( 'models:judge:' ) )
179+ ?. slice ( 'models:judge:' . length )
180+ ?. trim ( ) ;
181+ const evaluationConnectorId = rawEvaluationConnectorId
182+ ? normalizeEvaluationConnectorId ( rawEvaluationConnectorId )
183+ : undefined ;
184+ const includeEisModels =
185+ parsedLabels . some ( ( label ) => label === 'models:all' || label . startsWith ( 'models:eis/' ) ) ||
186+ ! ! rawEvaluationConnectorId ?. startsWith ( 'eis/' ) ||
187+ ! ! evaluationConnectorId ?. startsWith ( 'eis-' ) ;
145188 const selectedModelGroups = parsedLabels
146- . filter ( ( label ) => label . startsWith ( 'models:' ) )
189+ . filter ( ( label ) => label . startsWith ( 'models:' ) && ! label . startsWith ( 'models:judge:' ) )
147190 . map ( ( label ) => label . slice ( 'models:' . length ) )
148191 . map ( ( value ) => value . trim ( ) )
149192 . filter ( Boolean )
@@ -156,5 +199,7 @@ export function getEvalPipeline(githubPrLabels: string): string | null {
156199 return buildEvalsYaml ( {
157200 selectedSuites : selectedEvalSuites ,
158201 modelGroups : selectedModelGroups . length > 0 ? selectedModelGroups : undefined ,
202+ evaluationConnectorId,
203+ includeEisModels,
159204 } ) ;
160205}
0 commit comments