Skip to content

Commit 893d1fa

Browse files
committed
Feedback from review
1 parent 108bb94 commit 893d1fa

5 files changed

Lines changed: 14 additions & 57 deletions

File tree

x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/product_documentation/product_documentation.spec.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ evaluate.describe('AgentBuilder product documentation tool', { tag: '@svlOblt' }
182182
metadata: {
183183
agentId: productDocAgentId,
184184
expectedOnlyToolId: platformCoreTools.productDocumentation,
185-
requireOnlyFromToolOutput: true,
186185
product: 'kibana',
187186
},
188187
},

x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/src/evaluate_dataset.ts

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,16 @@ import type { ExperimentTask } from '@kbn/evals/src/types';
2323
import type { TaskOutput } from '@arizeai/phoenix-client/dist/esm/types/experiments';
2424
import type { EsClient } from '@kbn/scout';
2525
import type { ToolingLog } from '@kbn/tooling-log';
26-
import type { AgentBuilderEvaluationChatClient } from './chat_client';
2726
import {
28-
containsAllTerms,
2927
extractAllStrings,
3028
extractMaxSemver,
3129
extractReleaseDateNearVersion,
3230
getBooleanMeta,
3331
getFinalAssistantMessage,
3432
getStringMeta,
3533
getToolCallSteps,
36-
includesOneOf,
37-
} from './evaluate_dataset_utils';
34+
} from '@kbn/evals';
35+
import type { AgentBuilderEvaluationChatClient } from './chat_client';
3836

3937
interface DatasetExample extends Example {
4038
input: {
@@ -185,52 +183,6 @@ export function createEvaluateDataset({
185183
};
186184
},
187185
},
188-
{
189-
name: 'OnlyFromToolOutputHeuristic',
190-
kind: 'CODE' as const,
191-
evaluate: async ({ output, metadata }) => {
192-
if (!getBooleanMeta(metadata, 'requireOnlyFromToolOutput')) return { score: 1 };
193-
194-
const expectedOnlyToolId = getStringMeta(metadata, 'expectedOnlyToolId');
195-
const toolCalls = getToolCallSteps(output as TaskOutput);
196-
const matching = expectedOnlyToolId
197-
? toolCalls.filter((t) => t.tool_id === expectedOnlyToolId)
198-
: toolCalls;
199-
200-
const strings: string[] = [];
201-
for (const call of matching) {
202-
extractAllStrings(call.results, strings);
203-
}
204-
const toolText = strings.join('\n');
205-
const answer = getFinalAssistantMessage(output as TaskOutput);
206-
207-
// The prompt explicitly asks about the relationship between Elasticsearch, Kibana, and Logstash.
208-
// If the retrieved docs don't mention all three, the agent should explicitly state insufficiency.
209-
const requiredTerms = ['elasticsearch', 'kibana', 'logstash'];
210-
const hasAllRequiredTerms = containsAllTerms(toolText, requiredTerms);
211-
if (hasAllRequiredTerms) return { score: 1 };
212-
213-
const explicitlyInsufficient = includesOneOf(answer, [
214-
'insufficient',
215-
'not enough information',
216-
"don't have enough",
217-
'do not have enough',
218-
"couldn't find",
219-
'could not find',
220-
"didn't find",
221-
'did not find',
222-
]);
223-
224-
return {
225-
score: explicitlyInsufficient ? 1 : 0,
226-
metadata: {
227-
requiredTerms,
228-
hasAllRequiredTerms,
229-
answerPreview: answer.slice(0, 500),
230-
},
231-
};
232-
},
233-
},
234186
{
235187
name: 'DocVersionReleaseDate',
236188
kind: 'CODE' as const,

x-pack/platform/packages/shared/ai-infra/kbn-evals-suite-llm-tasks/evals/retrieve_documentation/retrieve_documentation.spec.ts

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import type { TaskOutput } from '@arizeai/phoenix-client/dist/esm/types/experime
1010
import type { ElasticsearchClient, KibanaRequest } from '@kbn/core/server';
1111
import type { Logger } from '@kbn/logging';
1212
import { defaultInferenceEndpoints } from '@kbn/inference-common';
13-
import { evaluate, selectEvaluators } from '@kbn/evals';
13+
import { containsAllTerms, evaluate, selectEvaluators } from '@kbn/evals';
1414
import { SearchService } from '@kbn/product-doc-base-plugin/server/services/search/search_service';
1515
import { retrieveDocumentation } from '@kbn/llm-tasks-plugin/server/tasks/retrieve_documentation';
1616
import type { ProductName } from '@kbn/product-doc-common';
@@ -57,11 +57,6 @@ type RetrieveDocumentationTaskOutput = TaskOutput & {
5757
}>;
5858
};
5959

60-
const containsAllTerms = (text: string, terms: string[]) => {
61-
const lower = text.toLowerCase();
62-
return terms.every((t) => lower.includes(t.toLowerCase()));
63-
};
64-
6560
const createNoopLogger = (): Logger =>
6661
({
6762
trace: () => {},

x-pack/platform/packages/shared/kbn-evals/index.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ export { createQuantitativeCorrectnessEvaluators } from './src/evaluators/correc
1313
export { createQuantitativeGroundednessEvaluator } from './src/evaluators/groundedness';
1414
export type { EvaluationDataset, EvaluationWorkerFixtures, EvaluationReport } from './src/types';
1515
export { withEvaluatorSpan } from './src/utils/tracing';
16+
export {
17+
containsAllTerms,
18+
extractAllStrings,
19+
extractMaxSemver,
20+
extractReleaseDateNearVersion,
21+
getBooleanMeta,
22+
getFinalAssistantMessage,
23+
getStringMeta,
24+
getToolCallSteps,
25+
includesOneOf,
26+
} from './src/utils/evaluation_helpers';
1627
export {
1728
type EvaluationReporter,
1829
createDefaultTerminalReporter,

x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/src/evaluate_dataset_utils.ts renamed to x-pack/platform/packages/shared/kbn-evals/src/utils/evaluation_helpers.ts

File renamed without changes.

0 commit comments

Comments
 (0)