Skip to content

Commit 31f5258

Browse files
itamargolanclaude
andauthored
[OPIK-5759] [BE/FE] feat: multi-provider LLM-as-judge support for eval suite assertions (#6167)
* [OPIK-5759] [BE/FE] feat: multi-provider LLM-as-judge support for eval suite assertions Support OpenAI, Anthropic, and Gemini as LLM-as-judge providers for eval suite assertions. Previously only OpenAI was implicitly supported. The model is resolved from connected providers using a priority order (OpenAI > Anthropic > Gemini). Backend: - Add SupportedJudgeProvider enum with provider-to-model mapping - Move provider resolution from config into EvalSuiteAssertionSampler - Keep EvalSuiteEvaluatorMapper as a pure data transformer - Remove unused defaultModelName from EvalSuiteConfig - Fix: don't send snakeCased config keys to backend (camelCase expected) Frontend: - Add provider validation: disable run button when no supported provider is connected - Pass pre-computed boolean to RunOnDatasetDialog instead of raw provider keys Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(eval-suite): improve evaluator mapper clarity and add unit tests - Merge deserialization and model assignment into deserializeScoringCode() - Remove separate setModel/withModel method - Add parameterized unit tests for resolveModel() provider priority Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor: remove FE provider blocking, add backend model fallback from trace metadata - Remove frontend logic that blocked eval suite runs without a supported LLM provider (revert llm.ts, PlaygroundHeader, RunOnDatasetDialog) - Resolve model once per batch on backend: try connected providers (OpenAI > Anthropic > Gemini > Vertex AI), fall back to completion task model from trace metadata - Add SupportedJudgeProvider enum referencing model name enums for compile-time safety - Store eval_suite_model in trace metadata for fallback resolution - Add Vertex AI as supported eval suite judge provider Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: address PR review comments for eval suite model resolution - Extract SupportedJudgeProvider to its own file - Use streams instead of for-loop in resolveModel - Preserve model parameters (temperature, seed, customParameters) when overriding name - Guard against null modelName with early return and warning log - Replace hardcoded model strings in tests with enum constants - Use imported Context instead of qualified reactor.util.context.Context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: add proper import for reactor.util.context.Context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: update EvalSuiteAssertionSamplerTest for null model guard - Mock connected OpenAI provider in setUp so existing tests pass - Add test for eval_suite_model metadata fallback when no provider connected - Add test for early return when neither provider nor metadata model available Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 95ed74c commit 31f5258

File tree

10 files changed

+335
-39
lines changed

10 files changed

+335
-39
lines changed

apps/opik-backend/config.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,6 @@ onlineScoring:
595595

596596
# Configuration for Evaluation Suite assertions
597597
evalSuite:
598-
# Default: gpt-5-nano
599-
# Description: Default LLM model used for eval suite assertions when the evaluator config has no model specified
600-
defaultModelName: ${EVAL_SUITE_DEFAULT_MODEL_NAME:-gpt-5-nano}
601598
# Default: 1
602599
# Description: Number of LLM runs per dataset item during eval suite execution
603600
defaultRunsPerItem: ${EVAL_SUITE_DEFAULT_RUNS_PER_ITEM:-1}

apps/opik-backend/src/main/java/com/comet/opik/api/resources/v1/events/EvalSuiteAssertionSampler.java

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
import com.comet.opik.api.DatasetVersion;
44
import com.comet.opik.api.EvaluatorItem;
5+
import com.comet.opik.api.LlmProvider;
56
import com.comet.opik.api.PromptType;
7+
import com.comet.opik.api.ProviderApiKey;
68
import com.comet.opik.api.Trace;
79
import com.comet.opik.api.evaluators.AutomationRuleEvaluatorType;
810
import com.comet.opik.api.events.TraceToScoreLlmAsJudge;
@@ -11,6 +13,7 @@
1113
import com.comet.opik.domain.DatasetItemService;
1214
import com.comet.opik.domain.DatasetVersionService;
1315
import com.comet.opik.domain.IdGenerator;
16+
import com.comet.opik.domain.LlmProviderApiKeyService;
1417
import com.comet.opik.domain.evaluators.OnlineScorePublisher;
1518
import com.comet.opik.infrastructure.EvalSuiteConfig;
1619
import com.comet.opik.infrastructure.auth.RequestContext;
@@ -21,6 +24,7 @@
2124
import lombok.extern.slf4j.Slf4j;
2225
import reactor.core.publisher.Mono;
2326
import reactor.core.scheduler.Schedulers;
27+
import reactor.util.context.Context;
2428
import ru.vyarus.dropwizard.guice.module.installer.feature.eager.EagerSingleton;
2529
import ru.vyarus.dropwizard.guice.module.yaml.bind.Config;
2630

@@ -30,7 +34,9 @@
3034
import java.util.List;
3135
import java.util.Map;
3236
import java.util.Optional;
37+
import java.util.Set;
3338
import java.util.UUID;
39+
import java.util.stream.Collectors;
3440
import java.util.stream.Stream;
3541

3642
/**
@@ -56,6 +62,7 @@ public class EvalSuiteAssertionSampler {
5662
private final IdGenerator idGenerator;
5763
private final EvalSuiteConfig evalSuiteConfig;
5864
private final EvalSuiteEvaluatorMapper evaluatorMapper;
65+
private final LlmProviderApiKeyService llmProviderApiKeyService;
5966

6067
@Inject
6168
public EvalSuiteAssertionSampler(
@@ -64,13 +71,15 @@ public EvalSuiteAssertionSampler(
6471
@NonNull OnlineScorePublisher onlineScorePublisher,
6572
@NonNull IdGenerator idGenerator,
6673
@NonNull @Config("evalSuite") EvalSuiteConfig evalSuiteConfig,
67-
@NonNull EvalSuiteEvaluatorMapper evaluatorMapper) {
74+
@NonNull EvalSuiteEvaluatorMapper evaluatorMapper,
75+
@NonNull LlmProviderApiKeyService llmProviderApiKeyService) {
6876
this.datasetItemService = datasetItemService;
6977
this.datasetVersionService = datasetVersionService;
7078
this.onlineScorePublisher = onlineScorePublisher;
7179
this.idGenerator = idGenerator;
7280
this.evalSuiteConfig = evalSuiteConfig;
7381
this.evaluatorMapper = evaluatorMapper;
82+
this.llmProviderApiKeyService = llmProviderApiKeyService;
7483
}
7584

7685
@Subscribe
@@ -83,13 +92,26 @@ public void onTracesCreated(TracesCreated tracesBatch) {
8392
return;
8493
}
8594

86-
var reactiveContext = reactor.util.context.Context.of(
95+
var reactiveContext = Context.of(
8796
RequestContext.WORKSPACE_ID, tracesBatch.workspaceId(),
8897
RequestContext.USER_NAME, tracesBatch.userName(),
8998
RequestContext.VISIBILITY, com.comet.opik.api.Visibility.PRIVATE);
9099

91100
Duration fetchTimeout = Duration.ofSeconds(evalSuiteConfig.getFetchTimeoutSeconds());
92101

102+
// Resolve model once per batch: prefer connected provider, fall back to first trace's model
103+
var connectedProviders = getConnectedProviders(tracesBatch.workspaceId());
104+
String modelName = SupportedJudgeProvider.resolveModel(connectedProviders)
105+
.or(() -> getMetadataString(completeTraces.getFirst(), "eval_suite_model"))
106+
.orElse(null);
107+
108+
if (modelName == null) {
109+
log.warn("No LLM model resolved for eval suite batch in workspace '{}' — "
110+
+ "no supported provider connected and no eval_suite_model in trace metadata",
111+
tracesBatch.workspaceId());
112+
return;
113+
}
114+
93115
// Cache dataset evaluators by (datasetId:versionHash) to avoid redundant fetches
94116
Map<String, List<PreparedEvaluator>> datasetEvaluatorsCache = new HashMap<>();
95117

@@ -120,7 +142,7 @@ public void onTracesCreated(TracesCreated tracesBatch) {
120142
.contextWrite(reactiveContext)
121143
.timeout(fetchTimeout)
122144
.block();
123-
return evaluatorMapper.prepareEvaluators(result.evaluators());
145+
return evaluatorMapper.prepareEvaluators(result.evaluators(), modelName);
124146
});
125147

126148
var datasetItemId = getMetadataString(trace, "eval_suite_dataset_item_id");
@@ -134,7 +156,8 @@ public void onTracesCreated(TracesCreated tracesBatch) {
134156
.flatMap(itemId -> {
135157
List<PreparedEvaluator> allEvaluators = new ArrayList<>(
136158
preparedDatasetEvaluators);
137-
allEvaluators.addAll(fetchItemEvaluators(itemId, reactiveContext));
159+
allEvaluators.addAll(fetchItemEvaluators(itemId, reactiveContext,
160+
modelName));
138161

139162
if (allEvaluators.isEmpty()) {
140163
log.debug("No evaluators found for trace '{}', dataset item '{}'",
@@ -191,7 +214,8 @@ private Mono<DatasetEvaluatorsResult> fetchDatasetEvaluators(UUID datasetId, Str
191214
}
192215

193216
private List<PreparedEvaluator> fetchItemEvaluators(
194-
UUID itemId, reactor.util.context.Context reactiveContext) {
217+
UUID itemId, Context reactiveContext,
218+
String modelName) {
195219
try {
196220
var item = datasetItemService.get(itemId)
197221
.contextWrite(reactiveContext)
@@ -202,7 +226,7 @@ private List<PreparedEvaluator> fetchItemEvaluators(
202226
return List.of();
203227
}
204228

205-
return evaluatorMapper.prepareEvaluators(item.evaluators());
229+
return evaluatorMapper.prepareEvaluators(item.evaluators(), modelName);
206230
} catch (Exception e) {
207231
log.error("Failed to fetch evaluators for item '{}'", itemId, e);
208232
return List.of();
@@ -230,4 +254,16 @@ private Optional<UUID> parseUUID(String id, UUID traceId) {
230254
}
231255
}
232256

257+
private Set<LlmProvider> getConnectedProviders(String workspaceId) {
258+
try {
259+
return llmProviderApiKeyService.find(workspaceId)
260+
.content().stream()
261+
.map(ProviderApiKey::provider)
262+
.collect(Collectors.toSet());
263+
} catch (Exception e) {
264+
log.error("Failed to fetch connected providers for workspace '{}'", workspaceId, e);
265+
return Set.of();
266+
}
267+
}
268+
233269
}

apps/opik-backend/src/main/java/com/comet/opik/api/resources/v1/events/EvalSuiteEvaluatorMapper.java

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,11 @@
1515
import jakarta.inject.Singleton;
1616
import lombok.NonNull;
1717
import lombok.extern.slf4j.Slf4j;
18-
import org.apache.commons.lang3.StringUtils;
1918

2019
import java.util.ArrayList;
2120
import java.util.HashMap;
2221
import java.util.List;
2322
import java.util.Map;
24-
import java.util.Optional;
2523
import java.util.stream.Collectors;
2624
import java.util.stream.Stream;
2725

@@ -54,7 +52,8 @@ public int getEffectiveRunsPerItem(ExecutionPolicy itemPolicy, ExecutionPolicy v
5452
return evalSuiteConfig.getDefaultRunsPerItem();
5553
}
5654

57-
public List<PreparedEvaluator> prepareEvaluators(List<EvaluatorItem> evaluators) {
55+
public List<PreparedEvaluator> prepareEvaluators(List<EvaluatorItem> evaluators,
56+
String modelName) {
5857
return evaluators.stream()
5958
.filter(evaluator -> {
6059
if (evaluator.type() != EvaluatorType.LLM_JUDGE) {
@@ -66,7 +65,7 @@ public List<PreparedEvaluator> prepareEvaluators(List<EvaluatorItem> evaluators)
6665
})
6766
.flatMap(evaluator -> {
6867
try {
69-
LlmAsJudgeCode code = toScoringCode(evaluator.config());
68+
LlmAsJudgeCode code = toScoringCode(evaluator.config(), modelName);
7069

7170
Map<String, String> scoreNameMapping = code.schema() != null
7271
? code.schema().stream()
@@ -84,16 +83,22 @@ public List<PreparedEvaluator> prepareEvaluators(List<EvaluatorItem> evaluators)
8483
.toList();
8584
}
8685

87-
LlmAsJudgeCode toScoringCode(JsonNode config) {
88-
LlmAsJudgeCode code = deserializeEvaluatorConfig(config);
89-
code = resolveModelName(code);
86+
LlmAsJudgeCode toScoringCode(JsonNode config, String modelName) {
87+
LlmAsJudgeCode code = deserializeScoringCode(config, modelName);
9088
code = renameSchemaToAssertionKeys(code);
9189
code = applyEvalSuitePrompt(code);
9290
return code;
9391
}
9492

95-
private LlmAsJudgeCode deserializeEvaluatorConfig(JsonNode config) {
96-
return JsonUtils.treeToValue(config, LlmAsJudgeCode.class);
93+
private LlmAsJudgeCode deserializeScoringCode(JsonNode config, String modelName) {
94+
var code = JsonUtils.treeToValue(config, LlmAsJudgeCode.class);
95+
var existingModel = code.model();
96+
var model = (existingModel != null ? existingModel.toBuilder() : LlmAsJudgeModelParameters.builder())
97+
.name(modelName)
98+
.build();
99+
return code.toBuilder()
100+
.model(model)
101+
.build();
97102
}
98103

99104
/**
@@ -160,17 +165,4 @@ private String formatAssertions(List<LlmAsJudgeOutputSchema> schema) {
160165
.collect(Collectors.joining("\n"));
161166
}
162167

163-
private LlmAsJudgeCode resolveModelName(LlmAsJudgeCode code) {
164-
var existingModel = Optional.ofNullable(code.model());
165-
if (existingModel.map(LlmAsJudgeModelParameters::name).filter(StringUtils::isNotBlank).isEmpty()) {
166-
var resolvedModel = LlmAsJudgeModelParameters.builder()
167-
.name(evalSuiteConfig.getDefaultModelName())
168-
.temperature(existingModel.map(LlmAsJudgeModelParameters::temperature).orElse(null))
169-
.seed(existingModel.map(LlmAsJudgeModelParameters::seed).orElse(null))
170-
.customParameters(existingModel.map(LlmAsJudgeModelParameters::customParameters).orElse(null))
171-
.build();
172-
return new LlmAsJudgeCode(resolvedModel, code.messages(), code.variables(), code.schema());
173-
}
174-
return code;
175-
}
176168
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package com.comet.opik.api.resources.v1.events;
2+
3+
import com.comet.opik.api.LlmProvider;
4+
import com.comet.opik.infrastructure.llm.antropic.AnthropicModelName;
5+
import com.comet.opik.infrastructure.llm.gemini.GeminiModelName;
6+
import com.comet.opik.infrastructure.llm.openai.OpenaiModelName;
7+
import com.comet.opik.infrastructure.llm.vertexai.VertexAIModelName;
8+
9+
import java.util.Arrays;
10+
import java.util.Optional;
11+
import java.util.Set;
12+
13+
/**
14+
* Supported providers for eval suite LLM-as-judge assertions, ordered by priority.
15+
* First connected provider wins.
16+
*/
17+
enum SupportedJudgeProvider {
18+
OPEN_AI(LlmProvider.OPEN_AI, OpenaiModelName.GPT_5_NANO.toString()),
19+
ANTHROPIC(LlmProvider.ANTHROPIC, AnthropicModelName.CLAUDE_HAIKU_4_5.toString()),
20+
GEMINI(LlmProvider.GEMINI, GeminiModelName.GEMINI_2_0_FLASH.toString()),
21+
VERTEX_AI(LlmProvider.VERTEX_AI, VertexAIModelName.GEMINI_2_5_FLASH.qualifiedName());
22+
23+
private final LlmProvider provider;
24+
private final String model;
25+
26+
SupportedJudgeProvider(LlmProvider provider, String model) {
27+
this.provider = provider;
28+
this.model = model;
29+
}
30+
31+
/**
32+
* Resolves the LLM model for eval suite assertions based on connected providers.
33+
* Returns the model for the highest-priority connected provider, or empty if none match.
34+
*/
35+
static Optional<String> resolveModel(Set<LlmProvider> connectedProviders) {
36+
return Arrays.stream(values())
37+
.filter(judge -> connectedProviders.contains(judge.provider))
38+
.findFirst()
39+
.map(judge -> judge.model);
40+
}
41+
}

apps/opik-backend/src/main/java/com/comet/opik/domain/ExperimentTracePersistence.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ private Mono<Void> createTrace(PersistenceContext ctx, ObjectNode input, ObjectN
7171
metadata.put("eval_suite_dataset_version_hash", ctx.versionHash());
7272
}
7373
metadata.put("eval_suite_dataset_item_id", ctx.datasetItemId().toString());
74+
metadata.put("eval_suite_model", ctx.prompt().model());
7475

7576
var traceBuilder = Trace.builder()
7677
.id(ctx.traceId())

apps/opik-backend/src/main/java/com/comet/opik/infrastructure/EvalSuiteConfig.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,11 @@
33
import com.fasterxml.jackson.annotation.JsonProperty;
44
import jakarta.validation.Valid;
55
import jakarta.validation.constraints.Min;
6-
import jakarta.validation.constraints.NotBlank;
76
import lombok.Data;
87

98
@Data
109
public class EvalSuiteConfig {
1110

12-
@Valid @NotBlank @JsonProperty
13-
private String defaultModelName = "gpt-5-nano";
14-
1511
@Valid @Min(1) @JsonProperty
1612
private int defaultRunsPerItem = 1;
1713

0 commit comments

Comments
 (0)