Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs by chenmoneygithub · Pull Request #20197 · mlflow/mlflow

chenmoneygithub · 2026-01-21T21:10:18Z

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Base off #20115, will send out for review after #20115 is merged.

This PR adds three new backend APIs for the prompt optimization feature:

getPromptOptimizationJob: Retrieve details of a single optimization job
searchPromptOptimizationJobs: List all optimization jobs for an experiment
deletePromptOptimizationJob: Delete an optimization job and its associated run

To test out the PR, you can use the following script, which has some incremental changes based on #20115:

import json

import requests

# Configuration
MLFLOW_SERVER_URL = "http://127.0.0.1:5000"
API_VERSION = 3  # API version from proto (prompt optimization APIs are v3)


def get_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}"

    print(f"Getting job status for: {job_id}")

    response = requests.get(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def search_optimization_jobs(experiment_id: str = None) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/search"

    payload = {}
    if experiment_id:
        payload["experiment_id"] = experiment_id

    print("Searching optimization jobs...")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def delete_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}"

    print(f"Deleting job: {job_id}")

    response = requests.delete(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def create_optimization_job(
    experiment_id: str,
    prompt_uri: str,
    optimizer_type: str = "gepa",
    scorers: list[str] | None = None,
    dataset_id: str | None = None,
    optimizer_config: dict | None = None,
    tags: list[dict] | None = None,
) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs"

    # Convert string optimizer_type to proto enum value
    optimizer_type_to_enum = {
        "gepa": 1,  # OPTIMIZER_TYPE_GEPA
        "metaprompt": 2,  # OPTIMIZER_TYPE_METAPROMPT
    }
    optimizer_type_enum = optimizer_type_to_enum.get(optimizer_type.lower(), 0)

    # Build config - source_prompt_uri is now a top-level field, not in config
    config = {
        "optimizer_type": optimizer_type_enum,
        "scorers": scorers or [],  # Empty list if None
    }

    # Add dataset_id if provided (optional for zero-shot metaprompting)
    if dataset_id:
        config["dataset_id"] = dataset_id

    # Add optimizer_config_json if provided
    if optimizer_config:
        config["optimizer_config_json"] = json.dumps(optimizer_config)

    payload = {
        "experiment_id": experiment_id,
        "source_prompt_uri": prompt_uri,  # Top-level field, not in config
        "config": config,
        "tags": tags or [],
    }

    print("Creating optimization job...")
    print(f"URL: {url}")
    print(f"Payload: {json.dumps(payload, indent=2)}")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def cancel_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}/cancel"

    print(f"Cancelling job: {job_id}")

    response = requests.post(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def main():
    import mlflow

    dataset_id = "d-ecaeda79a412460bb6f0560af6bb7321"
    prompt_uri = "prompts:/aime_solver/1"

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("optimization_backend")

    # Scorers - use built-in scorer names (e.g., "Correctness", "Safety")
    # or registered scorer names from the experiment
    scorers = ["Correctness"]  # Built-in scorer for checking expected_response

    # Optimizer-specific config
    optimizer_config = {
        "reflection_model": "openai:/gpt-4o",
        # "max_metric_calls": 100,
    }

    experiment = mlflow.get_experiment_by_name("optimization_backend")
    experiment_id = experiment.experiment_id


    result = create_optimization_job(
        experiment_id=experiment_id,
        prompt_uri=prompt_uri,
        dataset_id=dataset_id,
        scorers=scorers,
        optimizer_type="metaprompt",
        optimizer_config=optimizer_config,
        tags=[{"key": "test", "value": "true"}],
    )

    if result and "job" in result:
        job_id = result["job"].get("job_id")
        print(f"\nCreated job with ID: {job_id}")

        import pdb

        pdb.set_trace()

        get_optimization_job(job_id)
        search_optimization_jobs(experiment_id)
        cancel_optimization_job(job_id)
        delete_optimization_job(job_id)
    else:
        print("Failed to create optimization job")

    print("\n" + "=" * 60)
    print("Test complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

Also includes:

New proto definitions for jobs and prompt optimization messages
Handler implementations for all three APIs
Unit tests for all new endpoints
Fix: Remove invalid target_prompt_uri field reference in handler

How is this PR tested?

Existing unit/integration tests
New unit/integration tests
Manual tests

Does this PR require documentation update?

Release Notes

Is this a user-facing change?

No. You can skip the rest of this section.
Yes. Give a description of this change to be included in the release notes for MLflow users.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

How should the PR be classified in the release notes? Choose one:

rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
rn/feature - A new user-facing feature worth mentioning in the release notes
rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?

Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
Bug fixes, doc updates and new features usually go into minor releases.
Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
Bug fixes and doc updates usually go into patch releases.

Yes (this PR will be cherry-picked and included in the next patch release)
No (this PR will be included in the next minor release)

github-actions · 2026-01-21T21:11:02Z

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20197/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20197/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/20197/merge

Copilot

Pull request overview

This PR adds backend APIs for prompt optimization job management including Get, Search, Delete, and Cancel operations. It introduces new proto definitions for jobs and prompt optimization, handler implementations for all endpoints, and comprehensive unit tests.

Changes:

New proto definitions for JobStatus, JobState, and PromptOptimizationJob messages
Handler implementations for get, search, delete, and cancel prompt optimization job endpoints
Unit tests covering pending, succeeded, failed, and edge case scenarios
Support utilities including job error logging and prompt optimization run tagging

Reviewed changes

Copilot reviewed 16 out of 19 changed files in this pull request and generated 3 comments.

Show a summary per file

File	Description
mlflow/protos/jobs.proto	Defines generic JobStatus enum and JobState message for all job types
mlflow/protos/prompt_optimization.proto	Defines PromptOptimizationJob message and OptimizerType enum
mlflow/protos/service.proto	Adds 5 new RPC endpoints for prompt optimization job operations
mlflow/server/handlers.py	Implements handlers for get, search, delete, cancel, and create operations
tests/server/test_handlers.py	Adds 11 unit tests covering various job scenarios and edge cases
mlflow/utils/mlflow_tags.py	Adds MLFLOW_RUN_IS_PROMPT_OPTIMIZATION tag
mlflow/server/jobs/utils.py	Adds error logging and disables job re-enqueueing temporarily
mlflow/server/jobs/_job_subproc_entry.py	Adds detailed error logging with traceback
mlflow/genai/optimize/util.py	Tags optimization runs for UI filtering
dev/generate_protos.py	Updates proto generation to include new proto files

Comments suppressed due to low confidence (1)

mlflow/server/jobs/utils.py:567

The function has unreachable code after the early return statement. All code from line 545 onwards will never execute. If the TODO is temporary, consider using a feature flag or configuration instead of commenting out the code with an early return. If the code should remain disabled, remove the unreachable code below the return statement.

def _enqueue_unfinished_jobs(server_launching_timestamp: int) -> None:
    # TODO: Job re-enqueueing is temporarily disabled. The current implementation
    # has issues with job state management that can cause duplicate execution.
    # This will be re-enabled once the job persistence layer is stabilized.
    return None
    from mlflow.server.handlers import _get_job_store

    job_store = _get_job_store()

    unfinished_jobs = job_store.list_jobs(
        statuses=[JobStatus.PENDING, JobStatus.RUNNING],
        # filter out jobs created after the server is launched.
        end_timestamp=server_launching_timestamp,
    )

    for job in unfinished_jobs:
        if job.status == JobStatus.RUNNING:
            job_store.reset_job(job.job_id)  # reset the job status to PENDING

        params = json.loads(job.params)
        timeout = job.timeout
        # Look up exclusive flag from function metadata
        fn_fullname = get_job_fn_fullname(job.job_name)
        fn_metadata = _load_function(fn_fullname)._job_fn_metadata
        _get_or_init_huey_instance(job.job_name).submit_task(
            job.job_id, job.job_name, params, timeout, fn_metadata.exclusive
        )

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

mlflow/server/handlers.py

github-actions · 2026-01-21T21:19:35Z

Documentation preview for eca1a14 is available at:

https://pr-20197--mlflow-docs-preview.netlify.app/docs/latest/

More info

Ignore this comment if this PR does not change the documentation.
The preview is updated when a new commit is pushed to this PR.
This comment was created by this workflow run.
The documentation was built by this workflow run.

This PR adds three new backend APIs for the prompt optimization feature: - getPromptOptimizationJob: Retrieve details of a single optimization job - searchPromptOptimizationJobs: List all optimization jobs for an experiment - deletePromptOptimizationJob: Delete an optimization job and its associated run Also includes: - New proto definitions for jobs and prompt optimization messages - Handler implementations for all three APIs - Unit tests for all new endpoints - Fix: Remove invalid target_prompt_uri field reference in handler Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: chenmoneygithub <chen.qian@databricks.com>

TomeHirata · 2026-01-26T05:16:59Z

mlflow/server/handlers.py

+    job_store.delete_jobs(job_ids=[job_id])
+
+    # Delete the associated MLflow run if it exists.
+    # Ignore errors (e.g., run already deleted) to ensure job deletion succeeds.


why do we return success response even when the deletion actually failed?

Yes it's a bit confusing - there is a chance that users delete the MLflow run associated with the optimization job from UI/client, then we want the DeletePromptOptimizationJob to just delete the job entity and skip the run. But with a second thought it is better to skip deleting the run if the run doesn't exist.

Changed!

mlflow/server/handlers.py

TomeHirata · 2026-01-26T05:23:36Z

mlflow/server/handlers.py

+            elif metric_name.startswith("final_eval_score."):
+                scorer_name = metric_name[len("final_eval_score.") :]
+                optimization_job.final_eval_scores[scorer_name] = metric_value
+


Q: Do we have a separate PR for adding APIs for intermediate results?

This is actually a tricky thing. My current plan is not including intermediate results in the response of GetOptimizationJob because it only exists in GEPA optimization workflow, and we want to add some flexibility to cache the earlier fetched intermediate prompts/evaluation results for performance. Here is the workflow in my mind:

When users open the prompt optimization detailed view page for a certain optimization job, the UI triggers a GetPromptOptimizationJob and fetch the associated run_id.

UI pulls intermediate prompts/evaluation results via the artifact API.

UI caches the earlier fetched prompts/evaluation results to avoid duplicated retrieval.

Let me know if this makes sense to you!

UI pulls intermediate prompts/evaluation results via the artifact API

Does this mean you use the artifact API directly instead of adding a separate RCP method for fetching intermediate prompts?

Yes I am thinking about letting frontend handle what to pull and what to cache for flexibility.

mlflow/server/handlers.py

TomeHirata · 2026-01-26T05:29:02Z

mlflow/server/handlers.py

+                optimization_job.initial_eval_scores["aggregate"] = metric_value
+            elif metric_name == "final_eval_score":
+                optimization_job.final_eval_scores["aggregate"] = metric_value
+            elif metric_name.startswith("initial_eval_score."):


As I commented in another PR, the response of Scorer is not always numerical, we may need to update the type on proto and allow string or other response types supported by Scorer

This is a bit complex, let's discuss it - originally I was thinking that we should force only using numeric scorers, since for DSPy optimizer/GEPA optimizer non-numeric scorer is not valid. There is one situation where non-numeric scorer can add value IIUC, which is few-shot metaprompting. But for few-shot metaprompting, the initial score and final score will hit trouble because we don't have a reliable way to aggregate over non-numeric scorers' outputs on the validation dataset. With that, I am thinking that for prompt optimization job, we only allow scorers that output numeric values for simplicity. Otherwise, users may hit this situation:

They quickly tried metaprompting on optimization UI, with some non-numeric scorers selected.

They want to try GEPA, but hit an error that the non-numeric scorer cannot be used.

In this situation we are exposing some unnecessary internal logics to the user.

This may be too verbose, please let me know if this explanation makes sense!

I think we should at least support the conversation from YES/NO to 1/0 since all the built-in scorers return "YES" or "NO" (CategoricalRating) right now. This aligns what we do in the optimize_prompts method (please see create_metric_from_scorers method). We can reject other types (e.g., dict[str, str], list[str]) in the initial version.

mlflow/utils/mlflow_tags.py

TomeHirata

LGTM!

Copilot AI review requested due to automatic review settings January 21, 2026 21:10

Copilot started reviewing on behalf of chenmoneygithub January 21, 2026 21:11 View session

github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. and removed area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 21, 2026

Copilot AI reviewed Jan 21, 2026

View reviewed changes

mlflow/server/handlers.py Outdated Show resolved Hide resolved

mlflow/server/handlers.py Show resolved Hide resolved

mlflow/server/handlers.py Outdated Show resolved Hide resolved

chenmoneygithub and others added 2 commits January 22, 2026 22:13

merge master

b7a97a8

chenmoneygithub force-pushed the mlflow-po-backend-pr-3 branch from a500856 to b7a97a8 Compare January 23, 2026 06:18

chenmoneygithub added 4 commits January 22, 2026 22:26

update proto

09faa46

update the metric to be a dict

02b3c53

remove the per-scorer change, as it's covered in a different PR

998f9f4

refine code

48afb40

chenmoneygithub changed the title ~~[WIP] Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs~~ Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs Jan 23, 2026

github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 23, 2026

TomeHirata self-requested a review January 26, 2026 05:14

TomeHirata self-assigned this Jan 26, 2026

TomeHirata reviewed Jan 26, 2026

View reviewed changes

mlflow/server/handlers.py Outdated Show resolved Hide resolved

TomeHirata reviewed Jan 26, 2026

View reviewed changes

mlflow/server/handlers.py Outdated Show resolved Hide resolved

TomeHirata reviewed Jan 26, 2026

View reviewed changes

mlflow/utils/mlflow_tags.py Outdated Show resolved Hide resolved

fix comments

edf3f2d

chenmoneygithub force-pushed the mlflow-po-backend-pr-3 branch from 7a8f434 to edf3f2d Compare January 26, 2026 19:06

chenmoneygithub mentioned this pull request Jan 26, 2026

Track intermediate candidates and evaluation scores in gepa optimizer #20198

Merged

29 tasks

Merge branch 'master' into mlflow-po-backend-pr-3

3670395

chenmoneygithub requested a review from TomeHirata January 26, 2026 20:55

chenmoneygithub added 2 commits January 26, 2026 17:21

include optimizer config in search optimization jobs

5fd81dd

Merge branch 'master' into mlflow-po-backend-pr-3

41b7eea

TomeHirata approved these changes Jan 27, 2026

View reviewed changes

Copilot AI mentioned this pull request Jan 27, 2026

Fix protobuf cross tests by enabling submodule checkout #20355

Merged

29 tasks

Merge branch 'master' into mlflow-po-backend-pr-3

eca1a14

chenmoneygithub added this pull request to the merge queue Jan 27, 2026

Merged via the queue into mlflow:master with commit 59b61a7 Jan 27, 2026
55 of 56 checks passed

chenmoneygithub deleted the mlflow-po-backend-pr-3 branch January 27, 2026 08:17

Conversation

chenmoneygithub commented Jan 21, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Related Issues/PRs

What changes are proposed in this pull request?

How is this PR tested?

Does this PR require documentation update?

Release Notes

Is this a user-facing change?

What component(s), interfaces, languages, and integrations does this PR affect?

How should the PR be classified in the release notes? Choose one:

Should this PR be included in the next patch release?

Uh oh!

github-actions bot commented Jan 21, 2026

Install mlflow from this PR

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Reviewed changes

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Jan 21, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

TomeHirata Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

chenmoneygithub Jan 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

TomeHirata Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

chenmoneygithub Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

TomeHirata Jan 27, 2026

Choose a reason for hiding this comment

Uh oh!

chenmoneygithub Jan 27, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

TomeHirata Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

chenmoneygithub Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

TomeHirata Jan 27, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

TomeHirata left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

chenmoneygithub commented Jan 21, 2026 •

edited

Loading

github-actions bot commented Jan 21, 2026 •

edited

Loading

chenmoneygithub Jan 26, 2026 •

edited

Loading

TomeHirata Jan 27, 2026 •

edited

Loading