Skip to content

Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs#20197

Merged
chenmoneygithub merged 11 commits intomlflow:masterfrom
chenmoneygithub:mlflow-po-backend-pr-3
Jan 27, 2026
Merged

Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs#20197
chenmoneygithub merged 11 commits intomlflow:masterfrom
chenmoneygithub:mlflow-po-backend-pr-3

Conversation

@chenmoneygithub
Copy link
Contributor

@chenmoneygithub chenmoneygithub commented Jan 21, 2026

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Base off #20115, will send out for review after #20115 is merged.

This PR adds three new backend APIs for the prompt optimization feature:

  • getPromptOptimizationJob: Retrieve details of a single optimization job
  • searchPromptOptimizationJobs: List all optimization jobs for an experiment
  • deletePromptOptimizationJob: Delete an optimization job and its associated run

To test out the PR, you can use the following script, which has some incremental changes based on #20115:

import json

import requests

# Configuration
MLFLOW_SERVER_URL = "http://127.0.0.1:5000"
API_VERSION = 3  # API version from proto (prompt optimization APIs are v3)


def get_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}"

    print(f"Getting job status for: {job_id}")

    response = requests.get(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def search_optimization_jobs(experiment_id: str = None) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/search"

    payload = {}
    if experiment_id:
        payload["experiment_id"] = experiment_id

    print("Searching optimization jobs...")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def delete_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}"

    print(f"Deleting job: {job_id}")

    response = requests.delete(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def create_optimization_job(
    experiment_id: str,
    prompt_uri: str,
    optimizer_type: str = "gepa",
    scorers: list[str] | None = None,
    dataset_id: str | None = None,
    optimizer_config: dict | None = None,
    tags: list[dict] | None = None,
) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs"

    # Convert string optimizer_type to proto enum value
    optimizer_type_to_enum = {
        "gepa": 1,  # OPTIMIZER_TYPE_GEPA
        "metaprompt": 2,  # OPTIMIZER_TYPE_METAPROMPT
    }
    optimizer_type_enum = optimizer_type_to_enum.get(optimizer_type.lower(), 0)

    # Build config - source_prompt_uri is now a top-level field, not in config
    config = {
        "optimizer_type": optimizer_type_enum,
        "scorers": scorers or [],  # Empty list if None
    }

    # Add dataset_id if provided (optional for zero-shot metaprompting)
    if dataset_id:
        config["dataset_id"] = dataset_id

    # Add optimizer_config_json if provided
    if optimizer_config:
        config["optimizer_config_json"] = json.dumps(optimizer_config)

    payload = {
        "experiment_id": experiment_id,
        "source_prompt_uri": prompt_uri,  # Top-level field, not in config
        "config": config,
        "tags": tags or [],
    }

    print("Creating optimization job...")
    print(f"URL: {url}")
    print(f"Payload: {json.dumps(payload, indent=2)}")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def cancel_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}/cancel"

    print(f"Cancelling job: {job_id}")

    response = requests.post(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def main():
    import mlflow

    dataset_id = "d-ecaeda79a412460bb6f0560af6bb7321"
    prompt_uri = "prompts:/aime_solver/1"

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("optimization_backend")

    # Scorers - use built-in scorer names (e.g., "Correctness", "Safety")
    # or registered scorer names from the experiment
    scorers = ["Correctness"]  # Built-in scorer for checking expected_response

    # Optimizer-specific config
    optimizer_config = {
        "reflection_model": "openai:/gpt-4o",
        # "max_metric_calls": 100,
    }

    experiment = mlflow.get_experiment_by_name("optimization_backend")
    experiment_id = experiment.experiment_id


    result = create_optimization_job(
        experiment_id=experiment_id,
        prompt_uri=prompt_uri,
        dataset_id=dataset_id,
        scorers=scorers,
        optimizer_type="metaprompt",
        optimizer_config=optimizer_config,
        tags=[{"key": "test", "value": "true"}],
    )

    if result and "job" in result:
        job_id = result["job"].get("job_id")
        print(f"\nCreated job with ID: {job_id}")

        import pdb

        pdb.set_trace()

        get_optimization_job(job_id)
        search_optimization_jobs(experiment_id)
        cancel_optimization_job(job_id)
        delete_optimization_job(job_id)
    else:
        print("Failed to create optimization job")

    print("\n" + "=" * 60)
    print("Test complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

Also includes:

  • New proto definitions for jobs and prompt optimization messages
  • Handler implementations for all three APIs
  • Unit tests for all new endpoints
  • Fix: Remove invalid target_prompt_uri field reference in handler

How is this PR tested?

  • Existing unit/integration tests
  • New unit/integration tests
  • Manual tests

Does this PR require documentation update?

  • No. You can skip the rest of this section.
  • Yes. I've updated:
    • Examples
    • API references
    • Instructions

Release Notes

Is this a user-facing change?

  • No. You can skip the rest of this section.
  • Yes. Give a description of this change to be included in the release notes for MLflow users.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

  • area/tracking: Tracking Service, tracking client APIs, autologging
  • area/models: MLmodel format, model serialization/deserialization, flavors
  • area/model-registry: Model Registry service, APIs, and the fluent client calls for Model Registry
  • area/scoring: MLflow Model server, model deployment tools, Spark UDFs
  • area/evaluation: MLflow model evaluation features, evaluation metrics, and evaluation workflows
  • area/gateway: MLflow AI Gateway client APIs, server, and third-party integrations
  • area/prompts: MLflow prompt engineering features, prompt templates, and prompt management
  • area/tracing: MLflow Tracing features, tracing APIs, and LLM tracing functionality
  • area/projects: MLproject format, project running backends
  • area/uiux: Front-end, user experience, plotting, JavaScript, JavaScript dev server
  • area/build: Build and test infrastructure for MLflow
  • area/docs: MLflow documentation pages

How should the PR be classified in the release notes? Choose one:

  • rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
  • rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
  • rn/feature - A new user-facing feature worth mentioning in the release notes
  • rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
  • rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?
  • Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
    Bug fixes, doc updates and new features usually go into minor releases.
  • Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
    Bug fixes and doc updates usually go into patch releases.
  • Yes (this PR will be cherry-picked and included in the next patch release)
  • No (this PR will be included in the next minor release)

Copilot AI review requested due to automatic review settings January 21, 2026 21:10
@github-actions
Copy link
Contributor

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20197/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20197/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/20197/merge

@github-actions github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. and removed area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 21, 2026
Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR adds backend APIs for prompt optimization job management including Get, Search, Delete, and Cancel operations. It introduces new proto definitions for jobs and prompt optimization, handler implementations for all endpoints, and comprehensive unit tests.

Changes:

  • New proto definitions for JobStatus, JobState, and PromptOptimizationJob messages
  • Handler implementations for get, search, delete, and cancel prompt optimization job endpoints
  • Unit tests covering pending, succeeded, failed, and edge case scenarios
  • Support utilities including job error logging and prompt optimization run tagging

Reviewed changes

Copilot reviewed 16 out of 19 changed files in this pull request and generated 3 comments.

Show a summary per file
File Description
mlflow/protos/jobs.proto Defines generic JobStatus enum and JobState message for all job types
mlflow/protos/prompt_optimization.proto Defines PromptOptimizationJob message and OptimizerType enum
mlflow/protos/service.proto Adds 5 new RPC endpoints for prompt optimization job operations
mlflow/server/handlers.py Implements handlers for get, search, delete, cancel, and create operations
tests/server/test_handlers.py Adds 11 unit tests covering various job scenarios and edge cases
mlflow/utils/mlflow_tags.py Adds MLFLOW_RUN_IS_PROMPT_OPTIMIZATION tag
mlflow/server/jobs/utils.py Adds error logging and disables job re-enqueueing temporarily
mlflow/server/jobs/_job_subproc_entry.py Adds detailed error logging with traceback
mlflow/genai/optimize/util.py Tags optimization runs for UI filtering
dev/generate_protos.py Updates proto generation to include new proto files
Comments suppressed due to low confidence (1)

mlflow/server/jobs/utils.py:567

  • The function has unreachable code after the early return statement. All code from line 545 onwards will never execute. If the TODO is temporary, consider using a feature flag or configuration instead of commenting out the code with an early return. If the code should remain disabled, remove the unreachable code below the return statement.
def _enqueue_unfinished_jobs(server_launching_timestamp: int) -> None:
    # TODO: Job re-enqueueing is temporarily disabled. The current implementation
    # has issues with job state management that can cause duplicate execution.
    # This will be re-enabled once the job persistence layer is stabilized.
    return None
    from mlflow.server.handlers import _get_job_store

    job_store = _get_job_store()

    unfinished_jobs = job_store.list_jobs(
        statuses=[JobStatus.PENDING, JobStatus.RUNNING],
        # filter out jobs created after the server is launched.
        end_timestamp=server_launching_timestamp,
    )

    for job in unfinished_jobs:
        if job.status == JobStatus.RUNNING:
            job_store.reset_job(job.job_id)  # reset the job status to PENDING

        params = json.loads(job.params)
        timeout = job.timeout
        # Look up exclusive flag from function metadata
        fn_fullname = get_job_fn_fullname(job.job_name)
        fn_metadata = _load_function(fn_fullname)._job_fn_metadata
        _get_or_init_huey_instance(job.job_name).submit_task(
            job.job_id, job.job_name, params, timeout, fn_metadata.exclusive
        )


💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

@github-actions
Copy link
Contributor

github-actions bot commented Jan 21, 2026

Documentation preview for eca1a14 is available at:

More info
  • Ignore this comment if this PR does not change the documentation.
  • The preview is updated when a new commit is pushed to this PR.
  • This comment was created by this workflow run.
  • The documentation was built by this workflow run.

chenmoneygithub and others added 2 commits January 22, 2026 22:13
This PR adds three new backend APIs for the prompt optimization feature:
- getPromptOptimizationJob: Retrieve details of a single optimization job
- searchPromptOptimizationJobs: List all optimization jobs for an experiment
- deletePromptOptimizationJob: Delete an optimization job and its associated run

Also includes:
- New proto definitions for jobs and prompt optimization messages
- Handler implementations for all three APIs
- Unit tests for all new endpoints
- Fix: Remove invalid target_prompt_uri field reference in handler

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: chenmoneygithub <chen.qian@databricks.com>
@chenmoneygithub chenmoneygithub changed the title [WIP] Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs Prompt Optimization backend PR 3: Add Get, Search, and Delete prompt optimization job APIs Jan 23, 2026
@github-actions github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 23, 2026
@TomeHirata TomeHirata self-requested a review January 26, 2026 05:14
@TomeHirata TomeHirata self-assigned this Jan 26, 2026
job_store.delete_jobs(job_ids=[job_id])

# Delete the associated MLflow run if it exists.
# Ignore errors (e.g., run already deleted) to ensure job deletion succeeds.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we return success response even when the deletion actually failed?

Copy link
Contributor Author

@chenmoneygithub chenmoneygithub Jan 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's a bit confusing - there is a chance that users delete the MLflow run associated with the optimization job from UI/client, then we want the DeletePromptOptimizationJob to just delete the job entity and skip the run. But with a second thought it is better to skip deleting the run if the run doesn't exist.

Changed!

elif metric_name.startswith("final_eval_score."):
scorer_name = metric_name[len("final_eval_score.") :]
optimization_job.final_eval_scores[scorer_name] = metric_value

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Do we have a separate PR for adding APIs for intermediate results?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually a tricky thing. My current plan is not including intermediate results in the response of GetOptimizationJob because it only exists in GEPA optimization workflow, and we want to add some flexibility to cache the earlier fetched intermediate prompts/evaluation results for performance. Here is the workflow in my mind:

  • When users open the prompt optimization detailed view page for a certain optimization job, the UI triggers a GetPromptOptimizationJob and fetch the associated run_id.
  • UI pulls intermediate prompts/evaluation results via the artifact API.
  • UI caches the earlier fetched prompts/evaluation results to avoid duplicated retrieval.

Let me know if this makes sense to you!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UI pulls intermediate prompts/evaluation results via the artifact API

Does this mean you use the artifact API directly instead of adding a separate RCP method for fetching intermediate prompts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I am thinking about letting frontend handle what to pull and what to cache for flexibility.

optimization_job.initial_eval_scores["aggregate"] = metric_value
elif metric_name == "final_eval_score":
optimization_job.final_eval_scores["aggregate"] = metric_value
elif metric_name.startswith("initial_eval_score."):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I commented in another PR, the response of Scorer is not always numerical, we may need to update the type on proto and allow string or other response types supported by Scorer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit complex, let's discuss it - originally I was thinking that we should force only using numeric scorers, since for DSPy optimizer/GEPA optimizer non-numeric scorer is not valid. There is one situation where non-numeric scorer can add value IIUC, which is few-shot metaprompting. But for few-shot metaprompting, the initial score and final score will hit trouble because we don't have a reliable way to aggregate over non-numeric scorers' outputs on the validation dataset. With that, I am thinking that for prompt optimization job, we only allow scorers that output numeric values for simplicity. Otherwise, users may hit this situation:

  1. They quickly tried metaprompting on optimization UI, with some non-numeric scorers selected.
  2. They want to try GEPA, but hit an error that the non-numeric scorer cannot be used.

In this situation we are exposing some unnecessary internal logics to the user.

This may be too verbose, please let me know if this explanation makes sense!

Copy link
Collaborator

@TomeHirata TomeHirata Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should at least support the conversation from YES/NO to 1/0 since all the built-in scorers return "YES" or "NO" (CategoricalRating) right now. This aligns what we do in the optimize_prompts method (please see create_metric_from_scorers method). We can reject other types (e.g., dict[str, str], list[str]) in the initial version.

Copy link
Collaborator

@TomeHirata TomeHirata left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM!

@chenmoneygithub chenmoneygithub added this pull request to the merge queue Jan 27, 2026
Merged via the queue into mlflow:master with commit 59b61a7 Jan 27, 2026
55 of 56 checks passed
@chenmoneygithub chenmoneygithub deleted the mlflow-po-backend-pr-3 branch January 27, 2026 08:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants