Skip to content

Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob#20115

Merged
chenmoneygithub merged 15 commits intomlflow:masterfrom
chenmoneygithub:mlflow-po-backend-pr-2
Jan 23, 2026
Merged

Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob#20115
chenmoneygithub merged 15 commits intomlflow:masterfrom
chenmoneygithub:mlflow-po-backend-pr-2

Conversation

@chenmoneygithub
Copy link
Contributor

@chenmoneygithub chenmoneygithub commented Jan 19, 2026

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Add CreatePromptOptimizationJob and CancelPromptOptimizationJob.

For testing purpose, please first clone this PR and spin up the mlflow server:

mlflow server --backend-store-uri sqlite:///mlflow.db
cd mlflow/server/js & yarn install & yarn start

Then create a dataset via the following script, which will output a dataset id (or you can copy it from the MLflow UI):

from datasets import load_dataset


def create_aime_evaluation_dataset(experiment_id: str) -> str:
    from mlflow.genai.datasets import create_dataset

    mlflow_dataset_name = "aime_1983_2024_tailored"

    print("Loading AIME dataset from HuggingFace...")
    hf_dataset = load_dataset("gneubig/aime-1983-2024", split="train")

    print(f"Loaded {len(hf_dataset)} samples")

    # Transform to MLflow format
    # - inputs: the question/problem to solve
    # - expectations: the expected answer (ground truth)
    records = []
    for item in hf_dataset:
        record = {
            "inputs": {
                "question": item["Question"],
            },
            "expectations": {
                "expected_response": item["Answer"],
            },
        }
        records.append(record)

    records = records[:20]

    print(f"Transformed {len(records)} records to MLflow format")

    # Create the dataset
    dataset_tags = {
        "source": "huggingface",
        "hf_dataset": "gneubig/aime-1983-2024",
        "purpose": "prompt_optimization",
    }

    print(f"Creating MLflow EvaluationDataset '{mlflow_dataset_name}'...")
    dataset = create_dataset(
        name=mlflow_dataset_name,
        experiment_id=experiment_id,
        tags=dataset_tags,
    )

    print(f"Created dataset with ID: {dataset.dataset_id}")

    # Add records to the dataset
    print(f"Merging {len(records)} records into dataset...")
    dataset.merge_records(records)

    print("Dataset created successfully!")
    print(f"  - Name: {dataset.name}")
    print(f"  - ID: {dataset.dataset_id}")
    print(f"  - Records: {len(records)}")

    return dataset.dataset_id


def main():
    import mlflow

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    experiment = mlflow.set_experiment("optimization_backend")
    experiment_id = experiment.experiment_id

    dataset_id = create_aime_evaluation_dataset(experiment_id)
    print(f"Dataset created with ID: {dataset_id}")


if __name__ == "__main__":
    main()

Then copy the dataset id into the script below:

import json
import time

import requests

# Configuration
MLFLOW_SERVER_URL = "http://127.0.0.1:5000"
API_VERSION = 3  # API version from proto (prompt optimization APIs are v3)


def create_optimization_job(
    experiment_id: str,
    prompt_uri: str,
    optimizer_type: str = "gepa",
    scorers: list[str] | None = None,
    dataset_id: str | None = None,
    optimizer_config: dict | None = None,
    tags: list[dict] | None = None,
) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs"

    # Convert string optimizer_type to proto enum value
    optimizer_type_to_enum = {
        "gepa": 1,  # OPTIMIZER_TYPE_GEPA
        "metaprompt": 2,  # OPTIMIZER_TYPE_METAPROMPT
    }
    optimizer_type_enum = optimizer_type_to_enum.get(optimizer_type.lower(), 0)

    # Build config - dataset_id and scorers are now separate fields
    config = {
        "target_prompt_uri": prompt_uri,
        "optimizer_type": optimizer_type_enum,
        "scorers": scorers or [],  # Empty list if None
    }

    # Add dataset_id if provided (optional for zero-shot metaprompting)
    if dataset_id:
        config["dataset_id"] = dataset_id

    # Add optimizer_config_json if provided
    if optimizer_config:
        config["optimizer_config_json"] = json.dumps(optimizer_config)

    payload = {
        "experiment_id": experiment_id,
        "config": config,
        "tags": tags or [],
    }

    print("Creating optimization job...")
    print(f"URL: {url}")
    print(f"Payload: {json.dumps(payload, indent=2)}")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def cancel_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}/cancel"

    print(f"Cancelling job: {job_id}")

    response = requests.post(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def main():
    import mlflow

    dataset_id = "d-ecaeda79a412460bb6f0560af6bb7321"
    prompt_uri = "prompts:/aime_solver/1"

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("optimization_backend")

    # Scorers - use built-in scorer names (e.g., "Correctness", "Safety")
    # or registered scorer names from the experiment
    scorers = ["Correctness"]  # Built-in scorer for checking expected_response

    # Optimizer-specific config
    optimizer_config = {
        "reflection_model": "openai:/gpt-4o",
        # "max_metric_calls": 100,
    }

    experiment = mlflow.get_experiment_by_name("optimization_backend")
    experiment_id = experiment.experiment_id

    result = create_optimization_job(
        experiment_id=experiment_id,
        prompt_uri=prompt_uri,
        dataset_id=None,
        scorers=[],
        optimizer_type="metaprompt",
        optimizer_config=optimizer_config,
        tags=[{"key": "test", "value": "true"}],
    )

    if result and "job" in result:
        job_id = result["job"].get("job_id")
        print(f"\nCreated job with ID: {job_id}")

        import pdb

        pdb.set_trace()

        cancel_optimization_job(job_id)
    else:
        print("Failed to create optimization job")

    print("\n" + "=" * 60)
    print("Test complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

Feel free to change the optimizer type to play with GEPA and Metaprompting optimizer. I put a breakpoint in the second script so that you can wait for a while before trying the job cancellation.

How is this PR tested?

  • Existing unit/integration tests
  • New unit/integration tests
  • Manual tests

Does this PR require documentation update?

  • No. You can skip the rest of this section.
  • Yes. I've updated:
    • Examples
    • API references
    • Instructions

Release Notes

Is this a user-facing change?

  • No. You can skip the rest of this section.
  • Yes. Give a description of this change to be included in the release notes for MLflow users.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

  • area/tracking: Tracking Service, tracking client APIs, autologging
  • area/models: MLmodel format, model serialization/deserialization, flavors
  • area/model-registry: Model Registry service, APIs, and the fluent client calls for Model Registry
  • area/scoring: MLflow Model server, model deployment tools, Spark UDFs
  • area/evaluation: MLflow model evaluation features, evaluation metrics, and evaluation workflows
  • area/gateway: MLflow AI Gateway client APIs, server, and third-party integrations
  • area/prompts: MLflow prompt engineering features, prompt templates, and prompt management
  • area/tracing: MLflow Tracing features, tracing APIs, and LLM tracing functionality
  • area/projects: MLproject format, project running backends
  • area/uiux: Front-end, user experience, plotting, JavaScript, JavaScript dev server
  • area/build: Build and test infrastructure for MLflow
  • area/docs: MLflow documentation pages

How should the PR be classified in the release notes? Choose one:

  • rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
  • rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
  • rn/feature - A new user-facing feature worth mentioning in the release notes
  • rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
  • rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?
  • Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
    Bug fixes, doc updates and new features usually go into minor releases.
  • Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
    Bug fixes and doc updates usually go into patch releases.
  • Yes (this PR will be cherry-picked and included in the next patch release)
  • No (this PR will be included in the next minor release)

Copilot AI review requested due to automatic review settings January 19, 2026 08:14
@github-actions
Copy link
Contributor

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20115/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20115/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/20115/merge

@github-actions
Copy link
Contributor

@chenmoneygithub Thank you for the contribution! Could you fix the following issue(s)?

⚠ DCO check

The DCO check failed. Please sign off your commit(s) by following the instructions here. See https://github.com/mlflow/mlflow/blob/master/CONTRIBUTING.md#sign-your-work for more details.

@github-actions github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 19, 2026
Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR adds backend support for prompt optimization jobs, introducing two new API endpoints: CreatePromptOptimizationJob and CancelPromptOptimizationJob. The implementation enables asynchronous prompt optimization with support for different optimizer types (GEPA, MetaPrompt) and includes both few-shot and zero-shot optimization modes.

Changes:

  • Added new protobuf definitions for prompt optimization job APIs including JobStatus, OptimizerType, and PromptOptimizationJob message types
  • Implemented server-side handlers for creating and canceling prompt optimization jobs with parameter validation and MLflow run tracking
  • Extended optimization logic to support dataset entities and zero-shot optimization when no training data is provided

Reviewed changes

Copilot reviewed 9 out of 12 changed files in this pull request and generated 4 comments.

Show a summary per file
File Description
mlflow/protos/prompt_optimization.proto New protobuf definitions for job status, optimizer types, and prompt optimization job entities
mlflow/protos/prompt_optimization_pb2.py Generated Python protobuf code for prompt optimization messages
mlflow/protos/prompt_optimization_pb2.pyi Generated Python type stubs for protobuf messages
mlflow/protos/service.proto Added RPC definitions for createPromptOptimizationJob and cancelPromptOptimizationJob endpoints
mlflow/protos/service_pb2.pyi Generated type stubs for new RPC messages
mlflow/server/handlers.py Implemented _create_prompt_optimization_job and _cancel_prompt_optimization_job handlers with validation, run creation, and job submission
mlflow/genai/optimize/optimize.py Added support for converting dataset entities to dataframes for optimization
mlflow/genai/optimize/job.py Updated to support optional dataset_id for zero-shot optimization
tests/server/test_handlers.py Added comprehensive tests for job creation, cancellation, and error cases

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

@github-actions
Copy link
Contributor

github-actions bot commented Jan 19, 2026

Documentation preview for 737555f is available at:

More info
  • Ignore this comment if this PR does not change the documentation.
  • The preview is updated when a new commit is pushed to this PR.
  • This comment was created by this workflow run.
  • The documentation was built by this workflow run.

)
job_result.dump(result_dump_path)
except Exception as e:
_logger.error(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now critical errors are also hidden from the user, which makes it really hard to debug, so I am adding this change. @WeichenXu123 Please let me know if this makes sense.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good!

dataset_id=dataset_id,
scorer_names=scorer_names,
)
return asdict(job_result)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is somehow required by mlflow job

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WeichenXu123 is the requirement that all job result needs to be dict (or json serializable)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, shall update PromptOptimizationJobResult to TypedDict?

Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

Copilot reviewed 15 out of 18 changed files in this pull request and generated no new comments.


💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

@chenmoneygithub chenmoneygithub changed the title [WIP] Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob Jan 20, 2026
optional string experiment_id = 4;

// URI of the source prompt that optimization started from (e.g., "prompts:/my-prompt/1").
optional string source_prompt_uri = 5;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the relationship between source_prompt_uri here and PromptOptimizationJobConfig.target_prompt_uri?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a mistake, target_prompt_uri is meaningless, changed!


// List of scorer names. Can be built-in scorer class names
// (e.g., "Correctness", "Safety") or registered scorer names.
repeated string scorers = 4;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q: how would we support Guidelines or ExpectationsGuidelines that accepts parameters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes good question, there are two ways we can support it:

  1. Require users to wrap the scorer in a custom scorer and register it in the MLflow experiment, in this approach the scorer will never need an arg.
  2. Change the scorer from string to a dict so that it can take in args.

With a first look that option 2 is more flexible, however this backend API is supposed to only be invoked by MLflow UI, and on MLflow UI it's a bit odd to let users configure the args for their scorer, because optimization -> scorer -> scorer args has many hops, which is not trivial to understand IMO. Please let me know your thoughts!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I think #1 makes sense to start with. When we release this feature, let's make sure this expected flow is clearly documented!

tags=[InputTag(key="mlflow.data.context", value="optimization")],
)
tracking_store.log_inputs(run_id=run_id, datasets=[dataset_input])
except Exception as e:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which method of the try block do we expect to fail?

Copy link
Contributor Author

@chenmoneygithub chenmoneygithub Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mostly the get_genai_dataset, if there is network issue or the id is invalid (less likely if requested from the UI). Happy to delete the try-except as well!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Maybe we should handle differently for different exceptions? If dataset_id is invalid, we should raise an exception immediately, but if it's a temporally network error, we can still accepts the request.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! And given a second thought, I feel this try block is a bit redundant because it's not common for get_dataset to fail as we specify the dataset_id through UI, and it makes sense to stop the request handling if there is a dataset loading error.


# Create MLflow run upfront so run_id is immediately available
# The job will resume this run when it starts executing
from mlflow.tracking.context.default_context import _get_user
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we move this import to the module level?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, done! there were a few circular import cases, but this one is fine.

@@ -250963,6 +250963,3680 @@ public org.mlflow.api.proto.Service.GetSecretsConfig getDefaultInstanceForType()

}

public interface CreatePromptOptimizationJobOrBuilder extends
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these java proto classes useful ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's auto generated by the proto generation script, I actually have little knowledge on the current status of MLflow java server.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mlflow does not have java server, the java proto classes are for mlflow java client , but the job optimization APIs are only called from UI . so these should be useless.

we might need to clean up these useless java proto classes and update the proto generation script.

this issue does not block the PR merging.

user_id=_get_user(),
start_time=int(time.time() * 1000),
tags=[],
run_name=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not set a meaningful run name ? like optimize_prompt_xxx ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea, changed!

Raises:
MlflowException: If the proto value is unspecified or unsupported.
"""
if proto_value == OPTIMIZER_TYPE_UNSPECIFIED:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use match statement

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually match statement makes it less clean, since it needs to be:

case x if x == OPTIMIZER_TYPE_GEPA:

Because OPTIMIZER_TYPE_GEPA is a variable not a literal. Here is the claude code judgement:

⏺ I see - when matching against literal values (strings like "gzip", "deflate"), it works
  directly. But for variables/constants, Python's match-case treats bare names as capture
  patterns, not as value comparisons.

  The issue is that OPTIMIZER_TYPE_GEPA is a variable, not a literal. In match-case, when you
   write case OPTIMIZER_TYPE_GEPA:, Python interprets it as "capture the value into a new
  variable named OPTIMIZER_TYPE_GEPA", not "compare against the existing variable
  OPTIMIZER_TYPE_GEPA".

  That's why the if-elif-else was reverted - it's the correct approach for matching against
  variables/constants in Python. The match-case syntax would require using a guard (case x if
   x == OPTIMIZER_TYPE_GEPA:) which is more verbose than the original if-elif.

  So the current if-elif-else code is actually the right choice for this case. The reviewer's
   suggestion to use match doesn't apply well here since we're matching against imported
  constants, not literal values.


// List of scorer names. Can be built-in scorer class names
// (e.g., "Correctness", "Safety") or registered scorer names.
repeated string scorers = 3;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another question: how do distinguish built-in Correctness scorer and a custom scorer with name=Correctness? Maybe we need a flag to differentiate these?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. Right now the built-in scorer is chosen over the custom scorer if there is a name conflict. My honest preference is we forbid users register scorers of builtin names, and throw an exception when users try it.

Copy link
Collaborator

@TomeHirata TomeHirata left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM once these comments are addressed!

start_time=start_time,
tags=[],
run_name=f"optimize_prompt_{optimizer_type}_{start_time}",
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not attach the prompt name / version in the run name ? like optimize_prompt_{optimizer_type}_{prompt_name}_{prompt_version}_{start_time}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call, changed!

Copy link
Collaborator

@WeichenXu123 WeichenXu123 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM except one comment about run name.


@catch_mlflow_exception
@_disable_if_artifacts_only
def _create_prompt_optimization_job():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

follow-up task: let's add permission validation rule for new added endpoints in mlflow/server/auth/__init__.py

@chenmoneygithub chenmoneygithub added this pull request to the merge queue Jan 23, 2026
Merged via the queue into mlflow:master with commit e08c91f Jan 23, 2026
59 of 61 checks passed
@chenmoneygithub chenmoneygithub deleted the mlflow-po-backend-pr-2 branch January 23, 2026 03:42
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants