Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob by chenmoneygithub · Pull Request #20115 · mlflow/mlflow

chenmoneygithub · 2026-01-19T08:14:34Z

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Add CreatePromptOptimizationJob and CancelPromptOptimizationJob.

For testing purpose, please first clone this PR and spin up the mlflow server:

mlflow server --backend-store-uri sqlite:///mlflow.db
cd mlflow/server/js & yarn install & yarn start

Then create a dataset via the following script, which will output a dataset id (or you can copy it from the MLflow UI):

from datasets import load_dataset


def create_aime_evaluation_dataset(experiment_id: str) -> str:
    from mlflow.genai.datasets import create_dataset

    mlflow_dataset_name = "aime_1983_2024_tailored"

    print("Loading AIME dataset from HuggingFace...")
    hf_dataset = load_dataset("gneubig/aime-1983-2024", split="train")

    print(f"Loaded {len(hf_dataset)} samples")

    # Transform to MLflow format
    # - inputs: the question/problem to solve
    # - expectations: the expected answer (ground truth)
    records = []
    for item in hf_dataset:
        record = {
            "inputs": {
                "question": item["Question"],
            },
            "expectations": {
                "expected_response": item["Answer"],
            },
        }
        records.append(record)

    records = records[:20]

    print(f"Transformed {len(records)} records to MLflow format")

    # Create the dataset
    dataset_tags = {
        "source": "huggingface",
        "hf_dataset": "gneubig/aime-1983-2024",
        "purpose": "prompt_optimization",
    }

    print(f"Creating MLflow EvaluationDataset '{mlflow_dataset_name}'...")
    dataset = create_dataset(
        name=mlflow_dataset_name,
        experiment_id=experiment_id,
        tags=dataset_tags,
    )

    print(f"Created dataset with ID: {dataset.dataset_id}")

    # Add records to the dataset
    print(f"Merging {len(records)} records into dataset...")
    dataset.merge_records(records)

    print("Dataset created successfully!")
    print(f"  - Name: {dataset.name}")
    print(f"  - ID: {dataset.dataset_id}")
    print(f"  - Records: {len(records)}")

    return dataset.dataset_id


def main():
    import mlflow

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    experiment = mlflow.set_experiment("optimization_backend")
    experiment_id = experiment.experiment_id

    dataset_id = create_aime_evaluation_dataset(experiment_id)
    print(f"Dataset created with ID: {dataset_id}")


if __name__ == "__main__":
    main()

Then copy the dataset id into the script below:

import json
import time

import requests

# Configuration
MLFLOW_SERVER_URL = "http://127.0.0.1:5000"
API_VERSION = 3  # API version from proto (prompt optimization APIs are v3)


def create_optimization_job(
    experiment_id: str,
    prompt_uri: str,
    optimizer_type: str = "gepa",
    scorers: list[str] | None = None,
    dataset_id: str | None = None,
    optimizer_config: dict | None = None,
    tags: list[dict] | None = None,
) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs"

    # Convert string optimizer_type to proto enum value
    optimizer_type_to_enum = {
        "gepa": 1,  # OPTIMIZER_TYPE_GEPA
        "metaprompt": 2,  # OPTIMIZER_TYPE_METAPROMPT
    }
    optimizer_type_enum = optimizer_type_to_enum.get(optimizer_type.lower(), 0)

    # Build config - dataset_id and scorers are now separate fields
    config = {
        "target_prompt_uri": prompt_uri,
        "optimizer_type": optimizer_type_enum,
        "scorers": scorers or [],  # Empty list if None
    }

    # Add dataset_id if provided (optional for zero-shot metaprompting)
    if dataset_id:
        config["dataset_id"] = dataset_id

    # Add optimizer_config_json if provided
    if optimizer_config:
        config["optimizer_config_json"] = json.dumps(optimizer_config)

    payload = {
        "experiment_id": experiment_id,
        "config": config,
        "tags": tags or [],
    }

    print("Creating optimization job...")
    print(f"URL: {url}")
    print(f"Payload: {json.dumps(payload, indent=2)}")

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def cancel_optimization_job(job_id: str) -> dict:
    url = f"{MLFLOW_SERVER_URL}/ajax-api/{API_VERSION}.0/mlflow/prompt-optimization/jobs/{job_id}/cancel"

    print(f"Cancelling job: {job_id}")

    response = requests.post(url)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    print(f"Response: {json.dumps(result, indent=2)}")
    return result


def main():
    import mlflow

    dataset_id = "d-ecaeda79a412460bb6f0560af6bb7321"
    prompt_uri = "prompts:/aime_solver/1"

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("optimization_backend")

    # Scorers - use built-in scorer names (e.g., "Correctness", "Safety")
    # or registered scorer names from the experiment
    scorers = ["Correctness"]  # Built-in scorer for checking expected_response

    # Optimizer-specific config
    optimizer_config = {
        "reflection_model": "openai:/gpt-4o",
        # "max_metric_calls": 100,
    }

    experiment = mlflow.get_experiment_by_name("optimization_backend")
    experiment_id = experiment.experiment_id

    result = create_optimization_job(
        experiment_id=experiment_id,
        prompt_uri=prompt_uri,
        dataset_id=None,
        scorers=[],
        optimizer_type="metaprompt",
        optimizer_config=optimizer_config,
        tags=[{"key": "test", "value": "true"}],
    )

    if result and "job" in result:
        job_id = result["job"].get("job_id")
        print(f"\nCreated job with ID: {job_id}")

        import pdb

        pdb.set_trace()

        cancel_optimization_job(job_id)
    else:
        print("Failed to create optimization job")

    print("\n" + "=" * 60)
    print("Test complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

Feel free to change the optimizer type to play with GEPA and Metaprompting optimizer. I put a breakpoint in the second script so that you can wait for a while before trying the job cancellation.

How is this PR tested?

Existing unit/integration tests
New unit/integration tests
Manual tests

Does this PR require documentation update?

Release Notes

Is this a user-facing change?

No. You can skip the rest of this section.
Yes. Give a description of this change to be included in the release notes for MLflow users.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

How should the PR be classified in the release notes? Choose one:

rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
rn/feature - A new user-facing feature worth mentioning in the release notes
rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?

Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
Bug fixes, doc updates and new features usually go into minor releases.
Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
Bug fixes and doc updates usually go into patch releases.

Yes (this PR will be cherry-picked and included in the next patch release)
No (this PR will be included in the next minor release)

github-actions · 2026-01-19T08:14:56Z

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20115/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/20115/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/20115/merge

github-actions · 2026-01-19T08:14:57Z

@chenmoneygithub Thank you for the contribution! Could you fix the following issue(s)?

⚠ DCO check

The DCO check failed. Please sign off your commit(s) by following the instructions here. See https://github.com/mlflow/mlflow/blob/master/CONTRIBUTING.md#sign-your-work for more details.

Copilot

Pull request overview

This PR adds backend support for prompt optimization jobs, introducing two new API endpoints: CreatePromptOptimizationJob and CancelPromptOptimizationJob. The implementation enables asynchronous prompt optimization with support for different optimizer types (GEPA, MetaPrompt) and includes both few-shot and zero-shot optimization modes.

Changes:

Added new protobuf definitions for prompt optimization job APIs including JobStatus, OptimizerType, and PromptOptimizationJob message types
Implemented server-side handlers for creating and canceling prompt optimization jobs with parameter validation and MLflow run tracking
Extended optimization logic to support dataset entities and zero-shot optimization when no training data is provided

Reviewed changes

Copilot reviewed 9 out of 12 changed files in this pull request and generated 4 comments.

Show a summary per file

File	Description
mlflow/protos/prompt_optimization.proto	New protobuf definitions for job status, optimizer types, and prompt optimization job entities
mlflow/protos/prompt_optimization_pb2.py	Generated Python protobuf code for prompt optimization messages
mlflow/protos/prompt_optimization_pb2.pyi	Generated Python type stubs for protobuf messages
mlflow/protos/service.proto	Added RPC definitions for createPromptOptimizationJob and cancelPromptOptimizationJob endpoints
mlflow/protos/service_pb2.pyi	Generated type stubs for new RPC messages
mlflow/server/handlers.py	Implemented _create_prompt_optimization_job and _cancel_prompt_optimization_job handlers with validation, run creation, and job submission
mlflow/genai/optimize/optimize.py	Added support for converting dataset entities to dataframes for optimization
mlflow/genai/optimize/job.py	Updated to support optional dataset_id for zero-shot optimization
tests/server/test_handlers.py	Added comprehensive tests for job creation, cancellation, and error cases

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

mlflow/server/handlers.py

github-actions · 2026-01-19T08:22:56Z

Documentation preview for 737555f is available at:

https://pr-20115--mlflow-docs-preview.netlify.app/docs/latest/

More info

Ignore this comment if this PR does not change the documentation.
The preview is updated when a new commit is pushed to this PR.
This comment was created by this workflow run.
The documentation was built by this workflow run.

chenmoneygithub · 2026-01-20T01:16:47Z

mlflow/server/jobs/_job_subproc_entry.py

        )
        job_result.dump(result_dump_path)
    except Exception as e:
+        _logger.error(


Right now critical errors are also hidden from the user, which makes it really hard to debug, so I am adding this change. @WeichenXu123 Please let me know if this makes sense.

sounds good!

chenmoneygithub · 2026-01-20T01:17:10Z

mlflow/genai/optimize/job.py

        dataset_id=dataset_id,
        scorer_names=scorer_names,
    )
+    return asdict(job_result)


This is somehow required by mlflow job

@WeichenXu123 is the requirement that all job result needs to be dict (or json serializable)?

If so, shall update PromptOptimizationJobResult to TypedDict?

Copilot

Pull request overview

Copilot reviewed 15 out of 18 changed files in this pull request and generated no new comments.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

TomeHirata · 2026-01-20T15:21:24Z

mlflow/protos/prompt_optimization.proto

+  optional string experiment_id = 4;
+
+  // URI of the source prompt that optimization started from (e.g., "prompts:/my-prompt/1").
+  optional string source_prompt_uri = 5;


What is the relationship between source_prompt_uri here and PromptOptimizationJobConfig.target_prompt_uri?

this is a mistake, target_prompt_uri is meaningless, changed!

TomeHirata · 2026-01-20T15:22:51Z

mlflow/protos/prompt_optimization.proto

+
+  // List of scorer names. Can be built-in scorer class names
+  // (e.g., "Correctness", "Safety") or registered scorer names.
+  repeated string scorers = 4;


q: how would we support Guidelines or ExpectationsGuidelines that accepts parameters?

Yes good question, there are two ways we can support it:

Require users to wrap the scorer in a custom scorer and register it in the MLflow experiment, in this approach the scorer will never need an arg.

Change the scorer from string to a dict so that it can take in args.

With a first look that option 2 is more flexible, however this backend API is supposed to only be invoked by MLflow UI, and on MLflow UI it's a bit odd to let users configure the args for their scorer, because optimization -> scorer -> scorer args has many hops, which is not trivial to understand IMO. Please let me know your thoughts!

Got it, I think #1 makes sense to start with. When we release this feature, let's make sure this expected flow is clearly documented!

mlflow/server/handlers.py

TomeHirata · 2026-01-20T15:27:14Z

mlflow/server/handlers.py

+                tags=[InputTag(key="mlflow.data.context", value="optimization")],
+            )
+            tracking_store.log_inputs(run_id=run_id, datasets=[dataset_input])
+        except Exception as e:


Which method of the try block do we expect to fail?

mostly the get_genai_dataset, if there is network issue or the id is invalid (less likely if requested from the UI). Happy to delete the try-except as well!

I see. Maybe we should handle differently for different exceptions? If dataset_id is invalid, we should raise an exception immediately, but if it's a temporally network error, we can still accepts the request.

Makes sense! And given a second thought, I feel this try block is a bit redundant because it's not common for get_dataset to fail as we specify the dataset_id through UI, and it makes sense to stop the request handling if there is a dataset loading error.

TomeHirata · 2026-01-20T15:29:06Z

mlflow/server/handlers.py

+
+    # Create MLflow run upfront so run_id is immediately available
+    # The job will resume this run when it starts executing
+    from mlflow.tracking.context.default_context import _get_user


Could we move this import to the module level?

Yes, done! there were a few circular import cases, but this one is fine.

mlflow/server/handlers.py

WeichenXu123 · 2026-01-21T08:52:19Z

mlflow/java/client/src/main/java/org/mlflow/api/proto/Service.java

@@ -250963,6 +250963,3680 @@ public org.mlflow.api.proto.Service.GetSecretsConfig getDefaultInstanceForType()

  }

+  public interface CreatePromptOptimizationJobOrBuilder extends


are these java proto classes useful ?

It's auto generated by the proto generation script, I actually have little knowledge on the current status of MLflow java server.

mlflow does not have java server, the java proto classes are for mlflow java client , but the job optimization APIs are only called from UI . so these should be useless.

we might need to clean up these useless java proto classes and update the proto generation script.

this issue does not block the PR merging.

WeichenXu123 · 2026-01-21T09:03:15Z

mlflow/server/handlers.py

+        user_id=_get_user(),
+        start_time=int(time.time() * 1000),
+        tags=[],
+        run_name=None,


why not set a meaningful run name ? like optimize_prompt_xxx ?

good idea, changed!

TomeHirata · 2026-01-22T06:59:38Z

mlflow/genai/optimize/job.py

+        Raises:
+            MlflowException: If the proto value is unspecified or unsupported.
+        """
+        if proto_value == OPTIMIZER_TYPE_UNSPECIFIED:


Let's use match statement

actually match statement makes it less clean, since it needs to be:

case x if x == OPTIMIZER_TYPE_GEPA:

Because OPTIMIZER_TYPE_GEPA is a variable not a literal. Here is the claude code judgement:

⏺ I see - when matching against literal values (strings like "gzip", "deflate"), it works directly. But for variables/constants, Python's match-case treats bare names as capture patterns, not as value comparisons. The issue is that OPTIMIZER_TYPE_GEPA is a variable, not a literal. In match-case, when you write case OPTIMIZER_TYPE_GEPA:, Python interprets it as "capture the value into a new variable named OPTIMIZER_TYPE_GEPA", not "compare against the existing variable OPTIMIZER_TYPE_GEPA". That's why the if-elif-else was reverted - it's the correct approach for matching against variables/constants in Python. The match-case syntax would require using a guard (case x if x == OPTIMIZER_TYPE_GEPA:) which is more verbose than the original if-elif. So the current if-elif-else code is actually the right choice for this case. The reviewer's suggestion to use match doesn't apply well here since we're matching against imported constants, not literal values.

TomeHirata · 2026-01-22T07:04:01Z

mlflow/protos/prompt_optimization.proto

+
+  // List of scorer names. Can be built-in scorer class names
+  // (e.g., "Correctness", "Safety") or registered scorer names.
+  repeated string scorers = 3;


Another question: how do distinguish built-in Correctness scorer and a custom scorer with name=Correctness? Maybe we need a flag to differentiate these?

Good question. Right now the built-in scorer is chosen over the custom scorer if there is a name conflict. My honest preference is we forbid users register scorers of builtin names, and throw an exception when users try it.

TomeHirata

LGTM once these comments are addressed!

WeichenXu123 · 2026-01-22T09:10:07Z

mlflow/server/handlers.py

+        start_time=start_time,
+        tags=[],
+        run_name=f"optimize_prompt_{optimizer_type}_{start_time}",
+    )


why not attach the prompt name / version in the run name ? like optimize_prompt_{optimizer_type}_{prompt_name}_{prompt_version}_{start_time}

good call, changed!

WeichenXu123

LGTM except one comment about run name.

WeichenXu123 · 2026-01-22T11:32:56Z

mlflow/server/handlers.py


+@catch_mlflow_exception
+@_disable_if_artifacts_only
+def _create_prompt_optimization_job():


follow-up task: let's add permission validation rule for new added endpoints in mlflow/server/auth/__init__.py

optimizer backend PR 2

c4b7983

Copilot AI review requested due to automatic review settings January 19, 2026 08:14

Copilot started reviewing on behalf of chenmoneygithub January 19, 2026 08:15 View session

github-actions bot added area/prompts MLflow Prompt Registry and Optimization rn/feature Mention under Features in Changelogs. labels Jan 19, 2026

Copilot AI reviewed Jan 19, 2026

View reviewed changes

mlflow/server/handlers.py Outdated Show resolved Hide resolved

mlflow/server/handlers.py Show resolved Hide resolved

mlflow/server/handlers.py Outdated Show resolved Hide resolved

mlflow/server/handlers.py Outdated Show resolved Hide resolved

chenmoneygithub added 3 commits January 19, 2026 17:03

clean up proto

37f239f

fixes

dda98a3

merge master

92ea2bb

chenmoneygithub commented Jan 20, 2026

View reviewed changes

chenmoneygithub added 3 commits January 19, 2026 22:04

fix test failures, and update underlying run status on cancelllation

6fc0b8d

fix broken test

42349fe

regenerate proto

719891c

chenmoneygithub requested a review from Copilot January 20, 2026 06:42

Copilot started reviewing on behalf of chenmoneygithub January 20, 2026 06:43 View session

Copilot AI reviewed Jan 20, 2026

View reviewed changes

chenmoneygithub requested review from B-Step62, TomeHirata and WeichenXu123 January 20, 2026 07:00

chenmoneygithub changed the title ~~[WIP] Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob~~ Prompt Optimization backend PR 2: Add CreatePromptOptimizationJob and CancelPromptOptimizationJob Jan 20, 2026