Skip to content

Binary instlation script does not work and failed to install #296

Description

@kuju63

Document recommend installation is binary installation. But install script does not work, because do not get latest tag.

Environment

  • os: macOSX 26.5
  • default shell: zsh

Setup log

curl -fsSL https://raw.githubusercontent.com/microsoft/waza/main/install.sh | bash -x
+ set -euo pipefail
+ REPO=microsoft/waza
+ BINARY_NAME=waza
+ tmpdir=
+ main
+ local os arch version tag asset_name install_path
++ detect_os
++ local os
+++ uname -s
++ os=Darwin
++ case "$os" in
++ echo darwin
+ os=darwin
++ detect_arch
++ local arch
+++ uname -m
++ arch=arm64
++ case "$arch" in
++ echo arm64
+ arch=arm64
+ echo 'Detected platform: darwin/arm64'
Detected platform: darwin/arm64
+ '[' darwin = linux ']'
++ curl -fsSL https://api.github.com/repos/microsoft/waza/releases/latest
++ grep '"tag_name": "v'
++ head -1
++ cut '-d"' -f4
+ tag=

curl -fsSL https://api.github.com/repos/microsoft/waza/releases/latest response is here.

{
  "url": "https://api.github.com/repos/microsoft/waza/releases/327244955",
  "assets_url": "https://api.github.com/repos/microsoft/waza/releases/327244955/assets",
  "upload_url": "https://uploads.github.com/repos/microsoft/waza/releases/327244955/assets{?name,label}",
  "html_url": "https://github.com/microsoft/waza/releases/tag/azd-ext-microsoft-azd-waza_0.33.0",
  "id": 327244955,
  "author": {
    "login": "github-actions[bot]",
    "id": 41898282,
    "node_id": "MDM6Qm90NDE4OTgyODI=",
    "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
    "gravatar_id": "",
    "url": "https://api.github.com/users/github-actions%5Bbot%5D",
    "html_url": "https://github.com/apps/github-actions",
    "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
    "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
    "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
    "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
    "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
    "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
    "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
    "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
    "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
    "type": "Bot",
    "user_view_type": "public",
    "site_admin": false
  },
  "node_id": "RE_kwDORasyGc4TgVyb",
  "tag_name": "azd-ext-microsoft-azd-waza_0.33.0",
  "target_commitish": "main",
  "name": "Waza azd Extension v0.33.0",
  "draft": false,
  "immutable": false,
  "prerelease": false,
  "created_at": "2026-05-21T18:16:12Z",
  "updated_at": "2026-05-21T19:54:43Z",
  "published_at": "2026-05-21T19:54:43Z",
  "assets": [
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430256",
      "id": 426430256,
      "node_id": "RA_kwDORasyGc4Zas8w",
      "name": "microsoft-azd-waza-darwin-amd64.zip",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/zip",
      "state": "uploaded",
      "size": 97465018,
      "digest": "sha256:ca2a284216a142317ed02bc211adf09d0bf3b7e470e17c5e5f4fdb44fcfcc2b8",
      "download_count": 1,
      "created_at": "2026-05-21T19:54:35Z",
      "updated_at": "2026-05-21T19:54:40Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-darwin-amd64.zip"
    },
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430254",
      "id": 426430254,
      "node_id": "RA_kwDORasyGc4Zas8u",
      "name": "microsoft-azd-waza-darwin-arm64.zip",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/zip",
      "state": "uploaded",
      "size": 93538424,
      "digest": "sha256:f9b231279e1e2a91be4a23b80fd72871559b0ae31d8cb4459fd16c1b1225794a",
      "download_count": 2,
      "created_at": "2026-05-21T19:54:35Z",
      "updated_at": "2026-05-21T19:54:39Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-darwin-arm64.zip"
    },
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430255",
      "id": 426430255,
      "node_id": "RA_kwDORasyGc4Zas8v",
      "name": "microsoft-azd-waza-linux-amd64.tar.gz",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/x-gtar",
      "state": "uploaded",
      "size": 99041331,
      "digest": "sha256:280612d4deacb56f108c58bc065103cce35b0f7d1175987e07eea18eda28ffe5",
      "download_count": 2,
      "created_at": "2026-05-21T19:54:35Z",
      "updated_at": "2026-05-21T19:54:39Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-linux-amd64.tar.gz"
    },
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430287",
      "id": 426430287,
      "node_id": "RA_kwDORasyGc4Zas9P",
      "name": "microsoft-azd-waza-linux-arm64.tar.gz",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/x-gtar",
      "state": "uploaded",
      "size": 99105261,
      "digest": "sha256:579e945bac07f5292a49d67aff80b0f3d4edf3f9fd223e20e54f501a7fdfd654",
      "download_count": 1,
      "created_at": "2026-05-21T19:54:39Z",
      "updated_at": "2026-05-21T19:54:42Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-linux-arm64.tar.gz"
    },
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430252",
      "id": 426430252,
      "node_id": "RA_kwDORasyGc4Zas8s",
      "name": "microsoft-azd-waza-windows-amd64.zip",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/zip",
      "state": "uploaded",
      "size": 93622677,
      "digest": "sha256:8032b50af3a7ac3ccb08aaf676b0cd1f0772fac3288f843de7c77f82f3f53ec3",
      "download_count": 11,
      "created_at": "2026-05-21T19:54:35Z",
      "updated_at": "2026-05-21T19:54:38Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-windows-amd64.zip"
    },
    {
      "url": "https://api.github.com/repos/microsoft/waza/releases/assets/426430253",
      "id": 426430253,
      "node_id": "RA_kwDORasyGc4Zas8t",
      "name": "microsoft-azd-waza-windows-arm64.zip",
      "label": "",
      "uploader": {
        "login": "github-actions[bot]",
        "id": 41898282,
        "node_id": "MDM6Qm90NDE4OTgyODI=",
        "avatar_url": "https://avatars.githubusercontent.com/in/15368?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/github-actions%5Bbot%5D",
        "html_url": "https://github.com/apps/github-actions",
        "followers_url": "https://api.github.com/users/github-actions%5Bbot%5D/followers",
        "following_url": "https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}",
        "gists_url": "https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/github-actions%5Bbot%5D/subscriptions",
        "organizations_url": "https://api.github.com/users/github-actions%5Bbot%5D/orgs",
        "repos_url": "https://api.github.com/users/github-actions%5Bbot%5D/repos",
        "events_url": "https://api.github.com/users/github-actions%5Bbot%5D/events{/privacy}",
        "received_events_url": "https://api.github.com/users/github-actions%5Bbot%5D/received_events",
        "type": "Bot",
        "user_view_type": "public",
        "site_admin": false
      },
      "content_type": "application/zip",
      "state": "uploaded",
      "size": 89909632,
      "digest": "sha256:f81ce0020c77ed2a97027598e523e11b9264caf999edbbb3439b1b508dad8f63",
      "download_count": 1,
      "created_at": "2026-05-21T19:54:35Z",
      "updated_at": "2026-05-21T19:54:39Z",
      "browser_download_url": "https://github.com/microsoft/waza/releases/download/azd-ext-microsoft-azd-waza_0.33.0/microsoft-azd-waza-windows-arm64.zip"
    }
  ],
  "tarball_url": "https://api.github.com/repos/microsoft/waza/tarball/azd-ext-microsoft-azd-waza_0.33.0",
  "zipball_url": "https://api.github.com/repos/microsoft/waza/zipball/azd-ext-microsoft-azd-waza_0.33.0",
  "body": "# Changelog\n\nAll notable changes to waza will be documented in this file.\n\nThe format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),\nand this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n\n## [Unreleased]\n\n## [0.33.0] - 2026-05-21\n\nNote: This release includes the changes previously prepared under 0.32.0, which was not published.\n\n### Added\n\n- **Configurable eval file naming** — `.waza.yaml` can now configure `files.evalFile`, `files.taskGlob`, and `files.taskFileSuffix`, with the new naming carried through scaffolding, workspace discovery, discovery mode, schemas, and docs while preserving the existing `eval.yaml` and `tasks/*.yaml` defaults (#254, closes #232)\n- **Instruction files in eval runs** — Eval-level `config.instruction_files` and task-level `instruction_files` now copy files from the active context into task workspaces and append path-labeled contents to the Copilot system message (#248, closes #239)\n\n### Fixed\n\n- **Prompt graders use the execution engine** — Prompt graders now route judge turns through `CopilotEngine` instead of constructing a Copilot client directly, keeping grader execution aligned with engine configuration and preserving follow-up recovery behavior (#258, closes #54)\n- **Prompt grader follow-up recovery** — Prompt grading now preserves collected grades when a follow-up turn fails after successful grader collection (#251)\n- **Bundled Copilot CLI updated** — Embedded `copilot-cli` bundles are updated from 1.0.2 to 1.0.49 across supported platforms, with reproducible pinned bundle generation via `COPILOT_CLI_VERSION` (#260, closes #244)\n- **Spec-aligned skill scaffolding** — `waza new skill` no longer asks for a nonstandard skill type or emits `type:` frontmatter, and the wizard now rejects early exits that omit required name or description fields (#261, closes #243)\n- **`waza check` eval discovery** — Nested skills and separated evals are discovered consistently in multi-skill workspaces (#247, closes #238)\n- **Skill body routing markers** — Compliance scoring now detects trigger, anti-trigger, and routing markers in `SKILL.md` body sections as well as frontmatter descriptions (#236, closes #223)\n\n### Changed\n\n- **Copilot SDK v0.3.0 migration** — Updated `github.com/github/copilot-sdk/go` to v0.3.0, migrated session event handling to typed payloads, and refreshed transcript, logging, web API, usage collection, suggestion trace, and test coverage for the new API (#255, closes #253)\n- **Dashboard validation coverage** — Added coverage for dashboard lint and end-to-end validation (#249)\n- **Install documentation** — Replaced unsupported `go install` guidance and clarified Windows/WSL install behavior (#246, closes #242; #245, closes #241)\n- **Dependencies** — Bump devalue in /site, postcss in /web, and astro in /site (#237, #235, #234)\n\n## [0.31.0] - 2026-04-28\n\n### Added\n\n- **Custom agent (`.agent.md`) eval support** — Discover `.agent.md` files alongside `SKILL.md`, parse agent-specific frontmatter (`tools`, `model`, `handoffs`, `mcp-servers`, `agents`), auto-inject `tool_constraint` grader from agent `tools:` field, complete worked example under `examples/custom-agent/`, and new \"Evaluating Custom Agents\" docs guide (#226, closes #225)\n\n### Fixed\n\n- **Mock engine echoes file content** — `_output_contains` expectations against file contents now work in CI without a real model. Mock response includes task metadata, file paths, and a 1KB content preview per resource (#228, closes #227)\n- **`waza serve` no longer crashes when stdin isn't a terminal** — MCP stdio server only starts when `term.IsTerminal()` is true; piped input or background mode no longer kills the HTTP dashboard (#224)\n\n### Changed\n\n- **Vocabulary renames** — Internal types renamed: `BenchmarkSpec` → `EvalSpec`, `TestRunner` → `EvalRunner`. Not a breaking change for external consumers (types live in `internal/`) (#222)\n\n### Documentation\n\n- Cross-reference audit for recent renames + custom agent feature: added `.agent.md` coverage to quickstart, getting-started, GUIDE, TUTORIAL, examples README; updated mock engine descriptions in INTEGRATION-TESTING and eval-yaml guide (#230)\n\n### Dependencies\n\n- Bump postcss from 8.5.6 to 8.5.12 in /site (#229)\n\n## [0.30.1] - 2026-04-22\n\n### Documentation\n\n- **Updated README with missing CLI commands** — Added documentation for recently-added CLI commands that were missing from the README (#220)\n\n## [0.30.0] - 2026-04-22\n\n### Added\n\n- **`waza quality` command** — LLM-as-Judge skill quality scoring that evaluates skill output quality using a configurable judge model (#218)\n- **Scope-reduction advisory check** — `waza check` now includes an advisory that flags skills with overly broad scope, helping authors tighten skill definitions (#219)\n\n## [0.29.0] - 2026-04-22\n\n### Added\n\n- **`--keep-workspace` flag** — Preserve the temporary workspace after task execution for debugging agent output (#123, #217)\n- **`--no-skills` flag and `disabled_skills` config** — Disable specific skills during evaluation to isolate behavior (#126, #216)\n- **Non-blocking version update check** — CLI now checks for newer waza versions in the background without slowing startup (#104, #214)\n- **Per-task `skill_directories`** — Specify different skill directories for individual tasks in eval YAML (#156, #215)\n\n### Dependencies\n\n- Bump astro and @astrojs/starlight in /site (#212)\n\n## [0.28.0] - 2026-04-21\n\n### Added\n\n- **Follow-up prompts in eval YAML** — Tasks can now include pre-written follow-up prompts for multi-turn evaluation conversations (#189, #209)\n- **`waza models` command** — List all available models supported by the configured engine (#208)\n- **Early termination for trigger tests** — Trigger tests can now stop early once the target skill is invoked, reducing evaluation time (#207)\n\n### Fixed\n\n- **Stricter YAML validation** — Audited all YAML parsers; unknown fields in `TestCase` definitions are now properly rejected (#132, #206)\n- **Test fixture assertion syntax** — Fixed invalid Python expression in a test fixture assertion (#197)\n- **CI integration test stability** — CI integration tests now correctly handle expected eval failures when using the mock executor (#210)\n\n### Documentation\n\n- Added Quick Start guide to the documentation site (#205)\n\n## [0.27.0] - 2026-04-21\n\n### Added\n\n- **`output_contains_any` expectation** — New expectation field that passes when the agent response contains any one of the specified strings (#203)\n- **`max_response_time_ms` behavior rule** — Enforce maximum response time constraints on agent execution (#201)\n- **Task prompt from file** — Task `prompt` field can now reference an external file path instead of inline text (#157, #200)\n- **`tool_calls` grader** — New grader type that validates the specific tool calls an agent makes during execution (#187, #202)\n\n### Fixed\n\n- **Webserver test resilience** — Webserver tests now skip gracefully when frontend assets are not built (#204)\n\n## [0.26.0] - 2026-04-21\n\n### Changed\n\n- **Timestamped output directories** — `run --output-dir` now groups result files by timestamp for cleaner organization (#153)\n- **Improved debug logging** — Debug output is now more structured and useful for troubleshooting (#152)\n\n### Fixed\n\n- **`--discover` finds eval.yaml in nested layout** — Skill discovery now correctly locates `eval.yaml` files in `evals/{name}/` directories at the project root (#44)\n- **Diff grader reads post-execution workspace** — The diff grader now reads files from the workspace after agent execution completes, not before (#165, #196)\n- **Grader config validation** — Required grader configuration fields are now validated before evaluation starts (#195)\n- **macOS install and trigger test count** — Fixed macOS binary installation and an off-by-one error in trigger test counting (#164, #184, #193)\n\n### Documentation\n\n- Added cache command reference, prompt mode documentation, and complete YAML schema reference (#198)\n- Updated demo guide and added CI/CD integration guide (#112, #89, #194)\n\n### Dependencies\n\n- Bump defu from 6.1.4 to 6.1.6 in /site (#181)\n- Bump vite from 6.4.1 to 6.4.2 in /site and /web (#182, #192)\n- Bump go.opentelemetry.io/otel/sdk from 1.42.0 to 1.43.0 (#185)\n- Bump astro from 5.17.3 to 5.18.1 in /site (#163)\n- Bump picomatch from 4.0.3 to 4.0.4 in /site and /web (#159, #160)\n- Bump smol-toml from 1.6.0 to 1.6.1 in /site (#158)\n\n## [0.25.0] - 2026-04-21\n\n### Added\n\n- **Eval coverage grid generator** — New coverage output that visualizes which skills have eval coverage across grader types (#92)\n\n### Fixed\n\n- **SKILL.md injection and trigger fixture loading** — `waza run` now correctly injects SKILL.md content into the evaluation context, loads trigger test fixtures, and passes MCP server configuration to the engine (#191)\n\n### Dependencies\n\n- Bump h3 from 1.15.5 to 1.15.8 in /site (#144)\n\n## [0.24.0] - 2026-03-25\n\n### Changed\n\n- **Strict YAML validation** — All YAML parsers now use `KnownFields(true)` to reject unknown fields, catching typos and misconfigurations early (#132, #133)\n- **`max_workers` renamed to `workers`** — Config YAML key renamed for consistency across all config types (**breaking change**)\n- **Unified token counting** — `waza check` and `waza tokens count` now share the same counting logic for consistent results (#146)\n\n### Fixed\n\n- **Typo in prompt grader** — Fixed \"prmopt\" → \"prompt\" in error message\n\n### Dependencies\n\n- Bump h3 from 1.15.8 to 1.15.9 in /site (#155)\n- Bump github.com/buger/jsonparser from 1.1.1 to 1.1.2 (#149)\n\n## [0.21.0] - 2026-03-12\n\n### Added\n\n- **`waza new task from-prompt` command** — Record Copilot sessions into task YAML files for eval creation (#110)\n- **Trigger heuristic grader** — New grader type that scores based on trigger/anti-trigger matching heuristics (#90)\n- **Eval scaffolding command** — `waza eval new` generates eval.yaml scaffolding for skills (#94)\n- **Multi-trial flakiness detection** — Detect flaky evals across multiple trial runs (#103)\n- **Snapshot auto-update workflow** — Diff grader can now auto-update snapshot files on mismatch (#95)\n- **Per-file token budget configuration** — Configure token budgets per-file in `.waza.yaml` (#96)\n- **Skill-aware thresholds** — `waza tokens compare` supports skill-specific threshold configuration (#93)\n- **Sensei scoring parity** — WHEN triggers, spec-security, invalid level, and advisory checks 16-18 (#79)\n- **CI/CD integration guide** — GitHub Actions and Azure DevOps integration documentation (#100)\n- **FileWriter service** — Refactored `waza init` inventory with FileWriter abstraction (#63)\n\n### Fixed\n\n- **`waza suggest` deadlock** — `Execute()` now applies the request timeout before calling `Start()`, preventing goroutine deadlock (#43)\n- **`ResourceFile.Content` type** — Changed from `string` to `[]byte` for proper binary file handling (#117)\n- **`tokens compare` in subdirectory** — No longer shows all files as \"added\" when run from a subdirectory (#105)\n- **`--output-dir` ignored** — Fixed `--output-dir` having no effect for single-skill runs (#109)\n- **Web dashboard build order** — Build dashboard assets before Go compilation (#107)\n- **Test file leak** — Fixed test that leaked files into the repo (#120)\n- **Config schema defaults** — Aligned `config.schema.json` defaults with Go source of truth (#65)\n- **Skill discovery path** — Discover skills under `.github/skills/` directory (#69)\n\n### Changed\n\n- Renamed `config` node `max_workers` to `workers` for consistency across all config types\n  - This is a breaking change\n- Custom YAML deserializers for config types (#106)\n- Validate only known fields in YAML decoders. (#132)\n- Token limits priority inverted to `.waza.yaml` first (#64)\n- `@wbreza` added to CODEOWNERS (#111)\n- Go 1.26+ noted in agent instruction files (#108)\n\n## [0.9.0] - 2026-02-23\n\n### Added\n\n- **A/B baseline testing** — `--baseline` flag runs each task with and without skill, computes weighted improvement scores across quality, tokens, turns, time, and task completion (#307)\n- **Pairwise LLM judging** — `pairwise` mode on `prompt` grader with position-swap bias mitigation. Three modes: pairwise, independent, both. Magnitude scoring from much-better to much-worse (#310)\n- **Tool constraint grader** — New `tool_constraint` grader type with `expect_tools`, `reject_tools`, `max_turns`, `max_tokens` constraints. Validates agent tool usage behavior (#391)\n- **Auto skill discovery** — `--discover` flag walks directory trees for SKILL.md + eval.yaml pairs. `--strict` mode fails if any skill lacks eval coverage (#392)\n- **Releases page** — New docs site page at `reference/releases` with platform download links, install commands, and azd extension info (#383)\n\n### Fixed\n\n- **Lint warnings** — Resolved errcheck (webserver) and ineffassign (utils) lint warnings\n\n### Changed\n\n- **Competitive research** — Added OpenAI Evals analysis (`docs/research/waza-vs-openai-evals.md`), skill-validator analysis (`docs/research/waza-vs-skill-validator.md`), and eval registry design doc (`docs/research/waza-eval-registry-design.md`)\n- **Mermaid diagrams** — Converted remaining ASCII diagrams to Mermaid across all markdown files. Added Mermaid directive to AGENTS.md\n\n## [0.8.0] - 2026-02-21\n\n### Added\n\n- **MCP Server** — `waza serve` now includes an always-on MCP server with 10 tools (eval.list, eval.get, eval.validate, eval.run, task.list, run.status, run.cancel, results.summary, results.runs, skill.check) via stdio transport (#286)\n- **`waza suggest` command** — LLM-powered eval suggestions: reads SKILL.md, proposes test cases, graders, and fixtures. Flags: `--model`, `--dry-run`, `--apply`, `--output-dir`, `--format` (#287)\n- **Interactive workflow skill** — `skills/waza-interactive/SKILL.md` with 5 workflow scenarios for conversational eval orchestration (#288)\n- **Grader weighting** — `weight` field on grader configs, `ComputeWeightedRunScore` method, dashboard weighted scores column (#299)\n- **Statistical confidence intervals** — Bootstrap CI with 10K resamples, 95% confidence, normalized gain. Dashboard CI bands and significance badges (#308)\n- **Judge model support** — `--judge-model` flag and `judge_model` config for separate LLM-as-judge model (#309)\n- **Spec compliance checks** — 8 agentskills.io compliance checks in `waza check` and `waza dev` (#314)\n- **SkillsBench advisory** — 5 advisory checks (module-count, complexity, negative-delta, procedural, over-specificity) (#315)\n- **MCP integration scoring** — 4 MCP integration checks in `waza dev` (#316)\n- **Batch skill processing** — `waza dev` processes multiple skills in one run (#317)\n- **Token compare --strict** — Budget enforcement mode for `waza tokens compare` (#318)\n- **Scaffold trigger tests** — Auto-generate trigger test YAML from SKILL.md frontmatter (#319)\n- **Skill profile** — `waza tokens profile` for static analysis of skill token distribution (#311)\n- **JUnit XML reporter** — `--format junit` output for CI integration (#312)\n- **Template Variables** — New `internal/template` package with `Render()` for Go text/template syntax in hooks and commands. System variables: `JobID`, `TaskName`, `Iteration`, `Attempt`, `Timestamp`. User variables via `vars` map (#186)\n- **GroupBy Results** — New `group_by` config field to organize results by dimension (e.g., model). CLI shows grouped output, JSON includes `GroupStats` with name/passed/total/avg_score (#188)\n- **Custom Input Variables** — New `inputs` section in eval.yaml for defining key-value pairs available as `{{.Vars.key}}` throughout evaluation. Accessible in hooks, task templates, and grader configs (#189)\n- **CSV Dataset Support** — New `tasks_from` field to generate tasks from CSV files. Each row becomes a task with columns accessible as `{{.Vars.column}}`. Optional `range: [start, end]` for row filtering. First row treated as headers (#187)\n- **Retry/Attempts** — Add `max_attempts` config field for retrying failed task executions within each trial (#191)\n- **Lifecycle Hooks** — Add `hooks` section with `before_run`/`after_run`/`before_task`/`after_task` lifecycle points (#191)\n- **`prompt` grader (LLM-as-judge)** — LLM-based evaluation with rubrics, tool-based grading, and session management modes (#177, closes #104)\n  - Two modes: `clean` (fresh context) and `continue_session` (resumes test session)\n  - Tool-based grading: `set_waza_grade_pass` and `set_waza_grade_fail` tools for LLM graders\n  - Separate judge model configuration: run evaluation with a different model than the executor\n  - Pre-built rubric templates adapted from Azure ML evaluators\n- **`trigger_tests.yaml` auto-discovery** — measure prompt trigger accuracy for skills (#166, closes #36)\n  - New `internal/trigger/` package for trigger testing\n  - Automatically discovered alongside `eval.yaml`\n  - Confidence weighting: `high` (weight 1.0) and `medium` (weight 0.5) for borderline cases\n  - `trigger_accuracy` metric with configurable cutoff threshold\n  - Metrics: accuracy, precision, recall, F1, error count\n- **`diff` grader** — new grader type for workspace file comparison with snapshot matching and contains-line fragment checks (#158)\n- **Azure ML evaluation rubrics** — 8 pre-built rubric YAMLs in `examples/rubrics/` adapted from Azure ML evaluators (#160, #161):\n  - Tool call rubrics: `tool_call_accuracy`, `tool_selection`, `tool_input_accuracy`, `tool_output_utilization`\n  - Task evaluation rubrics: `task_completion`, `task_adherence`, `intent_resolution`, `response_completeness`\n- **MockEngine WorkspaceDir support** — test infrastructure for graders that need workspace access (#159)\n\n### Changed\n\n- **Dashboard** — Aspire-style trajectory waterfall, weighted scores column, CI bands with significance indicators, judge model badge (#303, #330, #331, #332)\n- **Docs site** — Dashboard explore page with 14+ screenshots, light/dark mode, navbar polish (#357, #358, #360)\n\n### Fixed\n\n- **install.sh macOS checksum** — added `shasum -a 256` fallback for macOS (which lacks `sha256sum`) (#163)\n- Dashboard compare-runs screenshot now shows 2 runs selected with full comparison\n- GitHub icon alignment and search bar width on docs site\n\n## [0.4.0-alpha.1] - 2026-02-17\n\n### Added\n\n- **Go cross-platform release pipeline** — `go-release.yml` workflow builds binaries for linux/darwin/windows on amd64 and arm64 (#155)\n- **`install.sh` installer** — one-line binary install with checksum verification: `curl -fsSL https://raw.githubusercontent.com/microsoft/waza/main/install.sh | bash`\n- **`skill_invocation` grader** — validates orchestration workflows by checking which skills were invoked (#146)\n- **`required_skills` preflight validation** — verifies skill dependencies before evaluation (#147)\n- **Multi-model `--model` flag** — run evaluations across multiple models in a single command (#39)\n- **`waza check` command** — skill submission readiness checks (#151)\n- **Evaluation result caching** — incremental testing with cache invalidation (#150)\n- **GitHub PR comment reporter** — post eval results as PR comments (#140)\n- **Skills CI integration** — GitHub Actions workflow for microsoft/skills (#141)\n\n### Fixed\n\n- **Engine shutdown leak** — `runSingleModel()` now calls `engine.Shutdown(context.Background())` via defer after engine creation (#153, #154)\n\n### Changed\n\n- **Python release deprecated** — the Python release workflow is no longer maintained; Go binaries are the official distribution\n- **First Go binary release** — v0.4.0-alpha.1 is the first release distributed as pre-built binaries\n\n## [0.3.0] - 2026-02-13\n\n### Added\n\n- Grader showcase examples demonstrating all grader types (#134)\n- Reusable GitHub Actions workflow for waza evaluations (#132)\n- Documentation for prompt and action_sequence grader types (#133)\n- Documentation for `waza dev` command and compliance scoring (#131)\n- Auto-loading of skills for testing (#129)\n- Debug logging support (`--debug` flag) (#130)\n\n### Fixed\n\n- Always output test run errors to help debug failures (#128)\n- Include cwd as a skill folder when running waza (workspace fix)\n\n### Changed\n\n- Exit codes for CI/CD integration: 0=success, 1=test failure, 2=config error (#135)\n- Reordered azd-publish skill workflow steps (#127)\n- Auto-merge bot registry PRs in release workflow\n\n## [0.2.1] - 2026-02-12\n\n### Added\n\n- `waza dev` command for interactive skill development and testing (#117)\n- Prerelease input to azd publish workflow\n- CHANGELOG.md as release notes source for azd extension releases\n- `waza generate --skill <name>` - Filter to specific skill when using `--repo` or `--scan`\n\n### Fixed\n\n- Fixed azd extensions documentation link\n- Corrected `azd ext source add` command syntax\n- Branch release PR from origin/main to avoid workflow permission error (#121)\n\n### Changed\n\n- Removed path filters from Go CI to unblock non-code PRs\n- Removed auto-merge from azd publish PR workflow\n- Added azd extension installation instructions to README\n\n## [0.2.0] - 2026-02-02\n\n### Added\n\n- **Skill Discovery** (#3)\n  - `waza generate --repo <org/repo>` - Scan GitHub repos for SKILL.md files\n  - `waza generate --scan` - Scan local directory for skills\n  - `waza generate --all` - Generate evals for all discovered skills (CI-friendly)\n  - Interactive skill selection with checkboxes when not using `--all`\n\n- **GitHub Issue Creation** (#3)\n  - Post-run prompt to create GitHub issues with eval results\n  - Options: create for failed tasks only, all tasks, or none\n  - Issues include results table, failed task details, and suggestions\n  - `--no-issues` flag to skip prompts (CI-friendly)\n\n- **New Modules**\n  - `waza/scanner.py` - Skill discovery from GitHub repos and local directories\n  - `waza/issues.py` - GitHub issue creation and formatting\n\n### Changed\n\n- Improved documentation with new feature guides\n- Added skill discovery section to DEMO-SCRIPT.md\n- Updated TUTORIAL.md with discovery and issue creation steps\n\n## [0.1.0] - 2026-02-02\n\n### Changed\n\n- **Renamed project from `skill-eval` to `waza`** (技 - Japanese for \"technique/skill\")\n  - New CLI command: `waza` (previously `skill-eval`)\n  - New package name: `waza` (previously `skill_eval`)\n  - Repository renamed to `waza`\n- Bumped version to 0.1.0 to mark the rename milestone\n\n### Migration\n\nIf you were using `skill-eval`, update your scripts:\n\n```bash\n# Old\nskill-eval run ./eval.yaml\npip install skill-eval\n\n# New\nwaza run ./eval.yaml\npip install waza\n```\n\n## [0.0.2] - 2026-02-01\n\n### Added\n\n- `--suggestions-file` option to save improvement suggestions to markdown file\n- Improved progress display with step-by-step status (tool counts, activity indicators)\n- Copilot SDK usage guide in AGENTS.md\n\n### Fixed\n\n- Fixed Copilot SDK import (`from copilot import CopilotClient` not `copilot_sdk`)\n- Fixed Windows glob pattern in release workflow\n- Fixed linting issues across codebase (import sorting, exception chaining, etc.)\n- Clarified fixture isolation between tasks (each task gets fresh temp workspace)\n\n## [0.0.1] - 2026-02-01\n\n### Added\n\n- **CLI Commands**\n  - `waza run` - Run evaluation suites against skills\n  - `waza generate` - Auto-generate evals from SKILL.md files\n  - `waza init` - Initialize new eval suites interactively\n  - `waza report` - Generate reports from results\n\n- **Eval Generation**\n  - Pattern-based generation from SKILL.md files\n  - LLM-assisted generation with `--assist` flag for better tasks/fixtures\n  - Support for multiple models (Claude, GPT-4, etc.)\n\n- **Executors**\n  - Mock executor for testing without LLM calls\n  - Copilot SDK executor for real integration testing\n\n- **Graders**\n  - Code graders with Python assertions\n  - Regex graders for pattern matching\n  - LLM graders for semantic evaluation\n\n- **Features**\n  - Real-time progress display with conversation streaming (`-v`)\n  - Transcript logging (`--log`)\n  - Project context support (`--context-dir`)\n  - LLM-powered improvement suggestions (`--suggestions`)\n\n- **Documentation**\n  - Comprehensive README with examples\n  - Tutorial guide\n  - Grader reference\n  - Demo script for walkthroughs\n\n### Fixed\n\n- Grader eval context now includes `str`, `int`, `bool`, etc.\n- Transcript normalization for proper tool call detection\n- YAML escaping for regex patterns with backslashes\n- Progress bar now shows 100% on completion\n\n[Unreleased]: https://github.com/microsoft/waza/compare/v0.33.0...HEAD\n[0.33.0]: https://github.com/microsoft/waza/compare/v0.31.0...v0.33.0\n[0.31.0]: https://github.com/microsoft/waza/compare/v0.30.1...v0.31.0\n[0.30.1]: https://github.com/microsoft/waza/compare/v0.30.0...v0.30.1\n[0.30.0]: https://github.com/microsoft/waza/compare/v0.29.0...v0.30.0\n[0.29.0]: https://github.com/microsoft/waza/compare/v0.28.0...v0.29.0\n[0.28.0]: https://github.com/microsoft/waza/compare/v0.27.0...v0.28.0\n[0.27.0]: https://github.com/microsoft/waza/compare/v0.26.0...v0.27.0\n[0.26.0]: https://github.com/microsoft/waza/compare/v0.25.0...v0.26.0\n[0.25.0]: https://github.com/microsoft/waza/compare/v0.23.0...v0.25.0\n[0.24.0]: https://github.com/microsoft/waza/compare/azd-ext-microsoft-azd-waza_0.23.0...azd-ext-microsoft-azd-waza_0.24.0\n[0.21.0]: https://github.com/microsoft/waza/compare/azd-ext-microsoft-azd-waza_0.20.0...azd-ext-microsoft-azd-waza_0.21.0\n[0.9.0]: https://github.com/microsoft/waza/compare/v0.8.0...azd-ext-microsoft-azd-waza_0.20.0\n[0.8.0]: https://github.com/microsoft/waza/compare/v0.4.0-alpha.1...v0.8.0\n[0.4.0-alpha.1]: https://github.com/microsoft/waza/compare/azd-ext-microsoft-azd-waza_0.3.0...v0.4.0-alpha.1\n[0.3.0]: https://github.com/microsoft/waza/compare/azd-ext-microsoft-azd-waza_0.2.1...azd-ext-microsoft-azd-waza_0.3.0\n[0.2.1]: https://github.com/microsoft/waza/compare/azd-ext-microsoft-azd-waza_0.2.0...azd-ext-microsoft-azd-waza_0.2.1\n[0.2.0]: https://github.com/microsoft/waza/compare/v0.1.0...v0.2.0\n[0.1.0]: https://github.com/microsoft/waza/compare/v0.0.2...v0.1.0\n[0.0.2]: https://github.com/microsoft/waza/compare/v0.0.1...v0.0.2\n[0.0.1]: https://github.com/microsoft/waza/releases/tag/v0.0.1\n"
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions