Aureliolo
diff --git a/‎src/synthorg/security/safety_classifier.py‎
Lines changed: 35 additions & 11 deletions b/‎src/synthorg/security/safety_classifier.py‎
Lines changed: 35 additions & 11 deletions
diff --git a/‎src/synthorg/security/service.py‎
Lines changed: 45 additions & 39 deletions b/‎src/synthorg/security/service.py‎
Lines changed: 45 additions & 39 deletions
diff --git a/‎src/synthorg/security/uncertainty.py‎
Lines changed: 20 additions & 4 deletions b/‎src/synthorg/security/uncertainty.py‎
Lines changed: 20 additions & 4 deletions
@@ -19,6 +19,7 @@
 """
 
 import asyncio
+import html
 import re
 import time
 from enum import StrEnum
@@ -37,7 +38,12 @@
 )
 from synthorg.providers.enums import MessageRole
 from synthorg.providers.family import get_family, providers_excluding_family
-from synthorg.providers.models import ChatMessage, CompletionConfig, ToolDefinition
+from synthorg.providers.models import (
+    ChatMessage,
+    CompletionConfig,
+    CompletionResponse,
+    ToolDefinition,
+)
 from synthorg.security.config import SafetyClassifierConfig  # noqa: TC001
 from synthorg.security.rules.credential_detector import CREDENTIAL_PATTERNS
 from synthorg.security.rules.data_leak_detector import PII_PATTERNS
@@ -80,8 +86,17 @@
 # Maximum length for LLM-returned reason string.
 _MAX_REASON_LENGTH: Final[int] = 300
 
-# Regex to strip control characters from LLM-returned reason.
-_CONTROL_CHAR_RE: Final[re.Pattern[str]] = re.compile(r"[\x00-\x1f\x7f]")
+# Regex to strip control and formatting characters from LLM-returned
+# reason.  Covers ASCII control (C0/DEL), Unicode bidi overrides
+# (U+200E-200F, U+202A-202E, U+2066-2069), and zero-width chars.
+_CONTROL_CHAR_RE: Final[re.Pattern[str]] = re.compile(
+    r"[\x00-\x1f\x7f"
+    r"\u200b-\u200f"  # zero-width and bidi marks
+    r"\u202a-\u202e"  # bidi embedding/override
+    r"\u2066-\u2069"  # bidi isolate
+    r"\ufeff"  # BOM / zero-width no-break space
+    r"]",
+)
 
 
 # ── Enums and models ─────────────────────────────────────────────
@@ -374,7 +389,7 @@ def _select_provider(
         if not available:
             return None, None
 
-        # Try cross-family from a random starting point.
+        # Try cross-family selection.
         for name in available:
             family = get_family(name, self._configs)
             cross = providers_excluding_family(family, self._configs)
@@ -406,13 +421,22 @@ def _build_messages(
         tool_name: str,
         risk_level: ApprovalRiskLevel,
     ) -> list[ChatMessage]:
-        """Build prompt messages from the stripped context."""
+        """Build prompt messages from the stripped context.
+
+        All interpolated values are XML-escaped to prevent tag
+        injection from agent-controlled fields, and stripped of
+        PII/secrets via the same ``InformationStripper``.
+        """
+        safe_tool = html.escape(self._stripper.strip(tool_name))
+        safe_type = html.escape(self._stripper.strip(action_type))
+        safe_risk = html.escape(risk_level.value)
+        safe_desc = html.escape(stripped_description)
         user_content = (
             "<action>\n"
-            f"  <tool>{tool_name}</tool>\n"
-            f"  <type>{action_type}</type>\n"
-            f"  <risk_level>{risk_level.value}</risk_level>\n"
-            f"  <description>{stripped_description}</description>\n"
+            f"  <tool>{safe_tool}</tool>\n"
+            f"  <type>{safe_type}</type>\n"
+            f"  <risk_level>{safe_risk}</risk_level>\n"
+            f"  <description>{safe_desc}</description>\n"
             "</action>"
         )
 
@@ -427,14 +451,14 @@ def _build_messages(
 
     def _parse_response(
         self,
-        response: object,
+        response: CompletionResponse,
         stripped_description: str,
         start: float,
     ) -> SafetyClassifierResult:
         """Parse LLM response into a SafetyClassifierResult."""
         duration_ms = (time.monotonic() - start) * 1000
 
-        for tc in response.tool_calls:  # type: ignore[attr-defined]
+        for tc in response.tool_calls:
             if tc.name == "safety_classification_verdict":
                 return self._parse_tool_call(
                     tc.arguments,
 
@@ -60,6 +60,9 @@
 )
 from synthorg.security.output_scanner import OutputScanner  # noqa: TC001
 from synthorg.security.rules.engine import RuleEngine  # noqa: TC001
+from synthorg.security.safety_classifier import (
+    SafetyClassification,
+)
 from synthorg.security.timeout.protocol import RiskTierClassifier  # noqa: TC001
 
 if TYPE_CHECKING:
@@ -564,32 +567,36 @@ async def _handle_escalation(
 
         # Stage 1+2: safety classification (if configured).
         if self._safety_classifier is not None:
-            classify_result = await self._run_safety_classifier(
+            auto_rejected = await self._run_safety_classifier(
                 context,
                 verdict,
                 metadata,
             )
-            if classify_result is not None:
-                # BLOCKED -> auto-reject.
-                if classify_result == "blocked":
-                    return verdict.model_copy(
-                        update={
-                            "verdict": SecurityVerdictType.DENY,
-                            "reason": (
-                                f"{verdict.reason} (auto-rejected: "
-                                f"safety classifier blocked)"
-                            ),
-                        },
-                    )
-                # Use stripped description for the reviewer view.
-                stripped = metadata.get("stripped_description")
-                if stripped:
-                    description = stripped
+            if auto_rejected:
+                return verdict.model_copy(
+                    update={
+                        "verdict": SecurityVerdictType.DENY,
+                        "reason": (
+                            f"{verdict.reason} (auto-rejected: "
+                            f"safety classifier blocked)"
+                        ),
+                    },
+                )
+            # Use stripped description for the reviewer view.
+            stripped = metadata.get("stripped_description")
+            if stripped:
+                description = stripped
 
         # Cross-provider uncertainty check (if configured).
+        # Use stripped description when available to avoid
+        # broadcasting raw PII/secrets to all providers.
         if self._uncertainty_checker is not None:
+            check_text = metadata.get(
+                "stripped_description",
+                verdict.reason,
+            )
             await self._run_uncertainty_check(
-                verdict,
+                check_text,
                 metadata,
             )
 
@@ -637,17 +644,16 @@ async def _run_safety_classifier(
         context: SecurityContext,
         verdict: SecurityVerdict,
         metadata: dict[str, str],
-    ) -> str | None:
+    ) -> bool:
         """Run the safety classifier and populate metadata.
 
-        Returns the classification string, or ``None`` on error.
-        On BLOCKED, the caller should auto-reject.
+        Returns ``True`` if the action was auto-rejected (BLOCKED
+        with ``auto_reject_blocked`` enabled), ``False`` otherwise.
+        Metadata is populated with classification results on success.
+        On error, metadata is left unchanged and ``False`` is returned
+        (fail-safe: proceed to human review).
         """
         try:
-            from synthorg.security.safety_classifier import (  # noqa: PLC0415
-                SafetyClassification,
-            )
-
             result = await self._safety_classifier.classify(  # type: ignore[union-attr]
                 verdict.reason,
                 context.action_type,
@@ -658,15 +664,16 @@ async def _run_safety_classifier(
             metadata["stripped_description"] = result.stripped_description
             metadata["safety_reason"] = result.reason
 
-            if result.classification == SafetyClassification.BLOCKED:
-                auto_reject = self._config.safety_classifier.auto_reject_blocked
-                if auto_reject:
-                    logger.warning(
-                        SECURITY_SAFETY_CLASSIFY_BLOCKED,
-                        tool_name=context.tool_name,
-                        reason=result.reason,
-                    )
-                    return "blocked"
+            if (
+                result.classification == SafetyClassification.BLOCKED
+                and self._config.safety_classifier.auto_reject_blocked
+            ):
+                logger.warning(
+                    SECURITY_SAFETY_CLASSIFY_BLOCKED,
+                    tool_name=context.tool_name,
+                    reason=result.reason,
+                )
+                return True
         except MemoryError, RecursionError:
             raise
         except Exception:
@@ -675,19 +682,17 @@ async def _run_safety_classifier(
                 tool_name=context.tool_name,
                 note="Safety classifier failed -- proceeding without classification",
             )
-            return None
-        else:
-            return result.classification.value
+        return False
 
     async def _run_uncertainty_check(
         self,
-        verdict: SecurityVerdict,
+        prompt: str,
         metadata: dict[str, str],
     ) -> None:
         """Run the uncertainty checker and populate metadata."""
         try:
             result = await self._uncertainty_checker.check(  # type: ignore[union-attr]
-                verdict.reason,
+                prompt,
             )
             metadata["confidence_score"] = str(result.confidence_score)
             if result.keyword_overlap is not None:
@@ -703,3 +708,4 @@ async def _run_uncertainty_check(
                 SECURITY_UNCERTAINTY_CHECK_ERROR,
                 note="Uncertainty check failed -- proceeding without score",
             )
+            metadata["uncertainty_check_error"] = "true"
@@ -306,7 +306,7 @@ async def check(self, prompt: str) -> UncertaintyResult:
         # Compute similarity metrics.
         keyword_overlap = _compute_keyword_overlap(responses)
         embedding_sim = _compute_tfidf_cosine_similarity(responses)
-        confidence = 0.6 * embedding_sim + 0.4 * keyword_overlap
+        confidence = min(1.0, 0.6 * embedding_sim + 0.4 * keyword_overlap)
 
         if confidence < self._config.low_confidence_threshold:
             logger.warning(
@@ -354,6 +354,13 @@ async def _collect_responses(
         results: list[str] = []
 
         async def _call_provider(candidate: ResolvedModel) -> str | None:
+            """Call a single provider.
+
+            Inside a TaskGroup, all exceptions must be caught to
+            avoid ExceptionGroup propagation (even MemoryError /
+            RecursionError -- re-raising them would wrap in an
+            ExceptionGroup that escapes outer except clauses).
+            """
             driver: BaseCompletionProvider = self._registry.get(
                 candidate.provider_name,
             )
@@ -366,8 +373,6 @@ async def _call_provider(candidate: ResolvedModel) -> str | None:
                     ),
                     timeout=self._config.timeout_seconds,
                 )
-            except MemoryError, RecursionError:
-                raise
             except Exception:
                 logger.exception(
                     SECURITY_UNCERTAINTY_CHECK_ERROR,
@@ -376,7 +381,18 @@ async def _call_provider(candidate: ResolvedModel) -> str | None:
                 )
                 return None
             else:
-                return response.content or ""
+                # Filter empty/None content to avoid diluting
+                # similarity metrics (e.g. content-filtered responses).
+                text = response.content
+                if not text:
+                    logger.debug(
+                        SECURITY_UNCERTAINTY_CHECK_ERROR,
+                        provider=candidate.provider_name,
+                        model=candidate.model_id,
+                        note="Provider returned empty content",
+                    )
+                    return None
+                return text
 
         async with asyncio.TaskGroup() as tg:
             tasks = [tg.create_task(_call_provider(c)) for c in candidates]