langfuse

Langfuse GitHub Banner

Langfuse Python SDK

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

View Source

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31
32Langfuse = _client_module.Langfuse
33
34__all__ = [
35    "Langfuse",
36    "get_client",
37    "observe",
38    "propagate_attributes",
39    "ObservationTypeLiteral",
40    "LangfuseSpan",
41    "LangfuseGeneration",
42    "LangfuseEvent",
43    "LangfuseOtelSpanAttributes",
44    "LangfuseAgent",
45    "LangfuseTool",
46    "LangfuseChain",
47    "LangfuseEmbedding",
48    "LangfuseEvaluator",
49    "LangfuseRetriever",
50    "LangfuseGuardrail",
51    "Evaluation",
52    "EvaluatorInputs",
53    "MapperFunction",
54    "CompositeEvaluatorFunction",
55    "EvaluatorStats",
56    "BatchEvaluationResumeToken",
57    "BatchEvaluationResult",
58    "experiment",
59    "api",
60]

class Langfuse: View Source

 134class Langfuse:
 135    """Main client for Langfuse tracing and platform features.
 136
 137    This class provides an interface for creating and managing traces, spans,
 138    and generations in Langfuse as well as interacting with the Langfuse API.
 139
 140    The client features a thread-safe singleton pattern for each unique public API key,
 141    ensuring consistent trace context propagation across your application. It implements
 142    efficient batching of spans with configurable flush settings and includes background
 143    thread management for media uploads and score ingestion.
 144
 145    Configuration is flexible through either direct parameters or environment variables,
 146    with graceful fallbacks and runtime configuration updates.
 147
 148    Attributes:
 149        api: Synchronous API client for Langfuse backend communication
 150        async_api: Asynchronous API client for Langfuse backend communication
 151        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 152
 153    Parameters:
 154        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 155        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 156        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 157        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 158        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 159        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 160        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 161        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 162        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 163        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 164        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 165        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 166        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 167        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 168        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 169        blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (`metadata.scope.name`)
 170        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 171        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 172
 173    Example:
 174        ```python
 175        from langfuse.otel import Langfuse
 176
 177        # Initialize the client (reads from env vars if not provided)
 178        langfuse = Langfuse(
 179            public_key="your-public-key",
 180            secret_key="your-secret-key",
 181            host="https://cloud.langfuse.com",  # Optional, default shown
 182        )
 183
 184        # Create a trace span
 185        with langfuse.start_as_current_span(name="process-query") as span:
 186            # Your application code here
 187
 188            # Create a nested generation span for an LLM call
 189            with span.start_as_current_generation(
 190                name="generate-response",
 191                model="gpt-4",
 192                input={"query": "Tell me about AI"},
 193                model_parameters={"temperature": 0.7, "max_tokens": 500}
 194            ) as generation:
 195                # Generate response here
 196                response = "AI is a field of computer science..."
 197
 198                generation.update(
 199                    output=response,
 200                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 201                    cost_details={"total_cost": 0.0023}
 202                )
 203
 204                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 205                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 206        ```
 207    """
 208
 209    _resources: Optional[LangfuseResourceManager] = None
 210    _mask: Optional[MaskFunction] = None
 211    _otel_tracer: otel_trace_api.Tracer
 212
 213    def __init__(
 214        self,
 215        *,
 216        public_key: Optional[str] = None,
 217        secret_key: Optional[str] = None,
 218        base_url: Optional[str] = None,
 219        host: Optional[str] = None,
 220        timeout: Optional[int] = None,
 221        httpx_client: Optional[httpx.Client] = None,
 222        debug: bool = False,
 223        tracing_enabled: Optional[bool] = True,
 224        flush_at: Optional[int] = None,
 225        flush_interval: Optional[float] = None,
 226        environment: Optional[str] = None,
 227        release: Optional[str] = None,
 228        media_upload_thread_count: Optional[int] = None,
 229        sample_rate: Optional[float] = None,
 230        mask: Optional[MaskFunction] = None,
 231        blocked_instrumentation_scopes: Optional[List[str]] = None,
 232        additional_headers: Optional[Dict[str, str]] = None,
 233        tracer_provider: Optional[TracerProvider] = None,
 234    ):
 235        self._base_url = (
 236            base_url
 237            or os.environ.get(LANGFUSE_BASE_URL)
 238            or host
 239            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 240        )
 241        self._environment = environment or cast(
 242            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 243        )
 244        self._project_id: Optional[str] = None
 245        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 246        if not 0.0 <= sample_rate <= 1.0:
 247            raise ValueError(
 248                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 249            )
 250
 251        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 252
 253        self._tracing_enabled = (
 254            tracing_enabled
 255            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 256        )
 257        if not self._tracing_enabled:
 258            langfuse_logger.info(
 259                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 260            )
 261
 262        debug = (
 263            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 264        )
 265        if debug:
 266            logging.basicConfig(
 267                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 268            )
 269            langfuse_logger.setLevel(logging.DEBUG)
 270
 271        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 272        if public_key is None:
 273            langfuse_logger.warning(
 274                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 275                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 276            )
 277            self._otel_tracer = otel_trace_api.NoOpTracer()
 278            return
 279
 280        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 281        if secret_key is None:
 282            langfuse_logger.warning(
 283                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 284                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 285            )
 286            self._otel_tracer = otel_trace_api.NoOpTracer()
 287            return
 288
 289        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 290            langfuse_logger.warning(
 291                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 292            )
 293
 294        # Initialize api and tracer if requirements are met
 295        self._resources = LangfuseResourceManager(
 296            public_key=public_key,
 297            secret_key=secret_key,
 298            base_url=self._base_url,
 299            timeout=timeout,
 300            environment=self._environment,
 301            release=release,
 302            flush_at=flush_at,
 303            flush_interval=flush_interval,
 304            httpx_client=httpx_client,
 305            media_upload_thread_count=media_upload_thread_count,
 306            sample_rate=sample_rate,
 307            mask=mask,
 308            tracing_enabled=self._tracing_enabled,
 309            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 310            additional_headers=additional_headers,
 311            tracer_provider=tracer_provider,
 312        )
 313        self._mask = self._resources.mask
 314
 315        self._otel_tracer = (
 316            self._resources.tracer
 317            if self._tracing_enabled and self._resources.tracer is not None
 318            else otel_trace_api.NoOpTracer()
 319        )
 320        self.api = self._resources.api
 321        self.async_api = self._resources.async_api
 322
 323    def start_span(
 324        self,
 325        *,
 326        trace_context: Optional[TraceContext] = None,
 327        name: str,
 328        input: Optional[Any] = None,
 329        output: Optional[Any] = None,
 330        metadata: Optional[Any] = None,
 331        version: Optional[str] = None,
 332        level: Optional[SpanLevel] = None,
 333        status_message: Optional[str] = None,
 334    ) -> LangfuseSpan:
 335        """Create a new span for tracing a unit of work.
 336
 337        This method creates a new span but does not set it as the current span in the
 338        context. To create and use a span within a context, use start_as_current_span().
 339
 340        The created span will be the child of the current span in the context.
 341
 342        Args:
 343            trace_context: Optional context for connecting to an existing trace
 344            name: Name of the span (e.g., function or operation name)
 345            input: Input data for the operation (can be any JSON-serializable object)
 346            output: Output data from the operation (can be any JSON-serializable object)
 347            metadata: Additional metadata to associate with the span
 348            version: Version identifier for the code or component
 349            level: Importance level of the span (info, warning, error)
 350            status_message: Optional status message for the span
 351
 352        Returns:
 353            A LangfuseSpan object that must be ended with .end() when the operation completes
 354
 355        Example:
 356            ```python
 357            span = langfuse.start_span(name="process-data")
 358            try:
 359                # Do work
 360                span.update(output="result")
 361            finally:
 362                span.end()
 363            ```
 364        """
 365        return self.start_observation(
 366            trace_context=trace_context,
 367            name=name,
 368            as_type="span",
 369            input=input,
 370            output=output,
 371            metadata=metadata,
 372            version=version,
 373            level=level,
 374            status_message=status_message,
 375        )
 376
 377    def start_as_current_span(
 378        self,
 379        *,
 380        trace_context: Optional[TraceContext] = None,
 381        name: str,
 382        input: Optional[Any] = None,
 383        output: Optional[Any] = None,
 384        metadata: Optional[Any] = None,
 385        version: Optional[str] = None,
 386        level: Optional[SpanLevel] = None,
 387        status_message: Optional[str] = None,
 388        end_on_exit: Optional[bool] = None,
 389    ) -> _AgnosticContextManager[LangfuseSpan]:
 390        """Create a new span and set it as the current span in a context manager.
 391
 392        This method creates a new span and sets it as the current span within a context
 393        manager. Use this method with a 'with' statement to automatically handle span
 394        lifecycle within a code block.
 395
 396        The created span will be the child of the current span in the context.
 397
 398        Args:
 399            trace_context: Optional context for connecting to an existing trace
 400            name: Name of the span (e.g., function or operation name)
 401            input: Input data for the operation (can be any JSON-serializable object)
 402            output: Output data from the operation (can be any JSON-serializable object)
 403            metadata: Additional metadata to associate with the span
 404            version: Version identifier for the code or component
 405            level: Importance level of the span (info, warning, error)
 406            status_message: Optional status message for the span
 407            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 408
 409        Returns:
 410            A context manager that yields a LangfuseSpan
 411
 412        Example:
 413            ```python
 414            with langfuse.start_as_current_span(name="process-query") as span:
 415                # Do work
 416                result = process_data()
 417                span.update(output=result)
 418
 419                # Create a child span automatically
 420                with span.start_as_current_span(name="sub-operation") as child_span:
 421                    # Do sub-operation work
 422                    child_span.update(output="sub-result")
 423            ```
 424        """
 425        return self.start_as_current_observation(
 426            trace_context=trace_context,
 427            name=name,
 428            as_type="span",
 429            input=input,
 430            output=output,
 431            metadata=metadata,
 432            version=version,
 433            level=level,
 434            status_message=status_message,
 435            end_on_exit=end_on_exit,
 436        )
 437
 438    @overload
 439    def start_observation(
 440        self,
 441        *,
 442        trace_context: Optional[TraceContext] = None,
 443        name: str,
 444        as_type: Literal["generation"],
 445        input: Optional[Any] = None,
 446        output: Optional[Any] = None,
 447        metadata: Optional[Any] = None,
 448        version: Optional[str] = None,
 449        level: Optional[SpanLevel] = None,
 450        status_message: Optional[str] = None,
 451        completion_start_time: Optional[datetime] = None,
 452        model: Optional[str] = None,
 453        model_parameters: Optional[Dict[str, MapValue]] = None,
 454        usage_details: Optional[Dict[str, int]] = None,
 455        cost_details: Optional[Dict[str, float]] = None,
 456        prompt: Optional[PromptClient] = None,
 457    ) -> LangfuseGeneration: ...
 458
 459    @overload
 460    def start_observation(
 461        self,
 462        *,
 463        trace_context: Optional[TraceContext] = None,
 464        name: str,
 465        as_type: Literal["span"] = "span",
 466        input: Optional[Any] = None,
 467        output: Optional[Any] = None,
 468        metadata: Optional[Any] = None,
 469        version: Optional[str] = None,
 470        level: Optional[SpanLevel] = None,
 471        status_message: Optional[str] = None,
 472    ) -> LangfuseSpan: ...
 473
 474    @overload
 475    def start_observation(
 476        self,
 477        *,
 478        trace_context: Optional[TraceContext] = None,
 479        name: str,
 480        as_type: Literal["agent"],
 481        input: Optional[Any] = None,
 482        output: Optional[Any] = None,
 483        metadata: Optional[Any] = None,
 484        version: Optional[str] = None,
 485        level: Optional[SpanLevel] = None,
 486        status_message: Optional[str] = None,
 487    ) -> LangfuseAgent: ...
 488
 489    @overload
 490    def start_observation(
 491        self,
 492        *,
 493        trace_context: Optional[TraceContext] = None,
 494        name: str,
 495        as_type: Literal["tool"],
 496        input: Optional[Any] = None,
 497        output: Optional[Any] = None,
 498        metadata: Optional[Any] = None,
 499        version: Optional[str] = None,
 500        level: Optional[SpanLevel] = None,
 501        status_message: Optional[str] = None,
 502    ) -> LangfuseTool: ...
 503
 504    @overload
 505    def start_observation(
 506        self,
 507        *,
 508        trace_context: Optional[TraceContext] = None,
 509        name: str,
 510        as_type: Literal["chain"],
 511        input: Optional[Any] = None,
 512        output: Optional[Any] = None,
 513        metadata: Optional[Any] = None,
 514        version: Optional[str] = None,
 515        level: Optional[SpanLevel] = None,
 516        status_message: Optional[str] = None,
 517    ) -> LangfuseChain: ...
 518
 519    @overload
 520    def start_observation(
 521        self,
 522        *,
 523        trace_context: Optional[TraceContext] = None,
 524        name: str,
 525        as_type: Literal["retriever"],
 526        input: Optional[Any] = None,
 527        output: Optional[Any] = None,
 528        metadata: Optional[Any] = None,
 529        version: Optional[str] = None,
 530        level: Optional[SpanLevel] = None,
 531        status_message: Optional[str] = None,
 532    ) -> LangfuseRetriever: ...
 533
 534    @overload
 535    def start_observation(
 536        self,
 537        *,
 538        trace_context: Optional[TraceContext] = None,
 539        name: str,
 540        as_type: Literal["evaluator"],
 541        input: Optional[Any] = None,
 542        output: Optional[Any] = None,
 543        metadata: Optional[Any] = None,
 544        version: Optional[str] = None,
 545        level: Optional[SpanLevel] = None,
 546        status_message: Optional[str] = None,
 547    ) -> LangfuseEvaluator: ...
 548
 549    @overload
 550    def start_observation(
 551        self,
 552        *,
 553        trace_context: Optional[TraceContext] = None,
 554        name: str,
 555        as_type: Literal["embedding"],
 556        input: Optional[Any] = None,
 557        output: Optional[Any] = None,
 558        metadata: Optional[Any] = None,
 559        version: Optional[str] = None,
 560        level: Optional[SpanLevel] = None,
 561        status_message: Optional[str] = None,
 562        completion_start_time: Optional[datetime] = None,
 563        model: Optional[str] = None,
 564        model_parameters: Optional[Dict[str, MapValue]] = None,
 565        usage_details: Optional[Dict[str, int]] = None,
 566        cost_details: Optional[Dict[str, float]] = None,
 567        prompt: Optional[PromptClient] = None,
 568    ) -> LangfuseEmbedding: ...
 569
 570    @overload
 571    def start_observation(
 572        self,
 573        *,
 574        trace_context: Optional[TraceContext] = None,
 575        name: str,
 576        as_type: Literal["guardrail"],
 577        input: Optional[Any] = None,
 578        output: Optional[Any] = None,
 579        metadata: Optional[Any] = None,
 580        version: Optional[str] = None,
 581        level: Optional[SpanLevel] = None,
 582        status_message: Optional[str] = None,
 583    ) -> LangfuseGuardrail: ...
 584
 585    def start_observation(
 586        self,
 587        *,
 588        trace_context: Optional[TraceContext] = None,
 589        name: str,
 590        as_type: ObservationTypeLiteralNoEvent = "span",
 591        input: Optional[Any] = None,
 592        output: Optional[Any] = None,
 593        metadata: Optional[Any] = None,
 594        version: Optional[str] = None,
 595        level: Optional[SpanLevel] = None,
 596        status_message: Optional[str] = None,
 597        completion_start_time: Optional[datetime] = None,
 598        model: Optional[str] = None,
 599        model_parameters: Optional[Dict[str, MapValue]] = None,
 600        usage_details: Optional[Dict[str, int]] = None,
 601        cost_details: Optional[Dict[str, float]] = None,
 602        prompt: Optional[PromptClient] = None,
 603    ) -> Union[
 604        LangfuseSpan,
 605        LangfuseGeneration,
 606        LangfuseAgent,
 607        LangfuseTool,
 608        LangfuseChain,
 609        LangfuseRetriever,
 610        LangfuseEvaluator,
 611        LangfuseEmbedding,
 612        LangfuseGuardrail,
 613    ]:
 614        """Create a new observation of the specified type.
 615
 616        This method creates a new observation but does not set it as the current span in the
 617        context. To create and use an observation within a context, use start_as_current_observation().
 618
 619        Args:
 620            trace_context: Optional context for connecting to an existing trace
 621            name: Name of the observation
 622            as_type: Type of observation to create (defaults to "span")
 623            input: Input data for the operation
 624            output: Output data from the operation
 625            metadata: Additional metadata to associate with the observation
 626            version: Version identifier for the code or component
 627            level: Importance level of the observation
 628            status_message: Optional status message for the observation
 629            completion_start_time: When the model started generating (for generation types)
 630            model: Name/identifier of the AI model used (for generation types)
 631            model_parameters: Parameters used for the model (for generation types)
 632            usage_details: Token usage information (for generation types)
 633            cost_details: Cost information (for generation types)
 634            prompt: Associated prompt template (for generation types)
 635
 636        Returns:
 637            An observation object of the appropriate type that must be ended with .end()
 638        """
 639        if trace_context:
 640            trace_id = trace_context.get("trace_id", None)
 641            parent_span_id = trace_context.get("parent_span_id", None)
 642
 643            if trace_id:
 644                remote_parent_span = self._create_remote_parent_span(
 645                    trace_id=trace_id, parent_span_id=parent_span_id
 646                )
 647
 648                with otel_trace_api.use_span(
 649                    cast(otel_trace_api.Span, remote_parent_span)
 650                ):
 651                    otel_span = self._otel_tracer.start_span(name=name)
 652                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 653
 654                    return self._create_observation_from_otel_span(
 655                        otel_span=otel_span,
 656                        as_type=as_type,
 657                        input=input,
 658                        output=output,
 659                        metadata=metadata,
 660                        version=version,
 661                        level=level,
 662                        status_message=status_message,
 663                        completion_start_time=completion_start_time,
 664                        model=model,
 665                        model_parameters=model_parameters,
 666                        usage_details=usage_details,
 667                        cost_details=cost_details,
 668                        prompt=prompt,
 669                    )
 670
 671        otel_span = self._otel_tracer.start_span(name=name)
 672
 673        return self._create_observation_from_otel_span(
 674            otel_span=otel_span,
 675            as_type=as_type,
 676            input=input,
 677            output=output,
 678            metadata=metadata,
 679            version=version,
 680            level=level,
 681            status_message=status_message,
 682            completion_start_time=completion_start_time,
 683            model=model,
 684            model_parameters=model_parameters,
 685            usage_details=usage_details,
 686            cost_details=cost_details,
 687            prompt=prompt,
 688        )
 689
 690    def _create_observation_from_otel_span(
 691        self,
 692        *,
 693        otel_span: otel_trace_api.Span,
 694        as_type: ObservationTypeLiteralNoEvent,
 695        input: Optional[Any] = None,
 696        output: Optional[Any] = None,
 697        metadata: Optional[Any] = None,
 698        version: Optional[str] = None,
 699        level: Optional[SpanLevel] = None,
 700        status_message: Optional[str] = None,
 701        completion_start_time: Optional[datetime] = None,
 702        model: Optional[str] = None,
 703        model_parameters: Optional[Dict[str, MapValue]] = None,
 704        usage_details: Optional[Dict[str, int]] = None,
 705        cost_details: Optional[Dict[str, float]] = None,
 706        prompt: Optional[PromptClient] = None,
 707    ) -> Union[
 708        LangfuseSpan,
 709        LangfuseGeneration,
 710        LangfuseAgent,
 711        LangfuseTool,
 712        LangfuseChain,
 713        LangfuseRetriever,
 714        LangfuseEvaluator,
 715        LangfuseEmbedding,
 716        LangfuseGuardrail,
 717    ]:
 718        """Create the appropriate observation type from an OTEL span."""
 719        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 720            observation_class = self._get_span_class(as_type)
 721            # Type ignore to prevent overloads of internal _get_span_class function,
 722            # issue is that LangfuseEvent could be returned and that classes have diff. args
 723            return observation_class(  # type: ignore[return-value,call-arg]
 724                otel_span=otel_span,
 725                langfuse_client=self,
 726                environment=self._environment,
 727                input=input,
 728                output=output,
 729                metadata=metadata,
 730                version=version,
 731                level=level,
 732                status_message=status_message,
 733                completion_start_time=completion_start_time,
 734                model=model,
 735                model_parameters=model_parameters,
 736                usage_details=usage_details,
 737                cost_details=cost_details,
 738                prompt=prompt,
 739            )
 740        else:
 741            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 742            observation_class = self._get_span_class(as_type)
 743            # Type ignore to prevent overloads of internal _get_span_class function,
 744            # issue is that LangfuseEvent could be returned and that classes have diff. args
 745            return observation_class(  # type: ignore[return-value,call-arg]
 746                otel_span=otel_span,
 747                langfuse_client=self,
 748                environment=self._environment,
 749                input=input,
 750                output=output,
 751                metadata=metadata,
 752                version=version,
 753                level=level,
 754                status_message=status_message,
 755            )
 756            # span._observation_type = as_type
 757            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 758            # return span
 759
 760    def start_generation(
 761        self,
 762        *,
 763        trace_context: Optional[TraceContext] = None,
 764        name: str,
 765        input: Optional[Any] = None,
 766        output: Optional[Any] = None,
 767        metadata: Optional[Any] = None,
 768        version: Optional[str] = None,
 769        level: Optional[SpanLevel] = None,
 770        status_message: Optional[str] = None,
 771        completion_start_time: Optional[datetime] = None,
 772        model: Optional[str] = None,
 773        model_parameters: Optional[Dict[str, MapValue]] = None,
 774        usage_details: Optional[Dict[str, int]] = None,
 775        cost_details: Optional[Dict[str, float]] = None,
 776        prompt: Optional[PromptClient] = None,
 777    ) -> LangfuseGeneration:
 778        """Create a new generation span for model generations.
 779
 780        DEPRECATED: This method is deprecated and will be removed in a future version.
 781        Use start_observation(as_type='generation') instead.
 782
 783        This method creates a specialized span for tracking model generations.
 784        It includes additional fields specific to model generations such as model name,
 785        token usage, and cost details.
 786
 787        The created generation span will be the child of the current span in the context.
 788
 789        Args:
 790            trace_context: Optional context for connecting to an existing trace
 791            name: Name of the generation operation
 792            input: Input data for the model (e.g., prompts)
 793            output: Output from the model (e.g., completions)
 794            metadata: Additional metadata to associate with the generation
 795            version: Version identifier for the model or component
 796            level: Importance level of the generation (info, warning, error)
 797            status_message: Optional status message for the generation
 798            completion_start_time: When the model started generating the response
 799            model: Name/identifier of the AI model used (e.g., "gpt-4")
 800            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 801            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 802            cost_details: Cost information for the model call
 803            prompt: Associated prompt template from Langfuse prompt management
 804
 805        Returns:
 806            A LangfuseGeneration object that must be ended with .end() when complete
 807
 808        Example:
 809            ```python
 810            generation = langfuse.start_generation(
 811                name="answer-generation",
 812                model="gpt-4",
 813                input={"prompt": "Explain quantum computing"},
 814                model_parameters={"temperature": 0.7}
 815            )
 816            try:
 817                # Call model API
 818                response = llm.generate(...)
 819
 820                generation.update(
 821                    output=response.text,
 822                    usage_details={
 823                        "prompt_tokens": response.usage.prompt_tokens,
 824                        "completion_tokens": response.usage.completion_tokens
 825                    }
 826                )
 827            finally:
 828                generation.end()
 829            ```
 830        """
 831        warnings.warn(
 832            "start_generation is deprecated and will be removed in a future version. "
 833            "Use start_observation(as_type='generation') instead.",
 834            DeprecationWarning,
 835            stacklevel=2,
 836        )
 837        return self.start_observation(
 838            trace_context=trace_context,
 839            name=name,
 840            as_type="generation",
 841            input=input,
 842            output=output,
 843            metadata=metadata,
 844            version=version,
 845            level=level,
 846            status_message=status_message,
 847            completion_start_time=completion_start_time,
 848            model=model,
 849            model_parameters=model_parameters,
 850            usage_details=usage_details,
 851            cost_details=cost_details,
 852            prompt=prompt,
 853        )
 854
 855    def start_as_current_generation(
 856        self,
 857        *,
 858        trace_context: Optional[TraceContext] = None,
 859        name: str,
 860        input: Optional[Any] = None,
 861        output: Optional[Any] = None,
 862        metadata: Optional[Any] = None,
 863        version: Optional[str] = None,
 864        level: Optional[SpanLevel] = None,
 865        status_message: Optional[str] = None,
 866        completion_start_time: Optional[datetime] = None,
 867        model: Optional[str] = None,
 868        model_parameters: Optional[Dict[str, MapValue]] = None,
 869        usage_details: Optional[Dict[str, int]] = None,
 870        cost_details: Optional[Dict[str, float]] = None,
 871        prompt: Optional[PromptClient] = None,
 872        end_on_exit: Optional[bool] = None,
 873    ) -> _AgnosticContextManager[LangfuseGeneration]:
 874        """Create a new generation span and set it as the current span in a context manager.
 875
 876        DEPRECATED: This method is deprecated and will be removed in a future version.
 877        Use start_as_current_observation(as_type='generation') instead.
 878
 879        This method creates a specialized span for model generations and sets it as the
 880        current span within a context manager. Use this method with a 'with' statement to
 881        automatically handle the generation span lifecycle within a code block.
 882
 883        The created generation span will be the child of the current span in the context.
 884
 885        Args:
 886            trace_context: Optional context for connecting to an existing trace
 887            name: Name of the generation operation
 888            input: Input data for the model (e.g., prompts)
 889            output: Output from the model (e.g., completions)
 890            metadata: Additional metadata to associate with the generation
 891            version: Version identifier for the model or component
 892            level: Importance level of the generation (info, warning, error)
 893            status_message: Optional status message for the generation
 894            completion_start_time: When the model started generating the response
 895            model: Name/identifier of the AI model used (e.g., "gpt-4")
 896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 898            cost_details: Cost information for the model call
 899            prompt: Associated prompt template from Langfuse prompt management
 900            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 901
 902        Returns:
 903            A context manager that yields a LangfuseGeneration
 904
 905        Example:
 906            ```python
 907            with langfuse.start_as_current_generation(
 908                name="answer-generation",
 909                model="gpt-4",
 910                input={"prompt": "Explain quantum computing"}
 911            ) as generation:
 912                # Call model API
 913                response = llm.generate(...)
 914
 915                # Update with results
 916                generation.update(
 917                    output=response.text,
 918                    usage_details={
 919                        "prompt_tokens": response.usage.prompt_tokens,
 920                        "completion_tokens": response.usage.completion_tokens
 921                    }
 922                )
 923            ```
 924        """
 925        warnings.warn(
 926            "start_as_current_generation is deprecated and will be removed in a future version. "
 927            "Use start_as_current_observation(as_type='generation') instead.",
 928            DeprecationWarning,
 929            stacklevel=2,
 930        )
 931        return self.start_as_current_observation(
 932            trace_context=trace_context,
 933            name=name,
 934            as_type="generation",
 935            input=input,
 936            output=output,
 937            metadata=metadata,
 938            version=version,
 939            level=level,
 940            status_message=status_message,
 941            completion_start_time=completion_start_time,
 942            model=model,
 943            model_parameters=model_parameters,
 944            usage_details=usage_details,
 945            cost_details=cost_details,
 946            prompt=prompt,
 947            end_on_exit=end_on_exit,
 948        )
 949
 950    @overload
 951    def start_as_current_observation(
 952        self,
 953        *,
 954        trace_context: Optional[TraceContext] = None,
 955        name: str,
 956        as_type: Literal["generation"],
 957        input: Optional[Any] = None,
 958        output: Optional[Any] = None,
 959        metadata: Optional[Any] = None,
 960        version: Optional[str] = None,
 961        level: Optional[SpanLevel] = None,
 962        status_message: Optional[str] = None,
 963        completion_start_time: Optional[datetime] = None,
 964        model: Optional[str] = None,
 965        model_parameters: Optional[Dict[str, MapValue]] = None,
 966        usage_details: Optional[Dict[str, int]] = None,
 967        cost_details: Optional[Dict[str, float]] = None,
 968        prompt: Optional[PromptClient] = None,
 969        end_on_exit: Optional[bool] = None,
 970    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 971
 972    @overload
 973    def start_as_current_observation(
 974        self,
 975        *,
 976        trace_context: Optional[TraceContext] = None,
 977        name: str,
 978        as_type: Literal["span"] = "span",
 979        input: Optional[Any] = None,
 980        output: Optional[Any] = None,
 981        metadata: Optional[Any] = None,
 982        version: Optional[str] = None,
 983        level: Optional[SpanLevel] = None,
 984        status_message: Optional[str] = None,
 985        end_on_exit: Optional[bool] = None,
 986    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 987
 988    @overload
 989    def start_as_current_observation(
 990        self,
 991        *,
 992        trace_context: Optional[TraceContext] = None,
 993        name: str,
 994        as_type: Literal["agent"],
 995        input: Optional[Any] = None,
 996        output: Optional[Any] = None,
 997        metadata: Optional[Any] = None,
 998        version: Optional[str] = None,
 999        level: Optional[SpanLevel] = None,
1000        status_message: Optional[str] = None,
1001        end_on_exit: Optional[bool] = None,
1002    ) -> _AgnosticContextManager[LangfuseAgent]: ...
1003
1004    @overload
1005    def start_as_current_observation(
1006        self,
1007        *,
1008        trace_context: Optional[TraceContext] = None,
1009        name: str,
1010        as_type: Literal["tool"],
1011        input: Optional[Any] = None,
1012        output: Optional[Any] = None,
1013        metadata: Optional[Any] = None,
1014        version: Optional[str] = None,
1015        level: Optional[SpanLevel] = None,
1016        status_message: Optional[str] = None,
1017        end_on_exit: Optional[bool] = None,
1018    ) -> _AgnosticContextManager[LangfuseTool]: ...
1019
1020    @overload
1021    def start_as_current_observation(
1022        self,
1023        *,
1024        trace_context: Optional[TraceContext] = None,
1025        name: str,
1026        as_type: Literal["chain"],
1027        input: Optional[Any] = None,
1028        output: Optional[Any] = None,
1029        metadata: Optional[Any] = None,
1030        version: Optional[str] = None,
1031        level: Optional[SpanLevel] = None,
1032        status_message: Optional[str] = None,
1033        end_on_exit: Optional[bool] = None,
1034    ) -> _AgnosticContextManager[LangfuseChain]: ...
1035
1036    @overload
1037    def start_as_current_observation(
1038        self,
1039        *,
1040        trace_context: Optional[TraceContext] = None,
1041        name: str,
1042        as_type: Literal["retriever"],
1043        input: Optional[Any] = None,
1044        output: Optional[Any] = None,
1045        metadata: Optional[Any] = None,
1046        version: Optional[str] = None,
1047        level: Optional[SpanLevel] = None,
1048        status_message: Optional[str] = None,
1049        end_on_exit: Optional[bool] = None,
1050    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
1051
1052    @overload
1053    def start_as_current_observation(
1054        self,
1055        *,
1056        trace_context: Optional[TraceContext] = None,
1057        name: str,
1058        as_type: Literal["evaluator"],
1059        input: Optional[Any] = None,
1060        output: Optional[Any] = None,
1061        metadata: Optional[Any] = None,
1062        version: Optional[str] = None,
1063        level: Optional[SpanLevel] = None,
1064        status_message: Optional[str] = None,
1065        end_on_exit: Optional[bool] = None,
1066    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
1067
1068    @overload
1069    def start_as_current_observation(
1070        self,
1071        *,
1072        trace_context: Optional[TraceContext] = None,
1073        name: str,
1074        as_type: Literal["embedding"],
1075        input: Optional[Any] = None,
1076        output: Optional[Any] = None,
1077        metadata: Optional[Any] = None,
1078        version: Optional[str] = None,
1079        level: Optional[SpanLevel] = None,
1080        status_message: Optional[str] = None,
1081        completion_start_time: Optional[datetime] = None,
1082        model: Optional[str] = None,
1083        model_parameters: Optional[Dict[str, MapValue]] = None,
1084        usage_details: Optional[Dict[str, int]] = None,
1085        cost_details: Optional[Dict[str, float]] = None,
1086        prompt: Optional[PromptClient] = None,
1087        end_on_exit: Optional[bool] = None,
1088    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
1089
1090    @overload
1091    def start_as_current_observation(
1092        self,
1093        *,
1094        trace_context: Optional[TraceContext] = None,
1095        name: str,
1096        as_type: Literal["guardrail"],
1097        input: Optional[Any] = None,
1098        output: Optional[Any] = None,
1099        metadata: Optional[Any] = None,
1100        version: Optional[str] = None,
1101        level: Optional[SpanLevel] = None,
1102        status_message: Optional[str] = None,
1103        end_on_exit: Optional[bool] = None,
1104    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
1105
1106    def start_as_current_observation(
1107        self,
1108        *,
1109        trace_context: Optional[TraceContext] = None,
1110        name: str,
1111        as_type: ObservationTypeLiteralNoEvent = "span",
1112        input: Optional[Any] = None,
1113        output: Optional[Any] = None,
1114        metadata: Optional[Any] = None,
1115        version: Optional[str] = None,
1116        level: Optional[SpanLevel] = None,
1117        status_message: Optional[str] = None,
1118        completion_start_time: Optional[datetime] = None,
1119        model: Optional[str] = None,
1120        model_parameters: Optional[Dict[str, MapValue]] = None,
1121        usage_details: Optional[Dict[str, int]] = None,
1122        cost_details: Optional[Dict[str, float]] = None,
1123        prompt: Optional[PromptClient] = None,
1124        end_on_exit: Optional[bool] = None,
1125    ) -> Union[
1126        _AgnosticContextManager[LangfuseGeneration],
1127        _AgnosticContextManager[LangfuseSpan],
1128        _AgnosticContextManager[LangfuseAgent],
1129        _AgnosticContextManager[LangfuseTool],
1130        _AgnosticContextManager[LangfuseChain],
1131        _AgnosticContextManager[LangfuseRetriever],
1132        _AgnosticContextManager[LangfuseEvaluator],
1133        _AgnosticContextManager[LangfuseEmbedding],
1134        _AgnosticContextManager[LangfuseGuardrail],
1135    ]:
1136        """Create a new observation and set it as the current span in a context manager.
1137
1138        This method creates a new observation of the specified type and sets it as the
1139        current span within a context manager. Use this method with a 'with' statement to
1140        automatically handle the observation lifecycle within a code block.
1141
1142        The created observation will be the child of the current span in the context.
1143
1144        Args:
1145            trace_context: Optional context for connecting to an existing trace
1146            name: Name of the observation (e.g., function or operation name)
1147            as_type: Type of observation to create (defaults to "span")
1148            input: Input data for the operation (can be any JSON-serializable object)
1149            output: Output data from the operation (can be any JSON-serializable object)
1150            metadata: Additional metadata to associate with the observation
1151            version: Version identifier for the code or component
1152            level: Importance level of the observation (info, warning, error)
1153            status_message: Optional status message for the observation
1154            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1155
1156            The following parameters are available when as_type is: "generation" or "embedding".
1157            completion_start_time: When the model started generating the response
1158            model: Name/identifier of the AI model used (e.g., "gpt-4")
1159            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1160            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1161            cost_details: Cost information for the model call
1162            prompt: Associated prompt template from Langfuse prompt management
1163
1164        Returns:
1165            A context manager that yields the appropriate observation type based on as_type
1166
1167        Example:
1168            ```python
1169            # Create a span
1170            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1171                # Do work
1172                result = process_data()
1173                span.update(output=result)
1174
1175                # Create a child span automatically
1176                with span.start_as_current_span(name="sub-operation") as child_span:
1177                    # Do sub-operation work
1178                    child_span.update(output="sub-result")
1179
1180            # Create a tool observation
1181            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1182                # Do tool work
1183                results = search_web(query)
1184                tool.update(output=results)
1185
1186            # Create a generation observation
1187            with langfuse.start_as_current_observation(
1188                name="answer-generation",
1189                as_type="generation",
1190                model="gpt-4"
1191            ) as generation:
1192                # Generate answer
1193                response = llm.generate(...)
1194                generation.update(output=response)
1195            ```
1196        """
1197        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1198            if trace_context:
1199                trace_id = trace_context.get("trace_id", None)
1200                parent_span_id = trace_context.get("parent_span_id", None)
1201
1202                if trace_id:
1203                    remote_parent_span = self._create_remote_parent_span(
1204                        trace_id=trace_id, parent_span_id=parent_span_id
1205                    )
1206
1207                    return cast(
1208                        Union[
1209                            _AgnosticContextManager[LangfuseGeneration],
1210                            _AgnosticContextManager[LangfuseEmbedding],
1211                        ],
1212                        self._create_span_with_parent_context(
1213                            as_type=as_type,
1214                            name=name,
1215                            remote_parent_span=remote_parent_span,
1216                            parent=None,
1217                            end_on_exit=end_on_exit,
1218                            input=input,
1219                            output=output,
1220                            metadata=metadata,
1221                            version=version,
1222                            level=level,
1223                            status_message=status_message,
1224                            completion_start_time=completion_start_time,
1225                            model=model,
1226                            model_parameters=model_parameters,
1227                            usage_details=usage_details,
1228                            cost_details=cost_details,
1229                            prompt=prompt,
1230                        ),
1231                    )
1232
1233            return cast(
1234                Union[
1235                    _AgnosticContextManager[LangfuseGeneration],
1236                    _AgnosticContextManager[LangfuseEmbedding],
1237                ],
1238                self._start_as_current_otel_span_with_processed_media(
1239                    as_type=as_type,
1240                    name=name,
1241                    end_on_exit=end_on_exit,
1242                    input=input,
1243                    output=output,
1244                    metadata=metadata,
1245                    version=version,
1246                    level=level,
1247                    status_message=status_message,
1248                    completion_start_time=completion_start_time,
1249                    model=model,
1250                    model_parameters=model_parameters,
1251                    usage_details=usage_details,
1252                    cost_details=cost_details,
1253                    prompt=prompt,
1254                ),
1255            )
1256
1257        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1258            if trace_context:
1259                trace_id = trace_context.get("trace_id", None)
1260                parent_span_id = trace_context.get("parent_span_id", None)
1261
1262                if trace_id:
1263                    remote_parent_span = self._create_remote_parent_span(
1264                        trace_id=trace_id, parent_span_id=parent_span_id
1265                    )
1266
1267                    return cast(
1268                        Union[
1269                            _AgnosticContextManager[LangfuseSpan],
1270                            _AgnosticContextManager[LangfuseAgent],
1271                            _AgnosticContextManager[LangfuseTool],
1272                            _AgnosticContextManager[LangfuseChain],
1273                            _AgnosticContextManager[LangfuseRetriever],
1274                            _AgnosticContextManager[LangfuseEvaluator],
1275                            _AgnosticContextManager[LangfuseGuardrail],
1276                        ],
1277                        self._create_span_with_parent_context(
1278                            as_type=as_type,
1279                            name=name,
1280                            remote_parent_span=remote_parent_span,
1281                            parent=None,
1282                            end_on_exit=end_on_exit,
1283                            input=input,
1284                            output=output,
1285                            metadata=metadata,
1286                            version=version,
1287                            level=level,
1288                            status_message=status_message,
1289                        ),
1290                    )
1291
1292            return cast(
1293                Union[
1294                    _AgnosticContextManager[LangfuseSpan],
1295                    _AgnosticContextManager[LangfuseAgent],
1296                    _AgnosticContextManager[LangfuseTool],
1297                    _AgnosticContextManager[LangfuseChain],
1298                    _AgnosticContextManager[LangfuseRetriever],
1299                    _AgnosticContextManager[LangfuseEvaluator],
1300                    _AgnosticContextManager[LangfuseGuardrail],
1301                ],
1302                self._start_as_current_otel_span_with_processed_media(
1303                    as_type=as_type,
1304                    name=name,
1305                    end_on_exit=end_on_exit,
1306                    input=input,
1307                    output=output,
1308                    metadata=metadata,
1309                    version=version,
1310                    level=level,
1311                    status_message=status_message,
1312                ),
1313            )
1314
1315        # This should never be reached since all valid types are handled above
1316        langfuse_logger.warning(
1317            f"Unknown observation type: {as_type}, falling back to span"
1318        )
1319        return self._start_as_current_otel_span_with_processed_media(
1320            as_type="span",
1321            name=name,
1322            end_on_exit=end_on_exit,
1323            input=input,
1324            output=output,
1325            metadata=metadata,
1326            version=version,
1327            level=level,
1328            status_message=status_message,
1329        )
1330
1331    def _get_span_class(
1332        self,
1333        as_type: ObservationTypeLiteral,
1334    ) -> Union[
1335        Type[LangfuseAgent],
1336        Type[LangfuseTool],
1337        Type[LangfuseChain],
1338        Type[LangfuseRetriever],
1339        Type[LangfuseEvaluator],
1340        Type[LangfuseEmbedding],
1341        Type[LangfuseGuardrail],
1342        Type[LangfuseGeneration],
1343        Type[LangfuseEvent],
1344        Type[LangfuseSpan],
1345    ]:
1346        """Get the appropriate span class based on as_type."""
1347        normalized_type = as_type.lower()
1348
1349        if normalized_type == "agent":
1350            return LangfuseAgent
1351        elif normalized_type == "tool":
1352            return LangfuseTool
1353        elif normalized_type == "chain":
1354            return LangfuseChain
1355        elif normalized_type == "retriever":
1356            return LangfuseRetriever
1357        elif normalized_type == "evaluator":
1358            return LangfuseEvaluator
1359        elif normalized_type == "embedding":
1360            return LangfuseEmbedding
1361        elif normalized_type == "guardrail":
1362            return LangfuseGuardrail
1363        elif normalized_type == "generation":
1364            return LangfuseGeneration
1365        elif normalized_type == "event":
1366            return LangfuseEvent
1367        elif normalized_type == "span":
1368            return LangfuseSpan
1369        else:
1370            return LangfuseSpan
1371
1372    @_agnosticcontextmanager
1373    def _create_span_with_parent_context(
1374        self,
1375        *,
1376        name: str,
1377        parent: Optional[otel_trace_api.Span] = None,
1378        remote_parent_span: Optional[otel_trace_api.Span] = None,
1379        as_type: ObservationTypeLiteralNoEvent,
1380        end_on_exit: Optional[bool] = None,
1381        input: Optional[Any] = None,
1382        output: Optional[Any] = None,
1383        metadata: Optional[Any] = None,
1384        version: Optional[str] = None,
1385        level: Optional[SpanLevel] = None,
1386        status_message: Optional[str] = None,
1387        completion_start_time: Optional[datetime] = None,
1388        model: Optional[str] = None,
1389        model_parameters: Optional[Dict[str, MapValue]] = None,
1390        usage_details: Optional[Dict[str, int]] = None,
1391        cost_details: Optional[Dict[str, float]] = None,
1392        prompt: Optional[PromptClient] = None,
1393    ) -> Any:
1394        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1395
1396        with otel_trace_api.use_span(parent_span):
1397            with self._start_as_current_otel_span_with_processed_media(
1398                name=name,
1399                as_type=as_type,
1400                end_on_exit=end_on_exit,
1401                input=input,
1402                output=output,
1403                metadata=metadata,
1404                version=version,
1405                level=level,
1406                status_message=status_message,
1407                completion_start_time=completion_start_time,
1408                model=model,
1409                model_parameters=model_parameters,
1410                usage_details=usage_details,
1411                cost_details=cost_details,
1412                prompt=prompt,
1413            ) as langfuse_span:
1414                if remote_parent_span is not None:
1415                    langfuse_span._otel_span.set_attribute(
1416                        LangfuseOtelSpanAttributes.AS_ROOT, True
1417                    )
1418
1419                yield langfuse_span
1420
1421    @_agnosticcontextmanager
1422    def _start_as_current_otel_span_with_processed_media(
1423        self,
1424        *,
1425        name: str,
1426        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1427        end_on_exit: Optional[bool] = None,
1428        input: Optional[Any] = None,
1429        output: Optional[Any] = None,
1430        metadata: Optional[Any] = None,
1431        version: Optional[str] = None,
1432        level: Optional[SpanLevel] = None,
1433        status_message: Optional[str] = None,
1434        completion_start_time: Optional[datetime] = None,
1435        model: Optional[str] = None,
1436        model_parameters: Optional[Dict[str, MapValue]] = None,
1437        usage_details: Optional[Dict[str, int]] = None,
1438        cost_details: Optional[Dict[str, float]] = None,
1439        prompt: Optional[PromptClient] = None,
1440    ) -> Any:
1441        with self._otel_tracer.start_as_current_span(
1442            name=name,
1443            end_on_exit=end_on_exit if end_on_exit is not None else True,
1444        ) as otel_span:
1445            span_class = self._get_span_class(
1446                as_type or "generation"
1447            )  # default was "generation"
1448            common_args = {
1449                "otel_span": otel_span,
1450                "langfuse_client": self,
1451                "environment": self._environment,
1452                "input": input,
1453                "output": output,
1454                "metadata": metadata,
1455                "version": version,
1456                "level": level,
1457                "status_message": status_message,
1458            }
1459
1460            if span_class in [
1461                LangfuseGeneration,
1462                LangfuseEmbedding,
1463            ]:
1464                common_args.update(
1465                    {
1466                        "completion_start_time": completion_start_time,
1467                        "model": model,
1468                        "model_parameters": model_parameters,
1469                        "usage_details": usage_details,
1470                        "cost_details": cost_details,
1471                        "prompt": prompt,
1472                    }
1473                )
1474            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1475
1476            yield span_class(**common_args)  # type: ignore[arg-type]
1477
1478    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1479        current_span = otel_trace_api.get_current_span()
1480
1481        if current_span is otel_trace_api.INVALID_SPAN:
1482            langfuse_logger.warning(
1483                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1484                "Ensure spans are created with start_as_current_span() or that you're operating within an active span context."
1485            )
1486            return None
1487
1488        return current_span
1489
1490    def update_current_generation(
1491        self,
1492        *,
1493        name: Optional[str] = None,
1494        input: Optional[Any] = None,
1495        output: Optional[Any] = None,
1496        metadata: Optional[Any] = None,
1497        version: Optional[str] = None,
1498        level: Optional[SpanLevel] = None,
1499        status_message: Optional[str] = None,
1500        completion_start_time: Optional[datetime] = None,
1501        model: Optional[str] = None,
1502        model_parameters: Optional[Dict[str, MapValue]] = None,
1503        usage_details: Optional[Dict[str, int]] = None,
1504        cost_details: Optional[Dict[str, float]] = None,
1505        prompt: Optional[PromptClient] = None,
1506    ) -> None:
1507        """Update the current active generation span with new information.
1508
1509        This method updates the current generation span in the active context with
1510        additional information. It's useful for adding output, usage stats, or other
1511        details that become available during or after model generation.
1512
1513        Args:
1514            name: The generation name
1515            input: Updated input data for the model
1516            output: Output from the model (e.g., completions)
1517            metadata: Additional metadata to associate with the generation
1518            version: Version identifier for the model or component
1519            level: Importance level of the generation (info, warning, error)
1520            status_message: Optional status message for the generation
1521            completion_start_time: When the model started generating the response
1522            model: Name/identifier of the AI model used (e.g., "gpt-4")
1523            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1524            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1525            cost_details: Cost information for the model call
1526            prompt: Associated prompt template from Langfuse prompt management
1527
1528        Example:
1529            ```python
1530            with langfuse.start_as_current_generation(name="answer-query") as generation:
1531                # Initial setup and API call
1532                response = llm.generate(...)
1533
1534                # Update with results that weren't available at creation time
1535                langfuse.update_current_generation(
1536                    output=response.text,
1537                    usage_details={
1538                        "prompt_tokens": response.usage.prompt_tokens,
1539                        "completion_tokens": response.usage.completion_tokens
1540                    }
1541                )
1542            ```
1543        """
1544        if not self._tracing_enabled:
1545            langfuse_logger.debug(
1546                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1547            )
1548            return
1549
1550        current_otel_span = self._get_current_otel_span()
1551
1552        if current_otel_span is not None:
1553            generation = LangfuseGeneration(
1554                otel_span=current_otel_span, langfuse_client=self
1555            )
1556
1557            if name:
1558                current_otel_span.update_name(name)
1559
1560            generation.update(
1561                input=input,
1562                output=output,
1563                metadata=metadata,
1564                version=version,
1565                level=level,
1566                status_message=status_message,
1567                completion_start_time=completion_start_time,
1568                model=model,
1569                model_parameters=model_parameters,
1570                usage_details=usage_details,
1571                cost_details=cost_details,
1572                prompt=prompt,
1573            )
1574
1575    def update_current_span(
1576        self,
1577        *,
1578        name: Optional[str] = None,
1579        input: Optional[Any] = None,
1580        output: Optional[Any] = None,
1581        metadata: Optional[Any] = None,
1582        version: Optional[str] = None,
1583        level: Optional[SpanLevel] = None,
1584        status_message: Optional[str] = None,
1585    ) -> None:
1586        """Update the current active span with new information.
1587
1588        This method updates the current span in the active context with
1589        additional information. It's useful for adding outputs or metadata
1590        that become available during execution.
1591
1592        Args:
1593            name: The span name
1594            input: Updated input data for the operation
1595            output: Output data from the operation
1596            metadata: Additional metadata to associate with the span
1597            version: Version identifier for the code or component
1598            level: Importance level of the span (info, warning, error)
1599            status_message: Optional status message for the span
1600
1601        Example:
1602            ```python
1603            with langfuse.start_as_current_span(name="process-data") as span:
1604                # Initial processing
1605                result = process_first_part()
1606
1607                # Update with intermediate results
1608                langfuse.update_current_span(metadata={"intermediate_result": result})
1609
1610                # Continue processing
1611                final_result = process_second_part(result)
1612
1613                # Final update
1614                langfuse.update_current_span(output=final_result)
1615            ```
1616        """
1617        if not self._tracing_enabled:
1618            langfuse_logger.debug(
1619                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1620            )
1621            return
1622
1623        current_otel_span = self._get_current_otel_span()
1624
1625        if current_otel_span is not None:
1626            span = LangfuseSpan(
1627                otel_span=current_otel_span,
1628                langfuse_client=self,
1629                environment=self._environment,
1630            )
1631
1632            if name:
1633                current_otel_span.update_name(name)
1634
1635            span.update(
1636                input=input,
1637                output=output,
1638                metadata=metadata,
1639                version=version,
1640                level=level,
1641                status_message=status_message,
1642            )
1643
1644    def update_current_trace(
1645        self,
1646        *,
1647        name: Optional[str] = None,
1648        user_id: Optional[str] = None,
1649        session_id: Optional[str] = None,
1650        version: Optional[str] = None,
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        tags: Optional[List[str]] = None,
1655        public: Optional[bool] = None,
1656    ) -> None:
1657        """Update the current trace with additional information.
1658
1659        Args:
1660            name: Updated name for the Langfuse trace
1661            user_id: ID of the user who initiated the Langfuse trace
1662            session_id: Session identifier for grouping related Langfuse traces
1663            version: Version identifier for the application or service
1664            input: Input data for the overall Langfuse trace
1665            output: Output data from the overall Langfuse trace
1666            metadata: Additional metadata to associate with the Langfuse trace
1667            tags: List of tags to categorize the Langfuse trace
1668            public: Whether the Langfuse trace should be publicly accessible
1669
1670        See Also:
1671            :func:`langfuse.propagate_attributes`: Recommended replacement
1672        """
1673        if not self._tracing_enabled:
1674            langfuse_logger.debug(
1675                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1676            )
1677            return
1678
1679        current_otel_span = self._get_current_otel_span()
1680
1681        if current_otel_span is not None and current_otel_span.is_recording():
1682            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1683                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1684            )
1685            # We need to preserve the class to keep the correct observation type
1686            span_class = self._get_span_class(existing_observation_type)
1687            span = span_class(
1688                otel_span=current_otel_span,
1689                langfuse_client=self,
1690                environment=self._environment,
1691            )
1692
1693            span.update_trace(
1694                name=name,
1695                user_id=user_id,
1696                session_id=session_id,
1697                version=version,
1698                input=input,
1699                output=output,
1700                metadata=metadata,
1701                tags=tags,
1702                public=public,
1703            )
1704
1705    def create_event(
1706        self,
1707        *,
1708        trace_context: Optional[TraceContext] = None,
1709        name: str,
1710        input: Optional[Any] = None,
1711        output: Optional[Any] = None,
1712        metadata: Optional[Any] = None,
1713        version: Optional[str] = None,
1714        level: Optional[SpanLevel] = None,
1715        status_message: Optional[str] = None,
1716    ) -> LangfuseEvent:
1717        """Create a new Langfuse observation of type 'EVENT'.
1718
1719        The created Langfuse Event observation will be the child of the current span in the context.
1720
1721        Args:
1722            trace_context: Optional context for connecting to an existing trace
1723            name: Name of the span (e.g., function or operation name)
1724            input: Input data for the operation (can be any JSON-serializable object)
1725            output: Output data from the operation (can be any JSON-serializable object)
1726            metadata: Additional metadata to associate with the span
1727            version: Version identifier for the code or component
1728            level: Importance level of the span (info, warning, error)
1729            status_message: Optional status message for the span
1730
1731        Returns:
1732            The Langfuse Event object
1733
1734        Example:
1735            ```python
1736            event = langfuse.create_event(name="process-event")
1737            ```
1738        """
1739        timestamp = time_ns()
1740
1741        if trace_context:
1742            trace_id = trace_context.get("trace_id", None)
1743            parent_span_id = trace_context.get("parent_span_id", None)
1744
1745            if trace_id:
1746                remote_parent_span = self._create_remote_parent_span(
1747                    trace_id=trace_id, parent_span_id=parent_span_id
1748                )
1749
1750                with otel_trace_api.use_span(
1751                    cast(otel_trace_api.Span, remote_parent_span)
1752                ):
1753                    otel_span = self._otel_tracer.start_span(
1754                        name=name, start_time=timestamp
1755                    )
1756                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1757
1758                    return cast(
1759                        LangfuseEvent,
1760                        LangfuseEvent(
1761                            otel_span=otel_span,
1762                            langfuse_client=self,
1763                            environment=self._environment,
1764                            input=input,
1765                            output=output,
1766                            metadata=metadata,
1767                            version=version,
1768                            level=level,
1769                            status_message=status_message,
1770                        ).end(end_time=timestamp),
1771                    )
1772
1773        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1774
1775        return cast(
1776            LangfuseEvent,
1777            LangfuseEvent(
1778                otel_span=otel_span,
1779                langfuse_client=self,
1780                environment=self._environment,
1781                input=input,
1782                output=output,
1783                metadata=metadata,
1784                version=version,
1785                level=level,
1786                status_message=status_message,
1787            ).end(end_time=timestamp),
1788        )
1789
1790    def _create_remote_parent_span(
1791        self, *, trace_id: str, parent_span_id: Optional[str]
1792    ) -> Any:
1793        if not self._is_valid_trace_id(trace_id):
1794            langfuse_logger.warning(
1795                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1796            )
1797
1798        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1799            langfuse_logger.warning(
1800                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1801            )
1802
1803        int_trace_id = int(trace_id, 16)
1804        int_parent_span_id = (
1805            int(parent_span_id, 16)
1806            if parent_span_id
1807            else RandomIdGenerator().generate_span_id()
1808        )
1809
1810        span_context = otel_trace_api.SpanContext(
1811            trace_id=int_trace_id,
1812            span_id=int_parent_span_id,
1813            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1814            is_remote=False,
1815        )
1816
1817        return otel_trace_api.NonRecordingSpan(span_context)
1818
1819    def _is_valid_trace_id(self, trace_id: str) -> bool:
1820        pattern = r"^[0-9a-f]{32}$"
1821
1822        return bool(re.match(pattern, trace_id))
1823
1824    def _is_valid_span_id(self, span_id: str) -> bool:
1825        pattern = r"^[0-9a-f]{16}$"
1826
1827        return bool(re.match(pattern, span_id))
1828
1829    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1830        """Create a unique observation ID for use with Langfuse.
1831
1832        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1833        for use with various Langfuse APIs. It can either generate a random ID or
1834        create a deterministic ID based on a seed string.
1835
1836        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1837        This method ensures the generated ID meets this requirement. If you need to
1838        correlate an external ID with a Langfuse observation ID, use the external ID as
1839        the seed to get a valid, deterministic observation ID.
1840
1841        Args:
1842            seed: Optional string to use as a seed for deterministic ID generation.
1843                 If provided, the same seed will always produce the same ID.
1844                 If not provided, a random ID will be generated.
1845
1846        Returns:
1847            A 16-character lowercase hexadecimal string representing the observation ID.
1848
1849        Example:
1850            ```python
1851            # Generate a random observation ID
1852            obs_id = langfuse.create_observation_id()
1853
1854            # Generate a deterministic ID based on a seed
1855            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1856
1857            # Correlate an external item ID with a Langfuse observation ID
1858            item_id = "item-789012"
1859            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1860
1861            # Use the ID with Langfuse APIs
1862            langfuse.create_score(
1863                name="relevance",
1864                value=0.95,
1865                trace_id=trace_id,
1866                observation_id=obs_id
1867            )
1868            ```
1869        """
1870        if not seed:
1871            span_id_int = RandomIdGenerator().generate_span_id()
1872
1873            return self._format_otel_span_id(span_id_int)
1874
1875        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1876
1877    @staticmethod
1878    def create_trace_id(*, seed: Optional[str] = None) -> str:
1879        """Create a unique trace ID for use with Langfuse.
1880
1881        This method generates a unique trace ID for use with various Langfuse APIs.
1882        It can either generate a random ID or create a deterministic ID based on
1883        a seed string.
1884
1885        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1886        This method ensures the generated ID meets this requirement. If you need to
1887        correlate an external ID with a Langfuse trace ID, use the external ID as the
1888        seed to get a valid, deterministic Langfuse trace ID.
1889
1890        Args:
1891            seed: Optional string to use as a seed for deterministic ID generation.
1892                 If provided, the same seed will always produce the same ID.
1893                 If not provided, a random ID will be generated.
1894
1895        Returns:
1896            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1897
1898        Example:
1899            ```python
1900            # Generate a random trace ID
1901            trace_id = langfuse.create_trace_id()
1902
1903            # Generate a deterministic ID based on a seed
1904            session_trace_id = langfuse.create_trace_id(seed="session-456")
1905
1906            # Correlate an external ID with a Langfuse trace ID
1907            external_id = "external-system-123456"
1908            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1909
1910            # Use the ID with trace context
1911            with langfuse.start_as_current_span(
1912                name="process-request",
1913                trace_context={"trace_id": trace_id}
1914            ) as span:
1915                # Operation will be part of the specific trace
1916                pass
1917            ```
1918        """
1919        if not seed:
1920            trace_id_int = RandomIdGenerator().generate_trace_id()
1921
1922            return Langfuse._format_otel_trace_id(trace_id_int)
1923
1924        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1925
1926    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1927        span_context = otel_span.get_span_context()
1928
1929        return self._format_otel_trace_id(span_context.trace_id)
1930
1931    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1932        span_context = otel_span.get_span_context()
1933
1934        return self._format_otel_span_id(span_context.span_id)
1935
1936    @staticmethod
1937    def _format_otel_span_id(span_id_int: int) -> str:
1938        """Format an integer span ID to a 16-character lowercase hex string.
1939
1940        Internal method to convert an OpenTelemetry integer span ID to the standard
1941        W3C Trace Context format (16-character lowercase hex string).
1942
1943        Args:
1944            span_id_int: 64-bit integer representing a span ID
1945
1946        Returns:
1947            A 16-character lowercase hexadecimal string
1948        """
1949        return format(span_id_int, "016x")
1950
1951    @staticmethod
1952    def _format_otel_trace_id(trace_id_int: int) -> str:
1953        """Format an integer trace ID to a 32-character lowercase hex string.
1954
1955        Internal method to convert an OpenTelemetry integer trace ID to the standard
1956        W3C Trace Context format (32-character lowercase hex string).
1957
1958        Args:
1959            trace_id_int: 128-bit integer representing a trace ID
1960
1961        Returns:
1962            A 32-character lowercase hexadecimal string
1963        """
1964        return format(trace_id_int, "032x")
1965
1966    @overload
1967    def create_score(
1968        self,
1969        *,
1970        name: str,
1971        value: float,
1972        session_id: Optional[str] = None,
1973        dataset_run_id: Optional[str] = None,
1974        trace_id: Optional[str] = None,
1975        observation_id: Optional[str] = None,
1976        score_id: Optional[str] = None,
1977        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1978        comment: Optional[str] = None,
1979        config_id: Optional[str] = None,
1980        metadata: Optional[Any] = None,
1981        timestamp: Optional[datetime] = None,
1982    ) -> None: ...
1983
1984    @overload
1985    def create_score(
1986        self,
1987        *,
1988        name: str,
1989        value: str,
1990        session_id: Optional[str] = None,
1991        dataset_run_id: Optional[str] = None,
1992        trace_id: Optional[str] = None,
1993        score_id: Optional[str] = None,
1994        observation_id: Optional[str] = None,
1995        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1996        comment: Optional[str] = None,
1997        config_id: Optional[str] = None,
1998        metadata: Optional[Any] = None,
1999        timestamp: Optional[datetime] = None,
2000    ) -> None: ...
2001
2002    def create_score(
2003        self,
2004        *,
2005        name: str,
2006        value: Union[float, str],
2007        session_id: Optional[str] = None,
2008        dataset_run_id: Optional[str] = None,
2009        trace_id: Optional[str] = None,
2010        observation_id: Optional[str] = None,
2011        score_id: Optional[str] = None,
2012        data_type: Optional[ScoreDataType] = None,
2013        comment: Optional[str] = None,
2014        config_id: Optional[str] = None,
2015        metadata: Optional[Any] = None,
2016        timestamp: Optional[datetime] = None,
2017    ) -> None:
2018        """Create a score for a specific trace or observation.
2019
2020        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2021        used to track quality metrics, user feedback, or automated evaluations.
2022
2023        Args:
2024            name: Name of the score (e.g., "relevance", "accuracy")
2025            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2026            session_id: ID of the Langfuse session to associate the score with
2027            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2028            trace_id: ID of the Langfuse trace to associate the score with
2029            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2030            score_id: Optional custom ID for the score (auto-generated if not provided)
2031            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2032            comment: Optional comment or explanation for the score
2033            config_id: Optional ID of a score config defined in Langfuse
2034            metadata: Optional metadata to be attached to the score
2035            timestamp: Optional timestamp for the score (defaults to current UTC time)
2036
2037        Example:
2038            ```python
2039            # Create a numeric score for accuracy
2040            langfuse.create_score(
2041                name="accuracy",
2042                value=0.92,
2043                trace_id="abcdef1234567890abcdef1234567890",
2044                data_type="NUMERIC",
2045                comment="High accuracy with minor irrelevant details"
2046            )
2047
2048            # Create a categorical score for sentiment
2049            langfuse.create_score(
2050                name="sentiment",
2051                value="positive",
2052                trace_id="abcdef1234567890abcdef1234567890",
2053                observation_id="abcdef1234567890",
2054                data_type="CATEGORICAL"
2055            )
2056            ```
2057        """
2058        if not self._tracing_enabled:
2059            return
2060
2061        score_id = score_id or self._create_observation_id()
2062
2063        try:
2064            new_body = ScoreBody(
2065                id=score_id,
2066                sessionId=session_id,
2067                datasetRunId=dataset_run_id,
2068                traceId=trace_id,
2069                observationId=observation_id,
2070                name=name,
2071                value=value,
2072                dataType=data_type,  # type: ignore
2073                comment=comment,
2074                configId=config_id,
2075                environment=self._environment,
2076                metadata=metadata,
2077            )
2078
2079            event = {
2080                "id": self.create_trace_id(),
2081                "type": "score-create",
2082                "timestamp": timestamp or _get_timestamp(),
2083                "body": new_body,
2084            }
2085
2086            if self._resources is not None:
2087                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2088                force_sample = (
2089                    not self._is_valid_trace_id(trace_id) if trace_id else True
2090                )
2091
2092                self._resources.add_score_task(
2093                    event,
2094                    force_sample=force_sample,
2095                )
2096
2097        except Exception as e:
2098            langfuse_logger.exception(
2099                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2100            )
2101
2102    @overload
2103    def score_current_span(
2104        self,
2105        *,
2106        name: str,
2107        value: float,
2108        score_id: Optional[str] = None,
2109        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2110        comment: Optional[str] = None,
2111        config_id: Optional[str] = None,
2112    ) -> None: ...
2113
2114    @overload
2115    def score_current_span(
2116        self,
2117        *,
2118        name: str,
2119        value: str,
2120        score_id: Optional[str] = None,
2121        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2122        comment: Optional[str] = None,
2123        config_id: Optional[str] = None,
2124    ) -> None: ...
2125
2126    def score_current_span(
2127        self,
2128        *,
2129        name: str,
2130        value: Union[float, str],
2131        score_id: Optional[str] = None,
2132        data_type: Optional[ScoreDataType] = None,
2133        comment: Optional[str] = None,
2134        config_id: Optional[str] = None,
2135    ) -> None:
2136        """Create a score for the current active span.
2137
2138        This method scores the currently active span in the context. It's a convenient
2139        way to score the current operation without needing to know its trace and span IDs.
2140
2141        Args:
2142            name: Name of the score (e.g., "relevance", "accuracy")
2143            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2144            score_id: Optional custom ID for the score (auto-generated if not provided)
2145            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2146            comment: Optional comment or explanation for the score
2147            config_id: Optional ID of a score config defined in Langfuse
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_generation(name="answer-query") as generation:
2152                # Generate answer
2153                response = generate_answer(...)
2154                generation.update(output=response)
2155
2156                # Score the generation
2157                langfuse.score_current_span(
2158                    name="relevance",
2159                    value=0.85,
2160                    data_type="NUMERIC",
2161                    comment="Mostly relevant but contains some tangential information"
2162                )
2163            ```
2164        """
2165        current_span = self._get_current_otel_span()
2166
2167        if current_span is not None:
2168            trace_id = self._get_otel_trace_id(current_span)
2169            observation_id = self._get_otel_span_id(current_span)
2170
2171            langfuse_logger.info(
2172                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2173            )
2174
2175            self.create_score(
2176                trace_id=trace_id,
2177                observation_id=observation_id,
2178                name=name,
2179                value=cast(str, value),
2180                score_id=score_id,
2181                data_type=cast(Literal["CATEGORICAL"], data_type),
2182                comment=comment,
2183                config_id=config_id,
2184            )
2185
2186    @overload
2187    def score_current_trace(
2188        self,
2189        *,
2190        name: str,
2191        value: float,
2192        score_id: Optional[str] = None,
2193        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2194        comment: Optional[str] = None,
2195        config_id: Optional[str] = None,
2196    ) -> None: ...
2197
2198    @overload
2199    def score_current_trace(
2200        self,
2201        *,
2202        name: str,
2203        value: str,
2204        score_id: Optional[str] = None,
2205        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2206        comment: Optional[str] = None,
2207        config_id: Optional[str] = None,
2208    ) -> None: ...
2209
2210    def score_current_trace(
2211        self,
2212        *,
2213        name: str,
2214        value: Union[float, str],
2215        score_id: Optional[str] = None,
2216        data_type: Optional[ScoreDataType] = None,
2217        comment: Optional[str] = None,
2218        config_id: Optional[str] = None,
2219    ) -> None:
2220        """Create a score for the current trace.
2221
2222        This method scores the trace of the currently active span. Unlike score_current_span,
2223        this method associates the score with the entire trace rather than a specific span.
2224        It's useful for scoring overall performance or quality of the entire operation.
2225
2226        Args:
2227            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2228            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2229            score_id: Optional custom ID for the score (auto-generated if not provided)
2230            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2231            comment: Optional comment or explanation for the score
2232            config_id: Optional ID of a score config defined in Langfuse
2233
2234        Example:
2235            ```python
2236            with langfuse.start_as_current_span(name="process-user-request") as span:
2237                # Process request
2238                result = process_complete_request()
2239                span.update(output=result)
2240
2241                # Score the overall trace
2242                langfuse.score_current_trace(
2243                    name="overall_quality",
2244                    value=0.95,
2245                    data_type="NUMERIC",
2246                    comment="High quality end-to-end response"
2247                )
2248            ```
2249        """
2250        current_span = self._get_current_otel_span()
2251
2252        if current_span is not None:
2253            trace_id = self._get_otel_trace_id(current_span)
2254
2255            langfuse_logger.info(
2256                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2257            )
2258
2259            self.create_score(
2260                trace_id=trace_id,
2261                name=name,
2262                value=cast(str, value),
2263                score_id=score_id,
2264                data_type=cast(Literal["CATEGORICAL"], data_type),
2265                comment=comment,
2266                config_id=config_id,
2267            )
2268
2269    def flush(self) -> None:
2270        """Force flush all pending spans and events to the Langfuse API.
2271
2272        This method manually flushes any pending spans, scores, and other events to the
2273        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2274        before proceeding, without waiting for the automatic flush interval.
2275
2276        Example:
2277            ```python
2278            # Record some spans and scores
2279            with langfuse.start_as_current_span(name="operation") as span:
2280                # Do work...
2281                pass
2282
2283            # Ensure all data is sent to Langfuse before proceeding
2284            langfuse.flush()
2285
2286            # Continue with other work
2287            ```
2288        """
2289        if self._resources is not None:
2290            self._resources.flush()
2291
2292    def shutdown(self) -> None:
2293        """Shut down the Langfuse client and flush all pending data.
2294
2295        This method cleanly shuts down the Langfuse client, ensuring all pending data
2296        is flushed to the API and all background threads are properly terminated.
2297
2298        It's important to call this method when your application is shutting down to
2299        prevent data loss and resource leaks. For most applications, using the client
2300        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2301
2302        Example:
2303            ```python
2304            # Initialize Langfuse
2305            langfuse = Langfuse(public_key="...", secret_key="...")
2306
2307            # Use Langfuse throughout your application
2308            # ...
2309
2310            # When application is shutting down
2311            langfuse.shutdown()
2312            ```
2313        """
2314        if self._resources is not None:
2315            self._resources.shutdown()
2316
2317    def get_current_trace_id(self) -> Optional[str]:
2318        """Get the trace ID of the current active span.
2319
2320        This method retrieves the trace ID from the currently active span in the context.
2321        It can be used to get the trace ID for referencing in logs, external systems,
2322        or for creating related operations.
2323
2324        Returns:
2325            The current trace ID as a 32-character lowercase hexadecimal string,
2326            or None if there is no active span.
2327
2328        Example:
2329            ```python
2330            with langfuse.start_as_current_span(name="process-request") as span:
2331                # Get the current trace ID for reference
2332                trace_id = langfuse.get_current_trace_id()
2333
2334                # Use it for external correlation
2335                log.info(f"Processing request with trace_id: {trace_id}")
2336
2337                # Or pass to another system
2338                external_system.process(data, trace_id=trace_id)
2339            ```
2340        """
2341        if not self._tracing_enabled:
2342            langfuse_logger.debug(
2343                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2344            )
2345            return None
2346
2347        current_otel_span = self._get_current_otel_span()
2348
2349        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2350
2351    def get_current_observation_id(self) -> Optional[str]:
2352        """Get the observation ID (span ID) of the current active span.
2353
2354        This method retrieves the observation ID from the currently active span in the context.
2355        It can be used to get the observation ID for referencing in logs, external systems,
2356        or for creating scores or other related operations.
2357
2358        Returns:
2359            The current observation ID as a 16-character lowercase hexadecimal string,
2360            or None if there is no active span.
2361
2362        Example:
2363            ```python
2364            with langfuse.start_as_current_span(name="process-user-query") as span:
2365                # Get the current observation ID
2366                observation_id = langfuse.get_current_observation_id()
2367
2368                # Store it for later reference
2369                cache.set(f"query_{query_id}_observation", observation_id)
2370
2371                # Process the query...
2372            ```
2373        """
2374        if not self._tracing_enabled:
2375            langfuse_logger.debug(
2376                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2377            )
2378            return None
2379
2380        current_otel_span = self._get_current_otel_span()
2381
2382        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2383
2384    def _get_project_id(self) -> Optional[str]:
2385        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2386        if not self._project_id:
2387            proj = self.api.projects.get()
2388            if not proj.data or not proj.data[0].id:
2389                return None
2390
2391            self._project_id = proj.data[0].id
2392
2393        return self._project_id
2394
2395    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2396        """Get the URL to view a trace in the Langfuse UI.
2397
2398        This method generates a URL that links directly to a trace in the Langfuse UI.
2399        It's useful for providing links in logs, notifications, or debugging tools.
2400
2401        Args:
2402            trace_id: Optional trace ID to generate a URL for. If not provided,
2403                     the trace ID of the current active span will be used.
2404
2405        Returns:
2406            A URL string pointing to the trace in the Langfuse UI,
2407            or None if the project ID couldn't be retrieved or no trace ID is available.
2408
2409        Example:
2410            ```python
2411            # Get URL for the current trace
2412            with langfuse.start_as_current_span(name="process-request") as span:
2413                trace_url = langfuse.get_trace_url()
2414                log.info(f"Processing trace: {trace_url}")
2415
2416            # Get URL for a specific trace
2417            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2418            send_notification(f"Review needed for trace: {specific_trace_url}")
2419            ```
2420        """
2421        project_id = self._get_project_id()
2422        final_trace_id = trace_id or self.get_current_trace_id()
2423
2424        return (
2425            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2426            if project_id and final_trace_id
2427            else None
2428        )
2429
2430    def get_dataset(
2431        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2432    ) -> "DatasetClient":
2433        """Fetch a dataset by its name.
2434
2435        Args:
2436            name (str): The name of the dataset to fetch.
2437            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2438
2439        Returns:
2440            DatasetClient: The dataset with the given name.
2441        """
2442        try:
2443            langfuse_logger.debug(f"Getting datasets {name}")
2444            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2445
2446            dataset_items = []
2447            page = 1
2448
2449            while True:
2450                new_items = self.api.dataset_items.list(
2451                    dataset_name=self._url_encode(name, is_url_param=True),
2452                    page=page,
2453                    limit=fetch_items_page_size,
2454                )
2455                dataset_items.extend(new_items.data)
2456
2457                if new_items.meta.total_pages <= page:
2458                    break
2459
2460                page += 1
2461
2462            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2463
2464            return DatasetClient(dataset, items=items)
2465
2466        except Error as e:
2467            handle_fern_exception(e)
2468            raise e
2469
2470    def get_dataset_run(
2471        self, *, dataset_name: str, run_name: str
2472    ) -> DatasetRunWithItems:
2473        """Fetch a dataset run by dataset name and run name.
2474
2475        Args:
2476            dataset_name (str): The name of the dataset.
2477            run_name (str): The name of the run.
2478
2479        Returns:
2480            DatasetRunWithItems: The dataset run with its items.
2481        """
2482        try:
2483            return self.api.datasets.get_run(
2484                dataset_name=self._url_encode(dataset_name),
2485                run_name=self._url_encode(run_name),
2486                request_options=None,
2487            )
2488        except Error as e:
2489            handle_fern_exception(e)
2490            raise e
2491
2492    def get_dataset_runs(
2493        self,
2494        *,
2495        dataset_name: str,
2496        page: Optional[int] = None,
2497        limit: Optional[int] = None,
2498    ) -> PaginatedDatasetRuns:
2499        """Fetch all runs for a dataset.
2500
2501        Args:
2502            dataset_name (str): The name of the dataset.
2503            page (Optional[int]): Page number, starts at 1.
2504            limit (Optional[int]): Limit of items per page.
2505
2506        Returns:
2507            PaginatedDatasetRuns: Paginated list of dataset runs.
2508        """
2509        try:
2510            return self.api.datasets.get_runs(
2511                dataset_name=self._url_encode(dataset_name),
2512                page=page,
2513                limit=limit,
2514                request_options=None,
2515            )
2516        except Error as e:
2517            handle_fern_exception(e)
2518            raise e
2519
2520    def delete_dataset_run(
2521        self, *, dataset_name: str, run_name: str
2522    ) -> DeleteDatasetRunResponse:
2523        """Delete a dataset run and all its run items. This action is irreversible.
2524
2525        Args:
2526            dataset_name (str): The name of the dataset.
2527            run_name (str): The name of the run.
2528
2529        Returns:
2530            DeleteDatasetRunResponse: Confirmation of deletion.
2531        """
2532        try:
2533            return self.api.datasets.delete_run(
2534                dataset_name=self._url_encode(dataset_name),
2535                run_name=self._url_encode(run_name),
2536                request_options=None,
2537            )
2538        except Error as e:
2539            handle_fern_exception(e)
2540            raise e
2541
2542    def run_experiment(
2543        self,
2544        *,
2545        name: str,
2546        run_name: Optional[str] = None,
2547        description: Optional[str] = None,
2548        data: ExperimentData,
2549        task: TaskFunction,
2550        evaluators: List[EvaluatorFunction] = [],
2551        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2552        run_evaluators: List[RunEvaluatorFunction] = [],
2553        max_concurrency: int = 50,
2554        metadata: Optional[Dict[str, str]] = None,
2555    ) -> ExperimentResult:
2556        """Run an experiment on a dataset with automatic tracing and evaluation.
2557
2558        This method executes a task function on each item in the provided dataset,
2559        automatically traces all executions with Langfuse for observability, runs
2560        item-level and run-level evaluators on the outputs, and returns comprehensive
2561        results with evaluation metrics.
2562
2563        The experiment system provides:
2564        - Automatic tracing of all task executions
2565        - Concurrent processing with configurable limits
2566        - Comprehensive error handling that isolates failures
2567        - Integration with Langfuse datasets for experiment tracking
2568        - Flexible evaluation framework supporting both sync and async evaluators
2569
2570        Args:
2571            name: Human-readable name for the experiment. Used for identification
2572                in the Langfuse UI.
2573            run_name: Optional exact name for the experiment run. If provided, this will be
2574                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2575                If not provided, this will default to the experiment name appended with an ISO timestamp.
2576            description: Optional description explaining the experiment's purpose,
2577                methodology, or expected outcomes.
2578            data: Array of data items to process. Can be either:
2579                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2580                - List of Langfuse DatasetItem objects from dataset.items
2581            task: Function that processes each data item and returns output.
2582                Must accept 'item' as keyword argument and can return sync or async results.
2583                The task function signature should be: task(*, item, **kwargs) -> Any
2584            evaluators: List of functions to evaluate each item's output individually.
2585                Each evaluator receives input, output, expected_output, and metadata.
2586                Can return single Evaluation dict or list of Evaluation dicts.
2587            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2588                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2589                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2590                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2591            run_evaluators: List of functions to evaluate the entire experiment run.
2592                Each run evaluator receives all item_results and can compute aggregate metrics.
2593                Useful for calculating averages, distributions, or cross-item comparisons.
2594            max_concurrency: Maximum number of concurrent task executions (default: 50).
2595                Controls the number of items processed simultaneously. Adjust based on
2596                API rate limits and system resources.
2597            metadata: Optional metadata dictionary to attach to all experiment traces.
2598                This metadata will be included in every trace created during the experiment.
2599                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2600
2601        Returns:
2602            ExperimentResult containing:
2603            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2604            - item_results: List of results for each processed item with outputs and evaluations
2605            - run_evaluations: List of aggregate evaluation results for the entire run
2606            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2607            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2608
2609        Raises:
2610            ValueError: If required parameters are missing or invalid
2611            Exception: If experiment setup fails (individual item failures are handled gracefully)
2612
2613        Examples:
2614            Basic experiment with local data:
2615            ```python
2616            def summarize_text(*, item, **kwargs):
2617                return f"Summary: {item['input'][:50]}..."
2618
2619            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2620                return {
2621                    "name": "output_length",
2622                    "value": len(output),
2623                    "comment": f"Output contains {len(output)} characters"
2624                }
2625
2626            result = langfuse.run_experiment(
2627                name="Text Summarization Test",
2628                description="Evaluate summarization quality and length",
2629                data=[
2630                    {"input": "Long article text...", "expected_output": "Expected summary"},
2631                    {"input": "Another article...", "expected_output": "Another summary"}
2632                ],
2633                task=summarize_text,
2634                evaluators=[length_evaluator]
2635            )
2636
2637            print(f"Processed {len(result.item_results)} items")
2638            for item_result in result.item_results:
2639                print(f"Input: {item_result.item['input']}")
2640                print(f"Output: {item_result.output}")
2641                print(f"Evaluations: {item_result.evaluations}")
2642            ```
2643
2644            Advanced experiment with async task and multiple evaluators:
2645            ```python
2646            async def llm_task(*, item, **kwargs):
2647                # Simulate async LLM call
2648                response = await openai_client.chat.completions.create(
2649                    model="gpt-4",
2650                    messages=[{"role": "user", "content": item["input"]}]
2651                )
2652                return response.choices[0].message.content
2653
2654            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2655                if expected_output and expected_output.lower() in output.lower():
2656                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2657                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2658
2659            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2660                # Simulate toxicity check
2661                toxicity_score = check_toxicity(output)  # Your toxicity checker
2662                return {
2663                    "name": "toxicity",
2664                    "value": toxicity_score,
2665                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2666                }
2667
2668            def average_accuracy(*, item_results, **kwargs):
2669                accuracies = [
2670                    eval.value for result in item_results
2671                    for eval in result.evaluations
2672                    if eval.name == "accuracy"
2673                ]
2674                return {
2675                    "name": "average_accuracy",
2676                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2677                    "comment": f"Average accuracy across {len(accuracies)} items"
2678                }
2679
2680            result = langfuse.run_experiment(
2681                name="LLM Safety and Accuracy Test",
2682                description="Evaluate model accuracy and safety across diverse prompts",
2683                data=test_dataset,  # Your dataset items
2684                task=llm_task,
2685                evaluators=[accuracy_evaluator, toxicity_evaluator],
2686                run_evaluators=[average_accuracy],
2687                max_concurrency=5,  # Limit concurrent API calls
2688                metadata={"model": "gpt-4", "temperature": 0.7}
2689            )
2690            ```
2691
2692            Using with Langfuse datasets:
2693            ```python
2694            # Get dataset from Langfuse
2695            dataset = langfuse.get_dataset("my-eval-dataset")
2696
2697            result = dataset.run_experiment(
2698                name="Production Model Evaluation",
2699                description="Monthly evaluation of production model performance",
2700                task=my_production_task,
2701                evaluators=[accuracy_evaluator, latency_evaluator]
2702            )
2703
2704            # Results automatically linked to dataset in Langfuse UI
2705            print(f"View results: {result['dataset_run_url']}")
2706            ```
2707
2708        Note:
2709            - Task and evaluator functions can be either synchronous or asynchronous
2710            - Individual item failures are logged but don't stop the experiment
2711            - All executions are automatically traced and visible in Langfuse UI
2712            - When using Langfuse datasets, results are automatically linked for easy comparison
2713            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2714            - Async execution is handled automatically with smart event loop detection
2715        """
2716        return cast(
2717            ExperimentResult,
2718            run_async_safely(
2719                self._run_experiment_async(
2720                    name=name,
2721                    run_name=self._create_experiment_run_name(
2722                        name=name, run_name=run_name
2723                    ),
2724                    description=description,
2725                    data=data,
2726                    task=task,
2727                    evaluators=evaluators or [],
2728                    composite_evaluator=composite_evaluator,
2729                    run_evaluators=run_evaluators or [],
2730                    max_concurrency=max_concurrency,
2731                    metadata=metadata,
2732                ),
2733            ),
2734        )
2735
2736    async def _run_experiment_async(
2737        self,
2738        *,
2739        name: str,
2740        run_name: str,
2741        description: Optional[str],
2742        data: ExperimentData,
2743        task: TaskFunction,
2744        evaluators: List[EvaluatorFunction],
2745        composite_evaluator: Optional[CompositeEvaluatorFunction],
2746        run_evaluators: List[RunEvaluatorFunction],
2747        max_concurrency: int,
2748        metadata: Optional[Dict[str, Any]] = None,
2749    ) -> ExperimentResult:
2750        langfuse_logger.debug(
2751            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2752        )
2753
2754        # Set up concurrency control
2755        semaphore = asyncio.Semaphore(max_concurrency)
2756
2757        # Process all items
2758        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2759            async with semaphore:
2760                return await self._process_experiment_item(
2761                    item,
2762                    task,
2763                    evaluators,
2764                    composite_evaluator,
2765                    name,
2766                    run_name,
2767                    description,
2768                    metadata,
2769                )
2770
2771        # Run all items concurrently
2772        tasks = [process_item(item) for item in data]
2773        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2774
2775        # Filter out any exceptions and log errors
2776        valid_results: List[ExperimentItemResult] = []
2777        for i, result in enumerate(item_results):
2778            if isinstance(result, Exception):
2779                langfuse_logger.error(f"Item {i} failed: {result}")
2780            elif isinstance(result, ExperimentItemResult):
2781                valid_results.append(result)  # type: ignore
2782
2783        # Run experiment-level evaluators
2784        run_evaluations: List[Evaluation] = []
2785        for run_evaluator in run_evaluators:
2786            try:
2787                evaluations = await _run_evaluator(
2788                    run_evaluator, item_results=valid_results
2789                )
2790                run_evaluations.extend(evaluations)
2791            except Exception as e:
2792                langfuse_logger.error(f"Run evaluator failed: {e}")
2793
2794        # Generate dataset run URL if applicable
2795        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2796        dataset_run_url = None
2797        if dataset_run_id and data:
2798            try:
2799                # Check if the first item has dataset_id (for DatasetItem objects)
2800                first_item = data[0]
2801                dataset_id = None
2802
2803                if hasattr(first_item, "dataset_id"):
2804                    dataset_id = getattr(first_item, "dataset_id", None)
2805
2806                if dataset_id:
2807                    project_id = self._get_project_id()
2808
2809                    if project_id:
2810                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2811
2812            except Exception:
2813                pass  # URL generation is optional
2814
2815        # Store run-level evaluations as scores
2816        for evaluation in run_evaluations:
2817            try:
2818                if dataset_run_id:
2819                    self.create_score(
2820                        dataset_run_id=dataset_run_id,
2821                        name=evaluation.name or "<unknown>",
2822                        value=evaluation.value,  # type: ignore
2823                        comment=evaluation.comment,
2824                        metadata=evaluation.metadata,
2825                        data_type=evaluation.data_type,  # type: ignore
2826                        config_id=evaluation.config_id,
2827                    )
2828
2829            except Exception as e:
2830                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2831
2832        # Flush scores and traces
2833        self.flush()
2834
2835        return ExperimentResult(
2836            name=name,
2837            run_name=run_name,
2838            description=description,
2839            item_results=valid_results,
2840            run_evaluations=run_evaluations,
2841            dataset_run_id=dataset_run_id,
2842            dataset_run_url=dataset_run_url,
2843        )
2844
2845    async def _process_experiment_item(
2846        self,
2847        item: ExperimentItem,
2848        task: Callable,
2849        evaluators: List[Callable],
2850        composite_evaluator: Optional[CompositeEvaluatorFunction],
2851        experiment_name: str,
2852        experiment_run_name: str,
2853        experiment_description: Optional[str],
2854        experiment_metadata: Optional[Dict[str, Any]] = None,
2855    ) -> ExperimentItemResult:
2856        span_name = "experiment-item-run"
2857
2858        with self.start_as_current_span(name=span_name) as span:
2859            try:
2860                input_data = (
2861                    item.get("input")
2862                    if isinstance(item, dict)
2863                    else getattr(item, "input", None)
2864                )
2865
2866                if input_data is None:
2867                    raise ValueError("Experiment Item is missing input. Skipping item.")
2868
2869                expected_output = (
2870                    item.get("expected_output")
2871                    if isinstance(item, dict)
2872                    else getattr(item, "expected_output", None)
2873                )
2874
2875                item_metadata = (
2876                    item.get("metadata")
2877                    if isinstance(item, dict)
2878                    else getattr(item, "metadata", None)
2879                )
2880
2881                final_observation_metadata = {
2882                    "experiment_name": experiment_name,
2883                    "experiment_run_name": experiment_run_name,
2884                    **(experiment_metadata or {}),
2885                }
2886
2887                trace_id = span.trace_id
2888                dataset_id = None
2889                dataset_item_id = None
2890                dataset_run_id = None
2891
2892                # Link to dataset run if this is a dataset item
2893                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2894                    try:
2895                        # Use sync API to avoid event loop issues when run_async_safely
2896                        # creates multiple event loops across different threads
2897                        dataset_run_item = await asyncio.to_thread(
2898                            self.api.dataset_run_items.create,
2899                            request=CreateDatasetRunItemRequest(
2900                                runName=experiment_run_name,
2901                                runDescription=experiment_description,
2902                                metadata=experiment_metadata,
2903                                datasetItemId=item.id,  # type: ignore
2904                                traceId=trace_id,
2905                                observationId=span.id,
2906                            ),
2907                        )
2908
2909                        dataset_run_id = dataset_run_item.dataset_run_id
2910
2911                    except Exception as e:
2912                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2913
2914                if (
2915                    not isinstance(item, dict)
2916                    and hasattr(item, "dataset_id")
2917                    and hasattr(item, "id")
2918                ):
2919                    dataset_id = item.dataset_id
2920                    dataset_item_id = item.id
2921
2922                    final_observation_metadata.update(
2923                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2924                    )
2925
2926                if isinstance(item_metadata, dict):
2927                    final_observation_metadata.update(item_metadata)
2928
2929                experiment_id = dataset_run_id or self._create_observation_id()
2930                experiment_item_id = (
2931                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2932                )
2933                span._otel_span.set_attributes(
2934                    {
2935                        k: v
2936                        for k, v in {
2937                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2938                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2939                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2940                                expected_output
2941                            ),
2942                        }.items()
2943                        if v is not None
2944                    }
2945                )
2946
2947                propagated_experiment_attributes = PropagatedExperimentAttributes(
2948                    experiment_id=experiment_id,
2949                    experiment_name=experiment_run_name,
2950                    experiment_metadata=_serialize(experiment_metadata),
2951                    experiment_dataset_id=dataset_id,
2952                    experiment_item_id=experiment_item_id,
2953                    experiment_item_metadata=_serialize(item_metadata),
2954                    experiment_item_root_observation_id=span.id,
2955                )
2956
2957                with _propagate_attributes(experiment=propagated_experiment_attributes):
2958                    output = await _run_task(task, item)
2959
2960                span.update(
2961                    input=input_data,
2962                    output=output,
2963                    metadata=final_observation_metadata,
2964                )
2965
2966            except Exception as e:
2967                span.update(
2968                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2969                )
2970                raise e
2971
2972            # Run evaluators
2973            evaluations = []
2974
2975            for evaluator in evaluators:
2976                try:
2977                    eval_metadata: Optional[Dict[str, Any]] = None
2978
2979                    if isinstance(item, dict):
2980                        eval_metadata = item.get("metadata")
2981                    elif hasattr(item, "metadata"):
2982                        eval_metadata = item.metadata
2983
2984                    with _propagate_attributes(
2985                        experiment=propagated_experiment_attributes
2986                    ):
2987                        eval_results = await _run_evaluator(
2988                            evaluator,
2989                            input=input_data,
2990                            output=output,
2991                            expected_output=expected_output,
2992                            metadata=eval_metadata,
2993                        )
2994                        evaluations.extend(eval_results)
2995
2996                        # Store evaluations as scores
2997                        for evaluation in eval_results:
2998                            self.create_score(
2999                                trace_id=trace_id,
3000                                observation_id=span.id,
3001                                name=evaluation.name,
3002                                value=evaluation.value,  # type: ignore
3003                                comment=evaluation.comment,
3004                                metadata=evaluation.metadata,
3005                                config_id=evaluation.config_id,
3006                                data_type=evaluation.data_type,  # type: ignore
3007                            )
3008
3009                except Exception as e:
3010                    langfuse_logger.error(f"Evaluator failed: {e}")
3011
3012            # Run composite evaluator if provided and we have evaluations
3013            if composite_evaluator and evaluations:
3014                try:
3015                    composite_eval_metadata: Optional[Dict[str, Any]] = None
3016                    if isinstance(item, dict):
3017                        composite_eval_metadata = item.get("metadata")
3018                    elif hasattr(item, "metadata"):
3019                        composite_eval_metadata = item.metadata
3020
3021                    with _propagate_attributes(
3022                        experiment=propagated_experiment_attributes
3023                    ):
3024                        result = composite_evaluator(
3025                            input=input_data,
3026                            output=output,
3027                            expected_output=expected_output,
3028                            metadata=composite_eval_metadata,
3029                            evaluations=evaluations,
3030                        )
3031
3032                        # Handle async composite evaluators
3033                        if asyncio.iscoroutine(result):
3034                            result = await result
3035
3036                        # Normalize to list
3037                        composite_evals: List[Evaluation] = []
3038                        if isinstance(result, (dict, Evaluation)):
3039                            composite_evals = [result]  # type: ignore
3040                        elif isinstance(result, list):
3041                            composite_evals = result  # type: ignore
3042
3043                        # Store composite evaluations as scores and add to evaluations list
3044                        for composite_evaluation in composite_evals:
3045                            self.create_score(
3046                                trace_id=trace_id,
3047                                observation_id=span.id,
3048                                name=composite_evaluation.name,
3049                                value=composite_evaluation.value,  # type: ignore
3050                                comment=composite_evaluation.comment,
3051                                metadata=composite_evaluation.metadata,
3052                                config_id=composite_evaluation.config_id,
3053                                data_type=composite_evaluation.data_type,  # type: ignore
3054                            )
3055                            evaluations.append(composite_evaluation)
3056
3057                except Exception as e:
3058                    langfuse_logger.error(f"Composite evaluator failed: {e}")
3059
3060            return ExperimentItemResult(
3061                item=item,
3062                output=output,
3063                evaluations=evaluations,
3064                trace_id=trace_id,
3065                dataset_run_id=dataset_run_id,
3066            )
3067
3068    def _create_experiment_run_name(
3069        self, *, name: Optional[str] = None, run_name: Optional[str] = None
3070    ) -> str:
3071        if run_name:
3072            return run_name
3073
3074        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
3075
3076        return f"{name} - {iso_timestamp}"
3077
3078    def run_batched_evaluation(
3079        self,
3080        *,
3081        scope: Literal["traces", "observations"],
3082        mapper: MapperFunction,
3083        filter: Optional[str] = None,
3084        fetch_batch_size: int = 50,
3085        max_items: Optional[int] = None,
3086        max_retries: int = 3,
3087        evaluators: List[EvaluatorFunction],
3088        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3089        max_concurrency: int = 50,
3090        metadata: Optional[Dict[str, Any]] = None,
3091        resume_from: Optional[BatchEvaluationResumeToken] = None,
3092        verbose: bool = False,
3093    ) -> BatchEvaluationResult:
3094        """Fetch traces or observations and run evaluations on each item.
3095
3096        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3097        It fetches items based on filters, transforms them using a mapper function, runs
3098        evaluators on each item, and creates scores that are linked back to the original
3099        entities. This is ideal for:
3100
3101        - Running evaluations on production traces after deployment
3102        - Backtesting new evaluation metrics on historical data
3103        - Batch scoring of observations for quality monitoring
3104        - Periodic evaluation runs on recent data
3105
3106        The method uses a streaming/pipeline approach to process items in batches, making
3107        it memory-efficient for large datasets. It includes comprehensive error handling,
3108        retry logic, and resume capability for long-running evaluations.
3109
3110        Args:
3111            scope: The type of items to evaluate. Must be one of:
3112                - "traces": Evaluate complete traces with all their observations
3113                - "observations": Evaluate individual observations (spans, generations, events)
3114            mapper: Function that transforms API response objects into evaluator inputs.
3115                Receives a trace/observation object and returns an EvaluatorInputs
3116                instance with input, output, expected_output, and metadata fields.
3117                Can be sync or async.
3118            evaluators: List of evaluation functions to run on each item. Each evaluator
3119                receives the mapped inputs and returns Evaluation object(s). Evaluator
3120                failures are logged but don't stop the batch evaluation.
3121            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3122                - '{"tags": ["production"]}'
3123                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3124                Default: None (fetches all items).
3125            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3126                Larger values may be faster but use more memory. Default: 50.
3127            max_items: Maximum total number of items to process. If None, processes all
3128                items matching the filter. Useful for testing or limiting evaluation runs.
3129                Default: None (process all).
3130            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3131                parallelism and resource usage. Default: 50.
3132            composite_evaluator: Optional function that creates a composite score from
3133                item-level evaluations. Receives the original item and its evaluations,
3134                returns a single Evaluation. Useful for weighted averages or combined metrics.
3135                Default: None.
3136            metadata: Optional metadata dict to add to all created scores. Useful for
3137                tracking evaluation runs, versions, or other context. Default: None.
3138            max_retries: Maximum number of retry attempts for failed batch fetches.
3139                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3140            verbose: If True, logs progress information to console. Useful for monitoring
3141                long-running evaluations. Default: False.
3142            resume_from: Optional resume token from a previous incomplete run. Allows
3143                continuing evaluation after interruption or failure. Default: None.
3144
3145
3146        Returns:
3147            BatchEvaluationResult containing:
3148                - total_items_fetched: Number of items fetched from API
3149                - total_items_processed: Number of items successfully evaluated
3150                - total_items_failed: Number of items that failed evaluation
3151                - total_scores_created: Scores created by item-level evaluators
3152                - total_composite_scores_created: Scores created by composite evaluator
3153                - total_evaluations_failed: Individual evaluator failures
3154                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3155                - resume_token: Token for resuming if incomplete (None if completed)
3156                - completed: True if all items processed
3157                - duration_seconds: Total execution time
3158                - failed_item_ids: IDs of items that failed
3159                - error_summary: Error types and counts
3160                - has_more_items: True if max_items reached but more exist
3161
3162        Raises:
3163            ValueError: If invalid scope is provided.
3164
3165        Examples:
3166            Basic trace evaluation:
3167            ```python
3168            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3169
3170            client = Langfuse()
3171
3172            # Define mapper to extract fields from traces
3173            def trace_mapper(trace):
3174                return EvaluatorInputs(
3175                    input=trace.input,
3176                    output=trace.output,
3177                    expected_output=None,
3178                    metadata={"trace_id": trace.id}
3179                )
3180
3181            # Define evaluator
3182            def length_evaluator(*, input, output, expected_output, metadata):
3183                return Evaluation(
3184                    name="output_length",
3185                    value=len(output) if output else 0
3186                )
3187
3188            # Run batch evaluation
3189            result = client.run_batched_evaluation(
3190                scope="traces",
3191                mapper=trace_mapper,
3192                evaluators=[length_evaluator],
3193                filter='{"tags": ["production"]}',
3194                max_items=1000,
3195                verbose=True
3196            )
3197
3198            print(f"Processed {result.total_items_processed} traces")
3199            print(f"Created {result.total_scores_created} scores")
3200            ```
3201
3202            Evaluation with composite scorer:
3203            ```python
3204            def accuracy_evaluator(*, input, output, expected_output, metadata):
3205                # ... evaluation logic
3206                return Evaluation(name="accuracy", value=0.85)
3207
3208            def relevance_evaluator(*, input, output, expected_output, metadata):
3209                # ... evaluation logic
3210                return Evaluation(name="relevance", value=0.92)
3211
3212            def composite_evaluator(*, item, evaluations):
3213                # Weighted average of evaluations
3214                weights = {"accuracy": 0.6, "relevance": 0.4}
3215                total = sum(
3216                    e.value * weights.get(e.name, 0)
3217                    for e in evaluations
3218                    if isinstance(e.value, (int, float))
3219                )
3220                return Evaluation(
3221                    name="composite_score",
3222                    value=total,
3223                    comment=f"Weighted average of {len(evaluations)} metrics"
3224                )
3225
3226            result = client.run_batched_evaluation(
3227                scope="traces",
3228                mapper=trace_mapper,
3229                evaluators=[accuracy_evaluator, relevance_evaluator],
3230                composite_evaluator=composite_evaluator,
3231                filter='{"user_id": "important_user"}',
3232                verbose=True
3233            )
3234            ```
3235
3236            Handling incomplete runs with resume:
3237            ```python
3238            # Initial run that may fail or timeout
3239            result = client.run_batched_evaluation(
3240                scope="observations",
3241                mapper=obs_mapper,
3242                evaluators=[my_evaluator],
3243                max_items=10000,
3244                verbose=True
3245            )
3246
3247            # Check if incomplete
3248            if not result.completed and result.resume_token:
3249                print(f"Processed {result.resume_token.items_processed} items before interruption")
3250
3251                # Resume from where it left off
3252                result = client.run_batched_evaluation(
3253                    scope="observations",
3254                    mapper=obs_mapper,
3255                    evaluators=[my_evaluator],
3256                    resume_from=result.resume_token,
3257                    verbose=True
3258                )
3259
3260            print(f"Total items processed: {result.total_items_processed}")
3261            ```
3262
3263            Monitoring evaluator performance:
3264            ```python
3265            result = client.run_batched_evaluation(...)
3266
3267            for stats in result.evaluator_stats:
3268                success_rate = stats.successful_runs / stats.total_runs
3269                print(f"{stats.name}:")
3270                print(f"  Success rate: {success_rate:.1%}")
3271                print(f"  Scores created: {stats.total_scores_created}")
3272
3273                if stats.failed_runs > 0:
3274                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3275            ```
3276
3277        Note:
3278            - Evaluator failures are logged but don't stop the batch evaluation
3279            - Individual item failures are tracked but don't stop processing
3280            - Fetch failures are retried with exponential backoff
3281            - All scores are automatically flushed to Langfuse at the end
3282            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3283        """
3284        runner = BatchEvaluationRunner(self)
3285
3286        return cast(
3287            BatchEvaluationResult,
3288            run_async_safely(
3289                runner.run_async(
3290                    scope=scope,
3291                    mapper=mapper,
3292                    evaluators=evaluators,
3293                    filter=filter,
3294                    fetch_batch_size=fetch_batch_size,
3295                    max_items=max_items,
3296                    max_concurrency=max_concurrency,
3297                    composite_evaluator=composite_evaluator,
3298                    metadata=metadata,
3299                    max_retries=max_retries,
3300                    verbose=verbose,
3301                    resume_from=resume_from,
3302                )
3303            ),
3304        )
3305
3306    def auth_check(self) -> bool:
3307        """Check if the provided credentials (public and secret key) are valid.
3308
3309        Raises:
3310            Exception: If no projects were found for the provided credentials.
3311
3312        Note:
3313            This method is blocking. It is discouraged to use it in production code.
3314        """
3315        try:
3316            projects = self.api.projects.get()
3317            langfuse_logger.debug(
3318                f"Auth check successful, found {len(projects.data)} projects"
3319            )
3320            if len(projects.data) == 0:
3321                raise Exception(
3322                    "Auth check failed, no project found for the keys provided."
3323                )
3324            return True
3325
3326        except AttributeError as e:
3327            langfuse_logger.warning(
3328                f"Auth check failed: Client not properly initialized. Error: {e}"
3329            )
3330            return False
3331
3332        except Error as e:
3333            handle_fern_exception(e)
3334            raise e
3335
3336    def create_dataset(
3337        self,
3338        *,
3339        name: str,
3340        description: Optional[str] = None,
3341        metadata: Optional[Any] = None,
3342        input_schema: Optional[Any] = None,
3343        expected_output_schema: Optional[Any] = None,
3344    ) -> Dataset:
3345        """Create a dataset with the given name on Langfuse.
3346
3347        Args:
3348            name: Name of the dataset to create.
3349            description: Description of the dataset. Defaults to None.
3350            metadata: Additional metadata. Defaults to None.
3351            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3352            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3353
3354        Returns:
3355            Dataset: The created dataset as returned by the Langfuse API.
3356        """
3357        try:
3358            body = CreateDatasetRequest(
3359                name=name,
3360                description=description,
3361                metadata=metadata,
3362                inputSchema=input_schema,
3363                expectedOutputSchema=expected_output_schema,
3364            )
3365            langfuse_logger.debug(f"Creating datasets {body}")
3366
3367            return self.api.datasets.create(request=body)
3368
3369        except Error as e:
3370            handle_fern_exception(e)
3371            raise e
3372
3373    def create_dataset_item(
3374        self,
3375        *,
3376        dataset_name: str,
3377        input: Optional[Any] = None,
3378        expected_output: Optional[Any] = None,
3379        metadata: Optional[Any] = None,
3380        source_trace_id: Optional[str] = None,
3381        source_observation_id: Optional[str] = None,
3382        status: Optional[DatasetStatus] = None,
3383        id: Optional[str] = None,
3384    ) -> DatasetItem:
3385        """Create a dataset item.
3386
3387        Upserts if an item with id already exists.
3388
3389        Args:
3390            dataset_name: Name of the dataset in which the dataset item should be created.
3391            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3392            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3393            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3394            source_trace_id: Id of the source trace. Defaults to None.
3395            source_observation_id: Id of the source observation. Defaults to None.
3396            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3397            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3398
3399        Returns:
3400            DatasetItem: The created dataset item as returned by the Langfuse API.
3401
3402        Example:
3403            ```python
3404            from langfuse import Langfuse
3405
3406            langfuse = Langfuse()
3407
3408            # Uploading items to the Langfuse dataset named "capital_cities"
3409            langfuse.create_dataset_item(
3410                dataset_name="capital_cities",
3411                input={"input": {"country": "Italy"}},
3412                expected_output={"expected_output": "Rome"},
3413                metadata={"foo": "bar"}
3414            )
3415            ```
3416        """
3417        try:
3418            body = CreateDatasetItemRequest(
3419                datasetName=dataset_name,
3420                input=input,
3421                expectedOutput=expected_output,
3422                metadata=metadata,
3423                sourceTraceId=source_trace_id,
3424                sourceObservationId=source_observation_id,
3425                status=status,
3426                id=id,
3427            )
3428            langfuse_logger.debug(f"Creating dataset item {body}")
3429            return self.api.dataset_items.create(request=body)
3430        except Error as e:
3431            handle_fern_exception(e)
3432            raise e
3433
3434    def resolve_media_references(
3435        self,
3436        *,
3437        obj: Any,
3438        resolve_with: Literal["base64_data_uri"],
3439        max_depth: int = 10,
3440        content_fetch_timeout_seconds: int = 5,
3441    ) -> Any:
3442        """Replace media reference strings in an object with base64 data URIs.
3443
3444        This method recursively traverses an object (up to max_depth) looking for media reference strings
3445        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3446        the provided Langfuse client and replaces the reference string with a base64 data URI.
3447
3448        If fetching media content fails for a reference string, a warning is logged and the reference
3449        string is left unchanged.
3450
3451        Args:
3452            obj: The object to process. Can be a primitive value, array, or nested object.
3453                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3454            resolve_with: The representation of the media content to replace the media reference string with.
3455                Currently only "base64_data_uri" is supported.
3456            max_depth: int: The maximum depth to traverse the object. Default is 10.
3457            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3458
3459        Returns:
3460            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3461            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3462
3463        Example:
3464            obj = {
3465                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3466                "nested": {
3467                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3468                }
3469            }
3470
3471            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3472
3473            # Result:
3474            # {
3475            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3476            #     "nested": {
3477            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3478            #     }
3479            # }
3480        """
3481        return LangfuseMedia.resolve_media_references(
3482            langfuse_client=self,
3483            obj=obj,
3484            resolve_with=resolve_with,
3485            max_depth=max_depth,
3486            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3487        )
3488
3489    @overload
3490    def get_prompt(
3491        self,
3492        name: str,
3493        *,
3494        version: Optional[int] = None,
3495        label: Optional[str] = None,
3496        type: Literal["chat"],
3497        cache_ttl_seconds: Optional[int] = None,
3498        fallback: Optional[List[ChatMessageDict]] = None,
3499        max_retries: Optional[int] = None,
3500        fetch_timeout_seconds: Optional[int] = None,
3501    ) -> ChatPromptClient: ...
3502
3503    @overload
3504    def get_prompt(
3505        self,
3506        name: str,
3507        *,
3508        version: Optional[int] = None,
3509        label: Optional[str] = None,
3510        type: Literal["text"] = "text",
3511        cache_ttl_seconds: Optional[int] = None,
3512        fallback: Optional[str] = None,
3513        max_retries: Optional[int] = None,
3514        fetch_timeout_seconds: Optional[int] = None,
3515    ) -> TextPromptClient: ...
3516
3517    def get_prompt(
3518        self,
3519        name: str,
3520        *,
3521        version: Optional[int] = None,
3522        label: Optional[str] = None,
3523        type: Literal["chat", "text"] = "text",
3524        cache_ttl_seconds: Optional[int] = None,
3525        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3526        max_retries: Optional[int] = None,
3527        fetch_timeout_seconds: Optional[int] = None,
3528    ) -> PromptClient:
3529        """Get a prompt.
3530
3531        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3532        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3533        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3534        return the expired prompt as a fallback.
3535
3536        Args:
3537            name (str): The name of the prompt to retrieve.
3538
3539        Keyword Args:
3540            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3541            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3542            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3543            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3544            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3545            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3546            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3547            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3548
3549        Returns:
3550            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3551            - TextPromptClient, if type argument is 'text'.
3552            - ChatPromptClient, if type argument is 'chat'.
3553
3554        Raises:
3555            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3556            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3557        """
3558        if self._resources is None:
3559            raise Error(
3560                "SDK is not correctly initialized. Check the init logs for more details."
3561            )
3562        if version is not None and label is not None:
3563            raise ValueError("Cannot specify both version and label at the same time.")
3564
3565        if not name:
3566            raise ValueError("Prompt name cannot be empty.")
3567
3568        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3569        bounded_max_retries = self._get_bounded_max_retries(
3570            max_retries, default_max_retries=2, max_retries_upper_bound=4
3571        )
3572
3573        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3574        cached_prompt = self._resources.prompt_cache.get(cache_key)
3575
3576        if cached_prompt is None or cache_ttl_seconds == 0:
3577            langfuse_logger.debug(
3578                f"Prompt '{cache_key}' not found in cache or caching disabled."
3579            )
3580            try:
3581                return self._fetch_prompt_and_update_cache(
3582                    name,
3583                    version=version,
3584                    label=label,
3585                    ttl_seconds=cache_ttl_seconds,
3586                    max_retries=bounded_max_retries,
3587                    fetch_timeout_seconds=fetch_timeout_seconds,
3588                )
3589            except Exception as e:
3590                if fallback:
3591                    langfuse_logger.warning(
3592                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3593                    )
3594
3595                    fallback_client_args: Dict[str, Any] = {
3596                        "name": name,
3597                        "prompt": fallback,
3598                        "type": type,
3599                        "version": version or 0,
3600                        "config": {},
3601                        "labels": [label] if label else [],
3602                        "tags": [],
3603                    }
3604
3605                    if type == "text":
3606                        return TextPromptClient(
3607                            prompt=Prompt_Text(**fallback_client_args),
3608                            is_fallback=True,
3609                        )
3610
3611                    if type == "chat":
3612                        return ChatPromptClient(
3613                            prompt=Prompt_Chat(**fallback_client_args),
3614                            is_fallback=True,
3615                        )
3616
3617                raise e
3618
3619        if cached_prompt.is_expired():
3620            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3621            try:
3622                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3623                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3624
3625                def refresh_task() -> None:
3626                    self._fetch_prompt_and_update_cache(
3627                        name,
3628                        version=version,
3629                        label=label,
3630                        ttl_seconds=cache_ttl_seconds,
3631                        max_retries=bounded_max_retries,
3632                        fetch_timeout_seconds=fetch_timeout_seconds,
3633                    )
3634
3635                self._resources.prompt_cache.add_refresh_prompt_task(
3636                    cache_key,
3637                    refresh_task,
3638                )
3639                langfuse_logger.debug(
3640                    f"Returning stale prompt '{cache_key}' from cache."
3641                )
3642                # return stale prompt
3643                return cached_prompt.value
3644
3645            except Exception as e:
3646                langfuse_logger.warning(
3647                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3648                )
3649                # creation of refresh prompt task failed, return stale prompt
3650                return cached_prompt.value
3651
3652        return cached_prompt.value
3653
3654    def _fetch_prompt_and_update_cache(
3655        self,
3656        name: str,
3657        *,
3658        version: Optional[int] = None,
3659        label: Optional[str] = None,
3660        ttl_seconds: Optional[int] = None,
3661        max_retries: int,
3662        fetch_timeout_seconds: Optional[int],
3663    ) -> PromptClient:
3664        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3665        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3666
3667        try:
3668
3669            @backoff.on_exception(
3670                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3671            )
3672            def fetch_prompts() -> Any:
3673                return self.api.prompts.get(
3674                    self._url_encode(name),
3675                    version=version,
3676                    label=label,
3677                    request_options={
3678                        "timeout_in_seconds": fetch_timeout_seconds,
3679                    }
3680                    if fetch_timeout_seconds is not None
3681                    else None,
3682                )
3683
3684            prompt_response = fetch_prompts()
3685
3686            prompt: PromptClient
3687            if prompt_response.type == "chat":
3688                prompt = ChatPromptClient(prompt_response)
3689            else:
3690                prompt = TextPromptClient(prompt_response)
3691
3692            if self._resources is not None:
3693                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3694
3695            return prompt
3696
3697        except NotFoundError as not_found_error:
3698            langfuse_logger.warning(
3699                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3700            )
3701            if self._resources is not None:
3702                self._resources.prompt_cache.delete(cache_key)
3703            raise not_found_error
3704
3705        except Exception as e:
3706            langfuse_logger.error(
3707                f"Error while fetching prompt '{cache_key}': {str(e)}"
3708            )
3709            raise e
3710
3711    def _get_bounded_max_retries(
3712        self,
3713        max_retries: Optional[int],
3714        *,
3715        default_max_retries: int = 2,
3716        max_retries_upper_bound: int = 4,
3717    ) -> int:
3718        if max_retries is None:
3719            return default_max_retries
3720
3721        bounded_max_retries = min(
3722            max(max_retries, 0),
3723            max_retries_upper_bound,
3724        )
3725
3726        return bounded_max_retries
3727
3728    @overload
3729    def create_prompt(
3730        self,
3731        *,
3732        name: str,
3733        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3734        labels: List[str] = [],
3735        tags: Optional[List[str]] = None,
3736        type: Optional[Literal["chat"]],
3737        config: Optional[Any] = None,
3738        commit_message: Optional[str] = None,
3739    ) -> ChatPromptClient: ...
3740
3741    @overload
3742    def create_prompt(
3743        self,
3744        *,
3745        name: str,
3746        prompt: str,
3747        labels: List[str] = [],
3748        tags: Optional[List[str]] = None,
3749        type: Optional[Literal["text"]] = "text",
3750        config: Optional[Any] = None,
3751        commit_message: Optional[str] = None,
3752    ) -> TextPromptClient: ...
3753
3754    def create_prompt(
3755        self,
3756        *,
3757        name: str,
3758        prompt: Union[
3759            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3760        ],
3761        labels: List[str] = [],
3762        tags: Optional[List[str]] = None,
3763        type: Optional[Literal["chat", "text"]] = "text",
3764        config: Optional[Any] = None,
3765        commit_message: Optional[str] = None,
3766    ) -> PromptClient:
3767        """Create a new prompt in Langfuse.
3768
3769        Keyword Args:
3770            name : The name of the prompt to be created.
3771            prompt : The content of the prompt to be created.
3772            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3773            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3774            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3775            config: Additional structured data to be saved with the prompt. Defaults to None.
3776            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3777            commit_message: Optional string describing the change.
3778
3779        Returns:
3780            TextPromptClient: The prompt if type argument is 'text'.
3781            ChatPromptClient: The prompt if type argument is 'chat'.
3782        """
3783        try:
3784            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3785
3786            if type == "chat":
3787                if not isinstance(prompt, list):
3788                    raise ValueError(
3789                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3790                    )
3791                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3792                    CreatePromptRequest_Chat(
3793                        name=name,
3794                        prompt=cast(Any, prompt),
3795                        labels=labels,
3796                        tags=tags,
3797                        config=config or {},
3798                        commitMessage=commit_message,
3799                        type="chat",
3800                    )
3801                )
3802                server_prompt = self.api.prompts.create(request=request)
3803
3804                if self._resources is not None:
3805                    self._resources.prompt_cache.invalidate(name)
3806
3807                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3808
3809            if not isinstance(prompt, str):
3810                raise ValueError("For 'text' type, 'prompt' must be a string.")
3811
3812            request = CreatePromptRequest_Text(
3813                name=name,
3814                prompt=prompt,
3815                labels=labels,
3816                tags=tags,
3817                config=config or {},
3818                commitMessage=commit_message,
3819                type="text",
3820            )
3821
3822            server_prompt = self.api.prompts.create(request=request)
3823
3824            if self._resources is not None:
3825                self._resources.prompt_cache.invalidate(name)
3826
3827            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3828
3829        except Error as e:
3830            handle_fern_exception(e)
3831            raise e
3832
3833    def update_prompt(
3834        self,
3835        *,
3836        name: str,
3837        version: int,
3838        new_labels: List[str] = [],
3839    ) -> Any:
3840        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3841
3842        Args:
3843            name (str): The name of the prompt to update.
3844            version (int): The version number of the prompt to update.
3845            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3846
3847        Returns:
3848            Prompt: The updated prompt from the Langfuse API.
3849
3850        """
3851        updated_prompt = self.api.prompt_version.update(
3852            name=self._url_encode(name),
3853            version=version,
3854            new_labels=new_labels,
3855        )
3856
3857        if self._resources is not None:
3858            self._resources.prompt_cache.invalidate(name)
3859
3860        return updated_prompt
3861
3862    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3863        # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3864        # “%”, “?”, “#”, “|”, … in query/path parts).  Re-quoting here would
3865        # double-encode, so we skip when the value is about to be sent straight
3866        # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28.
3867        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3868            return url
3869
3870        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3871        # we need add safe="" to force escaping of slashes
3872        # This is necessary for prompts in prompt folders
3873        return urllib.parse.quote(url, safe="")
3874
3875    def clear_prompt_cache(self) -> None:
3876        """Clear the entire prompt cache, removing all cached prompts.
3877
3878        This method is useful when you want to force a complete refresh of all
3879        cached prompts, for example after major updates or when you need to
3880        ensure the latest versions are fetched from the server.
3881        """
3882        if self._resources is not None:
3883            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:

api: Synchronous API client for Langfuse backend communication
async_api: Asynchronous API client for Langfuse backend communication
_otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components

Arguments:

public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (metadata.scope.name)
additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.

Example:

from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_span(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")

Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None) View Source

213    def __init__(
214        self,
215        *,
216        public_key: Optional[str] = None,
217        secret_key: Optional[str] = None,
218        base_url: Optional[str] = None,
219        host: Optional[str] = None,
220        timeout: Optional[int] = None,
221        httpx_client: Optional[httpx.Client] = None,
222        debug: bool = False,
223        tracing_enabled: Optional[bool] = True,
224        flush_at: Optional[int] = None,
225        flush_interval: Optional[float] = None,
226        environment: Optional[str] = None,
227        release: Optional[str] = None,
228        media_upload_thread_count: Optional[int] = None,
229        sample_rate: Optional[float] = None,
230        mask: Optional[MaskFunction] = None,
231        blocked_instrumentation_scopes: Optional[List[str]] = None,
232        additional_headers: Optional[Dict[str, str]] = None,
233        tracer_provider: Optional[TracerProvider] = None,
234    ):
235        self._base_url = (
236            base_url
237            or os.environ.get(LANGFUSE_BASE_URL)
238            or host
239            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
240        )
241        self._environment = environment or cast(
242            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
243        )
244        self._project_id: Optional[str] = None
245        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
246        if not 0.0 <= sample_rate <= 1.0:
247            raise ValueError(
248                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
249            )
250
251        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
252
253        self._tracing_enabled = (
254            tracing_enabled
255            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
256        )
257        if not self._tracing_enabled:
258            langfuse_logger.info(
259                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
260            )
261
262        debug = (
263            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
264        )
265        if debug:
266            logging.basicConfig(
267                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
268            )
269            langfuse_logger.setLevel(logging.DEBUG)
270
271        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
272        if public_key is None:
273            langfuse_logger.warning(
274                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
275                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
276            )
277            self._otel_tracer = otel_trace_api.NoOpTracer()
278            return
279
280        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
281        if secret_key is None:
282            langfuse_logger.warning(
283                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
284                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
285            )
286            self._otel_tracer = otel_trace_api.NoOpTracer()
287            return
288
289        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
290            langfuse_logger.warning(
291                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
292            )
293
294        # Initialize api and tracer if requirements are met
295        self._resources = LangfuseResourceManager(
296            public_key=public_key,
297            secret_key=secret_key,
298            base_url=self._base_url,
299            timeout=timeout,
300            environment=self._environment,
301            release=release,
302            flush_at=flush_at,
303            flush_interval=flush_interval,
304            httpx_client=httpx_client,
305            media_upload_thread_count=media_upload_thread_count,
306            sample_rate=sample_rate,
307            mask=mask,
308            tracing_enabled=self._tracing_enabled,
309            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
310            additional_headers=additional_headers,
311            tracer_provider=tracer_provider,
312        )
313        self._mask = self._resources.mask
314
315        self._otel_tracer = (
316            self._resources.tracer
317            if self._tracing_enabled and self._resources.tracer is not None
318            else otel_trace_api.NoOpTracer()
319        )
320        self.api = self._resources.api
321        self.async_api = self._resources.async_api

api

async_api

def start_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan: View Source

323    def start_span(
324        self,
325        *,
326        trace_context: Optional[TraceContext] = None,
327        name: str,
328        input: Optional[Any] = None,
329        output: Optional[Any] = None,
330        metadata: Optional[Any] = None,
331        version: Optional[str] = None,
332        level: Optional[SpanLevel] = None,
333        status_message: Optional[str] = None,
334    ) -> LangfuseSpan:
335        """Create a new span for tracing a unit of work.
336
337        This method creates a new span but does not set it as the current span in the
338        context. To create and use a span within a context, use start_as_current_span().
339
340        The created span will be the child of the current span in the context.
341
342        Args:
343            trace_context: Optional context for connecting to an existing trace
344            name: Name of the span (e.g., function or operation name)
345            input: Input data for the operation (can be any JSON-serializable object)
346            output: Output data from the operation (can be any JSON-serializable object)
347            metadata: Additional metadata to associate with the span
348            version: Version identifier for the code or component
349            level: Importance level of the span (info, warning, error)
350            status_message: Optional status message for the span
351
352        Returns:
353            A LangfuseSpan object that must be ended with .end() when the operation completes
354
355        Example:
356            ```python
357            span = langfuse.start_span(name="process-data")
358            try:
359                # Do work
360                span.update(output="result")
361            finally:
362                span.end()
363            ```
364        """
365        return self.start_observation(
366            trace_context=trace_context,
367            name=name,
368            as_type="span",
369            input=input,
370            output=output,
371            metadata=metadata,
372            version=version,
373            level=level,
374            status_message=status_message,
375        )

Create a new span for tracing a unit of work.

This method creates a new span but does not set it as the current span in the context. To create and use a span within a context, use start_as_current_span().

The created span will be the child of the current span in the context.

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the span (e.g., function or operation name)
input: Input data for the operation (can be any JSON-serializable object)
output: Output data from the operation (can be any JSON-serializable object)
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Returns:

A LangfuseSpan object that must be ended with .end() when the operation completes

Example:

span = langfuse.start_span(name="process-data")
try:
    # Do work
    span.update(output="result")
finally:
    span.end()

def start_as_current_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]: View Source

377    def start_as_current_span(
378        self,
379        *,
380        trace_context: Optional[TraceContext] = None,
381        name: str,
382        input: Optional[Any] = None,
383        output: Optional[Any] = None,
384        metadata: Optional[Any] = None,
385        version: Optional[str] = None,
386        level: Optional[SpanLevel] = None,
387        status_message: Optional[str] = None,
388        end_on_exit: Optional[bool] = None,
389    ) -> _AgnosticContextManager[LangfuseSpan]:
390        """Create a new span and set it as the current span in a context manager.
391
392        This method creates a new span and sets it as the current span within a context
393        manager. Use this method with a 'with' statement to automatically handle span
394        lifecycle within a code block.
395
396        The created span will be the child of the current span in the context.
397
398        Args:
399            trace_context: Optional context for connecting to an existing trace
400            name: Name of the span (e.g., function or operation name)
401            input: Input data for the operation (can be any JSON-serializable object)
402            output: Output data from the operation (can be any JSON-serializable object)
403            metadata: Additional metadata to associate with the span
404            version: Version identifier for the code or component
405            level: Importance level of the span (info, warning, error)
406            status_message: Optional status message for the span
407            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
408
409        Returns:
410            A context manager that yields a LangfuseSpan
411
412        Example:
413            ```python
414            with langfuse.start_as_current_span(name="process-query") as span:
415                # Do work
416                result = process_data()
417                span.update(output=result)
418
419                # Create a child span automatically
420                with span.start_as_current_span(name="sub-operation") as child_span:
421                    # Do sub-operation work
422                    child_span.update(output="sub-result")
423            ```
424        """
425        return self.start_as_current_observation(
426            trace_context=trace_context,
427            name=name,
428            as_type="span",
429            input=input,
430            output=output,
431            metadata=metadata,
432            version=version,
433            level=level,
434            status_message=status_message,
435            end_on_exit=end_on_exit,
436        )

Create a new span and set it as the current span in a context manager.

This method creates a new span and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle span lifecycle within a code block.

The created span will be the child of the current span in the context.

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the span (e.g., function or operation name)
input: Input data for the operation (can be any JSON-serializable object)
output: Output data from the operation (can be any JSON-serializable object)
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span
end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.

Returns:

A context manager that yields a LangfuseSpan

Example:

with langfuse.start_as_current_span(name="process-query") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]: View Source

585    def start_observation(
586        self,
587        *,
588        trace_context: Optional[TraceContext] = None,
589        name: str,
590        as_type: ObservationTypeLiteralNoEvent = "span",
591        input: Optional[Any] = None,
592        output: Optional[Any] = None,
593        metadata: Optional[Any] = None,
594        version: Optional[str] = None,
595        level: Optional[SpanLevel] = None,
596        status_message: Optional[str] = None,
597        completion_start_time: Optional[datetime] = None,
598        model: Optional[str] = None,
599        model_parameters: Optional[Dict[str, MapValue]] = None,
600        usage_details: Optional[Dict[str, int]] = None,
601        cost_details: Optional[Dict[str, float]] = None,
602        prompt: Optional[PromptClient] = None,
603    ) -> Union[
604        LangfuseSpan,
605        LangfuseGeneration,
606        LangfuseAgent,
607        LangfuseTool,
608        LangfuseChain,
609        LangfuseRetriever,
610        LangfuseEvaluator,
611        LangfuseEmbedding,
612        LangfuseGuardrail,
613    ]:
614        """Create a new observation of the specified type.
615
616        This method creates a new observation but does not set it as the current span in the
617        context. To create and use an observation within a context, use start_as_current_observation().
618
619        Args:
620            trace_context: Optional context for connecting to an existing trace
621            name: Name of the observation
622            as_type: Type of observation to create (defaults to "span")
623            input: Input data for the operation
624            output: Output data from the operation
625            metadata: Additional metadata to associate with the observation
626            version: Version identifier for the code or component
627            level: Importance level of the observation
628            status_message: Optional status message for the observation
629            completion_start_time: When the model started generating (for generation types)
630            model: Name/identifier of the AI model used (for generation types)
631            model_parameters: Parameters used for the model (for generation types)
632            usage_details: Token usage information (for generation types)
633            cost_details: Cost information (for generation types)
634            prompt: Associated prompt template (for generation types)
635
636        Returns:
637            An observation object of the appropriate type that must be ended with .end()
638        """
639        if trace_context:
640            trace_id = trace_context.get("trace_id", None)
641            parent_span_id = trace_context.get("parent_span_id", None)
642
643            if trace_id:
644                remote_parent_span = self._create_remote_parent_span(
645                    trace_id=trace_id, parent_span_id=parent_span_id
646                )
647
648                with otel_trace_api.use_span(
649                    cast(otel_trace_api.Span, remote_parent_span)
650                ):
651                    otel_span = self._otel_tracer.start_span(name=name)
652                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
653
654                    return self._create_observation_from_otel_span(
655                        otel_span=otel_span,
656                        as_type=as_type,
657                        input=input,
658                        output=output,
659                        metadata=metadata,
660                        version=version,
661                        level=level,
662                        status_message=status_message,
663                        completion_start_time=completion_start_time,
664                        model=model,
665                        model_parameters=model_parameters,
666                        usage_details=usage_details,
667                        cost_details=cost_details,
668                        prompt=prompt,
669                    )
670
671        otel_span = self._otel_tracer.start_span(name=name)
672
673        return self._create_observation_from_otel_span(
674            otel_span=otel_span,
675            as_type=as_type,
676            input=input,
677            output=output,
678            metadata=metadata,
679            version=version,
680            level=level,
681            status_message=status_message,
682            completion_start_time=completion_start_time,
683            model=model,
684            model_parameters=model_parameters,
685            usage_details=usage_details,
686            cost_details=cost_details,
687            prompt=prompt,
688        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the observation
as_type: Type of observation to create (defaults to "span")
input: Input data for the operation
output: Output data from the operation
metadata: Additional metadata to associate with the observation
version: Version identifier for the code or component
level: Importance level of the observation
status_message: Optional status message for the observation
completion_start_time: When the model started generating (for generation types)
model: Name/identifier of the AI model used (for generation types)
model_parameters: Parameters used for the model (for generation types)
usage_details: Token usage information (for generation types)
cost_details: Cost information (for generation types)
prompt: Associated prompt template (for generation types)

Returns:

An observation object of the appropriate type that must be ended with .end()

def start_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration: View Source

760    def start_generation(
761        self,
762        *,
763        trace_context: Optional[TraceContext] = None,
764        name: str,
765        input: Optional[Any] = None,
766        output: Optional[Any] = None,
767        metadata: Optional[Any] = None,
768        version: Optional[str] = None,
769        level: Optional[SpanLevel] = None,
770        status_message: Optional[str] = None,
771        completion_start_time: Optional[datetime] = None,
772        model: Optional[str] = None,
773        model_parameters: Optional[Dict[str, MapValue]] = None,
774        usage_details: Optional[Dict[str, int]] = None,
775        cost_details: Optional[Dict[str, float]] = None,
776        prompt: Optional[PromptClient] = None,
777    ) -> LangfuseGeneration:
778        """Create a new generation span for model generations.
779
780        DEPRECATED: This method is deprecated and will be removed in a future version.
781        Use start_observation(as_type='generation') instead.
782
783        This method creates a specialized span for tracking model generations.
784        It includes additional fields specific to model generations such as model name,
785        token usage, and cost details.
786
787        The created generation span will be the child of the current span in the context.
788
789        Args:
790            trace_context: Optional context for connecting to an existing trace
791            name: Name of the generation operation
792            input: Input data for the model (e.g., prompts)
793            output: Output from the model (e.g., completions)
794            metadata: Additional metadata to associate with the generation
795            version: Version identifier for the model or component
796            level: Importance level of the generation (info, warning, error)
797            status_message: Optional status message for the generation
798            completion_start_time: When the model started generating the response
799            model: Name/identifier of the AI model used (e.g., "gpt-4")
800            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
801            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
802            cost_details: Cost information for the model call
803            prompt: Associated prompt template from Langfuse prompt management
804
805        Returns:
806            A LangfuseGeneration object that must be ended with .end() when complete
807
808        Example:
809            ```python
810            generation = langfuse.start_generation(
811                name="answer-generation",
812                model="gpt-4",
813                input={"prompt": "Explain quantum computing"},
814                model_parameters={"temperature": 0.7}
815            )
816            try:
817                # Call model API
818                response = llm.generate(...)
819
820                generation.update(
821                    output=response.text,
822                    usage_details={
823                        "prompt_tokens": response.usage.prompt_tokens,
824                        "completion_tokens": response.usage.completion_tokens
825                    }
826                )
827            finally:
828                generation.end()
829            ```
830        """
831        warnings.warn(
832            "start_generation is deprecated and will be removed in a future version. "
833            "Use start_observation(as_type='generation') instead.",
834            DeprecationWarning,
835            stacklevel=2,
836        )
837        return self.start_observation(
838            trace_context=trace_context,
839            name=name,
840            as_type="generation",
841            input=input,
842            output=output,
843            metadata=metadata,
844            version=version,
845            level=level,
846            status_message=status_message,
847            completion_start_time=completion_start_time,
848            model=model,
849            model_parameters=model_parameters,
850            usage_details=usage_details,
851            cost_details=cost_details,
852            prompt=prompt,
853        )

Create a new generation span for model generations.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a specialized span for tracking model generations. It includes additional fields specific to model generations such as model name, token usage, and cost details.

The created generation span will be the child of the current span in the context.

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the generation operation
input: Input data for the model (e.g., prompts)
output: Output from the model (e.g., completions)
metadata: Additional metadata to associate with the generation
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

Returns:

A LangfuseGeneration object that must be ended with .end() when complete

Example:

generation = langfuse.start_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"},
    model_parameters={"temperature": 0.7}
)
try:
    # Call model API
    response = llm.generate(...)

    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
finally:
    generation.end()

def start_as_current_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]: View Source

855    def start_as_current_generation(
856        self,
857        *,
858        trace_context: Optional[TraceContext] = None,
859        name: str,
860        input: Optional[Any] = None,
861        output: Optional[Any] = None,
862        metadata: Optional[Any] = None,
863        version: Optional[str] = None,
864        level: Optional[SpanLevel] = None,
865        status_message: Optional[str] = None,
866        completion_start_time: Optional[datetime] = None,
867        model: Optional[str] = None,
868        model_parameters: Optional[Dict[str, MapValue]] = None,
869        usage_details: Optional[Dict[str, int]] = None,
870        cost_details: Optional[Dict[str, float]] = None,
871        prompt: Optional[PromptClient] = None,
872        end_on_exit: Optional[bool] = None,
873    ) -> _AgnosticContextManager[LangfuseGeneration]:
874        """Create a new generation span and set it as the current span in a context manager.
875
876        DEPRECATED: This method is deprecated and will be removed in a future version.
877        Use start_as_current_observation(as_type='generation') instead.
878
879        This method creates a specialized span for model generations and sets it as the
880        current span within a context manager. Use this method with a 'with' statement to
881        automatically handle the generation span lifecycle within a code block.
882
883        The created generation span will be the child of the current span in the context.
884
885        Args:
886            trace_context: Optional context for connecting to an existing trace
887            name: Name of the generation operation
888            input: Input data for the model (e.g., prompts)
889            output: Output from the model (e.g., completions)
890            metadata: Additional metadata to associate with the generation
891            version: Version identifier for the model or component
892            level: Importance level of the generation (info, warning, error)
893            status_message: Optional status message for the generation
894            completion_start_time: When the model started generating the response
895            model: Name/identifier of the AI model used (e.g., "gpt-4")
896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
898            cost_details: Cost information for the model call
899            prompt: Associated prompt template from Langfuse prompt management
900            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
901
902        Returns:
903            A context manager that yields a LangfuseGeneration
904
905        Example:
906            ```python
907            with langfuse.start_as_current_generation(
908                name="answer-generation",
909                model="gpt-4",
910                input={"prompt": "Explain quantum computing"}
911            ) as generation:
912                # Call model API
913                response = llm.generate(...)
914
915                # Update with results
916                generation.update(
917                    output=response.text,
918                    usage_details={
919                        "prompt_tokens": response.usage.prompt_tokens,
920                        "completion_tokens": response.usage.completion_tokens
921                    }
922                )
923            ```
924        """
925        warnings.warn(
926            "start_as_current_generation is deprecated and will be removed in a future version. "
927            "Use start_as_current_observation(as_type='generation') instead.",
928            DeprecationWarning,
929            stacklevel=2,
930        )
931        return self.start_as_current_observation(
932            trace_context=trace_context,
933            name=name,
934            as_type="generation",
935            input=input,
936            output=output,
937            metadata=metadata,
938            version=version,
939            level=level,
940            status_message=status_message,
941            completion_start_time=completion_start_time,
942            model=model,
943            model_parameters=model_parameters,
944            usage_details=usage_details,
945            cost_details=cost_details,
946            prompt=prompt,
947            end_on_exit=end_on_exit,
948        )

Create a new generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a specialized span for model generations and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the generation span lifecycle within a code block.

The created generation span will be the child of the current span in the context.

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the generation operation
input: Input data for the model (e.g., prompts)
output: Output from the model (e.g., completions)
metadata: Additional metadata to associate with the generation
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management
end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.

Returns:

A context manager that yields a LangfuseGeneration

Example:

with langfuse.start_as_current_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"}
) as generation:
    # Call model API
    response = llm.generate(...)

    # Update with results
    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]: View Source

1106    def start_as_current_observation(
1107        self,
1108        *,
1109        trace_context: Optional[TraceContext] = None,
1110        name: str,
1111        as_type: ObservationTypeLiteralNoEvent = "span",
1112        input: Optional[Any] = None,
1113        output: Optional[Any] = None,
1114        metadata: Optional[Any] = None,
1115        version: Optional[str] = None,
1116        level: Optional[SpanLevel] = None,
1117        status_message: Optional[str] = None,
1118        completion_start_time: Optional[datetime] = None,
1119        model: Optional[str] = None,
1120        model_parameters: Optional[Dict[str, MapValue]] = None,
1121        usage_details: Optional[Dict[str, int]] = None,
1122        cost_details: Optional[Dict[str, float]] = None,
1123        prompt: Optional[PromptClient] = None,
1124        end_on_exit: Optional[bool] = None,
1125    ) -> Union[
1126        _AgnosticContextManager[LangfuseGeneration],
1127        _AgnosticContextManager[LangfuseSpan],
1128        _AgnosticContextManager[LangfuseAgent],
1129        _AgnosticContextManager[LangfuseTool],
1130        _AgnosticContextManager[LangfuseChain],
1131        _AgnosticContextManager[LangfuseRetriever],
1132        _AgnosticContextManager[LangfuseEvaluator],
1133        _AgnosticContextManager[LangfuseEmbedding],
1134        _AgnosticContextManager[LangfuseGuardrail],
1135    ]:
1136        """Create a new observation and set it as the current span in a context manager.
1137
1138        This method creates a new observation of the specified type and sets it as the
1139        current span within a context manager. Use this method with a 'with' statement to
1140        automatically handle the observation lifecycle within a code block.
1141
1142        The created observation will be the child of the current span in the context.
1143
1144        Args:
1145            trace_context: Optional context for connecting to an existing trace
1146            name: Name of the observation (e.g., function or operation name)
1147            as_type: Type of observation to create (defaults to "span")
1148            input: Input data for the operation (can be any JSON-serializable object)
1149            output: Output data from the operation (can be any JSON-serializable object)
1150            metadata: Additional metadata to associate with the observation
1151            version: Version identifier for the code or component
1152            level: Importance level of the observation (info, warning, error)
1153            status_message: Optional status message for the observation
1154            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1155
1156            The following parameters are available when as_type is: "generation" or "embedding".
1157            completion_start_time: When the model started generating the response
1158            model: Name/identifier of the AI model used (e.g., "gpt-4")
1159            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1160            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1161            cost_details: Cost information for the model call
1162            prompt: Associated prompt template from Langfuse prompt management
1163
1164        Returns:
1165            A context manager that yields the appropriate observation type based on as_type
1166
1167        Example:
1168            ```python
1169            # Create a span
1170            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1171                # Do work
1172                result = process_data()
1173                span.update(output=result)
1174
1175                # Create a child span automatically
1176                with span.start_as_current_span(name="sub-operation") as child_span:
1177                    # Do sub-operation work
1178                    child_span.update(output="sub-result")
1179
1180            # Create a tool observation
1181            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1182                # Do tool work
1183                results = search_web(query)
1184                tool.update(output=results)
1185
1186            # Create a generation observation
1187            with langfuse.start_as_current_observation(
1188                name="answer-generation",
1189                as_type="generation",
1190                model="gpt-4"
1191            ) as generation:
1192                # Generate answer
1193                response = llm.generate(...)
1194                generation.update(output=response)
1195            ```
1196        """
1197        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1198            if trace_context:
1199                trace_id = trace_context.get("trace_id", None)
1200                parent_span_id = trace_context.get("parent_span_id", None)
1201
1202                if trace_id:
1203                    remote_parent_span = self._create_remote_parent_span(
1204                        trace_id=trace_id, parent_span_id=parent_span_id
1205                    )
1206
1207                    return cast(
1208                        Union[
1209                            _AgnosticContextManager[LangfuseGeneration],
1210                            _AgnosticContextManager[LangfuseEmbedding],
1211                        ],
1212                        self._create_span_with_parent_context(
1213                            as_type=as_type,
1214                            name=name,
1215                            remote_parent_span=remote_parent_span,
1216                            parent=None,
1217                            end_on_exit=end_on_exit,
1218                            input=input,
1219                            output=output,
1220                            metadata=metadata,
1221                            version=version,
1222                            level=level,
1223                            status_message=status_message,
1224                            completion_start_time=completion_start_time,
1225                            model=model,
1226                            model_parameters=model_parameters,
1227                            usage_details=usage_details,
1228                            cost_details=cost_details,
1229                            prompt=prompt,
1230                        ),
1231                    )
1232
1233            return cast(
1234                Union[
1235                    _AgnosticContextManager[LangfuseGeneration],
1236                    _AgnosticContextManager[LangfuseEmbedding],
1237                ],
1238                self._start_as_current_otel_span_with_processed_media(
1239                    as_type=as_type,
1240                    name=name,
1241                    end_on_exit=end_on_exit,
1242                    input=input,
1243                    output=output,
1244                    metadata=metadata,
1245                    version=version,
1246                    level=level,
1247                    status_message=status_message,
1248                    completion_start_time=completion_start_time,
1249                    model=model,
1250                    model_parameters=model_parameters,
1251                    usage_details=usage_details,
1252                    cost_details=cost_details,
1253                    prompt=prompt,
1254                ),
1255            )
1256
1257        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1258            if trace_context:
1259                trace_id = trace_context.get("trace_id", None)
1260                parent_span_id = trace_context.get("parent_span_id", None)
1261
1262                if trace_id:
1263                    remote_parent_span = self._create_remote_parent_span(
1264                        trace_id=trace_id, parent_span_id=parent_span_id
1265                    )
1266
1267                    return cast(
1268                        Union[
1269                            _AgnosticContextManager[LangfuseSpan],
1270                            _AgnosticContextManager[LangfuseAgent],
1271                            _AgnosticContextManager[LangfuseTool],
1272                            _AgnosticContextManager[LangfuseChain],
1273                            _AgnosticContextManager[LangfuseRetriever],
1274                            _AgnosticContextManager[LangfuseEvaluator],
1275                            _AgnosticContextManager[LangfuseGuardrail],
1276                        ],
1277                        self._create_span_with_parent_context(
1278                            as_type=as_type,
1279                            name=name,
1280                            remote_parent_span=remote_parent_span,
1281                            parent=None,
1282                            end_on_exit=end_on_exit,
1283                            input=input,
1284                            output=output,
1285                            metadata=metadata,
1286                            version=version,
1287                            level=level,
1288                            status_message=status_message,
1289                        ),
1290                    )
1291
1292            return cast(
1293                Union[
1294                    _AgnosticContextManager[LangfuseSpan],
1295                    _AgnosticContextManager[LangfuseAgent],
1296                    _AgnosticContextManager[LangfuseTool],
1297                    _AgnosticContextManager[LangfuseChain],
1298                    _AgnosticContextManager[LangfuseRetriever],
1299                    _AgnosticContextManager[LangfuseEvaluator],
1300                    _AgnosticContextManager[LangfuseGuardrail],
1301                ],
1302                self._start_as_current_otel_span_with_processed_media(
1303                    as_type=as_type,
1304                    name=name,
1305                    end_on_exit=end_on_exit,
1306                    input=input,
1307                    output=output,
1308                    metadata=metadata,
1309                    version=version,
1310                    level=level,
1311                    status_message=status_message,
1312                ),
1313            )
1314
1315        # This should never be reached since all valid types are handled above
1316        langfuse_logger.warning(
1317            f"Unknown observation type: {as_type}, falling back to span"
1318        )
1319        return self._start_as_current_otel_span_with_processed_media(
1320            as_type="span",
1321            name=name,
1322            end_on_exit=end_on_exit,
1323            input=input,
1324            output=output,
1325            metadata=metadata,
1326            version=version,
1327            level=level,
1328            status_message=status_message,
1329        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the observation (e.g., function or operation name)
as_type: Type of observation to create (defaults to "span")
input: Input data for the operation (can be any JSON-serializable object)
output: Output data from the operation (can be any JSON-serializable object)
metadata: Additional metadata to associate with the observation
version: Version identifier for the code or component
level: Importance level of the observation (info, warning, error)
status_message: Optional status message for the observation
end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
The following parameters are available when as_type is: "generation" or "embedding".
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

Returns:

A context manager that yields the appropriate observation type based on as_type

Example:

# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)

def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None: View Source

1490    def update_current_generation(
1491        self,
1492        *,
1493        name: Optional[str] = None,
1494        input: Optional[Any] = None,
1495        output: Optional[Any] = None,
1496        metadata: Optional[Any] = None,
1497        version: Optional[str] = None,
1498        level: Optional[SpanLevel] = None,
1499        status_message: Optional[str] = None,
1500        completion_start_time: Optional[datetime] = None,
1501        model: Optional[str] = None,
1502        model_parameters: Optional[Dict[str, MapValue]] = None,
1503        usage_details: Optional[Dict[str, int]] = None,
1504        cost_details: Optional[Dict[str, float]] = None,
1505        prompt: Optional[PromptClient] = None,
1506    ) -> None:
1507        """Update the current active generation span with new information.
1508
1509        This method updates the current generation span in the active context with
1510        additional information. It's useful for adding output, usage stats, or other
1511        details that become available during or after model generation.
1512
1513        Args:
1514            name: The generation name
1515            input: Updated input data for the model
1516            output: Output from the model (e.g., completions)
1517            metadata: Additional metadata to associate with the generation
1518            version: Version identifier for the model or component
1519            level: Importance level of the generation (info, warning, error)
1520            status_message: Optional status message for the generation
1521            completion_start_time: When the model started generating the response
1522            model: Name/identifier of the AI model used (e.g., "gpt-4")
1523            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1524            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1525            cost_details: Cost information for the model call
1526            prompt: Associated prompt template from Langfuse prompt management
1527
1528        Example:
1529            ```python
1530            with langfuse.start_as_current_generation(name="answer-query") as generation:
1531                # Initial setup and API call
1532                response = llm.generate(...)
1533
1534                # Update with results that weren't available at creation time
1535                langfuse.update_current_generation(
1536                    output=response.text,
1537                    usage_details={
1538                        "prompt_tokens": response.usage.prompt_tokens,
1539                        "completion_tokens": response.usage.completion_tokens
1540                    }
1541                )
1542            ```
1543        """
1544        if not self._tracing_enabled:
1545            langfuse_logger.debug(
1546                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1547            )
1548            return
1549
1550        current_otel_span = self._get_current_otel_span()
1551
1552        if current_otel_span is not None:
1553            generation = LangfuseGeneration(
1554                otel_span=current_otel_span, langfuse_client=self
1555            )
1556
1557            if name:
1558                current_otel_span.update_name(name)
1559
1560            generation.update(
1561                input=input,
1562                output=output,
1563                metadata=metadata,
1564                version=version,
1565                level=level,
1566                status_message=status_message,
1567                completion_start_time=completion_start_time,
1568                model=model,
1569                model_parameters=model_parameters,
1570                usage_details=usage_details,
1571                cost_details=cost_details,
1572                prompt=prompt,
1573            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:

name: The generation name
input: Updated input data for the model
output: Output from the model (e.g., completions)
metadata: Additional metadata to associate with the generation
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

Example:

with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )

def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None: View Source

1575    def update_current_span(
1576        self,
1577        *,
1578        name: Optional[str] = None,
1579        input: Optional[Any] = None,
1580        output: Optional[Any] = None,
1581        metadata: Optional[Any] = None,
1582        version: Optional[str] = None,
1583        level: Optional[SpanLevel] = None,
1584        status_message: Optional[str] = None,
1585    ) -> None:
1586        """Update the current active span with new information.
1587
1588        This method updates the current span in the active context with
1589        additional information. It's useful for adding outputs or metadata
1590        that become available during execution.
1591
1592        Args:
1593            name: The span name
1594            input: Updated input data for the operation
1595            output: Output data from the operation
1596            metadata: Additional metadata to associate with the span
1597            version: Version identifier for the code or component
1598            level: Importance level of the span (info, warning, error)
1599            status_message: Optional status message for the span
1600
1601        Example:
1602            ```python
1603            with langfuse.start_as_current_span(name="process-data") as span:
1604                # Initial processing
1605                result = process_first_part()
1606
1607                # Update with intermediate results
1608                langfuse.update_current_span(metadata={"intermediate_result": result})
1609
1610                # Continue processing
1611                final_result = process_second_part(result)
1612
1613                # Final update
1614                langfuse.update_current_span(output=final_result)
1615            ```
1616        """
1617        if not self._tracing_enabled:
1618            langfuse_logger.debug(
1619                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1620            )
1621            return
1622
1623        current_otel_span = self._get_current_otel_span()
1624
1625        if current_otel_span is not None:
1626            span = LangfuseSpan(
1627                otel_span=current_otel_span,
1628                langfuse_client=self,
1629                environment=self._environment,
1630            )
1631
1632            if name:
1633                current_otel_span.update_name(name)
1634
1635            span.update(
1636                input=input,
1637                output=output,
1638                metadata=metadata,
1639                version=version,
1640                level=level,
1641                status_message=status_message,
1642            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:

name: The span name
input: Updated input data for the operation
output: Output data from the operation
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Example:

with langfuse.start_as_current_span(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)

def update_current_trace( self, *, name: Optional[str] = None, user_id: Optional[str] = None, session_id: Optional[str] = None, version: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, tags: Optional[List[str]] = None, public: Optional[bool] = None) -> None: View Source

1644    def update_current_trace(
1645        self,
1646        *,
1647        name: Optional[str] = None,
1648        user_id: Optional[str] = None,
1649        session_id: Optional[str] = None,
1650        version: Optional[str] = None,
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        tags: Optional[List[str]] = None,
1655        public: Optional[bool] = None,
1656    ) -> None:
1657        """Update the current trace with additional information.
1658
1659        Args:
1660            name: Updated name for the Langfuse trace
1661            user_id: ID of the user who initiated the Langfuse trace
1662            session_id: Session identifier for grouping related Langfuse traces
1663            version: Version identifier for the application or service
1664            input: Input data for the overall Langfuse trace
1665            output: Output data from the overall Langfuse trace
1666            metadata: Additional metadata to associate with the Langfuse trace
1667            tags: List of tags to categorize the Langfuse trace
1668            public: Whether the Langfuse trace should be publicly accessible
1669
1670        See Also:
1671            :func:`langfuse.propagate_attributes`: Recommended replacement
1672        """
1673        if not self._tracing_enabled:
1674            langfuse_logger.debug(
1675                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1676            )
1677            return
1678
1679        current_otel_span = self._get_current_otel_span()
1680
1681        if current_otel_span is not None and current_otel_span.is_recording():
1682            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1683                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1684            )
1685            # We need to preserve the class to keep the correct observation type
1686            span_class = self._get_span_class(existing_observation_type)
1687            span = span_class(
1688                otel_span=current_otel_span,
1689                langfuse_client=self,
1690                environment=self._environment,
1691            )
1692
1693            span.update_trace(
1694                name=name,
1695                user_id=user_id,
1696                session_id=session_id,
1697                version=version,
1698                input=input,
1699                output=output,
1700                metadata=metadata,
1701                tags=tags,
1702                public=public,
1703            )

Update the current trace with additional information.

Arguments:

name: Updated name for the Langfuse trace
user_id: ID of the user who initiated the Langfuse trace
session_id: Session identifier for grouping related Langfuse traces
version: Version identifier for the application or service
input: Input data for the overall Langfuse trace
output: Output data from the overall Langfuse trace
metadata: Additional metadata to associate with the Langfuse trace
tags: List of tags to categorize the Langfuse trace
public: Whether the Langfuse trace should be publicly accessible

Arguments:

trace_context: Optional context for connecting to an existing trace
name: Name of the span (e.g., function or operation name)
input: Input data for the operation (can be any JSON-serializable object)
output: Output data from the operation (can be any JSON-serializable object)
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Returns:

The Langfuse Event object

Example:

event = langfuse.create_event(name="process-event")

@staticmethod

def create_trace_id(*, seed: Optional[str] = None) -> str: View Source

1877    @staticmethod
1878    def create_trace_id(*, seed: Optional[str] = None) -> str:
1879        """Create a unique trace ID for use with Langfuse.
1880
1881        This method generates a unique trace ID for use with various Langfuse APIs.
1882        It can either generate a random ID or create a deterministic ID based on
1883        a seed string.
1884
1885        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1886        This method ensures the generated ID meets this requirement. If you need to
1887        correlate an external ID with a Langfuse trace ID, use the external ID as the
1888        seed to get a valid, deterministic Langfuse trace ID.
1889
1890        Args:
1891            seed: Optional string to use as a seed for deterministic ID generation.
1892                 If provided, the same seed will always produce the same ID.
1893                 If not provided, a random ID will be generated.
1894
1895        Returns:
1896            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1897
1898        Example:
1899            ```python
1900            # Generate a random trace ID
1901            trace_id = langfuse.create_trace_id()
1902
1903            # Generate a deterministic ID based on a seed
1904            session_trace_id = langfuse.create_trace_id(seed="session-456")
1905
1906            # Correlate an external ID with a Langfuse trace ID
1907            external_id = "external-system-123456"
1908            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1909
1910            # Use the ID with trace context
1911            with langfuse.start_as_current_span(
1912                name="process-request",
1913                trace_context={"trace_id": trace_id}
1914            ) as span:
1915                # Operation will be part of the specific trace
1916                pass
1917            ```
1918        """
1919        if not seed:
1920            trace_id_int = RandomIdGenerator().generate_trace_id()
1921
1922            return Langfuse._format_otel_trace_id(trace_id_int)
1923
1924        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:

seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.

Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:

# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_span(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass

def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None: View Source

2002    def create_score(
2003        self,
2004        *,
2005        name: str,
2006        value: Union[float, str],
2007        session_id: Optional[str] = None,
2008        dataset_run_id: Optional[str] = None,
2009        trace_id: Optional[str] = None,
2010        observation_id: Optional[str] = None,
2011        score_id: Optional[str] = None,
2012        data_type: Optional[ScoreDataType] = None,
2013        comment: Optional[str] = None,
2014        config_id: Optional[str] = None,
2015        metadata: Optional[Any] = None,
2016        timestamp: Optional[datetime] = None,
2017    ) -> None:
2018        """Create a score for a specific trace or observation.
2019
2020        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2021        used to track quality metrics, user feedback, or automated evaluations.
2022
2023        Args:
2024            name: Name of the score (e.g., "relevance", "accuracy")
2025            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2026            session_id: ID of the Langfuse session to associate the score with
2027            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2028            trace_id: ID of the Langfuse trace to associate the score with
2029            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2030            score_id: Optional custom ID for the score (auto-generated if not provided)
2031            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2032            comment: Optional comment or explanation for the score
2033            config_id: Optional ID of a score config defined in Langfuse
2034            metadata: Optional metadata to be attached to the score
2035            timestamp: Optional timestamp for the score (defaults to current UTC time)
2036
2037        Example:
2038            ```python
2039            # Create a numeric score for accuracy
2040            langfuse.create_score(
2041                name="accuracy",
2042                value=0.92,
2043                trace_id="abcdef1234567890abcdef1234567890",
2044                data_type="NUMERIC",
2045                comment="High accuracy with minor irrelevant details"
2046            )
2047
2048            # Create a categorical score for sentiment
2049            langfuse.create_score(
2050                name="sentiment",
2051                value="positive",
2052                trace_id="abcdef1234567890abcdef1234567890",
2053                observation_id="abcdef1234567890",
2054                data_type="CATEGORICAL"
2055            )
2056            ```
2057        """
2058        if not self._tracing_enabled:
2059            return
2060
2061        score_id = score_id or self._create_observation_id()
2062
2063        try:
2064            new_body = ScoreBody(
2065                id=score_id,
2066                sessionId=session_id,
2067                datasetRunId=dataset_run_id,
2068                traceId=trace_id,
2069                observationId=observation_id,
2070                name=name,
2071                value=value,
2072                dataType=data_type,  # type: ignore
2073                comment=comment,
2074                configId=config_id,
2075                environment=self._environment,
2076                metadata=metadata,
2077            )
2078
2079            event = {
2080                "id": self.create_trace_id(),
2081                "type": "score-create",
2082                "timestamp": timestamp or _get_timestamp(),
2083                "body": new_body,
2084            }
2085
2086            if self._resources is not None:
2087                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2088                force_sample = (
2089                    not self._is_valid_trace_id(trace_id) if trace_id else True
2090                )
2091
2092                self._resources.add_score_task(
2093                    event,
2094                    force_sample=force_sample,
2095                )
2096
2097        except Exception as e:
2098            langfuse_logger.exception(
2099                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2100            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:

name: Name of the score (e.g., "relevance", "accuracy")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
session_id: ID of the Langfuse session to associate the score with
dataset_run_id: ID of the Langfuse dataset run to associate the score with
trace_id: ID of the Langfuse trace to associate the score with
observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
metadata: Optional metadata to be attached to the score
timestamp: Optional timestamp for the score (defaults to current UTC time)

Example:

# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)

def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None: View Source

2126    def score_current_span(
2127        self,
2128        *,
2129        name: str,
2130        value: Union[float, str],
2131        score_id: Optional[str] = None,
2132        data_type: Optional[ScoreDataType] = None,
2133        comment: Optional[str] = None,
2134        config_id: Optional[str] = None,
2135    ) -> None:
2136        """Create a score for the current active span.
2137
2138        This method scores the currently active span in the context. It's a convenient
2139        way to score the current operation without needing to know its trace and span IDs.
2140
2141        Args:
2142            name: Name of the score (e.g., "relevance", "accuracy")
2143            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2144            score_id: Optional custom ID for the score (auto-generated if not provided)
2145            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2146            comment: Optional comment or explanation for the score
2147            config_id: Optional ID of a score config defined in Langfuse
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_generation(name="answer-query") as generation:
2152                # Generate answer
2153                response = generate_answer(...)
2154                generation.update(output=response)
2155
2156                # Score the generation
2157                langfuse.score_current_span(
2158                    name="relevance",
2159                    value=0.85,
2160                    data_type="NUMERIC",
2161                    comment="Mostly relevant but contains some tangential information"
2162                )
2163            ```
2164        """
2165        current_span = self._get_current_otel_span()
2166
2167        if current_span is not None:
2168            trace_id = self._get_otel_trace_id(current_span)
2169            observation_id = self._get_otel_span_id(current_span)
2170
2171            langfuse_logger.info(
2172                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2173            )
2174
2175            self.create_score(
2176                trace_id=trace_id,
2177                observation_id=observation_id,
2178                name=name,
2179                value=cast(str, value),
2180                score_id=score_id,
2181                data_type=cast(Literal["CATEGORICAL"], data_type),
2182                comment=comment,
2183                config_id=config_id,
2184            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:

name: Name of the score (e.g., "relevance", "accuracy")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse

Example:

with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information"
    )

def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None: View Source

2210    def score_current_trace(
2211        self,
2212        *,
2213        name: str,
2214        value: Union[float, str],
2215        score_id: Optional[str] = None,
2216        data_type: Optional[ScoreDataType] = None,
2217        comment: Optional[str] = None,
2218        config_id: Optional[str] = None,
2219    ) -> None:
2220        """Create a score for the current trace.
2221
2222        This method scores the trace of the currently active span. Unlike score_current_span,
2223        this method associates the score with the entire trace rather than a specific span.
2224        It's useful for scoring overall performance or quality of the entire operation.
2225
2226        Args:
2227            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2228            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2229            score_id: Optional custom ID for the score (auto-generated if not provided)
2230            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2231            comment: Optional comment or explanation for the score
2232            config_id: Optional ID of a score config defined in Langfuse
2233
2234        Example:
2235            ```python
2236            with langfuse.start_as_current_span(name="process-user-request") as span:
2237                # Process request
2238                result = process_complete_request()
2239                span.update(output=result)
2240
2241                # Score the overall trace
2242                langfuse.score_current_trace(
2243                    name="overall_quality",
2244                    value=0.95,
2245                    data_type="NUMERIC",
2246                    comment="High quality end-to-end response"
2247                )
2248            ```
2249        """
2250        current_span = self._get_current_otel_span()
2251
2252        if current_span is not None:
2253            trace_id = self._get_otel_trace_id(current_span)
2254
2255            langfuse_logger.info(
2256                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2257            )
2258
2259            self.create_score(
2260                trace_id=trace_id,
2261                name=name,
2262                value=cast(str, value),
2263                score_id=score_id,
2264                data_type=cast(Literal["CATEGORICAL"], data_type),
2265                comment=comment,
2266                config_id=config_id,
2267            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:

name: Name of the score (e.g., "user_satisfaction", "overall_quality")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse

Example:

with langfuse.start_as_current_span(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response"
    )

def flush(self) -> None: View Source

2269    def flush(self) -> None:
2270        """Force flush all pending spans and events to the Langfuse API.
2271
2272        This method manually flushes any pending spans, scores, and other events to the
2273        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2274        before proceeding, without waiting for the automatic flush interval.
2275
2276        Example:
2277            ```python
2278            # Record some spans and scores
2279            with langfuse.start_as_current_span(name="operation") as span:
2280                # Do work...
2281                pass
2282
2283            # Ensure all data is sent to Langfuse before proceeding
2284            langfuse.flush()
2285
2286            # Continue with other work
2287            ```
2288        """
2289        if self._resources is not None:
2290            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:

# Record some spans and scores
with langfuse.start_as_current_span(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work

def shutdown(self) -> None: View Source

2292    def shutdown(self) -> None:
2293        """Shut down the Langfuse client and flush all pending data.
2294
2295        This method cleanly shuts down the Langfuse client, ensuring all pending data
2296        is flushed to the API and all background threads are properly terminated.
2297
2298        It's important to call this method when your application is shutting down to
2299        prevent data loss and resource leaks. For most applications, using the client
2300        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2301
2302        Example:
2303            ```python
2304            # Initialize Langfuse
2305            langfuse = Langfuse(public_key="...", secret_key="...")
2306
2307            # Use Langfuse throughout your application
2308            # ...
2309
2310            # When application is shutting down
2311            langfuse.shutdown()
2312            ```
2313        """
2314        if self._resources is not None:
2315            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:

# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()

def get_current_trace_id(self) -> Optional[str]: View Source

2317    def get_current_trace_id(self) -> Optional[str]:
2318        """Get the trace ID of the current active span.
2319
2320        This method retrieves the trace ID from the currently active span in the context.
2321        It can be used to get the trace ID for referencing in logs, external systems,
2322        or for creating related operations.
2323
2324        Returns:
2325            The current trace ID as a 32-character lowercase hexadecimal string,
2326            or None if there is no active span.
2327
2328        Example:
2329            ```python
2330            with langfuse.start_as_current_span(name="process-request") as span:
2331                # Get the current trace ID for reference
2332                trace_id = langfuse.get_current_trace_id()
2333
2334                # Use it for external correlation
2335                log.info(f"Processing request with trace_id: {trace_id}")
2336
2337                # Or pass to another system
2338                external_system.process(data, trace_id=trace_id)
2339            ```
2340        """
2341        if not self._tracing_enabled:
2342            langfuse_logger.debug(
2343                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2344            )
2345            return None
2346
2347        current_otel_span = self._get_current_otel_span()
2348
2349        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:

with langfuse.start_as_current_span(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)

def get_current_observation_id(self) -> Optional[str]: View Source

2351    def get_current_observation_id(self) -> Optional[str]:
2352        """Get the observation ID (span ID) of the current active span.
2353
2354        This method retrieves the observation ID from the currently active span in the context.
2355        It can be used to get the observation ID for referencing in logs, external systems,
2356        or for creating scores or other related operations.
2357
2358        Returns:
2359            The current observation ID as a 16-character lowercase hexadecimal string,
2360            or None if there is no active span.
2361
2362        Example:
2363            ```python
2364            with langfuse.start_as_current_span(name="process-user-query") as span:
2365                # Get the current observation ID
2366                observation_id = langfuse.get_current_observation_id()
2367
2368                # Store it for later reference
2369                cache.set(f"query_{query_id}_observation", observation_id)
2370
2371                # Process the query...
2372            ```
2373        """
2374        if not self._tracing_enabled:
2375            langfuse_logger.debug(
2376                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2377            )
2378            return None
2379
2380        current_otel_span = self._get_current_otel_span()
2381
2382        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:

with langfuse.start_as_current_span(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...

def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: View Source

2395    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2396        """Get the URL to view a trace in the Langfuse UI.
2397
2398        This method generates a URL that links directly to a trace in the Langfuse UI.
2399        It's useful for providing links in logs, notifications, or debugging tools.
2400
2401        Args:
2402            trace_id: Optional trace ID to generate a URL for. If not provided,
2403                     the trace ID of the current active span will be used.
2404
2405        Returns:
2406            A URL string pointing to the trace in the Langfuse UI,
2407            or None if the project ID couldn't be retrieved or no trace ID is available.
2408
2409        Example:
2410            ```python
2411            # Get URL for the current trace
2412            with langfuse.start_as_current_span(name="process-request") as span:
2413                trace_url = langfuse.get_trace_url()
2414                log.info(f"Processing trace: {trace_url}")
2415
2416            # Get URL for a specific trace
2417            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2418            send_notification(f"Review needed for trace: {specific_trace_url}")
2419            ```
2420        """
2421        project_id = self._get_project_id()
2422        final_trace_id = trace_id or self.get_current_trace_id()
2423
2424        return (
2425            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2426            if project_id and final_trace_id
2427            else None
2428        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:

trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.

Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:

# Get URL for the current trace
with langfuse.start_as_current_span(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")

def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50) -> langfuse._client.datasets.DatasetClient: View Source

2430    def get_dataset(
2431        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2432    ) -> "DatasetClient":
2433        """Fetch a dataset by its name.
2434
2435        Args:
2436            name (str): The name of the dataset to fetch.
2437            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2438
2439        Returns:
2440            DatasetClient: The dataset with the given name.
2441        """
2442        try:
2443            langfuse_logger.debug(f"Getting datasets {name}")
2444            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2445
2446            dataset_items = []
2447            page = 1
2448
2449            while True:
2450                new_items = self.api.dataset_items.list(
2451                    dataset_name=self._url_encode(name, is_url_param=True),
2452                    page=page,
2453                    limit=fetch_items_page_size,
2454                )
2455                dataset_items.extend(new_items.data)
2456
2457                if new_items.meta.total_pages <= page:
2458                    break
2459
2460                page += 1
2461
2462            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2463
2464            return DatasetClient(dataset, items=items)
2465
2466        except Error as e:
2467            handle_fern_exception(e)
2468            raise e

Fetch a dataset by its name.

Arguments:

name (str): The name of the dataset to fetch.
fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.

Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems: View Source

2470    def get_dataset_run(
2471        self, *, dataset_name: str, run_name: str
2472    ) -> DatasetRunWithItems:
2473        """Fetch a dataset run by dataset name and run name.
2474
2475        Args:
2476            dataset_name (str): The name of the dataset.
2477            run_name (str): The name of the run.
2478
2479        Returns:
2480            DatasetRunWithItems: The dataset run with its items.
2481        """
2482        try:
2483            return self.api.datasets.get_run(
2484                dataset_name=self._url_encode(dataset_name),
2485                run_name=self._url_encode(run_name),
2486                request_options=None,
2487            )
2488        except Error as e:
2489            handle_fern_exception(e)
2490            raise e

Fetch a dataset run by dataset name and run name.

Arguments:

dataset_name (str): The name of the dataset.
run_name (str): The name of the run.

Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns: View Source

2492    def get_dataset_runs(
2493        self,
2494        *,
2495        dataset_name: str,
2496        page: Optional[int] = None,
2497        limit: Optional[int] = None,
2498    ) -> PaginatedDatasetRuns:
2499        """Fetch all runs for a dataset.
2500
2501        Args:
2502            dataset_name (str): The name of the dataset.
2503            page (Optional[int]): Page number, starts at 1.
2504            limit (Optional[int]): Limit of items per page.
2505
2506        Returns:
2507            PaginatedDatasetRuns: Paginated list of dataset runs.
2508        """
2509        try:
2510            return self.api.datasets.get_runs(
2511                dataset_name=self._url_encode(dataset_name),
2512                page=page,
2513                limit=limit,
2514                request_options=None,
2515            )
2516        except Error as e:
2517            handle_fern_exception(e)
2518            raise e

Fetch all runs for a dataset.

Arguments:

dataset_name (str): The name of the dataset.
page (Optional[int]): Page number, starts at 1.
limit (Optional[int]): Limit of items per page.

Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse: View Source

2520    def delete_dataset_run(
2521        self, *, dataset_name: str, run_name: str
2522    ) -> DeleteDatasetRunResponse:
2523        """Delete a dataset run and all its run items. This action is irreversible.
2524
2525        Args:
2526            dataset_name (str): The name of the dataset.
2527            run_name (str): The name of the run.
2528
2529        Returns:
2530            DeleteDatasetRunResponse: Confirmation of deletion.
2531        """
2532        try:
2533            return self.api.datasets.delete_run(
2534                dataset_name=self._url_encode(dataset_name),
2535                run_name=self._url_encode(run_name),
2536                request_options=None,
2537            )
2538        except Error as e:
2539            handle_fern_exception(e)
2540            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:

dataset_name (str): The name of the dataset.
run_name (str): The name of the run.

Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse._client.datasets.DatasetItemClient]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None) -> langfuse.experiment.ExperimentResult: View Source

2542    def run_experiment(
2543        self,
2544        *,
2545        name: str,
2546        run_name: Optional[str] = None,
2547        description: Optional[str] = None,
2548        data: ExperimentData,
2549        task: TaskFunction,
2550        evaluators: List[EvaluatorFunction] = [],
2551        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2552        run_evaluators: List[RunEvaluatorFunction] = [],
2553        max_concurrency: int = 50,
2554        metadata: Optional[Dict[str, str]] = None,
2555    ) -> ExperimentResult:
2556        """Run an experiment on a dataset with automatic tracing and evaluation.
2557
2558        This method executes a task function on each item in the provided dataset,
2559        automatically traces all executions with Langfuse for observability, runs
2560        item-level and run-level evaluators on the outputs, and returns comprehensive
2561        results with evaluation metrics.
2562
2563        The experiment system provides:
2564        - Automatic tracing of all task executions
2565        - Concurrent processing with configurable limits
2566        - Comprehensive error handling that isolates failures
2567        - Integration with Langfuse datasets for experiment tracking
2568        - Flexible evaluation framework supporting both sync and async evaluators
2569
2570        Args:
2571            name: Human-readable name for the experiment. Used for identification
2572                in the Langfuse UI.
2573            run_name: Optional exact name for the experiment run. If provided, this will be
2574                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2575                If not provided, this will default to the experiment name appended with an ISO timestamp.
2576            description: Optional description explaining the experiment's purpose,
2577                methodology, or expected outcomes.
2578            data: Array of data items to process. Can be either:
2579                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2580                - List of Langfuse DatasetItem objects from dataset.items
2581            task: Function that processes each data item and returns output.
2582                Must accept 'item' as keyword argument and can return sync or async results.
2583                The task function signature should be: task(*, item, **kwargs) -> Any
2584            evaluators: List of functions to evaluate each item's output individually.
2585                Each evaluator receives input, output, expected_output, and metadata.
2586                Can return single Evaluation dict or list of Evaluation dicts.
2587            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2588                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2589                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2590                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2591            run_evaluators: List of functions to evaluate the entire experiment run.
2592                Each run evaluator receives all item_results and can compute aggregate metrics.
2593                Useful for calculating averages, distributions, or cross-item comparisons.
2594            max_concurrency: Maximum number of concurrent task executions (default: 50).
2595                Controls the number of items processed simultaneously. Adjust based on
2596                API rate limits and system resources.
2597            metadata: Optional metadata dictionary to attach to all experiment traces.
2598                This metadata will be included in every trace created during the experiment.
2599                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2600
2601        Returns:
2602            ExperimentResult containing:
2603            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2604            - item_results: List of results for each processed item with outputs and evaluations
2605            - run_evaluations: List of aggregate evaluation results for the entire run
2606            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2607            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2608
2609        Raises:
2610            ValueError: If required parameters are missing or invalid
2611            Exception: If experiment setup fails (individual item failures are handled gracefully)
2612
2613        Examples:
2614            Basic experiment with local data:
2615            ```python
2616            def summarize_text(*, item, **kwargs):
2617                return f"Summary: {item['input'][:50]}..."
2618
2619            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2620                return {
2621                    "name": "output_length",
2622                    "value": len(output),
2623                    "comment": f"Output contains {len(output)} characters"
2624                }
2625
2626            result = langfuse.run_experiment(
2627                name="Text Summarization Test",
2628                description="Evaluate summarization quality and length",
2629                data=[
2630                    {"input": "Long article text...", "expected_output": "Expected summary"},
2631                    {"input": "Another article...", "expected_output": "Another summary"}
2632                ],
2633                task=summarize_text,
2634                evaluators=[length_evaluator]
2635            )
2636
2637            print(f"Processed {len(result.item_results)} items")
2638            for item_result in result.item_results:
2639                print(f"Input: {item_result.item['input']}")
2640                print(f"Output: {item_result.output}")
2641                print(f"Evaluations: {item_result.evaluations}")
2642            ```
2643
2644            Advanced experiment with async task and multiple evaluators:
2645            ```python
2646            async def llm_task(*, item, **kwargs):
2647                # Simulate async LLM call
2648                response = await openai_client.chat.completions.create(
2649                    model="gpt-4",
2650                    messages=[{"role": "user", "content": item["input"]}]
2651                )
2652                return response.choices[0].message.content
2653
2654            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2655                if expected_output and expected_output.lower() in output.lower():
2656                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2657                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2658
2659            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2660                # Simulate toxicity check
2661                toxicity_score = check_toxicity(output)  # Your toxicity checker
2662                return {
2663                    "name": "toxicity",
2664                    "value": toxicity_score,
2665                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2666                }
2667
2668            def average_accuracy(*, item_results, **kwargs):
2669                accuracies = [
2670                    eval.value for result in item_results
2671                    for eval in result.evaluations
2672                    if eval.name == "accuracy"
2673                ]
2674                return {
2675                    "name": "average_accuracy",
2676                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2677                    "comment": f"Average accuracy across {len(accuracies)} items"
2678                }
2679
2680            result = langfuse.run_experiment(
2681                name="LLM Safety and Accuracy Test",
2682                description="Evaluate model accuracy and safety across diverse prompts",
2683                data=test_dataset,  # Your dataset items
2684                task=llm_task,
2685                evaluators=[accuracy_evaluator, toxicity_evaluator],
2686                run_evaluators=[average_accuracy],
2687                max_concurrency=5,  # Limit concurrent API calls
2688                metadata={"model": "gpt-4", "temperature": 0.7}
2689            )
2690            ```
2691
2692            Using with Langfuse datasets:
2693            ```python
2694            # Get dataset from Langfuse
2695            dataset = langfuse.get_dataset("my-eval-dataset")
2696
2697            result = dataset.run_experiment(
2698                name="Production Model Evaluation",
2699                description="Monthly evaluation of production model performance",
2700                task=my_production_task,
2701                evaluators=[accuracy_evaluator, latency_evaluator]
2702            )
2703
2704            # Results automatically linked to dataset in Langfuse UI
2705            print(f"View results: {result['dataset_run_url']}")
2706            ```
2707
2708        Note:
2709            - Task and evaluator functions can be either synchronous or asynchronous
2710            - Individual item failures are logged but don't stop the experiment
2711            - All executions are automatically traced and visible in Langfuse UI
2712            - When using Langfuse datasets, results are automatically linked for easy comparison
2713            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2714            - Async execution is handled automatically with smart event loop detection
2715        """
2716        return cast(
2717            ExperimentResult,
2718            run_async_safely(
2719                self._run_experiment_async(
2720                    name=name,
2721                    run_name=self._create_experiment_run_name(
2722                        name=name, run_name=run_name
2723                    ),
2724                    description=description,
2725                    data=data,
2726                    task=task,
2727                    evaluators=evaluators or [],
2728                    composite_evaluator=composite_evaluator,
2729                    run_evaluators=run_evaluators or [],
2730                    max_concurrency=max_concurrency,
2731                    metadata=metadata,
2732                ),
2733            ),
2734        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

Automatic tracing of all task executions
Concurrent processing with configurable limits
Comprehensive error handling that isolates failures
Integration with Langfuse datasets for experiment tracking
Flexible evaluation framework supporting both sync and async evaluators

Arguments:

name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.

Returns:

ExperimentResult containing:

run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.

item_results: List of results for each processed item with outputs and evaluations

run_evaluations: List of aggregate evaluation results for the entire run

dataset_run_id: ID of the dataset run (if using Langfuse datasets)

dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)

Raises:

ValueError: If required parameters are missing or invalid
Exception: If experiment setup fails (individual item failures are handled gracefully)

Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")

Note:

Task and evaluator functions can be either synchronous or asynchronous

Individual item failures are logged but don't stop the experiment

All executions are automatically traced and visible in Langfuse UI

When using Langfuse datasets, results are automatically linked for easy comparison

This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)

Async execution is handled automatically with smart event loop detection

def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult: View Source

3078    def run_batched_evaluation(
3079        self,
3080        *,
3081        scope: Literal["traces", "observations"],
3082        mapper: MapperFunction,
3083        filter: Optional[str] = None,
3084        fetch_batch_size: int = 50,
3085        max_items: Optional[int] = None,
3086        max_retries: int = 3,
3087        evaluators: List[EvaluatorFunction],
3088        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3089        max_concurrency: int = 50,
3090        metadata: Optional[Dict[str, Any]] = None,
3091        resume_from: Optional[BatchEvaluationResumeToken] = None,
3092        verbose: bool = False,
3093    ) -> BatchEvaluationResult:
3094        """Fetch traces or observations and run evaluations on each item.
3095
3096        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3097        It fetches items based on filters, transforms them using a mapper function, runs
3098        evaluators on each item, and creates scores that are linked back to the original
3099        entities. This is ideal for:
3100
3101        - Running evaluations on production traces after deployment
3102        - Backtesting new evaluation metrics on historical data
3103        - Batch scoring of observations for quality monitoring
3104        - Periodic evaluation runs on recent data
3105
3106        The method uses a streaming/pipeline approach to process items in batches, making
3107        it memory-efficient for large datasets. It includes comprehensive error handling,
3108        retry logic, and resume capability for long-running evaluations.
3109
3110        Args:
3111            scope: The type of items to evaluate. Must be one of:
3112                - "traces": Evaluate complete traces with all their observations
3113                - "observations": Evaluate individual observations (spans, generations, events)
3114            mapper: Function that transforms API response objects into evaluator inputs.
3115                Receives a trace/observation object and returns an EvaluatorInputs
3116                instance with input, output, expected_output, and metadata fields.
3117                Can be sync or async.
3118            evaluators: List of evaluation functions to run on each item. Each evaluator
3119                receives the mapped inputs and returns Evaluation object(s). Evaluator
3120                failures are logged but don't stop the batch evaluation.
3121            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3122                - '{"tags": ["production"]}'
3123                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3124                Default: None (fetches all items).
3125            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3126                Larger values may be faster but use more memory. Default: 50.
3127            max_items: Maximum total number of items to process. If None, processes all
3128                items matching the filter. Useful for testing or limiting evaluation runs.
3129                Default: None (process all).
3130            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3131                parallelism and resource usage. Default: 50.
3132            composite_evaluator: Optional function that creates a composite score from
3133                item-level evaluations. Receives the original item and its evaluations,
3134                returns a single Evaluation. Useful for weighted averages or combined metrics.
3135                Default: None.
3136            metadata: Optional metadata dict to add to all created scores. Useful for
3137                tracking evaluation runs, versions, or other context. Default: None.
3138            max_retries: Maximum number of retry attempts for failed batch fetches.
3139                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3140            verbose: If True, logs progress information to console. Useful for monitoring
3141                long-running evaluations. Default: False.
3142            resume_from: Optional resume token from a previous incomplete run. Allows
3143                continuing evaluation after interruption or failure. Default: None.
3144
3145
3146        Returns:
3147            BatchEvaluationResult containing:
3148                - total_items_fetched: Number of items fetched from API
3149                - total_items_processed: Number of items successfully evaluated
3150                - total_items_failed: Number of items that failed evaluation
3151                - total_scores_created: Scores created by item-level evaluators
3152                - total_composite_scores_created: Scores created by composite evaluator
3153                - total_evaluations_failed: Individual evaluator failures
3154                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3155                - resume_token: Token for resuming if incomplete (None if completed)
3156                - completed: True if all items processed
3157                - duration_seconds: Total execution time
3158                - failed_item_ids: IDs of items that failed
3159                - error_summary: Error types and counts
3160                - has_more_items: True if max_items reached but more exist
3161
3162        Raises:
3163            ValueError: If invalid scope is provided.
3164
3165        Examples:
3166            Basic trace evaluation:
3167            ```python
3168            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3169
3170            client = Langfuse()
3171
3172            # Define mapper to extract fields from traces
3173            def trace_mapper(trace):
3174                return EvaluatorInputs(
3175                    input=trace.input,
3176                    output=trace.output,
3177                    expected_output=None,
3178                    metadata={"trace_id": trace.id}
3179                )
3180
3181            # Define evaluator
3182            def length_evaluator(*, input, output, expected_output, metadata):
3183                return Evaluation(
3184                    name="output_length",
3185                    value=len(output) if output else 0
3186                )
3187
3188            # Run batch evaluation
3189            result = client.run_batched_evaluation(
3190                scope="traces",
3191                mapper=trace_mapper,
3192                evaluators=[length_evaluator],
3193                filter='{"tags": ["production"]}',
3194                max_items=1000,
3195                verbose=True
3196            )
3197
3198            print(f"Processed {result.total_items_processed} traces")
3199            print(f"Created {result.total_scores_created} scores")
3200            ```
3201
3202            Evaluation with composite scorer:
3203            ```python
3204            def accuracy_evaluator(*, input, output, expected_output, metadata):
3205                # ... evaluation logic
3206                return Evaluation(name="accuracy", value=0.85)
3207
3208            def relevance_evaluator(*, input, output, expected_output, metadata):
3209                # ... evaluation logic
3210                return Evaluation(name="relevance", value=0.92)
3211
3212            def composite_evaluator(*, item, evaluations):
3213                # Weighted average of evaluations
3214                weights = {"accuracy": 0.6, "relevance": 0.4}
3215                total = sum(
3216                    e.value * weights.get(e.name, 0)
3217                    for e in evaluations
3218                    if isinstance(e.value, (int, float))
3219                )
3220                return Evaluation(
3221                    name="composite_score",
3222                    value=total,
3223                    comment=f"Weighted average of {len(evaluations)} metrics"
3224                )
3225
3226            result = client.run_batched_evaluation(
3227                scope="traces",
3228                mapper=trace_mapper,
3229                evaluators=[accuracy_evaluator, relevance_evaluator],
3230                composite_evaluator=composite_evaluator,
3231                filter='{"user_id": "important_user"}',
3232                verbose=True
3233            )
3234            ```
3235
3236            Handling incomplete runs with resume:
3237            ```python
3238            # Initial run that may fail or timeout
3239            result = client.run_batched_evaluation(
3240                scope="observations",
3241                mapper=obs_mapper,
3242                evaluators=[my_evaluator],
3243                max_items=10000,
3244                verbose=True
3245            )
3246
3247            # Check if incomplete
3248            if not result.completed and result.resume_token:
3249                print(f"Processed {result.resume_token.items_processed} items before interruption")
3250
3251                # Resume from where it left off
3252                result = client.run_batched_evaluation(
3253                    scope="observations",
3254                    mapper=obs_mapper,
3255                    evaluators=[my_evaluator],
3256                    resume_from=result.resume_token,
3257                    verbose=True
3258                )
3259
3260            print(f"Total items processed: {result.total_items_processed}")
3261            ```
3262
3263            Monitoring evaluator performance:
3264            ```python
3265            result = client.run_batched_evaluation(...)
3266
3267            for stats in result.evaluator_stats:
3268                success_rate = stats.successful_runs / stats.total_runs
3269                print(f"{stats.name}:")
3270                print(f"  Success rate: {success_rate:.1%}")
3271                print(f"  Scores created: {stats.total_scores_created}")
3272
3273                if stats.failed_runs > 0:
3274                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3275            ```
3276
3277        Note:
3278            - Evaluator failures are logged but don't stop the batch evaluation
3279            - Individual item failures are tracked but don't stop processing
3280            - Fetch failures are retried with exponential backoff
3281            - All scores are automatically flushed to Langfuse at the end
3282            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3283        """
3284        runner = BatchEvaluationRunner(self)
3285
3286        return cast(
3287            BatchEvaluationResult,
3288            run_async_safely(
3289                runner.run_async(
3290                    scope=scope,
3291                    mapper=mapper,
3292                    evaluators=evaluators,
3293                    filter=filter,
3294                    fetch_batch_size=fetch_batch_size,
3295                    max_items=max_items,
3296                    max_concurrency=max_concurrency,
3297                    composite_evaluator=composite_evaluator,
3298                    metadata=metadata,
3299                    max_retries=max_retries,
3300                    verbose=verbose,
3301                    resume_from=resume_from,
3302                )
3303            ),
3304        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

Running evaluations on production traces after deployment
Backtesting new evaluation metrics on historical data
Batch scoring of observations for quality monitoring
Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:

scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 50.
composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.

Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:

ValueError: If invalid scope is provided.

Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Note:

Evaluator failures are logged but don't stop the batch evaluation

Individual item failures are tracked but don't stop processing

Fetch failures are retried with exponential backoff

All scores are automatically flushed to Langfuse at the end

The resume mechanism uses timestamp-based filtering to avoid duplicates

def auth_check(self) -> bool: View Source

3306    def auth_check(self) -> bool:
3307        """Check if the provided credentials (public and secret key) are valid.
3308
3309        Raises:
3310            Exception: If no projects were found for the provided credentials.
3311
3312        Note:
3313            This method is blocking. It is discouraged to use it in production code.
3314        """
3315        try:
3316            projects = self.api.projects.get()
3317            langfuse_logger.debug(
3318                f"Auth check successful, found {len(projects.data)} projects"
3319            )
3320            if len(projects.data) == 0:
3321                raise Exception(
3322                    "Auth check failed, no project found for the keys provided."
3323                )
3324            return True
3325
3326        except AttributeError as e:
3327            langfuse_logger.warning(
3328                f"Auth check failed: Client not properly initialized. Error: {e}"
3329            )
3330            return False
3331
3332        except Error as e:
3333            handle_fern_exception(e)
3334            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:

Exception: If no projects were found for the provided credentials.

Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset: View Source

3336    def create_dataset(
3337        self,
3338        *,
3339        name: str,
3340        description: Optional[str] = None,
3341        metadata: Optional[Any] = None,
3342        input_schema: Optional[Any] = None,
3343        expected_output_schema: Optional[Any] = None,
3344    ) -> Dataset:
3345        """Create a dataset with the given name on Langfuse.
3346
3347        Args:
3348            name: Name of the dataset to create.
3349            description: Description of the dataset. Defaults to None.
3350            metadata: Additional metadata. Defaults to None.
3351            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3352            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3353
3354        Returns:
3355            Dataset: The created dataset as returned by the Langfuse API.
3356        """
3357        try:
3358            body = CreateDatasetRequest(
3359                name=name,
3360                description=description,
3361                metadata=metadata,
3362                inputSchema=input_schema,
3363                expectedOutputSchema=expected_output_schema,
3364            )
3365            langfuse_logger.debug(f"Creating datasets {body}")
3366
3367            return self.api.datasets.create(request=body)
3368
3369        except Error as e:
3370            handle_fern_exception(e)
3371            raise e

Create a dataset with the given name on Langfuse.

Arguments:

name: Name of the dataset to create.
description: Description of the dataset. Defaults to None.
metadata: Additional metadata. Defaults to None.
input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.

Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem: View Source

3373    def create_dataset_item(
3374        self,
3375        *,
3376        dataset_name: str,
3377        input: Optional[Any] = None,
3378        expected_output: Optional[Any] = None,
3379        metadata: Optional[Any] = None,
3380        source_trace_id: Optional[str] = None,
3381        source_observation_id: Optional[str] = None,
3382        status: Optional[DatasetStatus] = None,
3383        id: Optional[str] = None,
3384    ) -> DatasetItem:
3385        """Create a dataset item.
3386
3387        Upserts if an item with id already exists.
3388
3389        Args:
3390            dataset_name: Name of the dataset in which the dataset item should be created.
3391            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3392            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3393            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3394            source_trace_id: Id of the source trace. Defaults to None.
3395            source_observation_id: Id of the source observation. Defaults to None.
3396            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3397            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3398
3399        Returns:
3400            DatasetItem: The created dataset item as returned by the Langfuse API.
3401
3402        Example:
3403            ```python
3404            from langfuse import Langfuse
3405
3406            langfuse = Langfuse()
3407
3408            # Uploading items to the Langfuse dataset named "capital_cities"
3409            langfuse.create_dataset_item(
3410                dataset_name="capital_cities",
3411                input={"input": {"country": "Italy"}},
3412                expected_output={"expected_output": "Rome"},
3413                metadata={"foo": "bar"}
3414            )
3415            ```
3416        """
3417        try:
3418            body = CreateDatasetItemRequest(
3419                datasetName=dataset_name,
3420                input=input,
3421                expectedOutput=expected_output,
3422                metadata=metadata,
3423                sourceTraceId=source_trace_id,
3424                sourceObservationId=source_observation_id,
3425                status=status,
3426                id=id,
3427            )
3428            langfuse_logger.debug(f"Creating dataset item {body}")
3429            return self.api.dataset_items.create(request=body)
3430        except Error as e:
3431            handle_fern_exception(e)
3432            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:

dataset_name: Name of the dataset in which the dataset item should be created.
input: Input data. Defaults to None. Can contain any dict, list or scalar.
expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
source_trace_id: Id of the source trace. Defaults to None.
source_observation_id: Id of the source observation. Defaults to None.
status: Status of the dataset item. Defaults to ACTIVE for newly created items.
id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.

Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:

from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)

def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any: View Source

3434    def resolve_media_references(
3435        self,
3436        *,
3437        obj: Any,
3438        resolve_with: Literal["base64_data_uri"],
3439        max_depth: int = 10,
3440        content_fetch_timeout_seconds: int = 5,
3441    ) -> Any:
3442        """Replace media reference strings in an object with base64 data URIs.
3443
3444        This method recursively traverses an object (up to max_depth) looking for media reference strings
3445        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3446        the provided Langfuse client and replaces the reference string with a base64 data URI.
3447
3448        If fetching media content fails for a reference string, a warning is logged and the reference
3449        string is left unchanged.
3450
3451        Args:
3452            obj: The object to process. Can be a primitive value, array, or nested object.
3453                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3454            resolve_with: The representation of the media content to replace the media reference string with.
3455                Currently only "base64_data_uri" is supported.
3456            max_depth: int: The maximum depth to traverse the object. Default is 10.
3457            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3458
3459        Returns:
3460            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3461            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3462
3463        Example:
3464            obj = {
3465                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3466                "nested": {
3467                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3468                }
3469            }
3470
3471            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3472
3473            # Result:
3474            # {
3475            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3476            #     "nested": {
3477            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3478            #     }
3479            # }
3480        """
3481        return LangfuseMedia.resolve_media_references(
3482            langfuse_client=self,
3483            obj=obj,
3484            resolve_with=resolve_with,
3485            max_depth=max_depth,
3486            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3487        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:

obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
max_depth: int: The maximum depth to traverse the object. Default is 10.
content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.

Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]: View Source

3517    def get_prompt(
3518        self,
3519        name: str,
3520        *,
3521        version: Optional[int] = None,
3522        label: Optional[str] = None,
3523        type: Literal["chat", "text"] = "text",
3524        cache_ttl_seconds: Optional[int] = None,
3525        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3526        max_retries: Optional[int] = None,
3527        fetch_timeout_seconds: Optional[int] = None,
3528    ) -> PromptClient:
3529        """Get a prompt.
3530
3531        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3532        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3533        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3534        return the expired prompt as a fallback.
3535
3536        Args:
3537            name (str): The name of the prompt to retrieve.
3538
3539        Keyword Args:
3540            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3541            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3542            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3543            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3544            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3545            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3546            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3547            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3548
3549        Returns:
3550            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3551            - TextPromptClient, if type argument is 'text'.
3552            - ChatPromptClient, if type argument is 'chat'.
3553
3554        Raises:
3555            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3556            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3557        """
3558        if self._resources is None:
3559            raise Error(
3560                "SDK is not correctly initialized. Check the init logs for more details."
3561            )
3562        if version is not None and label is not None:
3563            raise ValueError("Cannot specify both version and label at the same time.")
3564
3565        if not name:
3566            raise ValueError("Prompt name cannot be empty.")
3567
3568        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3569        bounded_max_retries = self._get_bounded_max_retries(
3570            max_retries, default_max_retries=2, max_retries_upper_bound=4
3571        )
3572
3573        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3574        cached_prompt = self._resources.prompt_cache.get(cache_key)
3575
3576        if cached_prompt is None or cache_ttl_seconds == 0:
3577            langfuse_logger.debug(
3578                f"Prompt '{cache_key}' not found in cache or caching disabled."
3579            )
3580            try:
3581                return self._fetch_prompt_and_update_cache(
3582                    name,
3583                    version=version,
3584                    label=label,
3585                    ttl_seconds=cache_ttl_seconds,
3586                    max_retries=bounded_max_retries,
3587                    fetch_timeout_seconds=fetch_timeout_seconds,
3588                )
3589            except Exception as e:
3590                if fallback:
3591                    langfuse_logger.warning(
3592                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3593                    )
3594
3595                    fallback_client_args: Dict[str, Any] = {
3596                        "name": name,
3597                        "prompt": fallback,
3598                        "type": type,
3599                        "version": version or 0,
3600                        "config": {},
3601                        "labels": [label] if label else [],
3602                        "tags": [],
3603                    }
3604
3605                    if type == "text":
3606                        return TextPromptClient(
3607                            prompt=Prompt_Text(**fallback_client_args),
3608                            is_fallback=True,
3609                        )
3610
3611                    if type == "chat":
3612                        return ChatPromptClient(
3613                            prompt=Prompt_Chat(**fallback_client_args),
3614                            is_fallback=True,
3615                        )
3616
3617                raise e
3618
3619        if cached_prompt.is_expired():
3620            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3621            try:
3622                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3623                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3624
3625                def refresh_task() -> None:
3626                    self._fetch_prompt_and_update_cache(
3627                        name,
3628                        version=version,
3629                        label=label,
3630                        ttl_seconds=cache_ttl_seconds,
3631                        max_retries=bounded_max_retries,
3632                        fetch_timeout_seconds=fetch_timeout_seconds,
3633                    )
3634
3635                self._resources.prompt_cache.add_refresh_prompt_task(
3636                    cache_key,
3637                    refresh_task,
3638                )
3639                langfuse_logger.debug(
3640                    f"Returning stale prompt '{cache_key}' from cache."
3641                )
3642                # return stale prompt
3643                return cached_prompt.value
3644
3645            except Exception as e:
3646                langfuse_logger.warning(
3647                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3648                )
3649                # creation of refresh prompt task failed, return stale prompt
3650                return cached_prompt.value
3651
3652        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:

name (str): The name of the prompt to retrieve.

Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

TextPromptClient, if type argument is 'text'.

ChatPromptClient, if type argument is 'chat'.

Raises:

Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
expired prompt in the cache, in which case it logs a warning and returns the expired prompt.

def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]: View Source

3754    def create_prompt(
3755        self,
3756        *,
3757        name: str,
3758        prompt: Union[
3759            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3760        ],
3761        labels: List[str] = [],
3762        tags: Optional[List[str]] = None,
3763        type: Optional[Literal["chat", "text"]] = "text",
3764        config: Optional[Any] = None,
3765        commit_message: Optional[str] = None,
3766    ) -> PromptClient:
3767        """Create a new prompt in Langfuse.
3768
3769        Keyword Args:
3770            name : The name of the prompt to be created.
3771            prompt : The content of the prompt to be created.
3772            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3773            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3774            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3775            config: Additional structured data to be saved with the prompt. Defaults to None.
3776            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3777            commit_message: Optional string describing the change.
3778
3779        Returns:
3780            TextPromptClient: The prompt if type argument is 'text'.
3781            ChatPromptClient: The prompt if type argument is 'chat'.
3782        """
3783        try:
3784            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3785
3786            if type == "chat":
3787                if not isinstance(prompt, list):
3788                    raise ValueError(
3789                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3790                    )
3791                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3792                    CreatePromptRequest_Chat(
3793                        name=name,
3794                        prompt=cast(Any, prompt),
3795                        labels=labels,
3796                        tags=tags,
3797                        config=config or {},
3798                        commitMessage=commit_message,
3799                        type="chat",
3800                    )
3801                )
3802                server_prompt = self.api.prompts.create(request=request)
3803
3804                if self._resources is not None:
3805                    self._resources.prompt_cache.invalidate(name)
3806
3807                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3808
3809            if not isinstance(prompt, str):
3810                raise ValueError("For 'text' type, 'prompt' must be a string.")
3811
3812            request = CreatePromptRequest_Text(
3813                name=name,
3814                prompt=prompt,
3815                labels=labels,
3816                tags=tags,
3817                config=config or {},
3818                commitMessage=commit_message,
3819                type="text",
3820            )
3821
3822            server_prompt = self.api.prompts.create(request=request)
3823
3824            if self._resources is not None:
3825                self._resources.prompt_cache.invalidate(name)
3826
3827            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3828
3829        except Error as e:
3830            handle_fern_exception(e)
3831            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any: View Source

3833    def update_prompt(
3834        self,
3835        *,
3836        name: str,
3837        version: int,
3838        new_labels: List[str] = [],
3839    ) -> Any:
3840        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3841
3842        Args:
3843            name (str): The name of the prompt to update.
3844            version (int): The version number of the prompt to update.
3845            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3846
3847        Returns:
3848            Prompt: The updated prompt from the Langfuse API.
3849
3850        """
3851        updated_prompt = self.api.prompt_version.update(
3852            name=self._url_encode(name),
3853            version=version,
3854            new_labels=new_labels,
3855        )
3856
3857        if self._resources is not None:
3858            self._resources.prompt_cache.invalidate(name)
3859
3860        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:

name (str): The name of the prompt to update.
version (int): The version number of the prompt to update.
new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].

Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None: View Source

3875    def clear_prompt_cache(self) -> None:
3876        """Clear the entire prompt cache, removing all cached prompts.
3877
3878        This method is useful when you want to force a complete refresh of all
3879        cached prompts, for example after major updates or when you need to
3880        ensure the latest versions are fetched from the server.
3881        """
3882        if self._resources is not None:
3883            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse: View Source

 61def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 62    """Get or create a Langfuse client instance.
 63
 64    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 65    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 66
 67    Behavior:
 68    - Single project: Returns existing client or creates new one
 69    - Multi-project: Requires public_key to return specific client
 70    - No public_key in multi-project: Returns disabled client to prevent data leakage
 71
 72    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 73
 74    Args:
 75        public_key (Optional[str]): Project identifier
 76            - With key: Returns client for that project
 77            - Without key: Returns single client or disabled client if multiple exist
 78
 79    Returns:
 80        Langfuse: Client instance in one of three states:
 81            1. Client for specified public_key
 82            2. Default client for single-project setup
 83            3. Disabled client when multiple projects exist without key
 84
 85    Security:
 86        Disables tracing when multiple projects exist without explicit key to prevent
 87        cross-project data leakage. Multi-project setups are experimental.
 88
 89    Example:
 90        ```python
 91        # Single project
 92        client = get_client()  # Default client
 93
 94        # In multi-project usage:
 95        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 96        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 97
 98        # Without specific key in multi-project setup:
 99        client = get_client()  # Returns disabled client for safety
100        ```
101    """
102    with LangfuseResourceManager._lock:
103        active_instances = LangfuseResourceManager._instances
104
105        # If no explicit public_key provided, check execution context
106        if not public_key:
107            public_key = _current_public_key.get(None)
108
109        if not public_key:
110            if len(active_instances) == 0:
111                # No clients initialized yet, create default instance
112                return Langfuse()
113
114            if len(active_instances) == 1:
115                # Only one client exists, safe to use without specifying key
116                instance = list(active_instances.values())[0]
117
118                # Initialize with the credentials bound to the instance
119                # This is important if the original instance was instantiated
120                # via constructor arguments
121                return _create_client_from_instance(instance)
122
123            else:
124                # Multiple clients exist but no key specified - disable tracing
125                # to prevent cross-project data leakage
126                langfuse_logger.warning(
127                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
128                )
129                return Langfuse(
130                    tracing_enabled=False, public_key="fake", secret_key="fake"
131                )
132
133        else:
134            # Specific key provided, look up existing instance
135            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
136                public_key, None
137            )
138
139            if target_instance is None:
140                # No instance found with this key - client not initialized properly
141                langfuse_logger.warning(
142                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
143                )
144                return Langfuse(
145                    tracing_enabled=False, public_key="fake", secret_key="fake"
146                )
147
148            # target_instance is guaranteed to be not None at this point
149            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

Single project: Returns existing client or creates new one
Multi-project: Requires public_key to return specific client
No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:

public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist

Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:

# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety

def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]: View Source

 90    def observe(
 91        self,
 92        func: Optional[F] = None,
 93        *,
 94        name: Optional[str] = None,
 95        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 96        capture_input: Optional[bool] = None,
 97        capture_output: Optional[bool] = None,
 98        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 99    ) -> Union[F, Callable[[F], F]]:
100        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
101
102        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
103        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
104        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
105
106        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
107        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
108
109        Args:
110            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
111            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
112            as_type (Optional[Literal]): Set the observation type. Supported values:
113                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
114                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
115                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
116                    can be set.
117
118        Returns:
119            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
120
121        Example:
122            For general function tracing with automatic naming:
123            ```python
124            @observe()
125            def process_user_request(user_id, query):
126                # Function is automatically traced with name "process_user_request"
127                return get_response(query)
128            ```
129
130            For language model generation tracking:
131            ```python
132            @observe(name="answer-generation", as_type="generation")
133            async def generate_answer(query):
134                # Creates a generation-type span with extended LLM metrics
135                response = await openai.chat.completions.create(
136                    model="gpt-4",
137                    messages=[{"role": "user", "content": query}]
138                )
139                return response.choices[0].message.content
140            ```
141
142            For trace context propagation between functions:
143            ```python
144            @observe()
145            def main_process():
146                # Parent span is created
147                return sub_process()  # Child span automatically connected to parent
148
149            @observe()
150            def sub_process():
151                # Automatically becomes a child span of main_process
152                return "result"
153            ```
154
155        Raises:
156            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
157
158        Notes:
159            - The decorator preserves the original function's signature, docstring, and return type.
160            - Proper parent-child relationships between spans are automatically maintained.
161            - Special keyword arguments can be passed to control tracing:
162              - langfuse_trace_id: Explicitly set the trace ID for this function call
163              - langfuse_parent_observation_id: Explicitly set the parent span ID
164              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
165            - For async functions, the decorator returns an async function wrapper.
166            - For sync functions, the decorator returns a synchronous wrapper.
167        """
168        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
169        if as_type is not None and as_type not in valid_types:
170            self._log.warning(
171                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
172            )
173            as_type = "span"
174
175        function_io_capture_enabled = os.environ.get(
176            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
177        ).lower() not in ("false", "0")
178
179        should_capture_input = (
180            capture_input if capture_input is not None else function_io_capture_enabled
181        )
182
183        should_capture_output = (
184            capture_output
185            if capture_output is not None
186            else function_io_capture_enabled
187        )
188
189        def decorator(func: F) -> F:
190            return (
191                self._async_observe(
192                    func,
193                    name=name,
194                    as_type=as_type,
195                    capture_input=should_capture_input,
196                    capture_output=should_capture_output,
197                    transform_to_string=transform_to_string,
198                )
199                if asyncio.iscoroutinefunction(func)
200                else self._sync_observe(
201                    func,
202                    name=name,
203                    as_type=as_type,
204                    capture_input=should_capture_input,
205                    capture_output=should_capture_output,
206                    transform_to_string=transform_to_string,
207                )
208            )
209
210        """Handle decorator with or without parentheses.
211
212        This logic enables the decorator to work both with and without parentheses:
213        - @observe - Python passes the function directly to the decorator
214        - @observe() - Python calls the decorator first, which must return a function decorator
215
216        When called without arguments (@observe), the func parameter contains the function to decorate,
217        so we directly apply the decorator to it. When called with parentheses (@observe()),
218        func is None, so we return the decorator function itself for Python to apply in the next step.
219        """
220        if func is None:
221            return decorator
222        else:
223            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:

func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.

Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"

Raises:

Exception: Propagates any exceptions from the wrapped function after logging them in the trace.

Notes:

The decorator preserves the original function's signature, docstring, and return type.

Proper parent-child relationships between spans are automatically maintained.

Special keyword arguments can be passed to control tracing:

langfuse_trace_id: Explicitly set the trace ID for this function call

langfuse_parent_observation_id: Explicitly set the parent span ID

langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)

For async functions, the decorator returns an async function wrapper.

For sync functions, the decorator returns a synchronous wrapper.

def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]: View Source

 76def propagate_attributes(
 77    *,
 78    user_id: Optional[str] = None,
 79    session_id: Optional[str] = None,
 80    metadata: Optional[Dict[str, str]] = None,
 81    version: Optional[str] = None,
 82    tags: Optional[List[str]] = None,
 83    trace_name: Optional[str] = None,
 84    as_baggage: bool = False,
 85) -> _AgnosticContextManager[Any]:
 86    """Propagate trace-level attributes to all spans created within this context.
 87
 88    This context manager sets attributes on the currently active span AND automatically
 89    propagates them to all new child spans created within the context. This is the
 90    recommended way to set trace-level attributes like user_id, session_id, and metadata
 91    dimensions that should be consistently applied across all observations in a trace.
 92
 93    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 94    currently active span and spans created after entering this context will have these
 95    attributes. Pre-existing spans will NOT be retroactively updated.
 96
 97    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 98    filtering by session_id) only include observations that have the attribute set.
 99    If you call `propagate_attributes` late in your workflow, earlier spans won't be
100    included in aggregations for that attribute.
101
102    Args:
103        user_id: User identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to track which user
105            generated each trace and enable e.g. per-user cost/performance analysis.
106        session_id: Session identifier to associate with all spans in this context.
107            Must be US-ASCII string, ≤200 characters. Use this to group related traces
108            within a user session (e.g., a conversation thread, multi-turn interaction).
109        metadata: Additional key-value metadata to propagate to all spans.
110            - Keys and values must be US-ASCII strings
111            - All values must be ≤200 characters
112            - Use for dimensions like internal correlating identifiers
113            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
114        version: Version identfier for parts of your application that are independently versioned, e.g. agents
115        tags: List of tags to categorize the group of observations
116        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
117            Use this to set a consistent trace name for all spans created within this context.
118        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
119            cross-process/service propagation. **Security warning**: When enabled,
120            attribute values are added to HTTP headers on ALL outbound requests.
121            Only enable if values are safe to transmit via HTTP headers and you need
122            cross-service tracing. Default: False.
123
124    Returns:
125        Context manager that propagates attributes to all child spans.
126
127    Example:
128        Basic usage with user and session tracking:
129
130        ```python
131        from langfuse import Langfuse
132
133        langfuse = Langfuse()
134
135        # Set attributes early in the trace
136        with langfuse.start_as_current_span(name="user_workflow") as span:
137            with langfuse.propagate_attributes(
138                user_id="user_123",
139                session_id="session_abc",
140                metadata={"experiment": "variant_a", "environment": "production"}
141            ):
142                # All spans created here will have user_id, session_id, and metadata
143                with langfuse.start_span(name="llm_call") as llm_span:
144                    # This span inherits: user_id, session_id, experiment, environment
145                    ...
146
147                with langfuse.start_generation(name="completion") as gen:
148                    # This span also inherits all attributes
149                    ...
150        ```
151
152        Late propagation (anti-pattern):
153
154        ```python
155        with langfuse.start_as_current_span(name="workflow") as span:
156            # These spans WON'T have user_id
157            early_span = langfuse.start_span(name="early_work")
158            early_span.end()
159
160            # Set attributes in the middle
161            with langfuse.propagate_attributes(user_id="user_123"):
162                # Only spans created AFTER this point will have user_id
163                late_span = langfuse.start_span(name="late_work")
164                late_span.end()
165
166            # Result: Aggregations by user_id will miss "early_work" span
167        ```
168
169        Cross-service propagation with baggage (advanced):
170
171        ```python
172        # Service A - originating service
173        with langfuse.start_as_current_span(name="api_request"):
174            with langfuse.propagate_attributes(
175                user_id="user_123",
176                session_id="session_abc",
177                as_baggage=True  # Propagate via HTTP headers
178            ):
179                # Make HTTP request to Service B
180                response = requests.get("https://service-b.example.com/api")
181                # user_id and session_id are now in HTTP headers
182
183        # Service B - downstream service
184        # OpenTelemetry will automatically extract baggage from HTTP headers
185        # and propagate to spans in Service B
186        ```
187
188    Note:
189        - **Validation**: All attribute values (user_id, session_id, metadata values)
190          must be strings ≤200 characters. Invalid values will be dropped with a
191          warning logged. Ensure values meet constraints before calling.
192        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
193          making it compatible with other OTel-instrumented libraries.
194
195    Raises:
196        No exceptions are raised. Invalid values are logged as warnings and dropped.
197    """
198    return _propagate_attributes(
199        user_id=user_id,
200        session_id=session_id,
201        metadata=metadata,
202        version=version,
203        tags=tags,
204        trace_name=trace_name,
205        as_baggage=as_baggage,
206    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:

user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
metadata: Additional key-value metadata to propagate to all spans.
- Keys and values must be US-ASCII strings
- All values must be ≤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
version: Version identfier for parts of your application that are independently versioned, e.g. agents
tags: List of tags to categorize the group of observations
trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.

Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_span(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_span(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_span(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_span(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_span(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_span(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B

Note:

Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.

OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.

Raises:

No exceptions are raised. Invalid values are logged as warnings and dropped.

ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]

class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper): View Source

1166class LangfuseSpan(LangfuseObservationWrapper):
1167    """Standard span implementation for general operations in Langfuse.
1168
1169    This class represents a general-purpose span that can be used to trace
1170    any operation in your application. It extends the base LangfuseObservationWrapper
1171    with specific methods for creating child spans, generations, and updating
1172    span-specific attributes. If possible, use a more specific type for
1173    better observability and insights.
1174    """
1175
1176    def __init__(
1177        self,
1178        *,
1179        otel_span: otel_trace_api.Span,
1180        langfuse_client: "Langfuse",
1181        input: Optional[Any] = None,
1182        output: Optional[Any] = None,
1183        metadata: Optional[Any] = None,
1184        environment: Optional[str] = None,
1185        version: Optional[str] = None,
1186        level: Optional[SpanLevel] = None,
1187        status_message: Optional[str] = None,
1188    ):
1189        """Initialize a new LangfuseSpan.
1190
1191        Args:
1192            otel_span: The OpenTelemetry span to wrap
1193            langfuse_client: Reference to the parent Langfuse client
1194            input: Input data for the span (any JSON-serializable object)
1195            output: Output data from the span (any JSON-serializable object)
1196            metadata: Additional metadata to associate with the span
1197            environment: The tracing environment
1198            version: Version identifier for the code or component
1199            level: Importance level of the span (info, warning, error)
1200            status_message: Optional status message for the span
1201        """
1202        super().__init__(
1203            otel_span=otel_span,
1204            as_type="span",
1205            langfuse_client=langfuse_client,
1206            input=input,
1207            output=output,
1208            metadata=metadata,
1209            environment=environment,
1210            version=version,
1211            level=level,
1212            status_message=status_message,
1213        )
1214
1215    def start_span(
1216        self,
1217        name: str,
1218        input: Optional[Any] = None,
1219        output: Optional[Any] = None,
1220        metadata: Optional[Any] = None,
1221        version: Optional[str] = None,
1222        level: Optional[SpanLevel] = None,
1223        status_message: Optional[str] = None,
1224    ) -> "LangfuseSpan":
1225        """Create a new child span.
1226
1227        This method creates a new child span with this span as the parent.
1228        Unlike start_as_current_span(), this method does not set the new span
1229        as the current span in the context.
1230
1231        Args:
1232            name: Name of the span (e.g., function or operation name)
1233            input: Input data for the operation
1234            output: Output data from the operation
1235            metadata: Additional metadata to associate with the span
1236            version: Version identifier for the code or component
1237            level: Importance level of the span (info, warning, error)
1238            status_message: Optional status message for the span
1239
1240        Returns:
1241            A new LangfuseSpan that must be ended with .end() when complete
1242
1243        Example:
1244            ```python
1245            parent_span = langfuse.start_span(name="process-request")
1246            try:
1247                # Create a child span
1248                child_span = parent_span.start_span(name="validate-input")
1249                try:
1250                    # Do validation work
1251                    validation_result = validate(request_data)
1252                    child_span.update(output=validation_result)
1253                finally:
1254                    child_span.end()
1255
1256                # Continue with parent span
1257                result = process_validated_data(validation_result)
1258                parent_span.update(output=result)
1259            finally:
1260                parent_span.end()
1261            ```
1262        """
1263        return self.start_observation(
1264            name=name,
1265            as_type="span",
1266            input=input,
1267            output=output,
1268            metadata=metadata,
1269            version=version,
1270            level=level,
1271            status_message=status_message,
1272        )
1273
1274    def start_as_current_span(
1275        self,
1276        *,
1277        name: str,
1278        input: Optional[Any] = None,
1279        output: Optional[Any] = None,
1280        metadata: Optional[Any] = None,
1281        version: Optional[str] = None,
1282        level: Optional[SpanLevel] = None,
1283        status_message: Optional[str] = None,
1284    ) -> _AgnosticContextManager["LangfuseSpan"]:
1285        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1286
1287        DEPRECATED: This method is deprecated and will be removed in a future version.
1288        Use start_as_current_observation(as_type='span') instead.
1289
1290        This method creates a new child span and sets it as the current span within
1291        a context manager. It should be used with a 'with' statement to automatically
1292        manage the span's lifecycle.
1293
1294        Args:
1295            name: Name of the span (e.g., function or operation name)
1296            input: Input data for the operation
1297            output: Output data from the operation
1298            metadata: Additional metadata to associate with the span
1299            version: Version identifier for the code or component
1300            level: Importance level of the span (info, warning, error)
1301            status_message: Optional status message for the span
1302
1303        Returns:
1304            A context manager that yields a new LangfuseSpan
1305
1306        Example:
1307            ```python
1308            with langfuse.start_as_current_span(name="process-request") as parent_span:
1309                # Parent span is active here
1310
1311                # Create a child span with context management
1312                with parent_span.start_as_current_span(name="validate-input") as child_span:
1313                    # Child span is active here
1314                    validation_result = validate(request_data)
1315                    child_span.update(output=validation_result)
1316
1317                # Back to parent span context
1318                result = process_validated_data(validation_result)
1319                parent_span.update(output=result)
1320            ```
1321        """
1322        warnings.warn(
1323            "start_as_current_span is deprecated and will be removed in a future version. "
1324            "Use start_as_current_observation(as_type='span') instead.",
1325            DeprecationWarning,
1326            stacklevel=2,
1327        )
1328        return self.start_as_current_observation(
1329            name=name,
1330            as_type="span",
1331            input=input,
1332            output=output,
1333            metadata=metadata,
1334            version=version,
1335            level=level,
1336            status_message=status_message,
1337        )
1338
1339    def start_generation(
1340        self,
1341        *,
1342        name: str,
1343        input: Optional[Any] = None,
1344        output: Optional[Any] = None,
1345        metadata: Optional[Any] = None,
1346        version: Optional[str] = None,
1347        level: Optional[SpanLevel] = None,
1348        status_message: Optional[str] = None,
1349        completion_start_time: Optional[datetime] = None,
1350        model: Optional[str] = None,
1351        model_parameters: Optional[Dict[str, MapValue]] = None,
1352        usage_details: Optional[Dict[str, int]] = None,
1353        cost_details: Optional[Dict[str, float]] = None,
1354        prompt: Optional[PromptClient] = None,
1355    ) -> "LangfuseGeneration":
1356        """[DEPRECATED] Create a new child generation span.
1357
1358        DEPRECATED: This method is deprecated and will be removed in a future version.
1359        Use start_observation(as_type='generation') instead.
1360
1361        This method creates a new child generation span with this span as the parent.
1362        Generation spans are specialized for AI/LLM operations and include additional
1363        fields for model information, usage stats, and costs.
1364
1365        Unlike start_as_current_generation(), this method does not set the new span
1366        as the current span in the context.
1367
1368        Args:
1369            name: Name of the generation operation
1370            input: Input data for the model (e.g., prompts)
1371            output: Output from the model (e.g., completions)
1372            metadata: Additional metadata to associate with the generation
1373            version: Version identifier for the model or component
1374            level: Importance level of the generation (info, warning, error)
1375            status_message: Optional status message for the generation
1376            completion_start_time: When the model started generating the response
1377            model: Name/identifier of the AI model used (e.g., "gpt-4")
1378            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1379            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1380            cost_details: Cost information for the model call
1381            prompt: Associated prompt template from Langfuse prompt management
1382
1383        Returns:
1384            A new LangfuseGeneration that must be ended with .end() when complete
1385
1386        Example:
1387            ```python
1388            span = langfuse.start_span(name="process-query")
1389            try:
1390                # Create a generation child span
1391                generation = span.start_generation(
1392                    name="generate-answer",
1393                    model="gpt-4",
1394                    input={"prompt": "Explain quantum computing"}
1395                )
1396                try:
1397                    # Call model API
1398                    response = llm.generate(...)
1399
1400                    generation.update(
1401                        output=response.text,
1402                        usage_details={
1403                            "prompt_tokens": response.usage.prompt_tokens,
1404                            "completion_tokens": response.usage.completion_tokens
1405                        }
1406                    )
1407                finally:
1408                    generation.end()
1409
1410                # Continue with parent span
1411                span.update(output={"answer": response.text, "source": "gpt-4"})
1412            finally:
1413                span.end()
1414            ```
1415        """
1416        warnings.warn(
1417            "start_generation is deprecated and will be removed in a future version. "
1418            "Use start_observation(as_type='generation') instead.",
1419            DeprecationWarning,
1420            stacklevel=2,
1421        )
1422        return self.start_observation(
1423            name=name,
1424            as_type="generation",
1425            input=input,
1426            output=output,
1427            metadata=metadata,
1428            version=version,
1429            level=level,
1430            status_message=status_message,
1431            completion_start_time=completion_start_time,
1432            model=model,
1433            model_parameters=model_parameters,
1434            usage_details=usage_details,
1435            cost_details=cost_details,
1436            prompt=prompt,
1437        )
1438
1439    def start_as_current_generation(
1440        self,
1441        *,
1442        name: str,
1443        input: Optional[Any] = None,
1444        output: Optional[Any] = None,
1445        metadata: Optional[Any] = None,
1446        version: Optional[str] = None,
1447        level: Optional[SpanLevel] = None,
1448        status_message: Optional[str] = None,
1449        completion_start_time: Optional[datetime] = None,
1450        model: Optional[str] = None,
1451        model_parameters: Optional[Dict[str, MapValue]] = None,
1452        usage_details: Optional[Dict[str, int]] = None,
1453        cost_details: Optional[Dict[str, float]] = None,
1454        prompt: Optional[PromptClient] = None,
1455    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1456        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1457
1458        DEPRECATED: This method is deprecated and will be removed in a future version.
1459        Use start_as_current_observation(as_type='generation') instead.
1460
1461        This method creates a new child generation span and sets it as the current span
1462        within a context manager. Generation spans are specialized for AI/LLM operations
1463        and include additional fields for model information, usage stats, and costs.
1464
1465        Args:
1466            name: Name of the generation operation
1467            input: Input data for the model (e.g., prompts)
1468            output: Output from the model (e.g., completions)
1469            metadata: Additional metadata to associate with the generation
1470            version: Version identifier for the model or component
1471            level: Importance level of the generation (info, warning, error)
1472            status_message: Optional status message for the generation
1473            completion_start_time: When the model started generating the response
1474            model: Name/identifier of the AI model used (e.g., "gpt-4")
1475            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1476            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1477            cost_details: Cost information for the model call
1478            prompt: Associated prompt template from Langfuse prompt management
1479
1480        Returns:
1481            A context manager that yields a new LangfuseGeneration
1482
1483        Example:
1484            ```python
1485            with langfuse.start_as_current_span(name="process-request") as span:
1486                # Prepare data
1487                query = preprocess_user_query(user_input)
1488
1489                # Create a generation span with context management
1490                with span.start_as_current_generation(
1491                    name="generate-answer",
1492                    model="gpt-4",
1493                    input={"query": query}
1494                ) as generation:
1495                    # Generation span is active here
1496                    response = llm.generate(query)
1497
1498                    # Update with results
1499                    generation.update(
1500                        output=response.text,
1501                        usage_details={
1502                            "prompt_tokens": response.usage.prompt_tokens,
1503                            "completion_tokens": response.usage.completion_tokens
1504                        }
1505                    )
1506
1507                # Back to parent span context
1508                span.update(output={"answer": response.text, "source": "gpt-4"})
1509            ```
1510        """
1511        warnings.warn(
1512            "start_as_current_generation is deprecated and will be removed in a future version. "
1513            "Use start_as_current_observation(as_type='generation') instead.",
1514            DeprecationWarning,
1515            stacklevel=2,
1516        )
1517        return self.start_as_current_observation(
1518            name=name,
1519            as_type="generation",
1520            input=input,
1521            output=output,
1522            metadata=metadata,
1523            version=version,
1524            level=level,
1525            status_message=status_message,
1526            completion_start_time=completion_start_time,
1527            model=model,
1528            model_parameters=model_parameters,
1529            usage_details=usage_details,
1530            cost_details=cost_details,
1531            prompt=prompt,
1532        )
1533
1534    def create_event(
1535        self,
1536        *,
1537        name: str,
1538        input: Optional[Any] = None,
1539        output: Optional[Any] = None,
1540        metadata: Optional[Any] = None,
1541        version: Optional[str] = None,
1542        level: Optional[SpanLevel] = None,
1543        status_message: Optional[str] = None,
1544    ) -> "LangfuseEvent":
1545        """Create a new Langfuse observation of type 'EVENT'.
1546
1547        Args:
1548            name: Name of the span (e.g., function or operation name)
1549            input: Input data for the operation (can be any JSON-serializable object)
1550            output: Output data from the operation (can be any JSON-serializable object)
1551            metadata: Additional metadata to associate with the span
1552            version: Version identifier for the code or component
1553            level: Importance level of the span (info, warning, error)
1554            status_message: Optional status message for the span
1555
1556        Returns:
1557            The LangfuseEvent object
1558
1559        Example:
1560            ```python
1561            event = langfuse.create_event(name="process-event")
1562            ```
1563        """
1564        timestamp = time_ns()
1565
1566        with otel_trace_api.use_span(self._otel_span):
1567            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1568                name=name, start_time=timestamp
1569            )
1570
1571        return cast(
1572            "LangfuseEvent",
1573            LangfuseEvent(
1574                otel_span=new_otel_span,
1575                langfuse_client=self._langfuse_client,
1576                input=input,
1577                output=output,
1578                metadata=metadata,
1579                environment=self._environment,
1580                version=version,
1581                level=level,
1582                status_message=status_message,
1583            ).end(end_time=timestamp),
1584        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) View Source

1176    def __init__(
1177        self,
1178        *,
1179        otel_span: otel_trace_api.Span,
1180        langfuse_client: "Langfuse",
1181        input: Optional[Any] = None,
1182        output: Optional[Any] = None,
1183        metadata: Optional[Any] = None,
1184        environment: Optional[str] = None,
1185        version: Optional[str] = None,
1186        level: Optional[SpanLevel] = None,
1187        status_message: Optional[str] = None,
1188    ):
1189        """Initialize a new LangfuseSpan.
1190
1191        Args:
1192            otel_span: The OpenTelemetry span to wrap
1193            langfuse_client: Reference to the parent Langfuse client
1194            input: Input data for the span (any JSON-serializable object)
1195            output: Output data from the span (any JSON-serializable object)
1196            metadata: Additional metadata to associate with the span
1197            environment: The tracing environment
1198            version: Version identifier for the code or component
1199            level: Importance level of the span (info, warning, error)
1200            status_message: Optional status message for the span
1201        """
1202        super().__init__(
1203            otel_span=otel_span,
1204            as_type="span",
1205            langfuse_client=langfuse_client,
1206            input=input,
1207            output=output,
1208            metadata=metadata,
1209            environment=environment,
1210            version=version,
1211            level=level,
1212            status_message=status_message,
1213        )

Initialize a new LangfuseSpan.

Arguments:

otel_span: The OpenTelemetry span to wrap
langfuse_client: Reference to the parent Langfuse client
input: Input data for the span (any JSON-serializable object)
output: Output data from the span (any JSON-serializable object)
metadata: Additional metadata to associate with the span
environment: The tracing environment
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

def start_span( self, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan: View Source

1215    def start_span(
1216        self,
1217        name: str,
1218        input: Optional[Any] = None,
1219        output: Optional[Any] = None,
1220        metadata: Optional[Any] = None,
1221        version: Optional[str] = None,
1222        level: Optional[SpanLevel] = None,
1223        status_message: Optional[str] = None,
1224    ) -> "LangfuseSpan":
1225        """Create a new child span.
1226
1227        This method creates a new child span with this span as the parent.
1228        Unlike start_as_current_span(), this method does not set the new span
1229        as the current span in the context.
1230
1231        Args:
1232            name: Name of the span (e.g., function or operation name)
1233            input: Input data for the operation
1234            output: Output data from the operation
1235            metadata: Additional metadata to associate with the span
1236            version: Version identifier for the code or component
1237            level: Importance level of the span (info, warning, error)
1238            status_message: Optional status message for the span
1239
1240        Returns:
1241            A new LangfuseSpan that must be ended with .end() when complete
1242
1243        Example:
1244            ```python
1245            parent_span = langfuse.start_span(name="process-request")
1246            try:
1247                # Create a child span
1248                child_span = parent_span.start_span(name="validate-input")
1249                try:
1250                    # Do validation work
1251                    validation_result = validate(request_data)
1252                    child_span.update(output=validation_result)
1253                finally:
1254                    child_span.end()
1255
1256                # Continue with parent span
1257                result = process_validated_data(validation_result)
1258                parent_span.update(output=result)
1259            finally:
1260                parent_span.end()
1261            ```
1262        """
1263        return self.start_observation(
1264            name=name,
1265            as_type="span",
1266            input=input,
1267            output=output,
1268            metadata=metadata,
1269            version=version,
1270            level=level,
1271            status_message=status_message,
1272        )

Create a new child span.

This method creates a new child span with this span as the parent. Unlike start_as_current_span(), this method does not set the new span as the current span in the context.

Arguments:

name: Name of the span (e.g., function or operation name)
input: Input data for the operation
output: Output data from the operation
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Returns:

A new LangfuseSpan that must be ended with .end() when complete

Example:

parent_span = langfuse.start_span(name="process-request")
try:
    # Create a child span
    child_span = parent_span.start_span(name="validate-input")
    try:
        # Do validation work
        validation_result = validate(request_data)
        child_span.update(output=validation_result)
    finally:
        child_span.end()

    # Continue with parent span
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
finally:
    parent_span.end()

def start_as_current_span( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]: View Source

1274    def start_as_current_span(
1275        self,
1276        *,
1277        name: str,
1278        input: Optional[Any] = None,
1279        output: Optional[Any] = None,
1280        metadata: Optional[Any] = None,
1281        version: Optional[str] = None,
1282        level: Optional[SpanLevel] = None,
1283        status_message: Optional[str] = None,
1284    ) -> _AgnosticContextManager["LangfuseSpan"]:
1285        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1286
1287        DEPRECATED: This method is deprecated and will be removed in a future version.
1288        Use start_as_current_observation(as_type='span') instead.
1289
1290        This method creates a new child span and sets it as the current span within
1291        a context manager. It should be used with a 'with' statement to automatically
1292        manage the span's lifecycle.
1293
1294        Args:
1295            name: Name of the span (e.g., function or operation name)
1296            input: Input data for the operation
1297            output: Output data from the operation
1298            metadata: Additional metadata to associate with the span
1299            version: Version identifier for the code or component
1300            level: Importance level of the span (info, warning, error)
1301            status_message: Optional status message for the span
1302
1303        Returns:
1304            A context manager that yields a new LangfuseSpan
1305
1306        Example:
1307            ```python
1308            with langfuse.start_as_current_span(name="process-request") as parent_span:
1309                # Parent span is active here
1310
1311                # Create a child span with context management
1312                with parent_span.start_as_current_span(name="validate-input") as child_span:
1313                    # Child span is active here
1314                    validation_result = validate(request_data)
1315                    child_span.update(output=validation_result)
1316
1317                # Back to parent span context
1318                result = process_validated_data(validation_result)
1319                parent_span.update(output=result)
1320            ```
1321        """
1322        warnings.warn(
1323            "start_as_current_span is deprecated and will be removed in a future version. "
1324            "Use start_as_current_observation(as_type='span') instead.",
1325            DeprecationWarning,
1326            stacklevel=2,
1327        )
1328        return self.start_as_current_observation(
1329            name=name,
1330            as_type="span",
1331            input=input,
1332            output=output,
1333            metadata=metadata,
1334            version=version,
1335            level=level,
1336            status_message=status_message,
1337        )

[DEPRECATED] Create a new child span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='span') instead.

This method creates a new child span and sets it as the current span within a context manager. It should be used with a 'with' statement to automatically manage the span's lifecycle.

Arguments:

name: Name of the span (e.g., function or operation name)
input: Input data for the operation
output: Output data from the operation
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Returns:

A context manager that yields a new LangfuseSpan

Example:

with langfuse.start_as_current_span(name="process-request") as parent_span:
    # Parent span is active here

    # Create a child span with context management
    with parent_span.start_as_current_span(name="validate-input") as child_span:
        # Child span is active here
        validation_result = validate(request_data)
        child_span.update(output=validation_result)

    # Back to parent span context
    result = process_validated_data(validation_result)
    parent_span.update(output=result)

def start_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration: View Source

1339    def start_generation(
1340        self,
1341        *,
1342        name: str,
1343        input: Optional[Any] = None,
1344        output: Optional[Any] = None,
1345        metadata: Optional[Any] = None,
1346        version: Optional[str] = None,
1347        level: Optional[SpanLevel] = None,
1348        status_message: Optional[str] = None,
1349        completion_start_time: Optional[datetime] = None,
1350        model: Optional[str] = None,
1351        model_parameters: Optional[Dict[str, MapValue]] = None,
1352        usage_details: Optional[Dict[str, int]] = None,
1353        cost_details: Optional[Dict[str, float]] = None,
1354        prompt: Optional[PromptClient] = None,
1355    ) -> "LangfuseGeneration":
1356        """[DEPRECATED] Create a new child generation span.
1357
1358        DEPRECATED: This method is deprecated and will be removed in a future version.
1359        Use start_observation(as_type='generation') instead.
1360
1361        This method creates a new child generation span with this span as the parent.
1362        Generation spans are specialized for AI/LLM operations and include additional
1363        fields for model information, usage stats, and costs.
1364
1365        Unlike start_as_current_generation(), this method does not set the new span
1366        as the current span in the context.
1367
1368        Args:
1369            name: Name of the generation operation
1370            input: Input data for the model (e.g., prompts)
1371            output: Output from the model (e.g., completions)
1372            metadata: Additional metadata to associate with the generation
1373            version: Version identifier for the model or component
1374            level: Importance level of the generation (info, warning, error)
1375            status_message: Optional status message for the generation
1376            completion_start_time: When the model started generating the response
1377            model: Name/identifier of the AI model used (e.g., "gpt-4")
1378            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1379            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1380            cost_details: Cost information for the model call
1381            prompt: Associated prompt template from Langfuse prompt management
1382
1383        Returns:
1384            A new LangfuseGeneration that must be ended with .end() when complete
1385
1386        Example:
1387            ```python
1388            span = langfuse.start_span(name="process-query")
1389            try:
1390                # Create a generation child span
1391                generation = span.start_generation(
1392                    name="generate-answer",
1393                    model="gpt-4",
1394                    input={"prompt": "Explain quantum computing"}
1395                )
1396                try:
1397                    # Call model API
1398                    response = llm.generate(...)
1399
1400                    generation.update(
1401                        output=response.text,
1402                        usage_details={
1403                            "prompt_tokens": response.usage.prompt_tokens,
1404                            "completion_tokens": response.usage.completion_tokens
1405                        }
1406                    )
1407                finally:
1408                    generation.end()
1409
1410                # Continue with parent span
1411                span.update(output={"answer": response.text, "source": "gpt-4"})
1412            finally:
1413                span.end()
1414            ```
1415        """
1416        warnings.warn(
1417            "start_generation is deprecated and will be removed in a future version. "
1418            "Use start_observation(as_type='generation') instead.",
1419            DeprecationWarning,
1420            stacklevel=2,
1421        )
1422        return self.start_observation(
1423            name=name,
1424            as_type="generation",
1425            input=input,
1426            output=output,
1427            metadata=metadata,
1428            version=version,
1429            level=level,
1430            status_message=status_message,
1431            completion_start_time=completion_start_time,
1432            model=model,
1433            model_parameters=model_parameters,
1434            usage_details=usage_details,
1435            cost_details=cost_details,
1436            prompt=prompt,
1437        )

[DEPRECATED] Create a new child generation span.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a new child generation span with this span as the parent. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Unlike start_as_current_generation(), this method does not set the new span as the current span in the context.

Arguments:

name: Name of the generation operation
input: Input data for the model (e.g., prompts)
output: Output from the model (e.g., completions)
metadata: Additional metadata to associate with the generation
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

Returns:

A new LangfuseGeneration that must be ended with .end() when complete

Example:

span = langfuse.start_span(name="process-query")
try:
    # Create a generation child span
    generation = span.start_generation(
        name="generate-answer",
        model="gpt-4",
        input={"prompt": "Explain quantum computing"}
    )
    try:
        # Call model API
        response = llm.generate(...)

        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )
    finally:
        generation.end()

    # Continue with parent span
    span.update(output={"answer": response.text, "source": "gpt-4"})
finally:
    span.end()

def start_as_current_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]: View Source

1439    def start_as_current_generation(
1440        self,
1441        *,
1442        name: str,
1443        input: Optional[Any] = None,
1444        output: Optional[Any] = None,
1445        metadata: Optional[Any] = None,
1446        version: Optional[str] = None,
1447        level: Optional[SpanLevel] = None,
1448        status_message: Optional[str] = None,
1449        completion_start_time: Optional[datetime] = None,
1450        model: Optional[str] = None,
1451        model_parameters: Optional[Dict[str, MapValue]] = None,
1452        usage_details: Optional[Dict[str, int]] = None,
1453        cost_details: Optional[Dict[str, float]] = None,
1454        prompt: Optional[PromptClient] = None,
1455    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1456        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1457
1458        DEPRECATED: This method is deprecated and will be removed in a future version.
1459        Use start_as_current_observation(as_type='generation') instead.
1460
1461        This method creates a new child generation span and sets it as the current span
1462        within a context manager. Generation spans are specialized for AI/LLM operations
1463        and include additional fields for model information, usage stats, and costs.
1464
1465        Args:
1466            name: Name of the generation operation
1467            input: Input data for the model (e.g., prompts)
1468            output: Output from the model (e.g., completions)
1469            metadata: Additional metadata to associate with the generation
1470            version: Version identifier for the model or component
1471            level: Importance level of the generation (info, warning, error)
1472            status_message: Optional status message for the generation
1473            completion_start_time: When the model started generating the response
1474            model: Name/identifier of the AI model used (e.g., "gpt-4")
1475            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1476            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1477            cost_details: Cost information for the model call
1478            prompt: Associated prompt template from Langfuse prompt management
1479
1480        Returns:
1481            A context manager that yields a new LangfuseGeneration
1482
1483        Example:
1484            ```python
1485            with langfuse.start_as_current_span(name="process-request") as span:
1486                # Prepare data
1487                query = preprocess_user_query(user_input)
1488
1489                # Create a generation span with context management
1490                with span.start_as_current_generation(
1491                    name="generate-answer",
1492                    model="gpt-4",
1493                    input={"query": query}
1494                ) as generation:
1495                    # Generation span is active here
1496                    response = llm.generate(query)
1497
1498                    # Update with results
1499                    generation.update(
1500                        output=response.text,
1501                        usage_details={
1502                            "prompt_tokens": response.usage.prompt_tokens,
1503                            "completion_tokens": response.usage.completion_tokens
1504                        }
1505                    )
1506
1507                # Back to parent span context
1508                span.update(output={"answer": response.text, "source": "gpt-4"})
1509            ```
1510        """
1511        warnings.warn(
1512            "start_as_current_generation is deprecated and will be removed in a future version. "
1513            "Use start_as_current_observation(as_type='generation') instead.",
1514            DeprecationWarning,
1515            stacklevel=2,
1516        )
1517        return self.start_as_current_observation(
1518            name=name,
1519            as_type="generation",
1520            input=input,
1521            output=output,
1522            metadata=metadata,
1523            version=version,
1524            level=level,
1525            status_message=status_message,
1526            completion_start_time=completion_start_time,
1527            model=model,
1528            model_parameters=model_parameters,
1529            usage_details=usage_details,
1530            cost_details=cost_details,
1531            prompt=prompt,
1532        )

[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a new child generation span and sets it as the current span within a context manager. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Arguments:

name: Name of the generation operation
input: Input data for the model (e.g., prompts)
output: Output from the model (e.g., completions)
metadata: Additional metadata to associate with the generation
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

Returns:

A context manager that yields a new LangfuseGeneration

Example:

with langfuse.start_as_current_span(name="process-request") as span:
    # Prepare data
    query = preprocess_user_query(user_input)

    # Create a generation span with context management
    with span.start_as_current_generation(
        name="generate-answer",
        model="gpt-4",
        input={"query": query}
    ) as generation:
        # Generation span is active here
        response = llm.generate(query)

        # Update with results
        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

    # Back to parent span context
    span.update(output={"answer": response.text, "source": "gpt-4"})

def create_event( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent: View Source

1534    def create_event(
1535        self,
1536        *,
1537        name: str,
1538        input: Optional[Any] = None,
1539        output: Optional[Any] = None,
1540        metadata: Optional[Any] = None,
1541        version: Optional[str] = None,
1542        level: Optional[SpanLevel] = None,
1543        status_message: Optional[str] = None,
1544    ) -> "LangfuseEvent":
1545        """Create a new Langfuse observation of type 'EVENT'.
1546
1547        Args:
1548            name: Name of the span (e.g., function or operation name)
1549            input: Input data for the operation (can be any JSON-serializable object)
1550            output: Output data from the operation (can be any JSON-serializable object)
1551            metadata: Additional metadata to associate with the span
1552            version: Version identifier for the code or component
1553            level: Importance level of the span (info, warning, error)
1554            status_message: Optional status message for the span
1555
1556        Returns:
1557            The LangfuseEvent object
1558
1559        Example:
1560            ```python
1561            event = langfuse.create_event(name="process-event")
1562            ```
1563        """
1564        timestamp = time_ns()
1565
1566        with otel_trace_api.use_span(self._otel_span):
1567            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1568                name=name, start_time=timestamp
1569            )
1570
1571        return cast(
1572            "LangfuseEvent",
1573            LangfuseEvent(
1574                otel_span=new_otel_span,
1575                langfuse_client=self._langfuse_client,
1576                input=input,
1577                output=output,
1578                metadata=metadata,
1579                environment=self._environment,
1580                version=version,
1581                level=level,
1582                status_message=status_message,
1583            ).end(end_time=timestamp),
1584        )

Create a new Langfuse observation of type 'EVENT'.

Arguments:

name: Name of the span (e.g., function or operation name)
input: Input data for the operation (can be any JSON-serializable object)
output: Output data from the operation (can be any JSON-serializable object)
metadata: Additional metadata to associate with the span
version: Version identifier for the code or component
level: Importance level of the span (info, warning, error)
status_message: Optional status message for the span

Returns:

The LangfuseEvent object

Example:

event = langfuse.create_event(name="process-event")

class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper): View Source

1587class LangfuseGeneration(LangfuseObservationWrapper):
1588    """Specialized span implementation for AI model generations in Langfuse.
1589
1590    This class represents a generation span specifically designed for tracking
1591    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1592    attributes for model details, token usage, and costs.
1593    """
1594
1595    def __init__(
1596        self,
1597        *,
1598        otel_span: otel_trace_api.Span,
1599        langfuse_client: "Langfuse",
1600        input: Optional[Any] = None,
1601        output: Optional[Any] = None,
1602        metadata: Optional[Any] = None,
1603        environment: Optional[str] = None,
1604        version: Optional[str] = None,
1605        level: Optional[SpanLevel] = None,
1606        status_message: Optional[str] = None,
1607        completion_start_time: Optional[datetime] = None,
1608        model: Optional[str] = None,
1609        model_parameters: Optional[Dict[str, MapValue]] = None,
1610        usage_details: Optional[Dict[str, int]] = None,
1611        cost_details: Optional[Dict[str, float]] = None,
1612        prompt: Optional[PromptClient] = None,
1613    ):
1614        """Initialize a new LangfuseGeneration span.
1615
1616        Args:
1617            otel_span: The OpenTelemetry span to wrap
1618            langfuse_client: Reference to the parent Langfuse client
1619            input: Input data for the generation (e.g., prompts)
1620            output: Output from the generation (e.g., completions)
1621            metadata: Additional metadata to associate with the generation
1622            environment: The tracing environment
1623            version: Version identifier for the model or component
1624            level: Importance level of the generation (info, warning, error)
1625            status_message: Optional status message for the generation
1626            completion_start_time: When the model started generating the response
1627            model: Name/identifier of the AI model used (e.g., "gpt-4")
1628            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1629            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1630            cost_details: Cost information for the model call
1631            prompt: Associated prompt template from Langfuse prompt management
1632        """
1633        super().__init__(
1634            as_type="generation",
1635            otel_span=otel_span,
1636            langfuse_client=langfuse_client,
1637            input=input,
1638            output=output,
1639            metadata=metadata,
1640            environment=environment,
1641            version=version,
1642            level=level,
1643            status_message=status_message,
1644            completion_start_time=completion_start_time,
1645            model=model,
1646            model_parameters=model_parameters,
1647            usage_details=usage_details,
1648            cost_details=cost_details,
1649            prompt=prompt,
1650        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) View Source

1595    def __init__(
1596        self,
1597        *,
1598        otel_span: otel_trace_api.Span,
1599        langfuse_client: "Langfuse",
1600        input: Optional[Any] = None,
1601        output: Optional[Any] = None,
1602        metadata: Optional[Any] = None,
1603        environment: Optional[str] = None,
1604        version: Optional[str] = None,
1605        level: Optional[SpanLevel] = None,
1606        status_message: Optional[str] = None,
1607        completion_start_time: Optional[datetime] = None,
1608        model: Optional[str] = None,
1609        model_parameters: Optional[Dict[str, MapValue]] = None,
1610        usage_details: Optional[Dict[str, int]] = None,
1611        cost_details: Optional[Dict[str, float]] = None,
1612        prompt: Optional[PromptClient] = None,
1613    ):
1614        """Initialize a new LangfuseGeneration span.
1615
1616        Args:
1617            otel_span: The OpenTelemetry span to wrap
1618            langfuse_client: Reference to the parent Langfuse client
1619            input: Input data for the generation (e.g., prompts)
1620            output: Output from the generation (e.g., completions)
1621            metadata: Additional metadata to associate with the generation
1622            environment: The tracing environment
1623            version: Version identifier for the model or component
1624            level: Importance level of the generation (info, warning, error)
1625            status_message: Optional status message for the generation
1626            completion_start_time: When the model started generating the response
1627            model: Name/identifier of the AI model used (e.g., "gpt-4")
1628            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1629            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1630            cost_details: Cost information for the model call
1631            prompt: Associated prompt template from Langfuse prompt management
1632        """
1633        super().__init__(
1634            as_type="generation",
1635            otel_span=otel_span,
1636            langfuse_client=langfuse_client,
1637            input=input,
1638            output=output,
1639            metadata=metadata,
1640            environment=environment,
1641            version=version,
1642            level=level,
1643            status_message=status_message,
1644            completion_start_time=completion_start_time,
1645            model=model,
1646            model_parameters=model_parameters,
1647            usage_details=usage_details,
1648            cost_details=cost_details,
1649            prompt=prompt,
1650        )

Initialize a new LangfuseGeneration span.

Arguments:

otel_span: The OpenTelemetry span to wrap
langfuse_client: Reference to the parent Langfuse client
input: Input data for the generation (e.g., prompts)
output: Output from the generation (e.g., completions)
metadata: Additional metadata to associate with the generation
environment: The tracing environment
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation
completion_start_time: When the model started generating the response
model: Name/identifier of the AI model used (e.g., "gpt-4")
model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
cost_details: Cost information for the model call
prompt: Associated prompt template from Langfuse prompt management

class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper): View Source

1653class LangfuseEvent(LangfuseObservationWrapper):
1654    """Specialized span implementation for Langfuse Events."""
1655
1656    def __init__(
1657        self,
1658        *,
1659        otel_span: otel_trace_api.Span,
1660        langfuse_client: "Langfuse",
1661        input: Optional[Any] = None,
1662        output: Optional[Any] = None,
1663        metadata: Optional[Any] = None,
1664        environment: Optional[str] = None,
1665        version: Optional[str] = None,
1666        level: Optional[SpanLevel] = None,
1667        status_message: Optional[str] = None,
1668    ):
1669        """Initialize a new LangfuseEvent span.
1670
1671        Args:
1672            otel_span: The OpenTelemetry span to wrap
1673            langfuse_client: Reference to the parent Langfuse client
1674            input: Input data for the event
1675            output: Output from the event
1676            metadata: Additional metadata to associate with the generation
1677            environment: The tracing environment
1678            version: Version identifier for the model or component
1679            level: Importance level of the generation (info, warning, error)
1680            status_message: Optional status message for the generation
1681        """
1682        super().__init__(
1683            otel_span=otel_span,
1684            as_type="event",
1685            langfuse_client=langfuse_client,
1686            input=input,
1687            output=output,
1688            metadata=metadata,
1689            environment=environment,
1690            version=version,
1691            level=level,
1692            status_message=status_message,
1693        )
1694
1695    def update(
1696        self,
1697        *,
1698        name: Optional[str] = None,
1699        input: Optional[Any] = None,
1700        output: Optional[Any] = None,
1701        metadata: Optional[Any] = None,
1702        version: Optional[str] = None,
1703        level: Optional[SpanLevel] = None,
1704        status_message: Optional[str] = None,
1705        completion_start_time: Optional[datetime] = None,
1706        model: Optional[str] = None,
1707        model_parameters: Optional[Dict[str, MapValue]] = None,
1708        usage_details: Optional[Dict[str, int]] = None,
1709        cost_details: Optional[Dict[str, float]] = None,
1710        prompt: Optional[PromptClient] = None,
1711        **kwargs: Any,
1712    ) -> "LangfuseEvent":
1713        """Update is not allowed for LangfuseEvent because events cannot be updated.
1714
1715        This method logs a warning and returns self without making changes.
1716
1717        Returns:
1718            self: Returns the unchanged LangfuseEvent instance
1719        """
1720        langfuse_logger.warning(
1721            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1722        )
1723        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) View Source

1656    def __init__(
1657        self,
1658        *,
1659        otel_span: otel_trace_api.Span,
1660        langfuse_client: "Langfuse",
1661        input: Optional[Any] = None,
1662        output: Optional[Any] = None,
1663        metadata: Optional[Any] = None,
1664        environment: Optional[str] = None,
1665        version: Optional[str] = None,
1666        level: Optional[SpanLevel] = None,
1667        status_message: Optional[str] = None,
1668    ):
1669        """Initialize a new LangfuseEvent span.
1670
1671        Args:
1672            otel_span: The OpenTelemetry span to wrap
1673            langfuse_client: Reference to the parent Langfuse client
1674            input: Input data for the event
1675            output: Output from the event
1676            metadata: Additional metadata to associate with the generation
1677            environment: The tracing environment
1678            version: Version identifier for the model or component
1679            level: Importance level of the generation (info, warning, error)
1680            status_message: Optional status message for the generation
1681        """
1682        super().__init__(
1683            otel_span=otel_span,
1684            as_type="event",
1685            langfuse_client=langfuse_client,
1686            input=input,
1687            output=output,
1688            metadata=metadata,
1689            environment=environment,
1690            version=version,
1691            level=level,
1692            status_message=status_message,
1693        )

Initialize a new LangfuseEvent span.

Arguments:

otel_span: The OpenTelemetry span to wrap
langfuse_client: Reference to the parent Langfuse client
input: Input data for the event
output: Output from the event
metadata: Additional metadata to associate with the generation
environment: The tracing environment
version: Version identifier for the model or component
level: Importance level of the generation (info, warning, error)
status_message: Optional status message for the generation

def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent: View Source

1695    def update(
1696        self,
1697        *,
1698        name: Optional[str] = None,
1699        input: Optional[Any] = None,
1700        output: Optional[Any] = None,
1701        metadata: Optional[Any] = None,
1702        version: Optional[str] = None,
1703        level: Optional[SpanLevel] = None,
1704        status_message: Optional[str] = None,
1705        completion_start_time: Optional[datetime] = None,
1706        model: Optional[str] = None,
1707        model_parameters: Optional[Dict[str, MapValue]] = None,
1708        usage_details: Optional[Dict[str, int]] = None,
1709        cost_details: Optional[Dict[str, float]] = None,
1710        prompt: Optional[PromptClient] = None,
1711        **kwargs: Any,
1712    ) -> "LangfuseEvent":
1713        """Update is not allowed for LangfuseEvent because events cannot be updated.
1714
1715        This method logs a warning and returns self without making changes.
1716
1717        Returns:
1718            self: Returns the unchanged LangfuseEvent instance
1719        """
1720        langfuse_logger.warning(
1721            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1722        )
1723        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes: View Source

27class LangfuseOtelSpanAttributes:
28    # Langfuse-Trace attributes
29    TRACE_NAME = "langfuse.trace.name"
30    TRACE_USER_ID = "user.id"
31    TRACE_SESSION_ID = "session.id"
32    TRACE_TAGS = "langfuse.trace.tags"
33    TRACE_PUBLIC = "langfuse.trace.public"
34    TRACE_METADATA = "langfuse.trace.metadata"
35    TRACE_INPUT = "langfuse.trace.input"
36    TRACE_OUTPUT = "langfuse.trace.output"
37
38    # Langfuse-observation attributes
39    OBSERVATION_TYPE = "langfuse.observation.type"
40    OBSERVATION_METADATA = "langfuse.observation.metadata"
41    OBSERVATION_LEVEL = "langfuse.observation.level"
42    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
43    OBSERVATION_INPUT = "langfuse.observation.input"
44    OBSERVATION_OUTPUT = "langfuse.observation.output"
45
46    # Langfuse-observation of type Generation attributes
47    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
48    OBSERVATION_MODEL = "langfuse.observation.model.name"
49    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
50    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
51    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
52    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
53    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
54
55    # General
56    ENVIRONMENT = "langfuse.environment"
57    RELEASE = "langfuse.release"
58    VERSION = "langfuse.version"
59
60    # Internal
61    AS_ROOT = "langfuse.internal.as_root"
62
63    # Experiments
64    EXPERIMENT_ID = "langfuse.experiment.id"
65    EXPERIMENT_NAME = "langfuse.experiment.name"
66    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
67    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
68    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
69    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
70    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
71    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
72    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"

TRACE_NAME = 'langfuse.trace.name'

TRACE_USER_ID = 'user.id'

TRACE_SESSION_ID = 'session.id'

TRACE_TAGS = 'langfuse.trace.tags'

TRACE_PUBLIC = 'langfuse.trace.public'

TRACE_METADATA = 'langfuse.trace.metadata'

TRACE_INPUT = 'langfuse.trace.input'

TRACE_OUTPUT = 'langfuse.trace.output'

OBSERVATION_TYPE = 'langfuse.observation.type'

OBSERVATION_METADATA = 'langfuse.observation.metadata'

OBSERVATION_LEVEL = 'langfuse.observation.level'

OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'

OBSERVATION_INPUT = 'langfuse.observation.input'

OBSERVATION_OUTPUT = 'langfuse.observation.output'

OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'

OBSERVATION_MODEL = 'langfuse.observation.model.name'

OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'

OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'

OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'

OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'

OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'

ENVIRONMENT = 'langfuse.environment'

RELEASE = 'langfuse.release'

VERSION = 'langfuse.version'

AS_ROOT = 'langfuse.internal.as_root'

EXPERIMENT_ID = 'langfuse.experiment.id'

EXPERIMENT_NAME = 'langfuse.experiment.name'

EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'

EXPERIMENT_METADATA = 'langfuse.experiment.metadata'

EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'

EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'

EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'

EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'

EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'

class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper): View Source

1726class LangfuseAgent(LangfuseObservationWrapper):
1727    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1728
1729    def __init__(self, **kwargs: Any) -> None:
1730        """Initialize a new LangfuseAgent span."""
1731        kwargs["as_type"] = "agent"
1732        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any) View Source

1729    def __init__(self, **kwargs: Any) -> None:
1730        """Initialize a new LangfuseAgent span."""
1731        kwargs["as_type"] = "agent"
1732        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper): View Source

1735class LangfuseTool(LangfuseObservationWrapper):
1736    """Tool observation representing external tool calls, e.g., calling a weather API."""
1737
1738    def __init__(self, **kwargs: Any) -> None:
1739        """Initialize a new LangfuseTool span."""
1740        kwargs["as_type"] = "tool"
1741        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any) View Source

1738    def __init__(self, **kwargs: Any) -> None:
1739        """Initialize a new LangfuseTool span."""
1740        kwargs["as_type"] = "tool"
1741        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper): View Source

1744class LangfuseChain(LangfuseObservationWrapper):
1745    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1746
1747    def __init__(self, **kwargs: Any) -> None:
1748        """Initialize a new LangfuseChain span."""
1749        kwargs["as_type"] = "chain"
1750        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any) View Source

1747    def __init__(self, **kwargs: Any) -> None:
1748        """Initialize a new LangfuseChain span."""
1749        kwargs["as_type"] = "chain"
1750        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper): View Source

1762class LangfuseEmbedding(LangfuseObservationWrapper):
1763    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1764
1765    def __init__(self, **kwargs: Any) -> None:
1766        """Initialize a new LangfuseEmbedding span."""
1767        kwargs["as_type"] = "embedding"
1768        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any) View Source

1765    def __init__(self, **kwargs: Any) -> None:
1766        """Initialize a new LangfuseEmbedding span."""
1767        kwargs["as_type"] = "embedding"
1768        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper): View Source

1771class LangfuseEvaluator(LangfuseObservationWrapper):
1772    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1773
1774    def __init__(self, **kwargs: Any) -> None:
1775        """Initialize a new LangfuseEvaluator span."""
1776        kwargs["as_type"] = "evaluator"
1777        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any) View Source

1774    def __init__(self, **kwargs: Any) -> None:
1775        """Initialize a new LangfuseEvaluator span."""
1776        kwargs["as_type"] = "evaluator"
1777        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper): View Source

1753class LangfuseRetriever(LangfuseObservationWrapper):
1754    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1755
1756    def __init__(self, **kwargs: Any) -> None:
1757        """Initialize a new LangfuseRetriever span."""
1758        kwargs["as_type"] = "retriever"
1759        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any) View Source

1756    def __init__(self, **kwargs: Any) -> None:
1757        """Initialize a new LangfuseRetriever span."""
1758        kwargs["as_type"] = "retriever"
1759        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper): View Source

1780class LangfuseGuardrail(LangfuseObservationWrapper):
1781    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1782
1783    def __init__(self, **kwargs: Any) -> None:
1784        """Initialize a new LangfuseGuardrail span."""
1785        kwargs["as_type"] = "guardrail"
1786        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any) View Source

1783    def __init__(self, **kwargs: Any) -> None:
1784        """Initialize a new LangfuseGuardrail span."""
1785        kwargs["as_type"] = "guardrail"
1786        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation: View Source

 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111        comment: Optional human-readable explanation of the evaluation result.
112            Useful for providing context, explaining scoring rationale, or noting
113            special conditions. Displayed in Langfuse UI for interpretability.
114        metadata: Optional structured metadata about the evaluation process.
115            Can include confidence scores, intermediate calculations, model versions,
116            or any other relevant technical details.
117        data_type: Optional score data type. Required if value is not NUMERIC.
118            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
119        config_id: Optional Langfuse score config ID.
120
121    Examples:
122        Basic accuracy evaluation:
123        ```python
124        from langfuse import Evaluation
125
126        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
127            if not expected_output:
128                return Evaluation(name="accuracy", value=None, comment="No expected output")
129
130            is_correct = output.strip().lower() == expected_output.strip().lower()
131            return Evaluation(
132                name="accuracy",
133                value=1.0 if is_correct else 0.0,
134                comment="Correct answer" if is_correct else "Incorrect answer"
135            )
136        ```
137
138        Multi-metric evaluator:
139        ```python
140        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
141            return [
142                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
143                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
144                Evaluation(
145                    name="quality",
146                    value=0.85,
147                    comment="High quality response",
148                    metadata={"confidence": 0.92, "model": "gpt-4"}
149                )
150            ]
151        ```
152
153        Categorical evaluation:
154        ```python
155        def sentiment_evaluator(*, input, output, **kwargs):
156            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
157            return Evaluation(
158                name="sentiment",
159                value=sentiment,
160                comment=f"Response expresses {sentiment} sentiment",
161                data_type="CATEGORICAL"
162            )
163        ```
164
165        Failed evaluation with error handling:
166        ```python
167        def external_api_evaluator(*, input, output, **kwargs):
168            try:
169                score = external_api.evaluate(output)
170                return Evaluation(name="external_score", value=score)
171            except Exception as e:
172                return Evaluation(
173                    name="external_score",
174                    value=None,
175                    comment=f"API unavailable: {e}",
176                    metadata={"error": str(e), "retry_count": 3}
177                )
178        ```
179
180    Note:
181        All arguments must be passed as keywords. Positional arguments are not allowed
182        to ensure code clarity and prevent errors from argument reordering.
183    """
184
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:

name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
config_id: Optional Langfuse score config ID.

Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )

Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None) View Source

185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:

name: Unique identifier for the evaluation metric.
value: The evaluation score or result.
comment: Optional human-readable explanation of the result.
metadata: Optional structured metadata about the evaluation process.
data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
config_id: Optional Langfuse score config ID.

Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name

value

comment

metadata

data_type

config_id

class EvaluatorInputs: View Source

 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:

input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.

Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None) View Source

 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:

input: The input data for evaluation.
output: The output data to be evaluated.
expected_output: Optional ground truth for comparison.
metadata: Optional additional context for evaluation.

Note:

All arguments must be provided as keywords.

input

output

expected_output

metadata

class MapperFunction(typing.Protocol): View Source

123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

Accept a single item parameter (trace, observation)
Return an EvaluatorInputs instance with input, output, expected_output, metadata
Can be either synchronous or asynchronous
Should handle missing or malformed data gracefully

MapperFunction(*args, **kwargs) View Source

1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)

class CompositeEvaluatorFunction(typing.Protocol): View Source

214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
Return either a single Evaluation, a list of Evaluations, or a dict
Can be either synchronous or asynchronous
Have access to both raw item data and evaluation results

CompositeEvaluatorFunction(*args, **kwargs) View Source

1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)

class EvaluatorStats: View Source

382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  ⚠️  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:

name: The name of the evaluator function (extracted from __name__).
total_runs: Total number of times the evaluator was invoked.
successful_runs: Number of times the evaluator completed successfully.
failed_runs: Number of times the evaluator raised an exception or failed.
total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.

Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0) View Source

427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:

name: The evaluator function name.
total_runs: Total number of evaluator invocations.
successful_runs: Number of successful completions.
failed_runs: Number of failures.
total_scores_created: Total scores created by this evaluator.

Note:

All arguments must be provided as keywords.

name

total_runs

successful_runs

failed_runs

total_scores_created

class BatchEvaluationResumeToken: View Source

455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:

scope: The type of items being evaluated ("traces", "observations").
filter: The original JSON filter string used to query items.
last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
last_processed_id: The ID of the last successfully processed item, for reference.
items_processed: Count of items successfully processed before interruption.

Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )

Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int) View Source

549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:

scope: The scope type ("traces", "observations").
filter: The original JSON filter string.
last_processed_timestamp: ISO 8601 timestamp of last processed item.
last_processed_id: ID of last processed item.
items_processed: Count of items processed before interruption.

Note:

All arguments must be provided as keywords.

scope

filter

last_processed_timestamp

last_processed_id

items_processed

class BatchEvaluationResult: View Source

577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    ⚠️  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\n⚠️  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("⚠️  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"ℹ️  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:

total_items_fetched: Total number of items fetched from the API.
total_items_processed: Number of items successfully evaluated.
total_items_failed: Number of items that failed during evaluation.
total_scores_created: Total scores created by all item-level evaluators.
total_composite_scores_created: Scores created by the composite evaluator.
total_evaluations_failed: Number of individual evaluator failures across all items.
evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
resume_token: Token for resuming if evaluation was interrupted (None if completed).
completed: True if all items were processed, False if stopped early or failed.
duration_seconds: Total time taken to execute the batch evaluation.
failed_item_ids: List of IDs for items that failed evaluation.
error_summary: Dictionary mapping error types to occurrence counts.
has_more_items: True if max_items limit was reached but more items exist.
item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).

Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    ⚠️  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\n⚠️  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("⚠️  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"ℹ️  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")

Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]]) View Source

679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:

total_items_fetched: Total items fetched from API.
total_items_processed: Items successfully evaluated.
total_items_failed: Items that failed evaluation.
total_scores_created: Scores from item-level evaluators.
total_composite_scores_created: Scores from composite evaluator.
total_evaluations_failed: Individual evaluator failures.
evaluator_stats: Per-evaluator statistics.
resume_token: Token for resuming (None if completed).
completed: Whether all items were processed.
duration_seconds: Total execution time.
failed_item_ids: IDs of failed items.
error_summary: Error types and counts.
has_more_items: Whether more items exist beyond max_items.
item_evaluations: Dictionary mapping item IDs to their evaluation results.

Note:

All arguments must be provided as keywords.

total_items_fetched

total_items_processed

total_items_failed

total_scores_created

total_composite_scores_created

total_evaluations_failed

evaluator_stats

resume_token

completed

duration_seconds

failed_item_ids

error_summary

has_more_items

item_evaluations