@@ -24,17 +24,25 @@ import (
2424 "log/slog"
2525 "net/http"
2626 "os"
27+ "strconv"
2728
2829 "cloud.google.com/go/auth"
2930 "cloud.google.com/go/auth/credentials"
3031 "cloud.google.com/go/auth/internal"
3132 "cloud.google.com/go/auth/internal/transport"
3233 "cloud.google.com/go/auth/internal/transport/headers"
34+ "github.com/googleapis/gax-go/v2"
35+ "github.com/googleapis/gax-go/v2/callctx"
3336 "github.com/googleapis/gax-go/v2/internallog"
3437 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
38+ "go.opentelemetry.io/otel/attribute"
39+ "go.opentelemetry.io/otel/trace"
3540 "google.golang.org/grpc"
41+ "google.golang.org/grpc/codes"
3642 grpccreds "google.golang.org/grpc/credentials"
3743 grpcinsecure "google.golang.org/grpc/credentials/insecure"
44+ "google.golang.org/grpc/stats"
45+ "google.golang.org/grpc/status"
3846)
3947
4048const (
@@ -47,6 +55,30 @@ const (
4755 quotaProjectHeaderKey = "X-goog-user-project"
4856)
4957
58+ // codeToStr is a reversal of the `strToCode` map in
59+ // https://github.com/grpc/grpc-go/blob/master/codes/codes.go
60+ // The gRPC specification has exactly 17 status codes, defined
61+ // as a contiguous block of integers from 0 to 16.
62+ var codeToStr = [... ]string {
63+ "OK" , // codes.OK = 0
64+ "CANCELED" , // codes.Canceled = 1
65+ "UNKNOWN" , // codes.Unknown = 2
66+ "INVALID_ARGUMENT" , // codes.InvalidArgument = 3
67+ "DEADLINE_EXCEEDED" , // codes.DeadlineExceeded = 4
68+ "NOT_FOUND" , // codes.NotFound = 5
69+ "ALREADY_EXISTS" , // codes.AlreadyExists = 6
70+ "PERMISSION_DENIED" , // codes.PermissionDenied = 7
71+ "RESOURCE_EXHAUSTED" , // codes.ResourceExhausted = 8
72+ "FAILED_PRECONDITION" , // codes.FailedPrecondition = 9
73+ "ABORTED" , // codes.Aborted = 10
74+ "OUT_OF_RANGE" , // codes.OutOfRange = 11
75+ "UNIMPLEMENTED" , // codes.Unimplemented = 12
76+ "INTERNAL" , // codes.Internal = 13
77+ "UNAVAILABLE" , // codes.Unavailable = 14
78+ "DATA_LOSS" , // codes.DataLoss = 15
79+ "UNAUTHENTICATED" , // codes.Unauthenticated = 16
80+ }
81+
5082var (
5183 // Set at init time by dial_socketopt.go. If nil, socketopt is not supported.
5284 timeoutDialerOption grpc.DialOption
@@ -198,7 +230,7 @@ type InternalOptions struct {
198230 // service.
199231 DefaultScopes []string
200232 // SkipValidation bypasses validation on Options. It should only be used
201- // internally for clients that needs more control over their transport.
233+ // internally for clients that need more control over their transport.
202234 SkipValidation bool
203235 // TelemetryAttributes specifies a map of telemetry attributes to be added
204236 // to all OpenTelemetry signals, such as tracing and metrics, for purposes
@@ -430,5 +462,115 @@ func addOpenTelemetryStatsHandler(dialOpts []grpc.DialOption, opts *Options) []g
430462 if opts .DisableTelemetry {
431463 return dialOpts
432464 }
433- return append (dialOpts , grpc .WithStatsHandler (otelgrpc .NewClientHandler ()))
465+ if ! gax .IsFeatureEnabled ("TRACING" ) {
466+ return append (dialOpts , grpc .WithStatsHandler (otelgrpc .NewClientHandler ()))
467+ }
468+ var staticAttrs []attribute.KeyValue
469+ if opts .InternalOptions != nil {
470+ staticAttrs = transport .StaticTelemetryAttributes (opts .InternalOptions .TelemetryAttributes )
471+ }
472+ otelOpts := []otelgrpc.Option {
473+ otelgrpc .WithSpanAttributes (staticAttrs ... ),
474+ }
475+ return append (dialOpts , grpc .WithStatsHandler (& otelHandler {
476+ Handler : otelgrpc .NewClientHandler (otelOpts ... ),
477+ }))
478+ }
479+
480+ // otelHandler is a wrapper around the OpenTelemetry gRPC client handler that
481+ // adds custom Google Cloud-specific attributes to spans and metrics.
482+ type otelHandler struct {
483+ stats.Handler
484+ }
485+
486+ // TagRPC intercepts the RPC start to extract dynamic attributes like resource
487+ // name and retry count from the outgoing context metadata and attach them to
488+ // the current span.
489+ func (h * otelHandler ) TagRPC (ctx context.Context , info * stats.RPCTagInfo ) context.Context {
490+ ctx = h .Handler .TagRPC (ctx , info )
491+ span := trace .SpanFromContext (ctx )
492+ if ! span .IsRecording () {
493+ return ctx
494+ }
495+ var attrs []attribute.KeyValue
496+ if resName , ok := callctx .TelemetryFromContext (ctx , "resource_name" ); ok {
497+ attrs = append (attrs , attribute .String ("gcp.resource.destination.id" , resName ))
498+ }
499+ if resendCountStr , ok := callctx .TelemetryFromContext (ctx , "resend_count" ); ok {
500+ if count , err := strconv .Atoi (resendCountStr ); err == nil {
501+ attrs = append (attrs , attribute .Int ("gcp.grpc.resend_count" , count ))
502+ }
503+ }
504+ if len (attrs ) > 0 {
505+ span .SetAttributes (attrs ... )
506+ }
507+ return ctx
508+ }
509+
510+ // HandleRPC intercepts the RPC completion to capture and format error-related
511+ // attributes ensuring they conform to Google Cloud observability standards.
512+ func (h * otelHandler ) HandleRPC (ctx context.Context , s stats.RPCStats ) {
513+ end , ok := s .(* stats.End )
514+ if ! ok {
515+ h .Handler .HandleRPC (ctx , s )
516+ return
517+ }
518+ span := trace .SpanFromContext (ctx )
519+ if ! span .IsRecording () {
520+ h .Handler .HandleRPC (ctx , s )
521+ return
522+ }
523+
524+ var attrs []attribute.KeyValue
525+ if end .Error != nil {
526+ st , ok := status .FromError (end .Error )
527+ rpcStatusCode := codeToCanonicalStr (st .Code ())
528+
529+ var errorType string
530+ // 1. Check if the local context expired or was cancelled. This is the only
531+ // reliable way to distinguish a local client timeout from a server timeout
532+ // because gRPC does not wrap context errors in its status.Error types.
533+ if errors .Is (ctx .Err (), context .DeadlineExceeded ) {
534+ errorType = "CLIENT_TIMEOUT"
535+ } else if errors .Is (ctx .Err (), context .Canceled ) {
536+ errorType = "CLIENT_CANCELLED"
537+ } else if ! ok || st .Code () == codes .Unknown || st .Code () == codes .Internal {
538+ // 2. If the error isn't a context breakdown and the gRPC framework
539+ // doesn't "understand" it (returning ok=false or a generic catch-all
540+ // bucket like Unknown/Internal), we "pack" the actual Go error type
541+ // name into error.type (e.g., "*net.OpError"). This is per the error.type
542+ // [spec](https://opentelemetry.io/docs/specs/semconv/registry/attributes/error/#error-type).
543+ // "When error.type is set to a type (e.g., an exception type), its canonical
544+ // class name identifying the type within the artifact SHOULD be used."
545+ errorType = fmt .Sprintf ("%T" , end .Error )
546+ } else {
547+ // 3. Otherwise, it is a well-understood gRPC protocol error (e.g.,
548+ // PERMISSION_DENIED) likely returned by the server.
549+ errorType = rpcStatusCode
550+ }
551+
552+ attrs = []attribute.KeyValue {
553+ attribute .String ("error.type" , errorType ),
554+ attribute .String ("status.message" , st .Message ()),
555+ attribute .String ("rpc.response.status_code" , rpcStatusCode ),
556+ attribute .String ("exception.type" , fmt .Sprintf ("%T" , end .Error )),
557+ }
558+ } else {
559+ attrs = []attribute.KeyValue {
560+ attribute .String ("rpc.response.status_code" , "OK" ),
561+ }
562+ }
563+ span .SetAttributes (attrs ... )
564+ h .Handler .HandleRPC (ctx , s )
565+ }
566+
567+ // codeToCanonicalStr returns the canonical name for each of the 17 gRPC
568+ // status codes defined in https://github.com/grpc/grpc-go/blob/master/codes/codes.go.
569+ // For any codes.Code that converts to an out-of-bounds int,
570+ // it returns "UNKNOWN".
571+ func codeToCanonicalStr (code codes.Code ) string {
572+ if int (code ) >= 0 && int (code ) < len (codeToStr ) {
573+ return codeToStr [code ]
574+ }
575+ return "UNKNOWN"
434576}
0 commit comments