smon status update after reconcillation#7767
smon status update after reconcillation#7767simonpasquier merged 35 commits intoprometheus-operator:mainfrom
Conversation
|
@simonpasquier PTAL |
pkg/prometheus/resource_selector.go
Outdated
| // The map's key is the full resource name in the "<namespace>/<name>" form. | ||
| type ResourcesSelection[T configurationResource] map[string]struct { | ||
| // Resource is a generic type that holds a configuration resource with its validation status. | ||
| type Resource[T configurationResource] struct { |
There was a problem hiding this comment.
Resource -> ConfigurationResource?
There was a problem hiding this comment.
and configurationResource -> TypedConfigurationResource?
pkg/prometheus/resource_selector.go
Outdated
| reason string // Reason for rejection; empty if accepted. | ||
| } | ||
|
|
||
| func NewResource[T configurationResource](resource T, err error, reason string) Resource[T] { |
There was a problem hiding this comment.
It doesn't seem to be used?
pkg/prometheus/operator.go
Outdated
| Group: ru.gvr.Group, | ||
| Conditions: []monitoringv1.ConfigResourceCondition{condition}, | ||
| } | ||
| switch r := any(res.resource).(type) { |
There was a problem hiding this comment.
It feels a bit awkward to have a generic type and then fallback to type-casting.
pkg/prometheus/operator.go
Outdated
| // resources selected by the Prometheus or PrometheusAgent. | ||
| func (ru *ConfigResourceSyncer[T]) AddStatus(ctx context.Context, p metav1.Object, resources ResourcesSelection[T]) error { | ||
| for key, res := range resources { | ||
| condition := monitoringv1.ConfigResourceCondition{ |
There was a problem hiding this comment.
(suggestion) can we have a receiver function of Resource which would return a slice of monitoringv1.ConfigResourceCondition? E.g.
func (r *Resource) Conditions() []monitoringv1.ConfigResourceCondition {
...
}
pkg/prometheus/operator.go
Outdated
| binding.Conditions = []monitoringv1.ConfigResourceCondition{condition} | ||
| r.Status.Bindings = append(r.Status.Bindings, binding) | ||
| } | ||
| _, err := ru.mclient.MonitoringV1().ServiceMonitors(r.Namespace).ApplyStatus(ctx, ApplyConfigurationFromServiceMonitor(r), metav1.ApplyOptions{FieldManager: operator.PrometheusOperatorFieldManager, Force: true}) |
There was a problem hiding this comment.
Eventually we should try not updating the status if nothing has changed. But in the scope of this PR, it's ok to blindly apply the new status.
pkg/prometheus/operator.go
Outdated
| } | ||
| _, err := ru.mclient.MonitoringV1().ServiceMonitors(r.Namespace).ApplyStatus(ctx, ApplyConfigurationFromServiceMonitor(r), metav1.ApplyOptions{FieldManager: operator.PrometheusOperatorFieldManager, Force: true}) | ||
| if err != nil { | ||
| ru.logger.Debug("Failed to update serviceMonitor status", "error", err, "key", key) |
There was a problem hiding this comment.
we'll discuss further what's the best approach to deal with such errors (only log or retry?) but at the minimum it should be a warning.
pkg/prometheus/operator.go
Outdated
| if binding.Namespace == p.GetNamespace() && | ||
| binding.Name == p.GetName() && | ||
| binding.Resource == monitoringv1.PrometheusName { | ||
| if binding.Conditions[0].ObservedGeneration != condition.ObservedGeneration { |
There was a problem hiding this comment.
this test isn't needed for now: the resource's generation might have not changed but still the resource's status has changed (e.g. a secret reference which was missing is now valid).
pkg/prometheus/operator.go
Outdated
| _, err := ru.mclient.MonitoringV1().ServiceMonitors(r.Namespace).ApplyStatus(ctx, ApplyConfigurationFromServiceMonitor(r), metav1.ApplyOptions{FieldManager: operator.PrometheusOperatorFieldManager, Force: true}) | ||
| if err != nil { | ||
| ru.logger.Debug("Failed to update serviceMonitor status", "error", err, "key", key) | ||
| return err |
There was a problem hiding this comment.
In general, we should either log the error or return it but not doing both.
…on resource statuses
…ed resource status management
…for clarity and consistency
…ter and updating AddStatus signature
…error returns for cleaner control flow
… update usage in operator
…MonitorStatus implementation
pkg/prometheus/operator.go
Outdated
| return &pStatus, nil | ||
| } | ||
|
|
||
| func (r *ConfigurationResource[T]) condition(observedGeneration int64) []monitoringv1.ConfigResourceCondition { |
There was a problem hiding this comment.
can we move this next to the declaration of ConfigurationResource?
pkg/prometheus/operator.go
Outdated
| return &pStatus, nil | ||
| } | ||
|
|
||
| func (r *ConfigurationResource[T]) condition(observedGeneration int64) []monitoringv1.ConfigResourceCondition { |
There was a problem hiding this comment.
(nit)
| func (r *ConfigurationResource[T]) condition(observedGeneration int64) []monitoringv1.ConfigResourceCondition { | |
| func (r *ConfigurationResource[T]) conditions(observedGeneration int64) []monitoringv1.ConfigResourceCondition { |
pkg/prometheus/operator.go
Outdated
| } | ||
|
|
||
| // AddServiceMonitorStatus add the latest status in serviceMonitors resources selected by the Prometheus or PrometheusAgent. | ||
| func AddServiceMonitorStatus(ctx context.Context, p metav1.Object, c *ConfigResourceSyncer, resources ResourcesSelection[*monitoringv1.ServiceMonitor]) { |
There was a problem hiding this comment.
(suggestion) can we make the function work on a single ServiceMonitor?
pkg/prometheus/resource_selector.go
Outdated
| // The map's key is the full resource name in the "<namespace>/<name>" form. | ||
| type ResourcesSelection[T configurationResource] map[string]struct { | ||
| // ConfigurationResource is a generic type that holds a configuration resource with its validation status. | ||
| type ConfigurationResource[T TypedConfigurationResource] struct { |
There was a problem hiding this comment.
| type ConfigurationResource[T TypedConfigurationResource] struct { | |
| type TypedConfigurationResource[T ConfigurationResource] struct { |
pkg/prometheus/resource_selector.go
Outdated
| // TypedConfigurationResource is a type constraint that permits only the specific pointer types for configuration resources | ||
| // selectable by Prometheus or PrometheusAgent. | ||
| type configurationResource interface { | ||
| type TypedConfigurationResource interface { |
There was a problem hiding this comment.
I suggest that we use the Typed prefix for types which use type parameters (see below).
| type TypedConfigurationResource interface { | |
| type ConfigurationResource interface { |
pkg/prometheus/resource_selector.go
Outdated
| } | ||
|
|
||
| // ResourcesSelection represents a map of configuration resources selected by Prometheus or PrometheusAgent. | ||
| type ResourcesSelection[T TypedConfigurationResource] map[string]ConfigurationResource[T] |
There was a problem hiding this comment.
| type ResourcesSelection[T TypedConfigurationResource] map[string]ConfigurationResource[T] | |
| type TypedResourcesSelection[T ConfigurationResource] map[string]TypedConfigurationResource[T] |
pkg/prometheus/server/operator.go
Outdated
| } | ||
|
|
||
| configResourceSyncer := prompkg.NewConfigResourceSyncer(monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.PrometheusName), c.mclient, logger) | ||
| prompkg.AddServiceMonitorStatus(ctx, p, configResourceSyncer, resources.sMons) |
There was a problem hiding this comment.
Adding to my comment on prompkg.AddServiceMonitorStatus I'd recommend that we iterate over the config resources in this function
| prompkg.AddServiceMonitorStatus(ctx, p, configResourceSyncer, resources.sMons) | |
| for _, sMon := range resources.sMons { | |
| err:= prompkg.UpdateServiceMonitorStatus(ctx, p, configResourceSyncer, sMon) | |
| ... | |
| } |
…clarity and maintainability
…ServiceMonitorStatus
…sistency and clarity
|
@simonpasquier done the changes |
pkg/prometheus/operator.go
Outdated
| } | ||
|
|
||
| // UpdateServiceMonitorStatus add the latest status in serviceMonitor selected by the Prometheus or PrometheusAgent. | ||
| func UpdateServiceMonitorStatus(ctx context.Context, p metav1.Object, c *ConfigResourceSyncer, res TypedConfigurationResource[*monitoringv1.ServiceMonitor]) error { |
There was a problem hiding this comment.
(nit) I would put the resource syncer as the second argument
| func UpdateServiceMonitorStatus(ctx context.Context, p metav1.Object, c *ConfigResourceSyncer, res TypedConfigurationResource[*monitoringv1.ServiceMonitor]) error { | |
| func UpdateServiceMonitorStatus( | |
| ctx context.Context, | |
| c *ConfigResourceSyncer, | |
| p metav1.Object, | |
| res TypedConfigurationResource[*monitoringv1.ServiceMonitor]) error { |
pkg/prometheus/operator.go
Outdated
| binding := &smon.Status.Bindings[i] | ||
| if binding.Namespace == p.GetNamespace() && | ||
| binding.Name == p.GetName() && | ||
| binding.Resource == monitoringv1.PrometheusName { |
There was a problem hiding this comment.
should we also check the group?
| binding.Resource == monitoringv1.PrometheusName { | |
| binding.Resource == c.gvr.Resource { |
There was a problem hiding this comment.
nope, not needed i think
| Conditions: conditions, | ||
| }) | ||
| } | ||
| _, err := c.mclient.MonitoringV1().ServiceMonitors(smon.Namespace).ApplyStatus(ctx, ApplyConfigurationFromServiceMonitor(smon), metav1.ApplyOptions{FieldManager: operator.PrometheusOperatorFieldManager, Force: true}) |
There was a problem hiding this comment.
I suspect that if we set the proper markers on , we could apply the updated binding instead of passing the full list of bindings.
type ConfigResourceStatus struct {
// The list of workload resources (Prometheus or PrometheusAgent) which select the configuration resource.
// +listType=map
// +listMapKey=group
// +listMapKey=resource
// +listMapKey=name
// +listMapKey=namespace
// +optional
Bindings []WorkloadBinding `json:"bindings,omitempty"`
}
pkg/prometheus/server/operator.go
Outdated
| return nil | ||
| } | ||
|
|
||
| // updateConfigResourcesStatus updates the status of the selected configuration resources (serviceMonitor, podMonitor, scrapeClass and podMonitor). |
There was a problem hiding this comment.
| // updateConfigResourcesStatus updates the status of the selected configuration resources (serviceMonitor, podMonitor, scrapeClass and podMonitor). | |
| // updateConfigResourcesStatus updates the status of the selected configuration resources (ServiceMonitor, PodMonitor, ScrapeConfig and PodMonitor). |
pkg/prometheus/server/operator.go
Outdated
| configResourceSyncer := prompkg.NewConfigResourceSyncer(monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.PrometheusName), c.mclient) | ||
| for key, sm := range resources.sMons { | ||
| if err := prompkg.UpdateServiceMonitorStatus(ctx, p, configResourceSyncer, sm); err != nil { | ||
| logger.Warn("Failed to update serviceMonitor status", "error", err, "key", key) |
There was a problem hiding this comment.
| logger.Warn("Failed to update serviceMonitor status", "error", err, "key", key) | |
| logger.Warn("Failed to update ServiceMonitor status", "error", err, "key", key) |
pkg/prometheus/operator.go
Outdated
| mclient monitoringclient.Interface | ||
| } | ||
|
|
||
| func NewConfigResourceSyncer(gvr schema.GroupVersionResource, mclient monitoringclient.Interface) *ConfigResourceSyncer { |
There was a problem hiding this comment.
(suggestion) the workload object could be an input for the constructor since it doesn't change when we call UpdateServiceMonitorStatus().
There was a problem hiding this comment.
didnt get what you are trying to say
There was a problem hiding this comment.
I meant something like:
| func NewConfigResourceSyncer(gvr schema.GroupVersionResource, mclient monitoringclient.Interface) *ConfigResourceSyncer { | |
| func NewConfigResourceSyncer(gvr schema.GroupVersionResource, mclient monitoringclient.Interface, workload metav1.Object) *ConfigResourceSyncer { |
pkg/prometheus/server/operator.go
Outdated
| if len(resources.sMons) == 0 { | ||
| return | ||
| } | ||
|
|
There was a problem hiding this comment.
(nit) we don't need to check if the slice is empty.
| if len(resources.sMons) == 0 { | |
| return | |
| } |
… and updateConfigResourcesStatus functions
pkg/prometheus/operator.go
Outdated
| return &pStatus, nil | ||
| } | ||
|
|
||
| // UpdateServiceMonitorStatus add the latest status in serviceMonitor selected by the Prometheus or PrometheusAgent. |
There was a problem hiding this comment.
| // UpdateServiceMonitorStatus add the latest status in serviceMonitor selected by the Prometheus or PrometheusAgent. | |
| // UpdateServiceMonitorStatus updates the status binding of the serviceMonitor for the given workload. |
pkg/prometheus/operator.go
Outdated
| } | ||
| } | ||
|
|
||
| func (r *TypedConfigurationResource[T]) conditions(observedGeneration int64) []monitoringv1.ConfigResourceCondition { |
There was a problem hiding this comment.
(nit) can we move this close to the TypedConfigurationResource definition?
test/e2e/status_subresource_test.go
Outdated
| _, err = framework.MonClientV1.ServiceMonitors(ns).Create(ctx, smon, v1.CreateOptions{}) | ||
| require.NoError(t, err) | ||
|
|
||
| time.Sleep(10 * time.Second) |
There was a problem hiding this comment.
We need to use a poll-retry construct, sleeping for a fixed duration is too brittle (and also costly). See examples with wait.PollUntilContextTimeout (for instance framework.WaitForResourceAvailable()).
I suggest to add a WaitForConfigurationResourceCondtionmethod to the test/framework package.
test/e2e/status_subresource_test.go
Outdated
| require.NoError(t, err) | ||
| require.Equal(t, 1, len(sm.Status.Bindings)) | ||
| require.Equal(t, name, sm.Status.Bindings[0].Name) | ||
| require.Equal(t, ns, sm.Status.Bindings[0].Namespace) |
There was a problem hiding this comment.
We'd need to check the Accepted condition.
Can we also add a negative test (e.g. configure an invalid secret reference and check that Accepted=false)?
…atus in testServiceMonitorStatusSubresource
|
@simonpasquier ready to merge this |


Description
smon status update after reconcillation
Closes: #7685
Type of change
What type of changes does your code introduce to the Prometheus operator? Put an
xin the box that apply.CHANGE(fix or feature that would cause existing functionality to not work as expected)FEATURE(non-breaking change which adds functionality)BUGFIX(non-breaking change which fixes an issue)ENHANCEMENT(non-breaking change which improves existing functionality)NONE(if none of the other choices apply. Example, tooling, build system, CI, docs, etc.)Verification
Please check the Prometheus-Operator testing guidelines for recommendations about automated tests.
Changelog entry
Please put a one-line changelog entry below. This will be copied to the changelog file during the release process.