Skip to content

Commit d4edf98

Browse files
committed
feat: reject problematic histograms early
Such error will produce debug logs, instead of error log for GCM write (with no mention of series labels). In separate PRs I plan to: * challenge debug level for those errors (should it be error/warn?) * add runtime log level feature prometheus/prometheus#10352 (comment) * pring series that are in the fail send batch in error log. Helps debugging b/387200417 Signed-off-by: bwplotka <bwplotka@google.com>
1 parent 99aa511 commit d4edf98

File tree

2 files changed

+61
-16
lines changed

2 files changed

+61
-16
lines changed

pkg/export/transform.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,12 +304,22 @@ func (d *distribution) build(lset labels.Labels) (*distribution_pb.Distribution,
304304
}
305305

306306
for i, bound := range d.bounds {
307+
if i > 0 && prevBound == bound {
308+
// Bounds has to be higher than the previous one.
309+
// Rarely, but the same bounds can occur due to string to float imprecision
310+
// or invalid representations of the same float e.g. 1 vs 1.0.
311+
// GCM API rejects those, so reject them early.
312+
prometheusSamplesDiscarded.WithLabelValues("duplicate-bucket-boundary").Add(float64(d.inputSampleCount()))
313+
err := fmt.Errorf("invalid histogram with duplicates bounds (le label value) %s: count=%f, sum=%f, dev=%f, index=%d, bucketBound=%f, bucketPrevBound=%f",
314+
lset, d.count, d.sum, dev, i, bound, prevBound)
315+
return nil, err
316+
}
317+
307318
if math.IsInf(bound, 1) {
308319
bound = prevBound
309320
} else {
310321
bounds = append(bounds, bound)
311322
}
312-
313323
val := d.values[i] - prevVal
314324
// val should never be negative and it most likely indicates a bug or a data race in a scraped
315325
// metrics endpoint.

pkg/export/transform_test.go

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,14 @@ func TestSampleBuilder(t *testing.T) {
6464
})
6565

6666
cases := []struct {
67-
doc string
68-
metadata MetadataFunc
69-
series seriesMap
70-
samples [][]record.RefSample
71-
exemplars []map[storage.SeriesRef]record.RefExemplar
72-
matchers Matchers
73-
wantSeries []*monitoring_pb.TimeSeries
74-
wantFail bool
67+
doc string
68+
metadata MetadataFunc
69+
series seriesMap
70+
samples [][]record.RefSample
71+
exemplars []map[storage.SeriesRef]record.RefExemplar
72+
matchers Matchers
73+
wantSeries []*monitoring_pb.TimeSeries
74+
wantFailOnLastSample bool
7575
}{
7676
{
7777
doc: "convert gauge",
@@ -1277,6 +1277,42 @@ func TestSampleBuilder(t *testing.T) {
12771277
},
12781278
},
12791279
},
1280+
{
1281+
// Regression test for b/387200417.
1282+
doc: "histogram with conversion error for boundaries",
1283+
metadata: testMetadataFunc(metricMetadataMap{
1284+
"metric1": {Type: textparse.MetricTypeHistogram, Help: "metric1 help text"},
1285+
}),
1286+
series: seriesMap{
1287+
1: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_sum"),
1288+
2: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_count"),
1289+
3: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_bucket", "le", "0.5"),
1290+
4: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_bucket", "le", "0.50000000000000001"),
1291+
5: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_bucket", "le", "1"),
1292+
6: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_bucket", "le", "1.0"),
1293+
7: labels.FromStrings("job", "job1", "instance", "instance1", "__name__", "metric1_bucket", "le", "+Inf"),
1294+
},
1295+
samples: [][]record.RefSample{
1296+
{
1297+
{Ref: 3, T: 1000, V: 1}, // 0.5
1298+
{Ref: 4, T: 1000, V: 1}, // 0.50000000000000001
1299+
{Ref: 5, T: 1000, V: 5}, // 1
1300+
{Ref: 6, T: 1000, V: 5}, // 1.0
1301+
{Ref: 7, T: 1000, V: 10}, // +Inf
1302+
{Ref: 1, T: 1000, V: 8}, // sum
1303+
{Ref: 2, T: 1000, V: 10}, // count
1304+
}, {
1305+
{Ref: 3, T: 2000, V: 2}, // 0.5
1306+
{Ref: 4, T: 2000, V: 2}, // 0.50000000000000001
1307+
{Ref: 5, T: 2000, V: 6}, // 1
1308+
{Ref: 6, T: 2000, V: 6}, // 1.0
1309+
{Ref: 7, T: 2000, V: 11}, // +Inf
1310+
{Ref: 1, T: 2000, V: 9.5}, // sum
1311+
{Ref: 2, T: 2000, V: 11}, // count
1312+
},
1313+
},
1314+
wantFailOnLastSample: true,
1315+
},
12801316
{
12811317
doc: "convert histogram with exemplars",
12821318
metadata: testMetadataFunc(metricMetadataMap{
@@ -1497,14 +1533,13 @@ func TestSampleBuilder(t *testing.T) {
14971533
exemplars = c.exemplars[i]
14981534
}
14991535
out, tail, err := b.next(c.metadata, externalLabels, batch, exemplars)
1500-
if err == nil && c.wantFail {
1501-
t.Fatal("expected error but got none")
1502-
}
1503-
if err != nil && !c.wantFail {
1504-
t.Fatalf("unexpected error: %s", err)
1505-
}
1506-
if err != nil {
1536+
if c.wantFailOnLastSample && len(c.samples) == i+1 {
1537+
if err == nil {
1538+
t.Fatal("expected error but got none")
1539+
}
15071540
break
1541+
} else if err != nil {
1542+
t.Fatalf("unexpected error: %s %d", err, i)
15081543
}
15091544
if len(tail) >= len(batch) {
15101545
t.Fatalf("no sample was consumed")

0 commit comments

Comments
 (0)