Skip to content

Commit 245d2ea

Browse files
authored
feat(storage): add full object checksum to reader.Attrs (#10538)
1 parent 29b52dc commit 245d2ea

File tree

4 files changed

+123
-67
lines changed

4 files changed

+123
-67
lines changed

storage/grpc_client.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,9 +1101,11 @@ func (c *grpcStorageClient) NewRangeReader(ctx context.Context, params *newRange
11011101
wantCRC uint32
11021102
checkCRC bool
11031103
)
1104-
if checksums := msg.GetObjectChecksums(); checksums != nil && checksums.Crc32C != nil && params.offset == 0 && params.length < 0 {
1104+
if checksums := msg.GetObjectChecksums(); checksums != nil && checksums.Crc32C != nil {
1105+
if params.offset == 0 && params.length < 0 {
1106+
checkCRC = true
1107+
}
11051108
wantCRC = checksums.GetCrc32C()
1106-
checkCRC = true
11071109
}
11081110

11091111
r = &Reader{
@@ -1115,6 +1117,7 @@ func (c *grpcStorageClient) NewRangeReader(ctx context.Context, params *newRange
11151117
LastModified: obj.GetUpdateTime().AsTime(),
11161118
Metageneration: obj.GetMetageneration(),
11171119
Generation: obj.GetGeneration(),
1120+
CRC32C: wantCRC,
11181121
},
11191122
reader: &gRPCReader{
11201123
stream: res.stream,

storage/http_client.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,18 +1415,20 @@ func parseReadResponse(res *http.Response, params *newRangeReaderParams, reopen
14151415
}
14161416
} else {
14171417
size = res.ContentLength
1418-
// Check the CRC iff all of the following hold:
1419-
// - We asked for content (length != 0).
1420-
// - We got all the content (status != PartialContent).
1421-
// - The server sent a CRC header.
1422-
// - The Go http stack did not uncompress the file.
1423-
// - We were not served compressed data that was uncompressed on download.
1424-
// The problem with the last two cases is that the CRC will not match -- GCS
1425-
// computes it on the compressed contents, but we compute it on the
1426-
// uncompressed contents.
1427-
if params.length != 0 && !res.Uncompressed && !uncompressedByServer(res) {
1428-
crc, checkCRC = parseCRC32c(res)
1429-
}
1418+
}
1419+
1420+
// Check the CRC iff all of the following hold:
1421+
// - We asked for content (length != 0).
1422+
// - We got all the content (status != PartialContent).
1423+
// - The server sent a CRC header.
1424+
// - The Go http stack did not uncompress the file.
1425+
// - We were not served compressed data that was uncompressed on download.
1426+
// The problem with the last two cases is that the CRC will not match -- GCS
1427+
// computes it on the compressed contents, but we compute it on the
1428+
// uncompressed contents.
1429+
crc, checkCRC = parseCRC32c(res)
1430+
if params.length == 0 || res.StatusCode == http.StatusPartialContent || res.Uncompressed || uncompressedByServer(res) {
1431+
checkCRC = false
14301432
}
14311433

14321434
remain := res.ContentLength
@@ -1463,6 +1465,8 @@ func parseReadResponse(res *http.Response, params *newRangeReaderParams, reopen
14631465
StartOffset: startOffset,
14641466
Generation: params.gen,
14651467
Metageneration: metaGen,
1468+
CRC32C: crc,
1469+
Decompressed: res.Uncompressed || uncompressedByServer(res),
14661470
}
14671471
return &Reader{
14681472
Attrs: attrs,

storage/integration_test.go

Lines changed: 87 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3631,78 +3631,87 @@ func TestIntegration_ReadCRC(t *testing.T) {
36313631
offset, length int64
36323632
readCompressed bool // don't decompress a gzipped file
36333633

3634-
wantErr bool
3635-
wantCheck bool // Should Reader try to check the CRC?
3634+
wantErr bool
3635+
wantCheck bool // Should Reader try to check the CRC?
3636+
wantDecompressed bool
36363637
}{
36373638
{
3638-
desc: "uncompressed, entire file",
3639-
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3640-
offset: 0,
3641-
length: -1,
3642-
readCompressed: false,
3643-
wantCheck: true,
3639+
desc: "uncompressed, entire file",
3640+
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3641+
offset: 0,
3642+
length: -1,
3643+
readCompressed: false,
3644+
wantCheck: true,
3645+
wantDecompressed: false,
36443646
},
36453647
{
3646-
desc: "uncompressed, entire file, don't decompress",
3647-
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3648-
offset: 0,
3649-
length: -1,
3650-
readCompressed: true,
3651-
wantCheck: true,
3648+
desc: "uncompressed, entire file, don't decompress",
3649+
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3650+
offset: 0,
3651+
length: -1,
3652+
readCompressed: true,
3653+
wantCheck: true,
3654+
wantDecompressed: false,
36523655
},
36533656
{
3654-
desc: "uncompressed, suffix",
3655-
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3656-
offset: 1,
3657-
length: -1,
3658-
readCompressed: false,
3659-
wantCheck: false,
3657+
desc: "uncompressed, suffix",
3658+
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3659+
offset: 1,
3660+
length: -1,
3661+
readCompressed: false,
3662+
wantCheck: false,
3663+
wantDecompressed: false,
36603664
},
36613665
{
3662-
desc: "uncompressed, prefix",
3663-
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3664-
offset: 0,
3665-
length: 18,
3666-
readCompressed: false,
3667-
wantCheck: false,
3666+
desc: "uncompressed, prefix",
3667+
obj: client.Bucket(uncompressedBucket).Object(uncompressedObject),
3668+
offset: 0,
3669+
length: 18,
3670+
readCompressed: false,
3671+
wantCheck: false,
3672+
wantDecompressed: false,
36683673
},
36693674
{
36703675
// When a gzipped file is unzipped on read, we can't verify the checksum
36713676
// because it was computed against the zipped contents. We can detect
36723677
// this case using http.Response.Uncompressed.
3673-
desc: "compressed, entire file, unzipped",
3674-
obj: client.Bucket(bucket).Object(gzippedObject),
3675-
offset: 0,
3676-
length: -1,
3677-
readCompressed: false,
3678-
wantCheck: false,
3678+
desc: "compressed, entire file, unzipped",
3679+
obj: client.Bucket(bucket).Object(gzippedObject),
3680+
offset: 0,
3681+
length: -1,
3682+
readCompressed: false,
3683+
wantCheck: false,
3684+
wantDecompressed: true,
36793685
},
36803686
{
36813687
// When we read a gzipped file uncompressed, it's like reading a regular file:
36823688
// the served content and the CRC match.
3683-
desc: "compressed, entire file, read compressed",
3684-
obj: client.Bucket(bucket).Object(gzippedObject),
3685-
offset: 0,
3686-
length: -1,
3687-
readCompressed: true,
3688-
wantCheck: true,
3689+
desc: "compressed, entire file, read compressed",
3690+
obj: client.Bucket(bucket).Object(gzippedObject),
3691+
offset: 0,
3692+
length: -1,
3693+
readCompressed: true,
3694+
wantCheck: true,
3695+
wantDecompressed: false,
36893696
},
36903697
{
3691-
desc: "compressed, partial, server unzips",
3692-
obj: client.Bucket(bucket).Object(gzippedObject),
3693-
offset: 1,
3694-
length: 8,
3695-
readCompressed: false,
3696-
wantErr: true, // GCS can't serve part of a gzipped object
3697-
wantCheck: false,
3698+
desc: "compressed, partial, server unzips",
3699+
obj: client.Bucket(bucket).Object(gzippedObject),
3700+
offset: 1,
3701+
length: 8,
3702+
readCompressed: false,
3703+
wantErr: true, // GCS can't serve part of a gzipped object
3704+
wantCheck: false,
3705+
wantDecompressed: true,
36983706
},
36993707
{
3700-
desc: "compressed, partial, read compressed",
3701-
obj: client.Bucket(bucket).Object(gzippedObject),
3702-
offset: 1,
3703-
length: 8,
3704-
readCompressed: true,
3705-
wantCheck: false,
3708+
desc: "compressed, partial, read compressed",
3709+
obj: client.Bucket(bucket).Object(gzippedObject),
3710+
offset: 1,
3711+
length: 8,
3712+
readCompressed: true,
3713+
wantCheck: false,
3714+
wantDecompressed: false,
37063715
},
37073716
} {
37083717
t.Run(test.desc, func(t *testing.T) {
@@ -3720,13 +3729,17 @@ func TestIntegration_ReadCRC(t *testing.T) {
37203729
if got, want := r.checkCRC, test.wantCheck; got != want {
37213730
t.Errorf("%s, checkCRC: got %t, want %t", test.desc, got, want)
37223731
}
3732+
3733+
if got, want := r.Attrs.Decompressed, test.wantDecompressed; got != want {
3734+
t.Errorf("Attrs.Decompressed: got %t, want %t", got, want)
3735+
}
3736+
37233737
_, err = c.readFunc(r)
37243738
_ = r.Close()
37253739
if err != nil {
37263740
t.Fatalf("%s: %v", test.desc, err)
37273741
}
37283742
})
3729-
37303743
}
37313744
})
37323745
}
@@ -4767,6 +4780,10 @@ func TestIntegration_Reader(t *testing.T) {
47674780
if got, want := rc.ContentType(), "text/plain"; got != want {
47684781
t.Errorf("ContentType (%q) = %q; want %q", obj, got, want)
47694782
}
4783+
4784+
if got, want := rc.Attrs.CRC32C, crc32c(contents[obj]); got != want {
4785+
t.Errorf("CRC32C (%q) = %d; want %d", obj, got, want)
4786+
}
47704787
rc.Close()
47714788

47724789
// Check early close.
@@ -4831,6 +4848,15 @@ func TestIntegration_Reader(t *testing.T) {
48314848
if len(slurp) != int(r.want) {
48324849
t.Fatalf("%+v: RangeReader (%d, %d): Read %d bytes, wanted %d bytes", r.desc, r.offset, r.length, len(slurp), r.want)
48334850
}
4851+
// JSON does not return the crc32c on partial reads, so
4852+
// allow got == 0.
4853+
if got, want := rc.Attrs.CRC32C, crc32c(contents[obj]); got != 0 && got != want {
4854+
t.Errorf("RangeReader CRC32C (%q) = %d; want %d", obj, got, want)
4855+
}
4856+
4857+
if rc.Attrs.Decompressed {
4858+
t.Errorf("RangeReader Decompressed (%q) = want false, got %v", obj, rc.Attrs.Decompressed)
4859+
}
48344860

48354861
switch {
48364862
case r.offset < 0: // The case of reading the last N bytes.
@@ -4912,6 +4938,7 @@ func TestIntegration_ReaderAttrs(t *testing.T) {
49124938
LastModified: got.LastModified, // ignored, tested separately
49134939
Generation: attrs.Generation,
49144940
Metageneration: attrs.Metageneration,
4941+
CRC32C: crc32c(c),
49154942
}
49164943
if got != want {
49174944
t.Fatalf("got\t%v,\nwanted\t%v", got, want)
@@ -5211,6 +5238,10 @@ func TestIntegration_NewReaderWithContentEncodingGzip(t *testing.T) {
52115238
if g, w := blob2kBTo3kB, original; !bytes.Equal(g, w) {
52125239
t.Fatalf("Body mismatch\nGot:\n%s\n\nWant:\n%s", g, w)
52135240
}
5241+
5242+
if !r2kBTo3kB.Attrs.Decompressed {
5243+
t.Errorf("Attrs.Decompressed: want true, got %v", r2kBTo3kB.Attrs.Decompressed)
5244+
}
52145245
})
52155246
}
52165247
})
@@ -6350,3 +6381,7 @@ func setUpRequesterPaysBucket(ctx context.Context, t *testing.T, bucket, object
63506381
}
63516382
})
63526383
}
6384+
6385+
func crc32c(b []byte) uint32 {
6386+
return crc32.Checksum(b, crc32.MakeTable(crc32.Castagnoli))
6387+
}

storage/reader.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,19 @@ type ReaderObjectAttrs struct {
6565
// meaningful in the context of a particular generation of a
6666
// particular object.
6767
Metageneration int64
68+
69+
// CRC32C is the CRC32 checksum of the entire object's content using the
70+
// Castagnoli93 polynomial, if available.
71+
CRC32C uint32
72+
73+
// Decompressed is true if the object is stored as a gzip file and was
74+
// decompressed when read.
75+
// Objects are automatically decompressed if the object's metadata property
76+
// "Content-Encoding" is set to "gzip" or satisfies decompressive
77+
// transcoding as per https://cloud.google.com/storage/docs/transcoding.
78+
//
79+
// To prevent decompression on reads, use [ObjectHandle.ReadCompressed].
80+
Decompressed bool
6881
}
6982

7083
// NewReader creates a new Reader to read the contents of the
@@ -91,7 +104,8 @@ func (o *ObjectHandle) NewReader(ctx context.Context) (*Reader, error) {
91104
// If the object's metadata property "Content-Encoding" is set to "gzip" or satisfies
92105
// decompressive transcoding per https://cloud.google.com/storage/docs/transcoding
93106
// that file will be served back whole, regardless of the requested range as
94-
// Google Cloud Storage dictates.
107+
// Google Cloud Storage dictates. If decompressive transcoding occurs,
108+
// [Reader.Attrs.Decompressed] will be true.
95109
//
96110
// By default, reads are made using the Cloud Storage XML API. We recommend
97111
// using the JSON API instead, which can be done by setting [WithJSONReads]

0 commit comments

Comments
 (0)