Skip to content

Commit 1a018d5

Browse files
yushoyamaguchipchaigno
authored andcommitted
bpf: Refine inner packet L4 checksum detection
The previous commit added handling for errors that occurred when NAT was applied to the inner header of an ICMP error packet too short to include the inner TCP L4 checksum. That implementation decided whether the inner packet had an L4 checksum by using the length attribute defined in RFC 4884: https://datatracker.ietf.org/doc/html/rfc4884. However, the length attribute is optional. For example, even in Linux kernel 6.17, ICMP error messages are sent without it: https://github.com/torvalds/linux/blob/v6.17/net/ipv4/icmp.c#L734-L739. Assuming that the attribute is always present is causing incorrect behavior. When the attribute exists, it is placed in the reserved bits. When it does not, the bits are zero-filled, and the logic wrongly assumes that the inner TCP checksum is missing, in all cases. This commit changes the logic to use the packet length stored in `ctx` instead of relying on the length attribute. Fix: 3c3f80bd48bc ("bpf: nat: Fix short packet MTU path discovery") Signed-off-by: YushoYamaguchi <ysh.824@outlook.jp>
1 parent 6f03aaf commit 1a018d5

File tree

1 file changed

+15
-27
lines changed

1 file changed

+15
-27
lines changed

bpf/lib/nat.h

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -781,9 +781,8 @@ snat_v4_nat_handle_icmp_error(struct __ctx_buff *ctx, __u64 off,
781781
__u32 icmpoff;
782782
__u8 type;
783783
int ret;
784-
__u32 icmp_xlen_off = (__u32)off + offsetof(struct icmphdr, un.frag.__unused) + 1;
785-
__u8 icmp_xlen;
786-
bool icmp_has_full_l4_header;
784+
bool icmp_has_inner_l4_csum = true;
785+
__u32 total_inner_len = (__u32)ctx_full_len(ctx) - inner_l3_off;
787786

788787
/* According to the RFC 5508, any networking equipment that is
789788
* responding with an ICMP Error packet should embed the original
@@ -841,15 +840,10 @@ snat_v4_nat_handle_icmp_error(struct __ctx_buff *ctx, __u64 off,
841840
if (!*state)
842841
return NAT_PUNT_TO_STACK;
843842

844-
/*
845-
* The snat_v4_rewrite_headers() call only rewrites the checksum for
846-
* TCP and UDP. UDP's checksum is covered by the RFC 792 inclusion
847-
* of the 1st 64 bits of the datagram following the IP header, but
848-
* TCP needs an additional 3 32-bit words to include the checksum.
849-
*/
850-
if (ctx_load_bytes(ctx, icmp_xlen_off, &icmp_xlen, sizeof(__u8)) < 0)
851-
return DROP_INVALID;
852-
icmp_has_full_l4_header = icmp_xlen >= ((tuple.nexthdr == IPPROTO_TCP) ? 3 : 0);
843+
/* Check if the inner L4 header has checksum */
844+
if (tuple.nexthdr == IPPROTO_TCP &&
845+
total_inner_len < iphdr.ihl + TCP_CSUM_OFF + sizeof(__u16))
846+
icmp_has_inner_l4_csum = false;
853847

854848
/* We found SNAT entry to NAT embedded packet. The destination addr
855849
* should be NATed according to the entry.
@@ -860,7 +854,7 @@ snat_v4_nat_handle_icmp_error(struct __ctx_buff *ctx, __u64 off,
860854
/* Failing to update the inner L4 checksum is not fatal if the header
861855
* is incomplete.
862856
*/
863-
if (!icmp_has_full_l4_header && ret == DROP_CSUM_L4)
857+
if (!icmp_has_inner_l4_csum && ret == DROP_CSUM_L4)
864858
ret = 0;
865859

866860
return ret;
@@ -1006,11 +1000,10 @@ snat_v4_rev_nat_handle_icmp_error(struct __ctx_buff *ctx,
10061000
__u16 port_off;
10071001
__u32 icmpoff;
10081002
__u8 type;
1009-
__u8 icmp_xlen;
1010-
bool icmp_has_full_l4_header;
1003+
bool icmp_has_inner_l4_csum = true;
10111004
int ret;
1012-
__u32 icmp_xlen_off = (__u32) inner_l3_off - sizeof(struct icmphdr) +
1013-
offsetof(struct icmphdr, un.frag.__unused) + 1;
1005+
__u32 total_inner_len = (__u32)(ctx_full_len(ctx) - inner_l3_off);
1006+
10141007

10151008
/* According to the RFC 5508, any networking equipment that is
10161009
* responding with an ICMP Error packet should embed the original
@@ -1070,15 +1063,10 @@ snat_v4_rev_nat_handle_icmp_error(struct __ctx_buff *ctx,
10701063
if (!*state)
10711064
return NAT_PUNT_TO_STACK;
10721065

1073-
/*
1074-
* The snat_v4_rewrite_headers() call only rewrites the checksum for
1075-
* TCP and UDP. UDP's checksum is covered by the RFC 792 inclusion
1076-
* of the 1st 64 bits of the datagram following the IP header, but
1077-
* TCP needs an additional 3 32-bit words to include the checksum.
1078-
*/
1079-
if (ctx_load_bytes(ctx, icmp_xlen_off, &icmp_xlen, sizeof(__u8)) < 0)
1080-
return DROP_INVALID;
1081-
icmp_has_full_l4_header = icmp_xlen >= ((tuple.nexthdr == IPPROTO_TCP) ? 3 : 0);
1066+
/* Check if the inner L4 header has checksum */
1067+
if (tuple.nexthdr == IPPROTO_TCP &&
1068+
total_inner_len < iphdr.ihl + TCP_CSUM_OFF + sizeof(__u16))
1069+
icmp_has_inner_l4_csum = false;
10821070

10831071
/* The embedded packet was SNATed on egress. Reverse it again: */
10841072
ret = snat_v4_rewrite_headers(ctx, tuple.nexthdr, (int)inner_l3_off, true, icmpoff,
@@ -1087,7 +1075,7 @@ snat_v4_rev_nat_handle_icmp_error(struct __ctx_buff *ctx,
10871075
/* Failing to update the inner L4 checksum is not fatal if the header
10881076
* is incomplete.
10891077
*/
1090-
if (!icmp_has_full_l4_header && ret == DROP_CSUM_L4)
1078+
if (!icmp_has_inner_l4_csum && ret == DROP_CSUM_L4)
10911079
ret = 0;
10921080
return ret;
10931081
}

0 commit comments

Comments
 (0)