Skip to content

Commit 3c3f80b

Browse files
br4243pchaigno
authored andcommitted
bpf: nat: Fix short packet MTU path discovery
Fixes: #33844 Routers that send minimal RFC 1191 ICMP Error packets for MTU path discovery only include the first two 32-bit words of the original L4 header. This is enough to include the L4 checksum for UDP, but not TCP; TCP needs an additional 3 32-bit words of the original datagram. RFC 4884 provides a length field in octet 5 of the ICMP Error packet header to indicate there are additional words of the original datagram included. This pays attention to the RFC 4884 length field and limits the L4 checksum update attempt to cases where there are enough bits of the original datagram included. This updates the current ICMP test packet generation to add RFC 4884 extra length values and adds tests for packets conforming strictly to RFC 1191 which include only 64 bits of the original datagram following the embedded IP header. Signed-off-by: Bill Reese <ReeseW@computer.org>
1 parent 0d21e9a commit 3c3f80b

File tree

2 files changed

+315
-16
lines changed

2 files changed

+315
-16
lines changed

bpf/lib/nat.h

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,9 @@ snat_v4_nat_handle_icmp_error(struct __ctx_buff *ctx, __u64 off)
781781
__u32 icmpoff;
782782
__u8 type;
783783
int ret;
784+
__u32 icmp_xlen_off = (__u32)off + offsetof(struct icmphdr, un.frag.__unused) + 1;
785+
__u8 icmp_xlen;
786+
bool icmp_has_full_l4_header;
784787

785788
/* According to the RFC 5508, any networking equipment that is
786789
* responding with an ICMP Error packet should embed the original
@@ -837,12 +840,28 @@ snat_v4_nat_handle_icmp_error(struct __ctx_buff *ctx, __u64 off)
837840
if (!state)
838841
return NAT_PUNT_TO_STACK;
839842

843+
/*
844+
* The snat_v4_rewrite_headers() call only rewrites the checksum for
845+
* TCP and UDP. UDP's checksum is covered by the RFC 792 inclusion
846+
* of the 1st 64 bits of the datagram following the IP header, but
847+
* TCP needs an additional 3 32-bit words to include the checksum.
848+
*/
849+
if (ctx_load_bytes(ctx, icmp_xlen_off, &icmp_xlen, sizeof(__u8)) < 0)
850+
return DROP_INVALID;
851+
icmp_has_full_l4_header = icmp_xlen >= ((tuple.nexthdr == IPPROTO_TCP) ? 3 : 0);
852+
840853
/* We found SNAT entry to NAT embedded packet. The destination addr
841854
* should be NATed according to the entry.
842855
*/
843856
ret = snat_v4_rewrite_headers(ctx, tuple.nexthdr, inner_l3_off, true, icmpoff,
844857
tuple.saddr, state->to_saddr, IPV4_DADDR_OFF,
845858
tuple.sport, state->to_sport, port_off);
859+
/* Failing to update the inner L4 checksum is not fatal if the header
860+
* is incomplete.
861+
*/
862+
if (!icmp_has_full_l4_header && ret == DROP_CSUM_L4)
863+
ret = 0;
864+
846865
if (IS_ERR(ret))
847866
return ret;
848867

@@ -976,6 +995,11 @@ snat_v4_rev_nat_handle_icmp_error(struct __ctx_buff *ctx,
976995
__u16 port_off;
977996
__u32 icmpoff;
978997
__u8 type;
998+
__u8 icmp_xlen;
999+
bool icmp_has_full_l4_header;
1000+
int ret;
1001+
__u32 icmp_xlen_off = (__u32) inner_l3_off - sizeof(struct icmphdr) +
1002+
offsetof(struct icmphdr, un.frag.__unused) + 1;
9791003

9801004
/* According to the RFC 5508, any networking equipment that is
9811005
* responding with an ICMP Error packet should embed the original
@@ -1035,10 +1059,26 @@ snat_v4_rev_nat_handle_icmp_error(struct __ctx_buff *ctx,
10351059
if (!*state)
10361060
return NAT_PUNT_TO_STACK;
10371061

1062+
/*
1063+
* The snat_v4_rewrite_headers() call only rewrites the checksum for
1064+
* TCP and UDP. UDP's checksum is covered by the RFC 792 inclusion
1065+
* of the 1st 64 bits of the datagram following the IP header, but
1066+
* TCP needs an additional 3 32-bit words to include the checksum.
1067+
*/
1068+
if (ctx_load_bytes(ctx, icmp_xlen_off, &icmp_xlen, sizeof(__u8)) < 0)
1069+
return DROP_INVALID;
1070+
icmp_has_full_l4_header = icmp_xlen >= ((tuple.nexthdr == IPPROTO_TCP) ? 3 : 0);
1071+
10381072
/* The embedded packet was SNATed on egress. Reverse it again: */
1039-
return snat_v4_rewrite_headers(ctx, tuple.nexthdr, (int)inner_l3_off, true, icmpoff,
1040-
tuple.daddr, (*state)->to_daddr, IPV4_SADDR_OFF,
1041-
tuple.dport, (*state)->to_dport, port_off);
1073+
ret = snat_v4_rewrite_headers(ctx, tuple.nexthdr, (int)inner_l3_off, true, icmpoff,
1074+
tuple.daddr, (*state)->to_daddr, IPV4_SADDR_OFF,
1075+
tuple.dport, (*state)->to_dport, port_off);
1076+
/* Failing to update the inner L4 checksum is not fatal if the header
1077+
* is incomplete.
1078+
*/
1079+
if (!icmp_has_full_l4_header && ret == DROP_CSUM_L4)
1080+
ret = 0;
1081+
return ret;
10421082
}
10431083

10441084
static __always_inline __maybe_unused int

0 commit comments

Comments
 (0)