diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 842655d0ca0e9..8e21a93799772 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20771,6 +20771,103 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
                      CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
 }
 
+// Attempt to use REVs for half-rotations of vectors of i16, i32 and i64.
+// Patterns for i32:
+//
+// (OR (SHL_PRED Pg, X, (splat 16)),
+//     (SRL_PRED Pg, X, (splat 16)))
+// =>
+// REVH Pg, X, poison
+//
+// (OR (VSHL X, 16), (VLSHR X, 16))
+// =>
+// NVCAST (REV32 X)
+static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  unsigned EltSize = VT.getScalarSizeInBits();
+  if (EltSize != 16 && EltSize != 32 && EltSize != 64)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (VT.isScalableVector()) {
+    if (N0.getOpcode() == AArch64ISD::SRL_PRED)
+      std::swap(N0, N1);
+    if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
+        N1.getOpcode() != AArch64ISD::SRL_PRED)
+      return SDValue();
+
+    // Ensure we have common inputs.
+    if (N0.getOperand(0) != N1.getOperand(0) ||
+        N0.getOperand(1) != N1.getOperand(1) ||
+        N0.getOperand(2) != N1.getOperand(2))
+      return SDValue();
+
+    APInt ShAmt;
+    if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShAmt) ||
+        EltSize / 2 != ShAmt)
+      return SDValue();
+
+    unsigned RevOp;
+    if (EltSize == 16)
+      RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
+    else if (EltSize == 32)
+      RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
+    else /* EltSize == 64 */
+      RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
+
+    return DAG.getNode(RevOp, DL, VT, N0.getOperand(0), N0.getOperand(1),
+                       DAG.getPOISON(VT));
+  }
+
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type");
+
+  // Half rotations of i16 vectors should be combined to bswap, so we shouldn't
+  // need custom code for them here.
+  // Note: This doesn't apply to scalable vectors as we allow arbitrary (but
+  // matching) predicates in the shifts. Predicated rotations aren't matched to
+  // rotl / rotr, and subsequently aren't combined to bswap.
+  if (EltSize == 16)
+    return SDValue();
+
+  if (N0.getOpcode() == AArch64ISD::VLSHR)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != AArch64ISD::VSHL || N1.getOpcode() != AArch64ISD::VLSHR)
+    return SDValue();
+
+  // Ensure common inputs.
+  if (N0.getOperand(0) != N1.getOperand(0) ||
+      N0.getOperand(1) != N1.getOperand(1))
+    return SDValue();
+
+  if (EltSize / 2 != N0.getConstantOperandVal(1))
+    return SDValue();
+
+  EVT HalfVT;
+  unsigned RevOp;
+  if (EltSize == 32) {
+    RevOp = AArch64ISD::REV32;
+    HalfVT = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+  } else /* EltSize == 64 */ {
+    RevOp = AArch64ISD::REV64;
+    HalfVT = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+  }
+
+  return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+                     DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0)));
+}
+
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget,
                                 const AArch64TargetLowering &TLI) {
@@ -20779,6 +20876,9 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   if (SDValue R = performANDORCSELCombine(N, DAG))
     return R;
 
+  if (SDValue R = tryCombineToREV(N, DAG, DCI))
+    return R;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 8f6f4510d8388..e72745e551dbb 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -296,6 +296,73 @@ define <vscale x 2 x i64> @xar_nxv2i64_shifts_neg(<vscale x 2 x i64> %x, <vscale
     ret <vscale x 2 x i64> %or
 }
 
+; Don't use XAR if REV[BHW] can be used.
+
+define <vscale x 8 x i16> @revb_nxv8i16(<vscale x 8 x i16> %r) {
+; CHECK-LABEL: revb_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 8 x i16> @llvm.fshl(<vscale x 8 x i16> %r, <vscale x 8 x i16> %r, <vscale x 8 x i16> splat (i16 8))
+  ret <vscale x 8 x i16> %or
+}
+
+define <vscale x 4 x i32> @revh_nxv4i32(<vscale x 4 x i32> %r) {
+; CHECK-LABEL: revh_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revh z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 4 x i32> @llvm.fshl(<vscale x 4 x i32> %r, <vscale x 4 x i32> %r, <vscale x 4 x i32> splat (i32 16))
+  ret <vscale x 4 x i32> %or
+}
+
+define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) {
+; CHECK-LABEL: revw_nx2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 2 x i64> @llvm.fshl(<vscale x 2 x i64> %r, <vscale x 2 x i64> %r, <vscale x 2 x i64> splat (i64 32))
+  ret <vscale x 2 x i64> %or
+}
+
+; As above, one test with rotate right.
+define <vscale x 2 x i64> @revw_nx2i64_r(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: revw_nx2i64_r:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 2 x i64> @llvm.fshr(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 32))
+  ret <vscale x 2 x i64> %r
+}
+
+; As above, one test with predicated shifts instead of rotate left.
+define <vscale x 4 x i32> @revh_nx4i32_shifts_l(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: revh_nx4i32_shifts_l:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    revh z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %shl = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
+  %shr = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
+  %or = or <vscale x 4 x i32> %shl, %shr
+  ret <vscale x 4 x i32> %or
+}
+
+; As above, one test with predicated shifts instead of rotate right.
+define <vscale x 8 x i16> @revb_nx8i16_shifts_r(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: revb_nx8i16_shifts_r:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %shr = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
+  %shl = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
+  %or = or <vscale x 8 x i16> %shr, %shl
+  ret <vscale x 8 x i16> %or
+}
+
 declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 652617b58eaf3..4f940a39dfe39 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=CHECK,SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=CHECK,NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefixes=CHECK,SVE2 %s
 
 /* 128-bit vectors */
 
@@ -359,6 +359,96 @@ entry:
   ret <8 x i8> %or
 }
 
+; Don't use XAR if REV16/REV32/REV64 can be used.
+
+define <4 x i16> @rev16_v4i16(<4 x i16> %r) {
+; CHECK-LABEL: rev16_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %or = tail call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 8))
+  ret <4 x i16> %or
+}
+
+define <2 x i32> @rev32_v2i32(<2 x i32> %r) {
+; CHECK-LABEL: rev32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16))
+  ret <2 x i32> %or
+}
+
+define <1 x i64> @rev64_v1i64(<1 x i64> %r) {
+; CHECK-LABEL: rev64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32))
+  ret <1 x i64> %or
+}
+
+define <8 x i16> @rev16_v8i16(<8 x i16> %r) {
+; CHECK-LABEL: rev16_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %or = tail call <8 x i16> @llvm.fshl(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 8))
+  ret <8 x i16> %or
+}
+
+define <4 x i32> @rev32_v4i32(<4 x i32> %r) {
+; CHECK-LABEL: rev32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-NEXT:    ret
+  %or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16))
+  ret <4 x i32> %or
+}
+
+define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
+; CHECK-LABEL: rev64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32))
+  ret <2 x i64> %or
+}
+
+; As above, one test with rotate right.
+define <1 x i64> @rev64_v1i64_r(<1 x i64> %a) {
+; CHECK-LABEL: rev64_v1i64_r:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %r = tail call <1 x i64> @llvm.fshr(<1 x i64> %a, <1 x i64> %a, <1 x i64> splat (i64 32))
+  ret <1 x i64> %r
+}
+
+; As above, one test with individual shifts instead of rotate left.
+define <2 x i32> @rev32_v2i32_shifts_l(<2 x i32> %a) {
+; CHECK-LABEL: rev32_v2i32_shifts_l:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %shl = shl <2 x i32> %a, splat (i32 16)
+  %shr = lshr <2 x i32> %a, splat (i32 16)
+  %or = or <2 x i32> %shl, %shr
+  ret <2 x i32> %or
+}
+
+; As above, one test with individual shifts instead of rotate right.
+define <4 x i16> @rev16_v4i16_shifts_r(<4 x i16> %a) {
+; CHECK-LABEL: rev16_v4i16_shifts_r:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %shr = lshr <4 x i16> %a, splat (i16 8)
+  %shl = shl <4 x i16> %a, splat (i16 8)
+  %or = or <4 x i16> %shr, %shl
+  ret <4 x i16> %or
+}
+
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)