-
Notifications
You must be signed in to change notification settings - Fork 3.5k
[SIMD] wasm_v8x16_shuffle fails to generate i18x16.shuffle with same indices #9340
Description
Test case
v_load_deinterleave is a OpenCV.js universal intrinsic that is used by popluar image processing kernels, for example color conversion cvtColor.
By being inspired by its SSE2 implementation, the proposal of WASM SIMD implementation is like
inline v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
// expect punpcklbw
return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
}
inline v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
// expect punpckhqdq
return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
}
__attribute__((noinline))
inline void v_load_deinterleave(const unsigned char* ptr, __u8x16& a, __u8x16& b, __u8x16& c)
{
v128_t t00 = wasm_v128_load(ptr);
v128_t t01 = wasm_v128_load(ptr + 16);
v128_t t02 = wasm_v128_load(ptr + 32);
v128_t t10 = wasm_unpacklo_i8x16(t00, wasm_unpackhi_i64x2(t01, t01));
v128_t t11 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t00, t00), t02);
v128_t t12 = wasm_unpacklo_i8x16(t01, wasm_unpackhi_i64x2(t02, t02));
v128_t t20 = wasm_unpacklo_i8x16(t10, wasm_unpackhi_i64x2(t11, t11));
v128_t t21 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t10, t10), t12);
v128_t t22 = wasm_unpacklo_i8x16(t11, wasm_unpackhi_i64x2(t12, t12));
v128_t t30 = wasm_unpacklo_i8x16(t20, wasm_unpackhi_i64x2(t21, t21));
v128_t t31 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t20, t20), t22);
v128_t t32 = wasm_unpacklo_i8x16(t21, wasm_unpackhi_i64x2(t22, t22));
a = wasm_unpacklo_i8x16(t30, wasm_unpackhi_i64x2(t31, t31));
b = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t30, t30), t32);
c = wasm_unpacklo_i8x16(t31, wasm_unpackhi_i64x2(t32, t32));
}Expected result
In particular, we expect wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23) could generate i8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 that matches kX64S8x16UnpackLow pattern in V8 that generates punpcklbw instruction. Similarly, we expect wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31) could generate i8x16.shuffle 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 that matches kX64S64x2UnpackHigh pattern in V8 that generates punpckhqdq instruction.
Actual result
However, it turns out emscripten compiles the above code to different v8x16.shuffle wasm ops that fails to match the pattern and leads V8 to generate slow pshufb with memory operands.
The wasm-dis output is
(func $v_load_deinterleave\28unsigned\20char\20const*\2c\20unsigned\20char\20vector\5b16\5d&\2c\20unsigned\20char\20vector\5b16\5d&\2c\20unsigned\20char\20vector\5b16\5d&\29 (; 10 ;) (type $7) (param $0 i32) (param $1 i32) (param $2 i32) (param $3 i32)
(local $4 v128)
(local $5 v128)
(local $6 v128)
(local $7 v128)
(local $8 v128)
(v128.store
(local.get $1)
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.tee $7
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.tee $8
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.tee $6
(v8x16.shuffle 0 24 1 25 2 26 3 27 4 28 5 29 6 30 7 31
(local.tee $4
(v128.load align=1
(local.get $0)
)
)
(local.tee $5
(v128.load offset=16 align=1
(local.get $0)
)
)
)
)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.tee $4
(v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
(local.get $4)
(local.tee $7
(v128.load offset=32 align=1
(local.get $0)
)
)
)
)
(local.get $4)
)
)
)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.tee $6
(v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
(local.get $6)
(local.tee $5
(v8x16.shuffle 0 24 1 25 2 26 3 27 4 28 5 29 6 30 7 31
(local.get $5)
(local.get $7)
)
)
)
)
(local.get $4)
)
)
)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.tee $5
(v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
(local.get $8)
(local.tee $4
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.get $4)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.get $5)
(local.get $4)
)
)
)
)
)
(local.get $4)
)
)
)
(v128.store
(local.get $2)
(v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
(local.get $7)
(local.tee $4
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.get $6)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.get $4)
(local.get $4)
)
)
)
)
)
(v128.store
(local.get $3)
(v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
(local.get $5)
(v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
(local.get $4)
(local.get $4)
)
)
)
)
In particular, the v8x16.shuffle 8,9,10,11,12,13,14,15,0,0,0,0,0,0,0,0, v8x16.shuffle 8,16,9,17,10,18,11,19,12,20,13,21,14,22 15 23, v8x16.shuffle 0,24,1,25,2,26,3,27,4,28,5,29,6,30,7,31 lead V8 to generate slow pshufb with memory operands.