Skip to content

[SIMD] wasm_v8x16_shuffle fails to generate i18x16.shuffle with same indices #9340

@huningxin

Description

@huningxin

Test case

v_load_deinterleave is a OpenCV.js universal intrinsic that is used by popluar image processing kernels, for example color conversion cvtColor.

By being inspired by its SSE2 implementation, the proposal of WASM SIMD implementation is like

inline v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
    // expect punpcklbw
    return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
}

inline v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
    // expect punpckhqdq
    return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
}

__attribute__((noinline))
inline void v_load_deinterleave(const unsigned char* ptr, __u8x16& a, __u8x16& b, __u8x16& c)
{
    v128_t t00 = wasm_v128_load(ptr);
    v128_t t01 = wasm_v128_load(ptr + 16);
    v128_t t02 = wasm_v128_load(ptr + 32);

    v128_t t10 = wasm_unpacklo_i8x16(t00, wasm_unpackhi_i64x2(t01, t01));
    v128_t t11 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t00, t00), t02);
    v128_t t12 = wasm_unpacklo_i8x16(t01, wasm_unpackhi_i64x2(t02, t02));

    v128_t t20 = wasm_unpacklo_i8x16(t10, wasm_unpackhi_i64x2(t11, t11));
    v128_t t21 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t10, t10), t12);
    v128_t t22 = wasm_unpacklo_i8x16(t11, wasm_unpackhi_i64x2(t12, t12));

    v128_t t30 = wasm_unpacklo_i8x16(t20, wasm_unpackhi_i64x2(t21, t21));
    v128_t t31 = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t20, t20), t22);
    v128_t t32 = wasm_unpacklo_i8x16(t21, wasm_unpackhi_i64x2(t22, t22));

    a = wasm_unpacklo_i8x16(t30, wasm_unpackhi_i64x2(t31, t31));
    b = wasm_unpacklo_i8x16(wasm_unpackhi_i64x2(t30, t30), t32);
    c = wasm_unpacklo_i8x16(t31, wasm_unpackhi_i64x2(t32, t32));
}

Expected result

In particular, we expect wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23) could generate i8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 that matches kX64S8x16UnpackLow pattern in V8 that generates punpcklbw instruction. Similarly, we expect wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31) could generate i8x16.shuffle 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 that matches kX64S64x2UnpackHigh pattern in V8 that generates punpckhqdq instruction.

Actual result

However, it turns out emscripten compiles the above code to different v8x16.shuffle wasm ops that fails to match the pattern and leads V8 to generate slow pshufb with memory operands.

The wasm-dis output is

(func $v_load_deinterleave\28unsigned\20char\20const*\2c\20unsigned\20char\20vector\5b16\5d&\2c\20unsigned\20char\20vector\5b16\5d&\2c\20unsigned\20char\20vector\5b16\5d&\29 (; 10 ;) (type $7) (param $0 i32) (param $1 i32) (param $2 i32) (param $3 i32)
  (local $4 v128)
  (local $5 v128)
  (local $6 v128)
  (local $7 v128)
  (local $8 v128)
  (v128.store
   (local.get $1)
   (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
    (local.tee $7
     (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
      (local.tee $8
       (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
        (local.tee $6
         (v8x16.shuffle 0 24 1 25 2 26 3 27 4 28 5 29 6 30 7 31
          (local.tee $4
           (v128.load align=1
            (local.get $0)
           )
          )
          (local.tee $5
           (v128.load offset=16 align=1
            (local.get $0)
           )
          )
         )
        )
        (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
         (local.tee $4
          (v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
           (local.get $4)
           (local.tee $7
            (v128.load offset=32 align=1
             (local.get $0)
            )
           )
          )
         )
         (local.get $4)
        )
       )
      )
      (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
       (local.tee $6
        (v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
         (local.get $6)
         (local.tee $5
          (v8x16.shuffle 0 24 1 25 2 26 3 27 4 28 5 29 6 30 7 31
           (local.get $5)
           (local.get $7)
          )
         )
        )
       )
       (local.get $4)
      )
     )
    )
    (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
     (local.tee $5
      (v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
       (local.get $8)
       (local.tee $4
        (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
         (local.get $4)
         (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
          (local.get $5)
          (local.get $4)
         )
        )
       )
      )
     )
     (local.get $4)
    )
   )
  )
  (v128.store
   (local.get $2)
   (v8x16.shuffle 8 16 9 17 10 18 11 19 12 20 13 21 14 22 15 23
    (local.get $7)
    (local.tee $4
     (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
      (local.get $6)
      (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
       (local.get $4)
       (local.get $4)
      )
     )
    )
   )
  )
  (v128.store
   (local.get $3)
   (v8x16.shuffle 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
    (local.get $5)
    (v8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
     (local.get $4)
     (local.get $4)
    )
   )
  )
 )

In particular, the v8x16.shuffle 8,9,10,11,12,13,14,15,0,0,0,0,0,0,0,0, v8x16.shuffle 8,16,9,17,10,18,11,19,12,20,13,21,14,22 15 23, v8x16.shuffle 0,24,1,25,2,26,3,27,4,28,5,29,6,30,7,31 lead V8 to generate slow pshufb with memory operands.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions