Skip to content

Vectorize in-place comparison operators#33252

Closed
xuhdev wants to merge 1 commit intopytorch:masterfrom
xuhdev:vec/comparison
Closed

Vectorize in-place comparison operators#33252
xuhdev wants to merge 1 commit intopytorch:masterfrom
xuhdev:vec/comparison

Conversation

@xuhdev
Copy link
Copy Markdown
Collaborator

@xuhdev xuhdev commented Feb 12, 2020

Benchmark: (Debian 10, Release build, gcc 8.3, no turbo, Intel(R) Xeon(R) E-2136 CPU @ 3.30GHz)

import timeit
for op in ('gt', 'lt', 'ge', 'le', 'eq', 'ne'):
    for dtype in ('torch.float', 'torch.double', 'torch.int16', 'torch.int32', 'torch.int64'):
        for n, t in [(10_000, 100000),
                    (100_000, 10000)]:
            print(f'a.{op}_(b), numel() == {n} for {t} times, dtype={dtype}')
            print(timeit.timeit(f'a.{op}_(b)', setup=f'import torch; a = torch.arange(1, {n}, dtype={dtype}); b = torch.arange({n}, 1, -1, dtype={dtype})', number=t))

Before:

a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.778998922000028
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6359690249992127
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0801493119997758
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9360321379990637
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7341018620008981
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6345281440007966
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7396387640001194
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6429641230006382
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7759611700003006
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6672059659995284
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724312530008319
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6392585769990546
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7917451840003196
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6455550159989798
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.739991647998977
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6572993859990675
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7627949479992822
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6476544910001394
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7965036850000615
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6780715599998075
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7653547080008138
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6383065829995758
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7895260240002244
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6508346030004759
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7409299750015634
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6383492870008922
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7620547579990671
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6474270239996258
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.8070051169997896
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6712598600006459
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7627660060006747
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6406353189995571
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0826010620003217
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9391552950000914
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7427801039993938
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6365172640016681
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7679271510005492
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6453389289999905
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.788032889000533
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6708840760002204
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
1.078837263999958
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.9397531720005645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.1031508050000411
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9412319389994082
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7509566959997755
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.638570957000411
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7592877549996047
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6458840529994632
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7984061539991671
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6776346309998189
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724407899986545
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6581534130000364
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.8303323249983805
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6954390920000151
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.745512373998281
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6360954970004968
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7569978400006221
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6450422030011396
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7889118379989668
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6693385389989999

After:

a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2444220920006046
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2031730359994981
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35491806199934217
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3905606850003096
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.16665379499863775
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10095906300011848
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21650469999985944
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18737469400002738
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35481256200000644
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.36696120199849247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21976138800164335
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.20275393200063263
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3695997209997586
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39441510399956314
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15657078300137073
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.0992998069996247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20425128799979575
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20352934599941364
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35883567900054913
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39059587599876977
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21457727400047588
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.18836135499986995
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35971907199927955
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3688875009993353
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1576009280015569
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09524034199966991
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2064543649994448
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18726435600001423
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35351785300008487
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3680737989998306
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2132134399998904
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2140274829998816
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.36539215199991304
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39128020300086064
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15712150600120367
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10149904400168452
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2103407699996751
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.2134442910009966
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35387034300038067
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.38917528399906587
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2190484450002259
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2030815980015177
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3710030169986567
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36419657899932645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15986497499943653
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10145393699895067
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21011781599918322
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20121852699958254
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36681504499938455
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.364472848999867
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2290963309988001
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21674784300012107
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3829616689999966
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39437660300063726
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1661020749997988
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10052955100036343
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21827425599985872
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.21522501399886096
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.37058242300008715
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39304063900090114

@dr-ci
Copy link
Copy Markdown

dr-ci Bot commented Feb 13, 2020

💊 CircleCI build failures summary and remediations

As of commit af2fe24 (more details on the Dr. CI page):


  • 1/3 failures introduced in this PR

  • 2/3 broken upstream at merge base e4a883e since Mar 06

    Please rebase on the viable/strict branch (expand for instructions)

    If your commit is newer than viable/strict, you can try basing on an older, stable commit:

    git fetch origin viable/strict
    git rebase --onto viable/strict $(git merge-base origin/master HEAD)
    

    If your commit is older than viable/strict:

    git fetch origin viable/strict
    git rebase viable/strict
    

    Check out the recency history of this "viable master" tracking branch.


🕵️ 1 new failure recognized by patterns

The following build failures do not appear to be due to upstream breakages (reran 1 job to discount flakiness):

See CircleCI build caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test (1/1)

Step: "Test" (full log | pattern match details) <confirmed not flaky by 2 failures>

Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_slice_index_exceeds_dim Fatal Python error: Segmentation fault
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns_opset11::test_error_on_data_parallel PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns_opset11::test_is_in_onnx_export PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns_opset11::test_strip_doc_string PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns_opset11::test_validate_dynamic_axes_invalid_input_output_name PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_concat PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_div PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_lstm PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_mul PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_reshape SKIPPED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_slice PASSED [ 99%] 
Mar 06 02:39:31 test/onnx/test_utility_funs.py::TestUtilityFuns::test_constant_fold_slice_index_exceeds_dim Fatal Python error: Segmentation fault 
Mar 06 02:39:31  
Mar 06 02:39:31 Current thread 0x00007f42c1e8d700 (most recent call first): 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_io/saferepr.py", line 43 in repr_instance 
Mar 06 02:39:31   File "/usr/lib/python3.6/reprlib.py", line 65 in repr1 
Mar 06 02:39:31   File "/usr/lib/python3.6/reprlib.py", line 55 in repr 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_io/saferepr.py", line 36 in repr 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_io/saferepr.py", line 67 in saferepr 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_code/code.py", line 655 in repr_args 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_code/code.py", line 736 in repr_traceback_entry 
Mar 06 02:39:31   File "/usr/local/lib/python3.6/dist-packages/_pytest/_code/code.py", line 777 in repr_traceback 

🚧 2 upstream failures recognized by patterns:

These were probably caused by upstream breakages:


This comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.

Please report bugs/suggestions on the GitHub issue tracker.

This comment has been revised 75 times.

@xuhdev xuhdev force-pushed the vec/comparison branch 3 times, most recently from 8a21c1d to 5f5ba8d Compare February 18, 2020 21:49
@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Feb 19, 2020

This is currently failing because the vectorized versions of == (and others) fill in the results with 0xff (when true) and 0x00 (when false) but the unvectorized versions fills in with 1 (when true) and 0 (when false):

return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);

The question is whether we should align these two scenarios. I lean towards adding another equality function to the vectorized version that is consistent with the serial version.

@yf225 yf225 added module: vectorization Related to SIMD vectorization, e.g., Vec256 triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module labels Feb 19, 2020
@VitalyFedyunin
Copy link
Copy Markdown
Contributor

Are you suggesting something instead of _mm256_cmp_pd ?
As option maybe you can do bitwise & with 0x01 and benchmark it.

@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Feb 20, 2020

@VitalyFedyunin Yes. What I meant is that we can either do the bitwise and in the low-level vectorization function or specifically only for comparison operators. Which one should we pursue? The former one seems more elegant but would possibly break backward compatibility.

@VitalyFedyunin
Copy link
Copy Markdown
Contributor

I think we can't break BC, so lets do custom ops and well comment it inside the code why we made this decision.

@xuhdev xuhdev force-pushed the vec/comparison branch 2 times, most recently from 3b04572 to 6e024b0 Compare February 21, 2020 22:32
@xuhdev xuhdev changed the title Vectorize in-place comparison operators [WIP] Vectorize in-place comparison operators Feb 21, 2020
@xuhdev xuhdev force-pushed the vec/comparison branch 9 times, most recently from e6430f7 to 814db46 Compare March 2, 2020 18:59
@xuhdev xuhdev force-pushed the vec/comparison branch 2 times, most recently from be71639 to 919a3fd Compare March 3, 2020 22:10
@xuhdev xuhdev changed the title [WIP] Vectorize in-place comparison operators Vectorize in-place comparison operators Mar 3, 2020
@VitalyFedyunin
Copy link
Copy Markdown
Contributor

Code looks good, can you please redo benchmarks with additional &

@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 4, 2020

Code looks good, can you please redo benchmarks with additional &

What do you mean? Could you explain?

@VitalyFedyunin
Copy link
Copy Markdown
Contributor

If I remember correctly, the first (benchmarked) implementation had no & Vec256<double>::ones;

@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 4, 2020

@VitalyFedyunin Here you go!

diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
index 6b05b2848a6d..2b2f2cbbfdd0 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -430,11 +430,11 @@ Vec256<std::complex<double>> inline operator^(const Vec256<std::complex<double>>
 const Vec256<std::complex<double>> Vec256<std::complex<double>>::ones(_mm256_set1_pd(1.0));
 
 Vec256<std::complex<double>> Vec256<std::complex<double>>::eq(const Vec256<std::complex<double>>& other) const {
-  return (*this == other) & Vec256<std::complex<double>>::ones;
+  return (*this == other) ;
 }
 
 Vec256<std::complex<double>> Vec256<std::complex<double>>::ne(const Vec256<std::complex<double>>& other) const {
-  return (*this != other) & Vec256<std::complex<double>>::ones;
+  return (*this != other) ;
 }
 
 #ifdef __AVX2__
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
index ddcf16ec9a39..e16250b8b784 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -471,12 +471,12 @@ const Vec256<std::complex<float>> Vec256<std::complex<float>>::ones(_mm256_set1_
 
 Vec256<std::complex<float>> Vec256<std::complex<float>>::eq(
     const Vec256<std::complex<float>>& other) const {
-  return (*this == other) & Vec256<std::complex<float>>::ones;
+  return (*this == other) ;
 }
 
 Vec256<std::complex<float>> Vec256<std::complex<float>>::ne(
     const Vec256<std::complex<float>>& other) const {
-  return (*this != other) & Vec256<std::complex<float>>::ones;
+  return (*this != other) ;
 }
 
 #ifdef __AVX2__
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index 97b9111b6977..9db1e19940f7 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -319,27 +319,27 @@ Vec256<double> inline operator^(const Vec256<double>& a, const Vec256<double>& b
 const Vec256<double> Vec256<double>::ones(1.0);
 
 Vec256<double> Vec256<double>::eq(const Vec256<double>& other) const {
-  return (*this == other) & Vec256<double>::ones;
+  return (*this == other) ;
 }
 
 Vec256<double> Vec256<double>::ne(const Vec256<double>& other) const {
-  return (*this != other) & Vec256<double>::ones;
+  return (*this != other) ;
 }
 
 Vec256<double> Vec256<double>::gt(const Vec256<double>& other) const {
-  return (*this > other) & Vec256<double>::ones;
+  return (*this > other) ;
 }
 
 Vec256<double> Vec256<double>::ge(const Vec256<double>& other) const {
-  return (*this >= other) & Vec256<double>::ones;
+  return (*this >= other) ;
 }
 
 Vec256<double> Vec256<double>::lt(const Vec256<double>& other) const {
-  return (*this < other) & Vec256<double>::ones;
+  return (*this < other) ;
 }
 
 Vec256<double> Vec256<double>::le(const Vec256<double>& other) const {
-  return (*this <= other) & Vec256<double>::ones;
+  return (*this <= other) ;
 }
 
 template <>
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index f28404dcc692..6545ee663080 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -326,27 +326,27 @@ Vec256<float> inline operator^(const Vec256<float>& a, const Vec256<float>& b) {
 const Vec256<float> Vec256<float>::ones(1.0f);
 
 Vec256<float> Vec256<float>::eq(const Vec256<float>& other) const {
-  return (*this == other) & Vec256<float>::ones;
+  return (*this == other) ;
 }
 
 Vec256<float> Vec256<float>::ne(const Vec256<float>& other) const {
-  return (*this != other) & Vec256<float>::ones;
+  return (*this != other) ;
 }
 
 Vec256<float> Vec256<float>::gt(const Vec256<float>& other) const {
-  return (*this > other) & Vec256<float>::ones;
+  return (*this > other) ;
 }
 
 Vec256<float> Vec256<float>::ge(const Vec256<float>& other) const {
-  return (*this >= other) & Vec256<float>::ones;
+  return (*this >= other) ;
 }
 
 Vec256<float> Vec256<float>::lt(const Vec256<float>& other) const {
-  return (*this < other) & Vec256<float>::ones;
+  return (*this < other) ;
 }
 
 Vec256<float> Vec256<float>::le(const Vec256<float>& other) const {
-  return (*this <= other) & Vec256<float>::ones;
+  return (*this <= other) ;
 }
 
 template <>
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 09e88ade413d..0a8d6f3a0e81 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -710,79 +710,79 @@ DEFINE_INTEGER_BINARY_OP(^, _mm256_xor_si256)
 const Vec256<int64_t> Vec256<int64_t>::ones(1);
 
 Vec256<int64_t> Vec256<int64_t>::eq(const Vec256<int64_t>& other) const {
-  return (*this == other) & Vec256<int64_t>::ones;
+  return (*this == other) ;
 }
 
 Vec256<int64_t> Vec256<int64_t>::ne(const Vec256<int64_t>& other) const {
-  return (*this != other) & Vec256<int64_t>::ones;
+  return (*this != other) ;
 }
 
 Vec256<int64_t> Vec256<int64_t>::gt(const Vec256<int64_t>& other) const {
-  return (*this > other) & Vec256<int64_t>::ones;
+  return (*this > other) ;
 }
 
 Vec256<int64_t> Vec256<int64_t>::ge(const Vec256<int64_t>& other) const {
-  return (*this >= other) & Vec256<int64_t>::ones;
+  return (*this >= other) ;
 }
 
 Vec256<int64_t> Vec256<int64_t>::lt(const Vec256<int64_t>& other) const {
-  return (*this < other) & Vec256<int64_t>::ones;
+  return (*this < other) ;
 }
 
 Vec256<int64_t> Vec256<int64_t>::le(const Vec256<int64_t>& other) const {
-  return (*this <= other) & Vec256<int64_t>::ones;
+  return (*this <= other) ;
 }
 
 const Vec256<int32_t> Vec256<int32_t>::ones(1);
 
 Vec256<int32_t> Vec256<int32_t>::eq(const Vec256<int32_t>& other) const {
-  return (*this == other) & Vec256<int32_t>::ones;
+  return (*this == other) ;
 }
 
 Vec256<int32_t> Vec256<int32_t>::ne(const Vec256<int32_t>& other) const {
-  return (*this != other) & Vec256<int32_t>::ones;
+  return (*this != other) ;
 }
 
 Vec256<int32_t> Vec256<int32_t>::gt(const Vec256<int32_t>& other) const {
-  return (*this > other) & Vec256<int32_t>::ones;
+  return (*this > other) ;
 }
 
 Vec256<int32_t> Vec256<int32_t>::ge(const Vec256<int32_t>& other) const {
-  return (*this >= other) & Vec256<int32_t>::ones;
+  return (*this >= other) ;
 }
 
 Vec256<int32_t> Vec256<int32_t>::lt(const Vec256<int32_t>& other) const {
-  return (*this < other) & Vec256<int32_t>::ones;
+  return (*this < other) ;
 }
 
 Vec256<int32_t> Vec256<int32_t>::le(const Vec256<int32_t>& other) const {
-  return (*this <= other) & Vec256<int32_t>::ones;
+  return (*this <= other) ;
 }
 
 const Vec256<int16_t> Vec256<int16_t>::ones(1);
 
 Vec256<int16_t> Vec256<int16_t>::eq(const Vec256<int16_t>& other) const {
-  return (*this == other) & Vec256<int16_t>::ones;
+  return (*this == other) ;
 }
 
 Vec256<int16_t> Vec256<int16_t>::ne(const Vec256<int16_t>& other) const {
-  return (*this != other) & Vec256<int16_t>::ones;
+  return (*this != other) ;
 }
 
 Vec256<int16_t> Vec256<int16_t>::gt(const Vec256<int16_t>& other) const {
-  return (*this > other) & Vec256<int16_t>::ones;
+  return (*this > other) ;
 }
 
 Vec256<int16_t> Vec256<int16_t>::ge(const Vec256<int16_t>& other) const {
-  return (*this >= other) & Vec256<int16_t>::ones;
+  return (*this >= other) ;
 }
 
 Vec256<int16_t> Vec256<int16_t>::lt(const Vec256<int16_t>& other) const {
-  return (*this < other) & Vec256<int16_t>::ones;
+  return (*this < other) ;
 }
 
 Vec256<int16_t> Vec256<int16_t>::le(const Vec256<int16_t>& other) const {
-  return (*this <= other) & Vec256<int16_t>::ones;
+  return (*this <= other) ;
 }
 
 #endif

Result:

a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2229550569973071
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2011424100055592
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.34440684400033206
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.38504007900337456
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15573242899699835
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09772641499876045
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2121838620005292
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18629408700508066
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.34447496099892305
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.36077459599619033
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2247381690031034
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2032312669980456
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3614415110059781
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39583342100377195
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15801144400029443
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10085486699972535
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21445474599750014
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20042651599942474
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.3482468610018259
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39153504199930467
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.22223757000028854
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.1886956989983446
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.36404590000165626
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36251684599847067
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15952981900045415
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10026752799603855
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21399201000167523
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.19001240999932634
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36203529799968237
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3670815019941074
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2214342170045711
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21155796599487076
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.362026146001881
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3906234059977578
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15747111800010316
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10151359900191892
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21104250400094315
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.2123121810000157
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.3563075529964408
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39280333199712913
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21559363300184486
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.19949310999800218
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3625585100016906
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36049080299562775
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15516175899392692
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09993253200082108
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20718558200314874
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.19748599200102035
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36137622900423594
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.35864436499832664
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21281901600013953
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21216571300465148
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3591362709994428
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3906971070027794
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1549924770006328
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10312153099948773
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20544030599558027
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.21353890600585146
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35717749699688284
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3951241620015935

@VitalyFedyunin
Copy link
Copy Markdown
Contributor

Super, it looks like mem-bound operation had no perf impact at all with this change.

Copy link
Copy Markdown
Contributor

@facebook-github-bot facebook-github-bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VitalyFedyunin has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 6, 2020

@pytorchbot retest please

Benchmark: (Debian 10, Release build, gcc 8.3, no turbo, Intel(R) Xeon(R) E-2136 CPU @ 3.30GHz)

```python
import timeit
for op in ('gt', 'lt', 'ge', 'le', 'eq', 'ne'):
    for dtype in ('torch.float', 'torch.double', 'torch.int16', 'torch.int32', 'torch.int64'):
        for n, t in [(10_000, 100000),
                    (100_000, 10000)]:
            print(f'a.{op}_(b), numel() == {n} for {t} times, dtype={dtype}')
            print(timeit.timeit(f'a.{op}_(b)', setup=f'import torch; a = torch.arange(1, {n}, dtype={dtype}); b = torch.arange({n}, 1, -1, dtype={dtype})', number=t))
```

Before:

```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.778998922000028
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6359690249992127
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0801493119997758
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9360321379990637
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7341018620008981
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6345281440007966
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7396387640001194
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6429641230006382
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7759611700003006
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6672059659995284
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724312530008319
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6392585769990546
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7917451840003196
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6455550159989798
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.739991647998977
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6572993859990675
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7627949479992822
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6476544910001394
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7965036850000615
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6780715599998075
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7653547080008138
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6383065829995758
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7895260240002244
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6508346030004759
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7409299750015634
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6383492870008922
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7620547579990671
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6474270239996258
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.8070051169997896
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6712598600006459
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7627660060006747
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6406353189995571
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0826010620003217
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9391552950000914
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7427801039993938
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6365172640016681
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7679271510005492
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6453389289999905
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.788032889000533
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6708840760002204
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
1.078837263999958
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.9397531720005645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.1031508050000411
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9412319389994082
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7509566959997755
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.638570957000411
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7592877549996047
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6458840529994632
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7984061539991671
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6776346309998189
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724407899986545
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6581534130000364
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.8303323249983805
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6954390920000151
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.745512373998281
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6360954970004968
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7569978400006221
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6450422030011396
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7889118379989668
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6693385389989999
```

After:

```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2444220920006046
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2031730359994981
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35491806199934217
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3905606850003096
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.16665379499863775
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10095906300011848
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21650469999985944
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18737469400002738
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35481256200000644
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.36696120199849247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21976138800164335
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.20275393200063263
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3695997209997586
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39441510399956314
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15657078300137073
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.0992998069996247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20425128799979575
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20352934599941364
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35883567900054913
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39059587599876977
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21457727400047588
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.18836135499986995
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35971907199927955
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3688875009993353
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1576009280015569
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09524034199966991
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2064543649994448
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18726435600001423
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35351785300008487
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3680737989998306
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2132134399998904
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2140274829998816
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.36539215199991304
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39128020300086064
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15712150600120367
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10149904400168452
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2103407699996751
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.2134442910009966
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35387034300038067
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.38917528399906587
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2190484450002259
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2030815980015177
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3710030169986567
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36419657899932645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15986497499943653
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10145393699895067
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21011781599918322
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20121852699958254
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36681504499938455
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.364472848999867
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2290963309988001
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21674784300012107
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3829616689999966
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39437660300063726
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1661020749997988
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10052955100036343
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21827425599985872
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.21522501399886096
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.37058242300008715
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39304063900090114
```
@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 17, 2020

@VitalyFedyunin Any chance to get this merged? :)

@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 18, 2020

@pytorchbot merge this please

@pytorchbot pytorchbot added the merge-this-please Was marked for merge with @pytorchbot merge this please label Mar 18, 2020
Copy link
Copy Markdown
Contributor

@facebook-github-bot facebook-github-bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ezyang is landing this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

@facebook-github-bot
Copy link
Copy Markdown
Contributor

@ezyang merged this pull request in 91d39de.

@ezyang
Copy link
Copy Markdown
Contributor

ezyang commented Mar 20, 2020

ruh oh, MSVC 2017 hates it

[1622/3270] C:\Users\circleci\project\build\win_tmp\bin\sccache-cl.exe   /TP -DAT_PARALLEL_OPENMP=1 -DCPUINFO_SUPPORTED_PLATFORM=1 -DIDEEP_USE_MKL -DMAGMA_V2 -DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS -DONNX_ML=1 -DONNX_NAMESPACE=onnx_torch -DTH_BLAS_MKL -DWIN32_LEAN_AND_MEAN -D_CRT_SECURE_NO_DEPRECATE=1 -D_OPENMP_NOFORCE_MANIFEST -Dtorch_cpu_EXPORTS -Iaten\src -I..\aten\src -I. -I..\ -I..\cmake\..\third_party\benchmark\include -Icaffe2\contrib\aten -I..\third_party\onnx -Ithird_party\onnx -I..\third_party\foxi -Ithird_party\foxi -I..\caffe2\..\torch\csrc\api -I..\caffe2\..\torch\csrc\api\include -I..\caffe2\aten\src\TH -Icaffe2\aten\src\TH -I..\caffe2\..\torch\..\aten\src -Icaffe2\aten\src -Icaffe2\..\aten\src -Icaffe2\..\aten\src\ATen -I..\caffe2\..\torch\csrc -I..\caffe2\..\torch\..\third_party\miniz-2.0.8 -I..\aten\src\TH -I..\aten\..\third_party\catch\single_include -I..\aten\src\ATen\.. -Icaffe2\aten\src\ATen -I..\third_party\miniz-2.0.8 -I..\caffe2\core\nomnigraph\include -I..\c10\.. -Ithird_party\ideep\mkl-dnn\include -I..\third_party\ideep\mkl-dnn\src\..\include -I..\third_party\cpuinfo\include -I..\third_party\fbgemm\include -I..\third_party\fbgemm -I..\third_party\fbgemm\third_party\asmjit\src -I..\third_party\FP16\include -I..\cmake\..\third_party\googletest\googlemock\include -I..\cmake\..\third_party\googletest\googletest\include -I..\third_party\protobuf\src -Iwin_tmp\mkl\include -I..\third_party -I..\cmake\..\third_party\eigen -IC:\Jenkins\Miniconda3\include -IC:\Jenkins\Miniconda3\lib\site-packages\numpy\core\include -I..\cmake\..\third_party\pybind11\include -I\opt\rocm\hip\include -I\include -I..\cmake\..\third_party\cub -Iwin_tmp\magma\include -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I..\third_party\ideep\mkl-dnn\include -I..\third_party\ideep\include -I..\caffe2 /DWIN32 /D_WINDOWS /GR  /w /EHa /bigobj -openmp -DNDEBUG -DUSE_FBGEMM -DHAVE_AVX_CPU_DEFINITION -DHAVE_AVX2_CPU_DEFINITION /MD /O2 /Ob2 /DNDEBUG /w /EHa /bigobj -DNDEBUG   -DCUDA_HAS_FP16=1 -DUSE_GCC_GET_CPUID -DUSE_AVX -DUSE_AVX2 -DTH_HAVE_THREAD /Z7 /EHa /DNOMINMAX /wd4267 /wd4251 /wd4522 /wd4838 /wd4305 /wd4244 /wd4190 /wd4101 /wd4996 /wd4275 /bigobj -O2 -DCAFFE2_BUILD_MAIN_LIB -DONNX_BUILD_MAIN_LIB -std:c++14 /fp:strict /arch:AVX /DCPU_CAPABILITY=AVX /DCPU_CAPABILITY_AVX /showIncludes /Focaffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp.obj /Fdcaffe2\CMakeFiles\torch_cpu.dir\ /FS -c aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp
FAILED: caffe2/CMakeFiles/torch_cpu.dir/__/aten/src/ATen/native/cpu/GridSamplerKernel.cpp.AVX.cpp.obj 
C:\Users\circleci\project\build\win_tmp\bin\sccache-cl.exe   /TP -DAT_PARALLEL_OPENMP=1 -DCPUINFO_SUPPORTED_PLATFORM=1 -DIDEEP_USE_MKL -DMAGMA_V2 -DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS -DONNX_ML=1 -DONNX_NAMESPACE=onnx_torch -DTH_BLAS_MKL -DWIN32_LEAN_AND_MEAN -D_CRT_SECURE_NO_DEPRECATE=1 -D_OPENMP_NOFORCE_MANIFEST -Dtorch_cpu_EXPORTS -Iaten\src -I..\aten\src -I. -I..\ -I..\cmake\..\third_party\benchmark\include -Icaffe2\contrib\aten -I..\third_party\onnx -Ithird_party\onnx -I..\third_party\foxi -Ithird_party\foxi -I..\caffe2\..\torch\csrc\api -I..\caffe2\..\torch\csrc\api\include -I..\caffe2\aten\src\TH -Icaffe2\aten\src\TH -I..\caffe2\..\torch\..\aten\src -Icaffe2\aten\src -Icaffe2\..\aten\src -Icaffe2\..\aten\src\ATen -I..\caffe2\..\torch\csrc -I..\caffe2\..\torch\..\third_party\miniz-2.0.8 -I..\aten\src\TH -I..\aten\..\third_party\catch\single_include -I..\aten\src\ATen\.. -Icaffe2\aten\src\ATen -I..\third_party\miniz-2.0.8 -I..\caffe2\core\nomnigraph\include -I..\c10\.. -Ithird_party\ideep\mkl-dnn\include -I..\third_party\ideep\mkl-dnn\src\..\include -I..\third_party\cpuinfo\include -I..\third_party\fbgemm\include -I..\third_party\fbgemm -I..\third_party\fbgemm\third_party\asmjit\src -I..\third_party\FP16\include -I..\cmake\..\third_party\googletest\googlemock\include -I..\cmake\..\third_party\googletest\googletest\include -I..\third_party\protobuf\src -Iwin_tmp\mkl\include -I..\third_party -I..\cmake\..\third_party\eigen -IC:\Jenkins\Miniconda3\include -IC:\Jenkins\Miniconda3\lib\site-packages\numpy\core\include -I..\cmake\..\third_party\pybind11\include -I\opt\rocm\hip\include -I\include -I..\cmake\..\third_party\cub -Iwin_tmp\magma\include -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I..\third_party\ideep\mkl-dnn\include -I..\third_party\ideep\include -I..\caffe2 /DWIN32 /D_WINDOWS /GR  /w /EHa /bigobj -openmp -DNDEBUG -DUSE_FBGEMM -DHAVE_AVX_CPU_DEFINITION -DHAVE_AVX2_CPU_DEFINITION /MD /O2 /Ob2 /DNDEBUG /w /EHa /bigobj -DNDEBUG   -DCUDA_HAS_FP16=1 -DUSE_GCC_GET_CPUID -DUSE_AVX -DUSE_AVX2 -DTH_HAVE_THREAD /Z7 /EHa /DNOMINMAX /wd4267 /wd4251 /wd4522 /wd4838 /wd4305 /wd4244 /wd4190 /wd4101 /wd4996 /wd4275 /bigobj -O2 -DCAFFE2_BUILD_MAIN_LIB -DONNX_BUILD_MAIN_LIB -std:c++14 /fp:strict /arch:AVX /DCPU_CAPABILITY=AVX /DCPU_CAPABILITY_AVX /showIncludes /Focaffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp.obj /Fdcaffe2\CMakeFiles\torch_cpu.dir\ /FS -c aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(737): error C2672: 'convert_to_int_of_same_size': no matching overloaded function found
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(730): note: while compiling class template member function 'void at::native::`anonymous-namespace'::ApplyGridSample<scalar_t,2,at::native::detail::GridSamplerInterpolation::Nearest,at::native::detail::GridSamplerPadding::Reflection,false>::backward(at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,const at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,const at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,int64_t,const at::vec256::`anonymous-namespace'::Vec256<scalar_t> &,const at::vec256::`anonymous-namespace'::Vec256<scalar_t> &,int64_t) const'
        with
        [
            scalar_t=scalar_t
        ]
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(1004): note: see reference to function template instantiation 'void at::native::`anonymous-namespace'::ApplyGridSample<scalar_t,2,at::native::detail::GridSamplerInterpolation::Nearest,at::native::detail::GridSamplerPadding::Reflection,false>::backward(at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,const at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,const at::TensorAccessor<scalar_t,3,at::DefaultPtrTraits,int64_t> &,int64_t,const at::vec256::`anonymous-namespace'::Vec256<scalar_t> &,const at::vec256::`anonymous-namespace'::Vec256<scalar_t> &,int64_t) const' being compiled
        with
        [
            scalar_t=scalar_t
        ]
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(947): note: see reference to class template instantiation 'at::native::`anonymous-namespace'::ApplyGridSample<scalar_t,2,at::native::detail::GridSamplerInterpolation::Nearest,at::native::detail::GridSamplerPadding::Reflection,false>' being compiled
C:\Users\circleci\project\aten\src\ATen/cpu/vec256/vec256_qint.h(1277): note: see reference to class template instantiation 'at::vec256::`anonymous-namespace'::Vec256QuantizedConverter<c10::quint8,std::array<at::vec256::`anonymous-namespace'::Vec256<float>,4>,std::array<at::vec256::`anonymous-namespace'::Vec256<c10::qint32>,4>,32>' being compiled
C:\Users\circleci\project\aten\src\ATen/cpu/vec256/vec256_qint.h(1157): note: see reference to class template instantiation 'at::vec256::`anonymous-namespace'::Vec256QuantizedConverter<c10::qint8,std::array<at::vec256::`anonymous-namespace'::Vec256<float>,4>,std::array<at::vec256::`anonymous-namespace'::Vec256<c10::qint32>,4>,32>' being compiled
C:\Users\circleci\project\aten\src\ATen/cpu/vec256/vec256_qint.h(1036): note: see reference to class template instantiation 'at::vec256::`anonymous-namespace'::Vec256QuantizedConverter<c10::qint32,std::array<at::vec256::`anonymous-namespace'::Vec256<float>,1>,std::array<at::vec256::`anonymous-namespace'::Vec256<c10::qint32>,1>,8>' being compiled
C:\Users\circleci\project\c10/core/MemoryFormat.h(56): note: see reference to class template instantiation 'c10::ArrayRef<int64_t>' being compiled
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(737): error C2770: invalid explicit template argument(s) for 'at::vec256::`anonymous-namespace'::Vec256<int_of_size<sizeof(T)>::type> at::vec256::`anonymous-namespace'::convert_to_int_of_same_size(const at::vec256::`anonymous-namespace'::Vec256<T> &)'
C:\Users\circleci\project\aten\src\ATen/cpu/vec256/vec256_base.h(725): note: see declaration of 'at::vec256::`anonymous-namespace'::convert_to_int_of_same_size'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(741): error C3536: 'i_x_nearest': cannot be used before it is initialized
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(741): error C2678: binary '>': no operator found which takes a left-hand operand of type 'int' (or there is no acceptable conversion)
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: could be 'at::Tensor at::operator >(c10::Scalar,const at::Tensor &)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator >(const at::Tensor &,c10::Scalar)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator >(const at::Tensor &,const at::Tensor &)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(741): note: while trying to match the argument list '(int, at::vec256::`anonymous-namespace'::Vec256<__int64>)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(741): error C2678: binary '<': no operator found which takes a left-hand operand of type 'int' (or there is no acceptable conversion)
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: could be 'at::Tensor at::operator <(c10::Scalar,const at::Tensor &)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator <(const at::Tensor &,c10::Scalar)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator <(const at::Tensor &,const at::Tensor &)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(741): note: while trying to match the argument list '(int, at::vec256::`anonymous-namespace'::Vec256<__int64>)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(744): error C2678: binary '+': no operator found which takes a left-hand operand of type 'at::vec256::`anonymous-namespace'::Vec256<__int64>' (or there is no acceptable conversion)
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: could be 'at::Tensor at::operator +(c10::Scalar,const at::Tensor &)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator +(const at::Tensor &,c10::Scalar)'
C:\Users\circleci\project\aten\src\ATen/TensorOperators.h(95): note: or       'at::Tensor at::operator +(const at::Tensor &,const at::Tensor &)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(744): note: while trying to match the argument list '(at::vec256::`anonymous-namespace'::Vec256<__int64>, int)'
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(747): error C3536: 'i_mask': cannot be used before it is initialized
aten\src\ATen\native\cpu\GridSamplerKernel.cpp.AVX.cpp(749): error C3536: 'i_gInp_offset': cannot be used before it is initialized
Microsoft (R) C/C++ Optimizing Compiler Version 19.16.27038 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.

https://app.circleci.com/pipelines/github/pytorch/pytorch/143494/workflows/ab21e8aa-9100-4b96-bcab-ef9bc229bf2d/jobs/4893272

@xuhdev xuhdev deleted the vec/comparison branch March 20, 2020 20:13
@xuhdev
Copy link
Copy Markdown
Collaborator Author

xuhdev commented Mar 20, 2020

@jamesr66a Seems the error is also related to your previous PR #25450 . Could you explain a bit about what happened there?

@jamesr66a
Copy link
Copy Markdown
Collaborator

@xuhdev I believe it's just a strange MSVC bug. I found a few threads like this: https://developercommunity.visualstudio.com/content/problem/151473/wrong-error-c2672-no-matching-overloaded-function.html

But it looks like my workaround of adding an explicit template argument is not actually durable. Not sure what the fix would be, it seems to be a strongly action-at-a-distance phenomenon

@edelsohn edelsohn mentioned this pull request Aug 9, 2020
24 tasks
laurentdupin pushed a commit to laurentdupin/pytorch that referenced this pull request Apr 24, 2026
Summary:
Benchmark: (Debian 10, Release build, gcc 8.3, no turbo, Intel(R) Xeon(R) E-2136 CPU @ 3.30GHz)

```python
import timeit
for op in ('gt', 'lt', 'ge', 'le', 'eq', 'ne'):
    for dtype in ('torch.float', 'torch.double', 'torch.int16', 'torch.int32', 'torch.int64'):
        for n, t in [(10_000, 100000),
                    (100_000, 10000)]:
            print(f'a.{op}_(b), numel() == {n} for {t} times, dtype={dtype}')
            print(timeit.timeit(f'a.{op}_(b)', setup=f'import torch; a = torch.arange(1, {n}, dtype={dtype}); b = torch.arange({n}, 1, -1, dtype={dtype})', number=t))
```

Before:

```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.778998922000028
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6359690249992127
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0801493119997758
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9360321379990637
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7341018620008981
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6345281440007966
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7396387640001194
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6429641230006382
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7759611700003006
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6672059659995284
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724312530008319
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6392585769990546
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7917451840003196
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6455550159989798
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.739991647998977
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6572993859990675
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7627949479992822
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6476544910001394
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7965036850000615
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6780715599998075
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7653547080008138
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6383065829995758
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7895260240002244
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6508346030004759
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7409299750015634
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6383492870008922
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7620547579990671
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6474270239996258
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.8070051169997896
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6712598600006459
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7627660060006747
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6406353189995571
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0826010620003217
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9391552950000914
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7427801039993938
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6365172640016681
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7679271510005492
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6453389289999905
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.788032889000533
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6708840760002204
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
1.078837263999958
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.9397531720005645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.1031508050000411
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9412319389994082
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7509566959997755
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.638570957000411
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7592877549996047
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6458840529994632
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7984061539991671
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6776346309998189
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724407899986545
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6581534130000364
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.8303323249983805
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6954390920000151
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.745512373998281
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6360954970004968
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7569978400006221
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6450422030011396
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7889118379989668
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6693385389989999
```

After:

```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2444220920006046
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2031730359994981
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35491806199934217
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3905606850003096
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.16665379499863775
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10095906300011848
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21650469999985944
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18737469400002738
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35481256200000644
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.36696120199849247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21976138800164335
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.20275393200063263
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3695997209997586
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39441510399956314
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15657078300137073
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.0992998069996247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20425128799979575
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20352934599941364
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35883567900054913
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39059587599876977
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21457727400047588
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.18836135499986995
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35971907199927955
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3688875009993353
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1576009280015569
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09524034199966991
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2064543649994448
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18726435600001423
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35351785300008487
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3680737989998306
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2132134399998904
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2140274829998816
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.36539215199991304
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39128020300086064
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15712150600120367
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10149904400168452
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2103407699996751
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.2134442910009966
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35387034300038067
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.38917528399906587
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2190484450002259
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2030815980015177
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3710030169986567
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36419657899932645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15986497499943653
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10145393699895067
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21011781599918322
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20121852699958254
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36681504499938455
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.364472848999867
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2290963309988001
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21674784300012107
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3829616689999966
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39437660300063726
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1661020749997988
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10052955100036343
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21827425599985872
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.21522501399886096
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.37058242300008715
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39304063900090114
```
Pull Request resolved: pytorch#33252

Differential Revision: D20254663

Pulled By: ezyang

fbshipit-source-id: 68b7109ec4359434afbeb96df372e29608f501bb
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

merge-this-please Was marked for merge with @pytorchbot merge this please Merged module: vectorization Related to SIMD vectorization, e.g., Vec256 open source triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module

Projects

None yet

Development

Successfully merging this pull request may close these issues.

8 participants