[CUDA] Abate `SoftMax.cu` compiler warning spam by eqy · Pull Request #128468 · pytorch/pytorch

eqy · 2024-06-12T00:00:34Z

Avoids excessively spammy warnings such as

pytorch/aten/src/ATen/native/cuda/SoftMax.cu(844): warning #191-D: type qualifier is meaningless on cast type
        [&] { const auto& the_type = input.scalar_type(); constexpr const char* at_dispatch_name = "host_softmax"; at::ScalarType _st = ::detail::scalar_type(the_type); ; switch (_st) { case at::ScalarType::Double: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Double)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Double), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Double>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Float: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Float)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Float), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Float>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Half: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Half)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Half), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Half>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::BFloat16: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::BFloat16)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::BFloat16), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::BFloat16>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } default: do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", ::c10::str('"', at_dispatch_name, "\" not implemented for '", toString(_st), "'")))); }; } while (false); } }()

and

SoftMax.cu:844: warning: comparison of integer expressions of different signedness: ‘int64_t’ {aka ‘long int’} and ‘long unsigned int’ [-Wsign-compare]

cc @ptrblck @msaroufim

pytorch-bot · 2024-06-12T00:00:37Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128468

📄 Preview Python docs built from this PR
📄 Preview C++ docs built from this PR
❓ Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours

Note: Links to docs will display an error until the docs builds have been completed.

✅ No Failures

As of commit 0e9196b with merge base 70a1e85 ():
💚 Looks good so far! There are no failures yet. 💚

This comment was automatically generated by Dr. CI and updates every 15 minutes.

valentinandrei

Sorry for the spam. Thanks for fixing it.

eqy · 2024-06-12T05:35:41Z

@pytorchmergebot merge

pytorchmergebot · 2024-06-12T05:37:29Z

Merge started

Your change will be merged once all checks pass (ETA 0-4 Hours).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging

Check the merge workflow status
here

Avoids excessively spammy warnings such as ``` pytorch/aten/src/ATen/native/cuda/SoftMax.cu(844): warning pytorch#191-D: type qualifier is meaningless on cast type [&] { const auto& the_type = input.scalar_type(); constexpr const char* at_dispatch_name = "host_softmax"; at::ScalarType _st = ::detail::scalar_type(the_type); ; switch (_st) { case at::ScalarType::Double: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Double)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Double), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Double>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Float: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Float)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Float), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Float>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Half: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Half)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Half), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Half>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::BFloat16: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::BFloat16)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::BFloat16), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::BFloat16>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } default: do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str('"', at_dispatch_name, "\" not implemented for '", toString(_st), "'")))); }; } while (false); } }() ``` and ``` SoftMax.cu:844: warning: comparison of integer expressions of different signedness: ‘int64_t’ {aka ‘long int’} and ‘long unsigned int’ [-Wsign-compare] ``` Pull Request resolved: pytorch#128468 Approved by: https://github.com/valentinandrei

check in

0e9196b

eqy added module: cuda Related to torch.cuda, and CUDA support in general module: build warnings Related to warnings during build process open source topic: not user facing topic category labels Jun 12, 2024

eqy requested a review from valentinandrei June 12, 2024 00:00

colesbury added the triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module label Jun 12, 2024

valentinandrei approved these changes Jun 12, 2024

View reviewed changes

pytorch-bot bot added the ciflow/trunk Trigger trunk jobs on your pull request label Jun 12, 2024

pytorchmergebot added the merging label Jun 12, 2024

pytorchmergebot added the Merged label Jun 12, 2024

pytorchmergebot closed this in 0b331fd Jun 12, 2024

pytorchmergebot removed the merging label Jun 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[CUDA] Abate `SoftMax.cu` compiler warning spam#128468

[CUDA] Abate `SoftMax.cu` compiler warning spam#128468
eqy wants to merge 1 commit intopytorch:mainfrom
eqy:softmaxwarning

eqy commented Jun 12, 2024 •

edited

Loading

Uh oh!

pytorch-bot bot commented Jun 12, 2024 •

edited

Loading

Uh oh!

valentinandrei left a comment

Uh oh!

eqy commented Jun 12, 2024

Uh oh!

pytorchmergebot commented Jun 12, 2024

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

Conversation

eqy commented Jun 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

pytorch-bot bot commented Jun 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128468

✅ No Failures

Uh oh!

valentinandrei left a comment

Choose a reason for hiding this comment

Uh oh!

eqy commented Jun 12, 2024

Uh oh!

pytorchmergebot commented Jun 12, 2024

Merge started

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

eqy commented Jun 12, 2024 •

edited

Loading

pytorch-bot bot commented Jun 12, 2024 •

edited

Loading