[FEAT] Improved PagedAttention FP8 (faster kvcache dequant v1)#346
[FEAT] Improved PagedAttention FP8 (faster kvcache dequant v1)#346tjtanaa wants to merge 2 commits intoROCm:llama_fp8_12062024from
Conversation
hongxiayang
left a comment
There was a problem hiding this comment.
Thank you very much for the integration. Left some comments and suggestions about coding style.
| float old; | ||
| old = (value >= 0) | ||
| ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) | ||
| : __uint_as_float( | ||
| atomicMin((unsigned int*)addr, __float_as_uint(value))); | ||
|
|
||
| return old; |
There was a problem hiding this comment.
| float old; | |
| old = (value >= 0) | |
| ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) | |
| : __uint_as_float( | |
| atomicMin((unsigned int*)addr, __float_as_uint(value))); | |
| return old; | |
| return (value >= 0) | |
| ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) | |
| : __uint_as_float( | |
| atomicMin((unsigned int*)addr, __float_as_uint(value))); | |
| template <bool is_scale_inverted> | ||
| __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, | ||
| float const scale) { | ||
| float x = 0.0f; |
There was a problem hiding this comment.
kindly name the variable with meaningful name, like scaledValue.
| if constexpr (is_scale_inverted) { | ||
| x = val * scale; | ||
| } else { | ||
| x = val / scale; |
| x = val / scale; | ||
| } | ||
|
|
||
| float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); |
There was a problem hiding this comment.
naming r as result, or something like that
| const scalar_t* __restrict__ input, | ||
| int64_t num_elems) { | ||
| __shared__ float cache[1024]; | ||
| int64_t i = blockDim.x * blockIdx.x + threadIdx.x; |
There was a problem hiding this comment.
| int64_t i = blockDim.x * blockIdx.x + threadIdx.x; | |
| int64_t index = blockDim.x * blockIdx.x + threadIdx.x; |
| num_query_heads, num_kv_heads = num_heads | ||
| query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) | ||
| query.uniform_(-scale, scale) | ||
| #query = torch.ones_like(query) |
There was a problem hiding this comment.
| #query = torch.ones_like(query) |
| #print('>>> ref qkout shape',attn_weights.shape) | ||
| #print('>>> ref qkout',attn_weights) | ||
| #global REF_TENSOR | ||
| #REF_TENSOR = attn_weights |
There was a problem hiding this comment.
| #print('>>> ref qkout shape',attn_weights.shape) | |
| #print('>>> ref qkout',attn_weights) | |
| #global REF_TENSOR | |
| #REF_TENSOR = attn_weights |
| SEEDS = [0] | ||
| CUDA_DEVICES = [ | ||
| f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) | ||
| f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 1) |
There was a problem hiding this comment.
this seems to change the multi-gpu test to only single gpu test. are you sure you want to have this change committed?
| NUM_BLOCKS = 1024 * 1024 | ||
| PARTITION_SIZE = 512 | ||
| NUM_BLOCKS = 256 * 1024 | ||
| PARTITION_SIZE = 256 |
There was a problem hiding this comment.
Can you explain what is the reason changing the values of the two constants? and is this change ROCm specific?
|
|
||
| # Using default kv_scale | ||
| k_scale = v_scale = 1.0 | ||
| k_scale = v_scale = 0.1 |
There was a problem hiding this comment.
also, can you explain the default kv-scale change?
Description
This is a PR to merge https://github.com/ROCm/vllm/tree/shsanyal_devpa_308_opt optimized
attention.cukernel intollama_fp8_12062024branch.CAVEAT
Currently the
attention.cukernel does not supportblock sizeof32andhead sizeof64.The vLLM model unittests are failing as it uses small models e.g. Gemma, Llama which has
head sizeof64.Performance
The following is a
benchmark_throughputresults ofLlama-3.1-70Bwithfp8dynamic quantization andkv-cache-dtypeoffp8_e4m3. For sequence input token length2048and output token length2048: