improve unsafe Decompression 0-8%

PSeitz · PSeitz · commit 7362373da147 · 2023-05-27T23:15:31.000+08:00
improve unsafe Decompression by 0-8% by replacing memcmp calls with
a custom function
diff --git a/src/block/decompress.rs b/src/block/decompress.rs
@@ -1,5 +1,6 @@
 //! The block decompression algorithm.
 use crate::block::{DecompressError, MINMATCH};
+use crate::fastcpy_unsafe;
 use crate::sink::SliceSink;
 use crate::sink::{PtrSink, Sink};
 use alloc::vec::Vec;
@@ -90,6 +91,7 @@ unsafe fn copy_from_dict(
     let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset;
     // Can't copy past ext_dict len, the match may cross dict and output
     let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
+    // TODO test fastcpy_unsafe
     core::ptr::copy_nonoverlapping(
         ext_dict.as_ptr().add(dict_offset),
         *output_ptr,
@@ -297,6 +299,7 @@ pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
             if offset >= match_length {
                 unsafe {
                     // _copy_, not copy_non_overlaping, as it may overlap.
+                    // Compiles to the same assembly on x68_64.
                     core::ptr::copy(start_ptr, output_ptr, 18);
                     output_ptr = output_ptr.add(match_length);
                 }
@@ -337,7 +340,7 @@ pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
                 }
             }
             unsafe {
-                core::ptr::copy_nonoverlapping(input_ptr, output_ptr, literal_length);
+                fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length);
                 output_ptr = output_ptr.add(literal_length);
                 input_ptr = input_ptr.add(literal_length);
             }
diff --git a/src/fastcpy_unsafe.rs b/src/fastcpy_unsafe.rs
@@ -0,0 +1,165 @@
+//! # FastCpy
+//!
+//! The Rust Compiler calls `memcpy` for slices of unknown length.
+//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
+//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
+//!
+//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
+//!
+//! As fall back the standard `memcpy` is called
+//!
+//! ## Double Copy Trick
+//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
+//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
+//!
+//! /// [1, 2, 3, 4, 5, 6]
+//! /// [1, 2, 3, 4]
+//! ///       [3, 4, 5, 6]
+//!
+
+#[inline]
+pub fn slice_copy(src: *const u8, dst: *mut u8, num_bytes: usize) {
+    if num_bytes < 4 {
+        short_copy(src, dst, num_bytes);
+        return;
+    }
+
+    if num_bytes < 8 {
+        double_copy_trick::<4>(src, dst, num_bytes);
+        return;
+    }
+
+    if num_bytes <= 16 {
+        double_copy_trick::<8>(src, dst, num_bytes);
+        return;
+    }
+
+    //if num_bytes <= 32 {
+    //double_copy_trick::<16>(src, dst, num_bytes);
+    //return;
+    //}
+
+    // /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+    //#[cfg(target_feature = "avx")]
+    //{
+    //if num_bytes <= 64 {
+    //double_copy_trick::<32>(src, dst, num_bytes);
+    //return;
+    //}
+    //}
+
+    // For larger sizes we use the default, which calls memcpy
+    // memcpy does some virtual memory tricks to copy large chunks of memory.
+    //
+    // The theory should be that the checks above don't cost much relative to the copy call for
+    // larger copies.
+    // The bounds checks in `copy_from_slice` are elided.
+
+    //unsafe { core::ptr::copy_nonoverlapping(src, dst, num_bytes) }
+    wild_copy_from_src::<16>(src, dst, num_bytes)
+}
+
+// Inline never because otherwise we get a call to memcpy -.-
+#[inline]
+fn wild_copy_from_src<const SIZE: usize>(
+    mut source: *const u8,
+    mut dst: *mut u8,
+    num_bytes: usize,
+) {
+    // Note: if the compiler auto-vectorizes this it'll hurt performance!
+    // It's not the case for 16 bytes stepsize, but for 8 bytes.
+    let l_last = unsafe { source.add(num_bytes - SIZE) };
+    let r_last = unsafe { dst.add(num_bytes - SIZE) };
+    let num_bytes = (num_bytes / SIZE) * SIZE;
+
+    unsafe {
+        let dst_ptr_end = dst.add(num_bytes);
+        loop {
+            core::ptr::copy_nonoverlapping(source, dst, SIZE);
+            source = source.add(SIZE);
+            dst = dst.add(SIZE);
+            if dst >= dst_ptr_end {
+                break;
+            }
+        }
+    }
+
+    unsafe {
+        core::ptr::copy_nonoverlapping(l_last, r_last, SIZE);
+    }
+}
+
+#[inline]
+fn short_copy(src: *const u8, dst: *mut u8, len: usize) {
+    unsafe {
+        *dst = *src;
+    }
+    if len >= 2 {
+        double_copy_trick::<2>(src, dst, len);
+    }
+}
+
+#[inline(always)]
+/// [1, 2, 3, 4, 5, 6]
+/// [1, 2, 3, 4]
+///       [3, 4, 5, 6]
+fn double_copy_trick<const SIZE: usize>(src: *const u8, dst: *mut u8, len: usize) {
+    let l_end = unsafe { src.add(len - SIZE) };
+    let r_end = unsafe { dst.add(len - SIZE) };
+
+    unsafe {
+        core::ptr::copy_nonoverlapping(src, dst, SIZE);
+        core::ptr::copy_nonoverlapping(l_end, r_end, SIZE);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::slice_copy;
+    use proptest::prelude::*;
+
+    proptest! {
+        #[test]
+        fn test_fast_short_slice_copy(left: Vec<u8>) {
+            if left.is_empty() {
+                return Ok(());
+            }
+            let mut right = vec![0u8; left.len()];
+            slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+            prop_assert_eq!(&left, &right);
+        }
+    }
+
+    #[test]
+    fn test_fast_short_slice_copy_edge_cases() {
+        for len in 1..(512 * 2) {
+            let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
+            let mut right = vec![0u8; len];
+            slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+            assert_eq!(left, right);
+        }
+    }
+
+    #[test]
+    fn test_fail2() {
+        let left = vec![
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn test_fail() {
+        let left = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+        assert_eq!(left, right);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -91,6 +91,8 @@ pub mod frame;
 
 #[allow(dead_code)]
 mod fastcpy;
+#[allow(dead_code)]
+mod fastcpy_unsafe;
 
 pub use block::{compress, compress_into, compress_prepend_size};
 pub use block::{decompress, decompress_into, decompress_size_prepended};