Skip to content

Commit 7362373

Browse files
committed
improve unsafe Decompression 0-8%
improve unsafe Decompression by 0-8% by replacing memcmp calls with a custom function
1 parent 4d36f98 commit 7362373

3 files changed

Lines changed: 171 additions & 1 deletion

File tree

src/block/decompress.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! The block decompression algorithm.
22
use crate::block::{DecompressError, MINMATCH};
3+
use crate::fastcpy_unsafe;
34
use crate::sink::SliceSink;
45
use crate::sink::{PtrSink, Sink};
56
use alloc::vec::Vec;
@@ -90,6 +91,7 @@ unsafe fn copy_from_dict(
9091
let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset;
9192
// Can't copy past ext_dict len, the match may cross dict and output
9293
let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
94+
// TODO test fastcpy_unsafe
9395
core::ptr::copy_nonoverlapping(
9496
ext_dict.as_ptr().add(dict_offset),
9597
*output_ptr,
@@ -297,6 +299,7 @@ pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
297299
if offset >= match_length {
298300
unsafe {
299301
// _copy_, not copy_non_overlaping, as it may overlap.
302+
// Compiles to the same assembly on x68_64.
300303
core::ptr::copy(start_ptr, output_ptr, 18);
301304
output_ptr = output_ptr.add(match_length);
302305
}
@@ -337,7 +340,7 @@ pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
337340
}
338341
}
339342
unsafe {
340-
core::ptr::copy_nonoverlapping(input_ptr, output_ptr, literal_length);
343+
fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length);
341344
output_ptr = output_ptr.add(literal_length);
342345
input_ptr = input_ptr.add(literal_length);
343346
}

src/fastcpy_unsafe.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
//! # FastCpy
2+
//!
3+
//! The Rust Compiler calls `memcpy` for slices of unknown length.
4+
//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
5+
//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
6+
//!
7+
//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
8+
//!
9+
//! As fall back the standard `memcpy` is called
10+
//!
11+
//! ## Double Copy Trick
12+
//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
13+
//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
14+
//!
15+
//! /// [1, 2, 3, 4, 5, 6]
16+
//! /// [1, 2, 3, 4]
17+
//! /// [3, 4, 5, 6]
18+
//!
19+
20+
#[inline]
21+
pub fn slice_copy(src: *const u8, dst: *mut u8, num_bytes: usize) {
22+
if num_bytes < 4 {
23+
short_copy(src, dst, num_bytes);
24+
return;
25+
}
26+
27+
if num_bytes < 8 {
28+
double_copy_trick::<4>(src, dst, num_bytes);
29+
return;
30+
}
31+
32+
if num_bytes <= 16 {
33+
double_copy_trick::<8>(src, dst, num_bytes);
34+
return;
35+
}
36+
37+
//if num_bytes <= 32 {
38+
//double_copy_trick::<16>(src, dst, num_bytes);
39+
//return;
40+
//}
41+
42+
// /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
43+
//#[cfg(target_feature = "avx")]
44+
//{
45+
//if num_bytes <= 64 {
46+
//double_copy_trick::<32>(src, dst, num_bytes);
47+
//return;
48+
//}
49+
//}
50+
51+
// For larger sizes we use the default, which calls memcpy
52+
// memcpy does some virtual memory tricks to copy large chunks of memory.
53+
//
54+
// The theory should be that the checks above don't cost much relative to the copy call for
55+
// larger copies.
56+
// The bounds checks in `copy_from_slice` are elided.
57+
58+
//unsafe { core::ptr::copy_nonoverlapping(src, dst, num_bytes) }
59+
wild_copy_from_src::<16>(src, dst, num_bytes)
60+
}
61+
62+
// Inline never because otherwise we get a call to memcpy -.-
63+
#[inline]
64+
fn wild_copy_from_src<const SIZE: usize>(
65+
mut source: *const u8,
66+
mut dst: *mut u8,
67+
num_bytes: usize,
68+
) {
69+
// Note: if the compiler auto-vectorizes this it'll hurt performance!
70+
// It's not the case for 16 bytes stepsize, but for 8 bytes.
71+
let l_last = unsafe { source.add(num_bytes - SIZE) };
72+
let r_last = unsafe { dst.add(num_bytes - SIZE) };
73+
let num_bytes = (num_bytes / SIZE) * SIZE;
74+
75+
unsafe {
76+
let dst_ptr_end = dst.add(num_bytes);
77+
loop {
78+
core::ptr::copy_nonoverlapping(source, dst, SIZE);
79+
source = source.add(SIZE);
80+
dst = dst.add(SIZE);
81+
if dst >= dst_ptr_end {
82+
break;
83+
}
84+
}
85+
}
86+
87+
unsafe {
88+
core::ptr::copy_nonoverlapping(l_last, r_last, SIZE);
89+
}
90+
}
91+
92+
#[inline]
93+
fn short_copy(src: *const u8, dst: *mut u8, len: usize) {
94+
unsafe {
95+
*dst = *src;
96+
}
97+
if len >= 2 {
98+
double_copy_trick::<2>(src, dst, len);
99+
}
100+
}
101+
102+
#[inline(always)]
103+
/// [1, 2, 3, 4, 5, 6]
104+
/// [1, 2, 3, 4]
105+
/// [3, 4, 5, 6]
106+
fn double_copy_trick<const SIZE: usize>(src: *const u8, dst: *mut u8, len: usize) {
107+
let l_end = unsafe { src.add(len - SIZE) };
108+
let r_end = unsafe { dst.add(len - SIZE) };
109+
110+
unsafe {
111+
core::ptr::copy_nonoverlapping(src, dst, SIZE);
112+
core::ptr::copy_nonoverlapping(l_end, r_end, SIZE);
113+
}
114+
}
115+
116+
#[cfg(test)]
117+
mod tests {
118+
use super::slice_copy;
119+
use proptest::prelude::*;
120+
121+
proptest! {
122+
#[test]
123+
fn test_fast_short_slice_copy(left: Vec<u8>) {
124+
if left.is_empty() {
125+
return Ok(());
126+
}
127+
let mut right = vec![0u8; left.len()];
128+
slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
129+
prop_assert_eq!(&left, &right);
130+
}
131+
}
132+
133+
#[test]
134+
fn test_fast_short_slice_copy_edge_cases() {
135+
for len in 1..(512 * 2) {
136+
let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
137+
let mut right = vec![0u8; len];
138+
slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
139+
assert_eq!(left, right);
140+
}
141+
}
142+
143+
#[test]
144+
fn test_fail2() {
145+
let left = vec![
146+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
147+
24, 25, 26, 27, 28, 29, 30, 31, 32,
148+
];
149+
let mut right = vec![0u8; left.len()];
150+
slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
151+
assert_eq!(left, right);
152+
}
153+
154+
#[test]
155+
fn test_fail() {
156+
let left = vec![
157+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
158+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159+
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160+
];
161+
let mut right = vec![0u8; left.len()];
162+
slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
163+
assert_eq!(left, right);
164+
}
165+
}

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ pub mod frame;
9191

9292
#[allow(dead_code)]
9393
mod fastcpy;
94+
#[allow(dead_code)]
95+
mod fastcpy_unsafe;
9496

9597
pub use block::{compress, compress_into, compress_prepend_size};
9698
pub use block::{decompress, decompress_into, decompress_size_prepended};

0 commit comments

Comments
 (0)