Hi, this issue follows a conversation with @vchuravy and @Keno, who advised I open an issue.
Here's my version info:
Julia Version 1.5.0-beta1.0
Commit 6443f6c95a (2020-05-28 17:42 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin19.4.0)
CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, skylake)
So, I've been trying to implement a type using NTuples, instead of ever wider primitive types.
I successfully made a function called shiftright that shifts all of the 2-bit encoded elements in a NTuple one position to the right, and it is as performant as the equivalent shifting of a UInt128:
using BioSequences, BenchmarkTools
# In BioSequences.jl kmers are called Mer, not Kmer so there is no name clash.
struct Kmer{A<:NucleicAcidAlphabet{2},K,N}
data::NTuple{N,UInt64}
end
# Shortcut
const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N}
@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64N, 2)
function (::Type{Kmer{A,K,N}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K,N}
seqlen = length(seq)
if seqlen != K
throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen ≠ $K)"))
end
if seqlen > capacity(Kmer{A,K,N})
throw(ArgumentError("Cannot build a mer longer than $(capacity(Kmer{A,K,N}))bp long"))
end
# Construct the head.
bases_in_head = div(64 - (64N - 2K), 2)
head = zero(UInt64)
@inbounds for i in 1:bases_in_head
nt = convert(eltype(typeof(seq)), seq[i])
head = (head << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
end
# And the rest of the sequence
idx = Ref(bases_in_head + 1)
tail = ntuple(Val{N - 1}()) do i
Base.@_inline_meta
body = zero(UInt64)
@inbounds for i in 1:32
nt = convert(eltype(typeof(seq)), seq[idx[]])
body = (body << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
idx[] += 1
end
return body
end
return Kmer{A,K,N}((head, tail...))
end
# For old primitive type based kmer.
@inline shiftright(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) >> 2)
# For NTuple based kmer.
@inline function shiftright(x::Kmer{A,K,N}) where {A,K,N}
return _shiftright(zero(UInt64), x.data...)
end
@inline function _shiftright(carry::UInt64, head::UInt64, tail...)
return ((head >> 2) | carry, _shiftright((head & UInt64(3)) << 62, tail...)...)
end
@inline _shiftright(carry::UInt64) = ()
dnaseq = LongSequence{DNAAlphabet{2}}(randdnaseq(63))
m = DNAKmer{63, 2}(dnaseq)
oldm = BigDNAMer{63}(dnaseq)
@benchmark shiftright($m)
@benchmark shiftright($oldm)
julia> @benchmark shiftright(m)
BenchmarkTools.Trial:
memory estimate: 32 bytes
allocs estimate: 1
--------------
minimum time: 23.807 ns (0.00% GC)
median time: 24.983 ns (0.00% GC)
mean time: 27.075 ns (3.33% GC)
maximum time: 1.573 μs (98.16% GC)
--------------
samples: 10000
evals/sample: 994
julia> @benchmark shiftright(oldm)
BenchmarkTools.Trial:
memory estimate: 32 bytes
allocs estimate: 1
--------------
minimum time: 23.241 ns (0.00% GC)
median time: 23.809 ns (0.00% GC)
mean time: 26.048 ns (3.40% GC)
maximum time: 1.539 μs (96.76% GC)
--------------
samples: 10000
evals/sample: 994
julia> @code_llvm shiftright(m)
; @ REPL[31]:1 within `shiftright'
define void @julia_shiftright_1226([2 x i64]* noalias nocapture sret, [1 x [2 x i64]]* nocapture nonnull readonly dereferenceable(16)) {
top:
; @ REPL[31]:2 within `shiftright'
%2 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 0
%3 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 1
; ┌ @ REPL[32]:2 within `_shiftright'
; │┌ @ int.jl:461 within `>>' @ int.jl:455
%4 = load i64, i64* %2, align 8
%5 = lshr i64 %4, 2
; │└
; │┌ @ int.jl:463 within `<<' @ int.jl:456
%6 = shl i64 %4, 62
; │└
; │ @ REPL[32]:2 within `_shiftright' @ REPL[32]:2
; │┌ @ int.jl:461 within `>>' @ int.jl:455
%7 = load i64, i64* %3, align 8
%8 = lshr i64 %7, 2
; │└
; │┌ @ int.jl:331 within `|'
%9 = or i64 %8, %6
; └└
%.sroa.0.0..sroa_idx = getelementptr inbounds [2 x i64], [2 x i64]* %0, i64 0, i64 0
store i64 %5, i64* %.sroa.0.0..sroa_idx, align 8
%.sroa.2.0..sroa_idx1 = getelementptr inbounds [2 x i64], [2 x i64]* %0, i64 0, i64 1
store i64 %9, i64* %.sroa.2.0..sroa_idx1, align 8
ret void
}
julia> @code_llvm shiftright(oldm)
; @ REPL[30]:1 within `shiftright'
define void @julia_shiftright_1228(i128* noalias nocapture sret, i128) {
top:
; ┌ @ int.jl:461 within `>>' @ int.jl:455
%2 = lshr i128 %1, 2
; └
store i128 %2, i128* %0, align 8
ret void
}
So far so good, I'm liking the Ntuples.
But when it comes to implementing a shiftleft it's a different story.
First here's the shiftleft implementation:
# For old primitive type based kmer.
@inline shiftleft(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) << 2)
# For new NTuple based kmer.
@inline function shiftleft(x::Kmer{A,K,N}) where {A,K,N}
_, newbits = _shiftleft(x.data...)
return Kmer{A,K,N}(_cliphead(64N - 2K, newbits...))
end
@inline function _cliphead(by::Integer, head::UInt64, tail...)
return (head & (typemax(UInt64) >> by), tail...)
end
@inline function _shiftleft(head::UInt64, tail...)
carry, newtail = _shiftleft(tail...)
return head >> 62, ((head << 2) | carry, newtail...)
end
@inline _shiftleft(head::UInt64) = (head & 0xC000000000000000) >> 62, head << 2
Seems fine to me at first, but I see allocations on benchmarking and a large performance penalty:
julia> @benchmark shiftleft($m)
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 6
--------------
minimum time: 228.094 ns (0.00% GC)
median time: 230.368 ns (0.00% GC)
mean time: 241.913 ns (1.64% GC)
maximum time: 4.400 μs (94.66% GC)
--------------
samples: 10000
evals/sample: 438
julia> @benchmark shiftleft($oldm)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 0.039 ns (0.00% GC)
median time: 0.043 ns (0.00% GC)
mean time: 0.043 ns (0.00% GC)
maximum time: 0.051 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
Looking at the @code_typed and @code_warntype I don't see type instability:
julia> @code_typed shiftleft(m)
CodeInfo(
1 ─ %1 = Base.getfield(x, :data)::Tuple{UInt64,UInt64}
│ %2 = (getfield)(%1, 1)::UInt64
│ %3 = (getfield)(%1, 2)::UInt64
│ %4 = Base.and_int(%3, 0xc000000000000000)::UInt64
│ %5 = Base.sle_int(0, 62)::Bool
│ %6 = Base.bitcast(UInt64, 62)::UInt64
│ %7 = Base.lshr_int(%4, %6)::UInt64
│ %8 = Base.neg_int(62)::Int64
│ %9 = Base.bitcast(UInt64, %8)::UInt64
│ %10 = Base.shl_int(%4, %9)::UInt64
│ %11 = Base.ifelse(%5, %7, %10)::UInt64
│ %12 = Base.sle_int(0, 2)::Bool
│ %13 = Base.bitcast(UInt64, 2)::UInt64
│ %14 = Base.shl_int(%3, %13)::UInt64
│ %15 = Base.neg_int(2)::Int64
│ %16 = Base.bitcast(UInt64, %15)::UInt64
│ %17 = Base.lshr_int(%3, %16)::UInt64
│ %18 = Base.ifelse(%12, %14, %17)::UInt64
│ %19 = Base.sle_int(0, 2)::Bool
│ %20 = Base.bitcast(UInt64, 2)::UInt64
│ %21 = Base.shl_int(%2, %20)::UInt64
│ %22 = Base.neg_int(2)::Int64
│ %23 = Base.bitcast(UInt64, %22)::UInt64
│ %24 = Base.lshr_int(%2, %23)::UInt64
│ %25 = Base.ifelse(%19, %21, %24)::UInt64
│ %26 = Base.or_int(%25, %11)::UInt64
│ %27 = Core.tuple(%26)::Tuple{UInt64}
│ %28 = Core._apply_iterate(Base.iterate, Core.tuple, %27, %18)::Tuple{UInt64,UInt64}
│ %29 = (getfield)(%28, 1)::UInt64
│ %30 = (getfield)(%28, 2)::UInt64
│ %31 = Base.and_int(%29, 0x3fffffffffffffff)::UInt64
│ %32 = Core.tuple(%31, %30)::Tuple{UInt64,UInt64}
│ %33 = %new(Kmer{DNAAlphabet{2},63,2}, %32)::Kmer{DNAAlphabet{2},63,2}
└── return %33
) => Kmer{DNAAlphabet{2},63,2}
julia> @code_warntype shiftleft(m)
Variables
#self#::Core.Compiler.Const(shiftleft, false)
x::Kmer{DNAAlphabet{2},63,2}
@_3::Int64
newbits::Tuple{UInt64,UInt64}
Body::Kmer{DNAAlphabet{2},63,2}
1 ─ nothing
│ %2 = Base.getproperty(x, :data)::Tuple{UInt64,UInt64}
│ %3 = Core._apply_iterate(Base.iterate, Main._shiftleft, %2)::Tuple{UInt64,Tuple{UInt64,UInt64}}
│ %4 = Base.indexed_iterate(%3, 1)::Core.Compiler.PartialStruct(Tuple{UInt64,Int64}, Any[UInt64, Core.Compiler.Const(2, false)])
│ Core.getfield(%4, 1)
│ (@_3 = Core.getfield(%4, 2))
│ %7 = Base.indexed_iterate(%3, 2, @_3::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{Tuple{UInt64,UInt64},Int64}, Any[Tuple{UInt64,UInt64}, Core.Compiler.Const(3, false)])
│ (newbits = Core.getfield(%7, 1))
│ %9 = Core.apply_type(Main.Kmer, $(Expr(:static_parameter, 1)), $(Expr(:static_parameter, 2)), $(Expr(:static_parameter, 3)))::Core.Compiler.Const(Kmer{DNAAlphabet{2},63,2}, false)
│ %10 = (64 * $(Expr(:static_parameter, 3)))::Core.Compiler.Const(128, false)
│ %11 = (2 * $(Expr(:static_parameter, 2)))::Core.Compiler.Const(126, false)
│ %12 = (%10 - %11)::Core.Compiler.Const(2, false)
│ %13 = Core.tuple(%12)::Core.Compiler.Const((2,), false)
│ %14 = Core._apply_iterate(Base.iterate, Main._cliphead, %13, newbits)::Tuple{UInt64,UInt64}
│ %15 = (%9)(%14)::Kmer{DNAAlphabet{2},63,2}
└── return %15
But on showing the generated llvm to Valentin and Keno they concluded something wasn't right:
julia> @code_llvm shiftleft(m)
; @ REPL[47]:1 within `shiftleft'
define void @julia_shiftleft_1768([1 x [2 x i64]]* noalias nocapture sret, [1 x [2 x i64]]* nocapture nonnull readonly dereferenceable(16)) {
top:
%2 = alloca %jl_value_t*, i32 4
%gcframe = alloca %jl_value_t*, i32 4, align 16
%3 = bitcast %jl_value_t** %gcframe to i8*
call void @llvm.memset.p0i8.i32(i8* align 16 %3, i8 0, i32 32, i1 false)
%4 = call %jl_value_t*** inttoptr (i64 4333723376 to %jl_value_t*** ()*)() #4
; @ REPL[47]:2 within `shiftleft'
%5 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
%6 = bitcast %jl_value_t** %5 to i64*
store i64 8, i64* %6
%7 = getelementptr %jl_value_t**, %jl_value_t*** %4, i32 0
%8 = load %jl_value_t**, %jl_value_t*** %7
%9 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
%10 = bitcast %jl_value_t** %9 to %jl_value_t***
store %jl_value_t** %8, %jl_value_t*** %10
%11 = bitcast %jl_value_t*** %7 to %jl_value_t***
store %jl_value_t** %gcframe, %jl_value_t*** %11
%12 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 0
%13 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 1
; ┌ @ REPL[49]:2 within `_shiftleft' @ REPL[50]:1
; │┌ @ int.jl:308 within `&'
%14 = load i64, i64* %13, align 8
; │└
; │┌ @ int.jl:461 within `>>' @ int.jl:455
%15 = lshr i64 %14, 62
; │└
; │┌ @ int.jl:463 within `<<' @ int.jl:456
%16 = shl i64 %14, 2
; │└
; │ @ REPL[49]:3 within `_shiftleft'
; │┌ @ int.jl:463 within `<<' @ int.jl:456
%17 = load i64, i64* %12, align 8
%18 = shl i64 %17, 2
; │└
; │┌ @ int.jl:331 within `|'
%19 = or i64 %18, %15
; │└
%20 = bitcast %jl_value_t*** %4 to i8*
%21 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %20, i32 1400, i32 16) #1
%22 = bitcast %jl_value_t* %21 to %jl_value_t**
%23 = getelementptr %jl_value_t*, %jl_value_t** %22, i64 -1
store %jl_value_t* inttoptr (i64 4441405376 to %jl_value_t*), %jl_value_t** %23
%24 = bitcast %jl_value_t* %21 to i64*
store i64 %19, i64* %24, align 8
%25 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
store %jl_value_t* %21, %jl_value_t** %25
%26 = call %jl_value_t* @jl_box_uint64(i64 zeroext %16)
%27 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
store %jl_value_t* %26, %jl_value_t** %27
%28 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 0
store %jl_value_t* inttoptr (i64 4488660112 to %jl_value_t*), %jl_value_t** %28
%29 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 1
store %jl_value_t* inttoptr (i64 4438135904 to %jl_value_t*), %jl_value_t** %29
%30 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 2
store %jl_value_t* %21, %jl_value_t** %30
%31 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 3
store %jl_value_t* %26, %jl_value_t** %31
%32 = call nonnull %jl_value_t* @jl_f__apply_iterate(%jl_value_t* null, %jl_value_t** %2, i32 4)
; └
; @ REPL[47]:3 within `shiftleft'
%33 = bitcast %jl_value_t* %32 to i64*
%34 = bitcast %jl_value_t* %32 to [2 x i64]*
%35 = getelementptr inbounds [2 x i64], [2 x i64]* %34, i64 0, i64 1
; ┌ @ REPL[48]:2 within `_cliphead'
; │┌ @ int.jl:308 within `&'
%36 = load i64, i64* %33, align 8
%37 = and i64 %36, 4611686018427387903
; │└
%38 = load i64, i64* %35, align 8
; └
%.sroa.0.sroa.0.0..sroa.0.0..sroa_cast1.sroa_idx = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %0, i64 0, i64 0, i64 0
store i64 %37, i64* %.sroa.0.sroa.0.0..sroa.0.0..sroa_cast1.sroa_idx, align 8
%.sroa.0.sroa.2.0..sroa.0.0..sroa_cast1.sroa_idx5 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %0, i64 0, i64 0, i64 1
store i64 %38, i64* %.sroa.0.sroa.2.0..sroa.0.0..sroa_cast1.sroa_idx5, align 8
%39 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
%40 = load %jl_value_t*, %jl_value_t** %39
%41 = getelementptr %jl_value_t**, %jl_value_t*** %4, i32 0
%42 = bitcast %jl_value_t*** %41 to %jl_value_t**
store %jl_value_t* %40, %jl_value_t** %42
ret void
}
Namely "the apply_iterate that ruins your parade", but it should be optimised away: Keno concluded an issue needed to be opened, saying "Looks like splat of integer needs to be special cased".
Hi, this issue follows a conversation with @vchuravy and @Keno, who advised I open an issue.
Here's my version info:
So, I've been trying to implement a type using NTuples, instead of ever wider primitive types.
I successfully made a function called
shiftrightthat shifts all of the 2-bit encoded elements in a NTuple one position to the right, and it is as performant as the equivalent shifting of a UInt128:So far so good, I'm liking the Ntuples.
But when it comes to implementing a
shiftleftit's a different story.First here's the
shiftleftimplementation:Seems fine to me at first, but I see allocations on benchmarking and a large performance penalty:
Looking at the @code_typed and @code_warntype I don't see type instability:
But on showing the generated llvm to Valentin and Keno they concluded something wasn't right:
Namely "the apply_iterate that ruins your parade", but it should be optimised away: Keno concluded an issue needed to be opened, saying "Looks like splat of integer needs to be special cased".