Skip to content

Splat of Integer needs to be special cased #36087

@TransGirlCodes

Description

@TransGirlCodes

Hi, this issue follows a conversation with @vchuravy and @Keno, who advised I open an issue.

Here's my version info:

Julia Version 1.5.0-beta1.0
Commit 6443f6c95a (2020-05-28 17:42 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin19.4.0)
  CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, skylake)

So, I've been trying to implement a type using NTuples, instead of ever wider primitive types.
I successfully made a function called shiftright that shifts all of the 2-bit encoded elements in a NTuple one position to the right, and it is as performant as the equivalent shifting of a UInt128:

using BioSequences, BenchmarkTools
# In BioSequences.jl kmers are called Mer, not Kmer so there is no name clash.
struct Kmer{A<:NucleicAcidAlphabet{2},K,N}
    data::NTuple{N,UInt64}
end
# Shortcut
const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N}

@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64N, 2)

function (::Type{Kmer{A,K,N}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K,N}
    seqlen = length(seq)
    if seqlen != K
        throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen$K)"))
    end
    if seqlen > capacity(Kmer{A,K,N})
        throw(ArgumentError("Cannot build a mer longer than $(capacity(Kmer{A,K,N}))bp long"))
    end

    # Construct the head.
    bases_in_head = div(64 - (64N - 2K), 2)
    head = zero(UInt64)
    @inbounds for i in 1:bases_in_head
        nt = convert(eltype(typeof(seq)), seq[i])
        head = (head << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
    end
    
    # And the rest of the sequence
    idx = Ref(bases_in_head + 1)
    
    tail = ntuple(Val{N - 1}()) do i
        Base.@_inline_meta
        body = zero(UInt64)
        @inbounds for i in 1:32
            nt = convert(eltype(typeof(seq)), seq[idx[]])
            body = (body << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
            idx[] += 1
        end
        return body
    end

    return Kmer{A,K,N}((head, tail...))
end

# For old primitive type based kmer.
@inline shiftright(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) >> 2)

# For NTuple based kmer.
@inline function shiftright(x::Kmer{A,K,N}) where {A,K,N}
    return _shiftright(zero(UInt64), x.data...)
end
@inline function _shiftright(carry::UInt64, head::UInt64, tail...)
    return ((head >> 2) | carry, _shiftright((head & UInt64(3)) << 62, tail...)...)
end
@inline _shiftright(carry::UInt64) = ()
dnaseq = LongSequence{DNAAlphabet{2}}(randdnaseq(63))
m = DNAKmer{63, 2}(dnaseq)
oldm = BigDNAMer{63}(dnaseq)
@benchmark shiftright($m)
@benchmark shiftright($oldm)
julia> @benchmark shiftright(m)
BenchmarkTools.Trial: 
  memory estimate:  32 bytes
  allocs estimate:  1
  --------------
  minimum time:     23.807 ns (0.00% GC)
  median time:      24.983 ns (0.00% GC)
  mean time:        27.075 ns (3.33% GC)
  maximum time:     1.573 μs (98.16% GC)
  --------------
  samples:          10000
  evals/sample:     994

julia> @benchmark shiftright(oldm)
BenchmarkTools.Trial: 
  memory estimate:  32 bytes
  allocs estimate:  1
  --------------
  minimum time:     23.241 ns (0.00% GC)
  median time:      23.809 ns (0.00% GC)
  mean time:        26.048 ns (3.40% GC)
  maximum time:     1.539 μs (96.76% GC)
  --------------
  samples:          10000
  evals/sample:     994

julia> @code_llvm shiftright(m)

;  @ REPL[31]:1 within `shiftright'
define void @julia_shiftright_1226([2 x i64]* noalias nocapture sret, [1 x [2 x i64]]* nocapture nonnull readonly dereferenceable(16)) {
top:
;  @ REPL[31]:2 within `shiftright'
  %2 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 0
  %3 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 1
; ┌ @ REPL[32]:2 within `_shiftright'
; │┌ @ int.jl:461 within `>>' @ int.jl:455
    %4 = load i64, i64* %2, align 8
    %5 = lshr i64 %4, 2
; │└
; │┌ @ int.jl:463 within `<<' @ int.jl:456
    %6 = shl i64 %4, 62
; │└
; │ @ REPL[32]:2 within `_shiftright' @ REPL[32]:2
; │┌ @ int.jl:461 within `>>' @ int.jl:455
    %7 = load i64, i64* %3, align 8
    %8 = lshr i64 %7, 2
; │└
; │┌ @ int.jl:331 within `|'
    %9 = or i64 %8, %6
; └└
  %.sroa.0.0..sroa_idx = getelementptr inbounds [2 x i64], [2 x i64]* %0, i64 0, i64 0
  store i64 %5, i64* %.sroa.0.0..sroa_idx, align 8
  %.sroa.2.0..sroa_idx1 = getelementptr inbounds [2 x i64], [2 x i64]* %0, i64 0, i64 1
  store i64 %9, i64* %.sroa.2.0..sroa_idx1, align 8
  ret void
}

julia> @code_llvm shiftright(oldm)

;  @ REPL[30]:1 within `shiftright'
define void @julia_shiftright_1228(i128* noalias nocapture sret, i128) {
top:
; ┌ @ int.jl:461 within `>>' @ int.jl:455
   %2 = lshr i128 %1, 2
; └
  store i128 %2, i128* %0, align 8
  ret void
}

So far so good, I'm liking the Ntuples.

But when it comes to implementing a shiftleft it's a different story.
First here's the shiftleft implementation:

# For old primitive type based kmer.
@inline shiftleft(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) << 2)

# For new NTuple based kmer.
@inline function shiftleft(x::Kmer{A,K,N}) where {A,K,N}
    _, newbits = _shiftleft(x.data...)
    return Kmer{A,K,N}(_cliphead(64N - 2K, newbits...))
end

@inline function _cliphead(by::Integer, head::UInt64, tail...)
    return (head & (typemax(UInt64) >> by), tail...)
end

@inline function _shiftleft(head::UInt64, tail...)
    carry, newtail = _shiftleft(tail...)
    return head >> 62, ((head << 2) | carry, newtail...)
end

@inline _shiftleft(head::UInt64) = (head & 0xC000000000000000) >> 62, head << 2

Seems fine to me at first, but I see allocations on benchmarking and a large performance penalty:

julia> @benchmark shiftleft($m)
BenchmarkTools.Trial: 
  memory estimate:  112 bytes
  allocs estimate:  6
  --------------
  minimum time:     228.094 ns (0.00% GC)
  median time:      230.368 ns (0.00% GC)
  mean time:        241.913 ns (1.64% GC)
  maximum time:     4.400 μs (94.66% GC)
  --------------
  samples:          10000
  evals/sample:     438

julia> @benchmark shiftleft($oldm)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     0.039 ns (0.00% GC)
  median time:      0.043 ns (0.00% GC)
  mean time:        0.043 ns (0.00% GC)
  maximum time:     0.051 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

Looking at the @code_typed and @code_warntype I don't see type instability:

julia> @code_typed shiftleft(m)
CodeInfo(
1%1  = Base.getfield(x, :data)::Tuple{UInt64,UInt64}%2  = (getfield)(%1, 1)::UInt64%3  = (getfield)(%1, 2)::UInt64%4  = Base.and_int(%3, 0xc000000000000000)::UInt64%5  = Base.sle_int(0, 62)::Bool%6  = Base.bitcast(UInt64, 62)::UInt64%7  = Base.lshr_int(%4, %6)::UInt64%8  = Base.neg_int(62)::Int64%9  = Base.bitcast(UInt64, %8)::UInt64%10 = Base.shl_int(%4, %9)::UInt64%11 = Base.ifelse(%5, %7, %10)::UInt64%12 = Base.sle_int(0, 2)::Bool%13 = Base.bitcast(UInt64, 2)::UInt64%14 = Base.shl_int(%3, %13)::UInt64%15 = Base.neg_int(2)::Int64%16 = Base.bitcast(UInt64, %15)::UInt64%17 = Base.lshr_int(%3, %16)::UInt64%18 = Base.ifelse(%12, %14, %17)::UInt64%19 = Base.sle_int(0, 2)::Bool%20 = Base.bitcast(UInt64, 2)::UInt64%21 = Base.shl_int(%2, %20)::UInt64%22 = Base.neg_int(2)::Int64%23 = Base.bitcast(UInt64, %22)::UInt64%24 = Base.lshr_int(%2, %23)::UInt64%25 = Base.ifelse(%19, %21, %24)::UInt64%26 = Base.or_int(%25, %11)::UInt64%27 = Core.tuple(%26)::Tuple{UInt64}%28 = Core._apply_iterate(Base.iterate, Core.tuple, %27, %18)::Tuple{UInt64,UInt64}%29 = (getfield)(%28, 1)::UInt64%30 = (getfield)(%28, 2)::UInt64%31 = Base.and_int(%29, 0x3fffffffffffffff)::UInt64%32 = Core.tuple(%31, %30)::Tuple{UInt64,UInt64}%33 = %new(Kmer{DNAAlphabet{2},63,2}, %32)::Kmer{DNAAlphabet{2},63,2}
└──       return %33
) => Kmer{DNAAlphabet{2},63,2}

julia> @code_warntype shiftleft(m)
Variables
  #self#::Core.Compiler.Const(shiftleft, false)
  x::Kmer{DNAAlphabet{2},63,2}
  @_3::Int64
  newbits::Tuple{UInt64,UInt64}

Body::Kmer{DNAAlphabet{2},63,2}
1nothing%2  = Base.getproperty(x, :data)::Tuple{UInt64,UInt64}%3  = Core._apply_iterate(Base.iterate, Main._shiftleft, %2)::Tuple{UInt64,Tuple{UInt64,UInt64}}%4  = Base.indexed_iterate(%3, 1)::Core.Compiler.PartialStruct(Tuple{UInt64,Int64}, Any[UInt64, Core.Compiler.Const(2, false)])
│         Core.getfield(%4, 1)
│         (@_3 = Core.getfield(%4, 2))
│   %7  = Base.indexed_iterate(%3, 2, @_3::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{Tuple{UInt64,UInt64},Int64}, Any[Tuple{UInt64,UInt64}, Core.Compiler.Const(3, false)])
│         (newbits = Core.getfield(%7, 1))
│   %9  = Core.apply_type(Main.Kmer, $(Expr(:static_parameter, 1)), $(Expr(:static_parameter, 2)), $(Expr(:static_parameter, 3)))::Core.Compiler.Const(Kmer{DNAAlphabet{2},63,2}, false)
│   %10 = (64 * $(Expr(:static_parameter, 3)))::Core.Compiler.Const(128, false)
│   %11 = (2 * $(Expr(:static_parameter, 2)))::Core.Compiler.Const(126, false)
│   %12 = (%10 - %11)::Core.Compiler.Const(2, false)
│   %13 = Core.tuple(%12)::Core.Compiler.Const((2,), false)
│   %14 = Core._apply_iterate(Base.iterate, Main._cliphead, %13, newbits)::Tuple{UInt64,UInt64}%15 = (%9)(%14)::Kmer{DNAAlphabet{2},63,2}
└──       return %15

But on showing the generated llvm to Valentin and Keno they concluded something wasn't right:

julia> @code_llvm shiftleft(m)

;  @ REPL[47]:1 within `shiftleft'
define void @julia_shiftleft_1768([1 x [2 x i64]]* noalias nocapture sret, [1 x [2 x i64]]* nocapture nonnull readonly dereferenceable(16)) {
top:
  %2 = alloca %jl_value_t*, i32 4
  %gcframe = alloca %jl_value_t*, i32 4, align 16
  %3 = bitcast %jl_value_t** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* align 16 %3, i8 0, i32 32, i1 false)
  %4 = call %jl_value_t*** inttoptr (i64 4333723376 to %jl_value_t*** ()*)() #4
;  @ REPL[47]:2 within `shiftleft'
  %5 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
  %6 = bitcast %jl_value_t** %5 to i64*
  store i64 8, i64* %6
  %7 = getelementptr %jl_value_t**, %jl_value_t*** %4, i32 0
  %8 = load %jl_value_t**, %jl_value_t*** %7
  %9 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %10 = bitcast %jl_value_t** %9 to %jl_value_t***
  store %jl_value_t** %8, %jl_value_t*** %10
  %11 = bitcast %jl_value_t*** %7 to %jl_value_t***
  store %jl_value_t** %gcframe, %jl_value_t*** %11
  %12 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 0
  %13 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %1, i64 0, i64 0, i64 1
; ┌ @ REPL[49]:2 within `_shiftleft' @ REPL[50]:1
; │┌ @ int.jl:308 within `&'
    %14 = load i64, i64* %13, align 8
; │└
; │┌ @ int.jl:461 within `>>' @ int.jl:455
    %15 = lshr i64 %14, 62
; │└
; │┌ @ int.jl:463 within `<<' @ int.jl:456
    %16 = shl i64 %14, 2
; │└
; │ @ REPL[49]:3 within `_shiftleft'
; │┌ @ int.jl:463 within `<<' @ int.jl:456
    %17 = load i64, i64* %12, align 8
    %18 = shl i64 %17, 2
; │└
; │┌ @ int.jl:331 within `|'
    %19 = or i64 %18, %15
; │└
   %20 = bitcast %jl_value_t*** %4 to i8*
   %21 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %20, i32 1400, i32 16) #1
   %22 = bitcast %jl_value_t* %21 to %jl_value_t**
   %23 = getelementptr %jl_value_t*, %jl_value_t** %22, i64 -1
   store %jl_value_t* inttoptr (i64 4441405376 to %jl_value_t*), %jl_value_t** %23
   %24 = bitcast %jl_value_t* %21 to i64*
   store i64 %19, i64* %24, align 8
   %25 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
   store %jl_value_t* %21, %jl_value_t** %25
   %26 = call %jl_value_t* @jl_box_uint64(i64 zeroext %16)
   %27 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
   store %jl_value_t* %26, %jl_value_t** %27
   %28 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 0
   store %jl_value_t* inttoptr (i64 4488660112 to %jl_value_t*), %jl_value_t** %28
   %29 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 1
   store %jl_value_t* inttoptr (i64 4438135904 to %jl_value_t*), %jl_value_t** %29
   %30 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 2
   store %jl_value_t* %21, %jl_value_t** %30
   %31 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 3
   store %jl_value_t* %26, %jl_value_t** %31
   %32 = call nonnull %jl_value_t* @jl_f__apply_iterate(%jl_value_t* null, %jl_value_t** %2, i32 4)
; └
;  @ REPL[47]:3 within `shiftleft'
  %33 = bitcast %jl_value_t* %32 to i64*
  %34 = bitcast %jl_value_t* %32 to [2 x i64]*
  %35 = getelementptr inbounds [2 x i64], [2 x i64]* %34, i64 0, i64 1
; ┌ @ REPL[48]:2 within `_cliphead'
; │┌ @ int.jl:308 within `&'
    %36 = load i64, i64* %33, align 8
    %37 = and i64 %36, 4611686018427387903
; │└
   %38 = load i64, i64* %35, align 8
; └
  %.sroa.0.sroa.0.0..sroa.0.0..sroa_cast1.sroa_idx = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %0, i64 0, i64 0, i64 0
  store i64 %37, i64* %.sroa.0.sroa.0.0..sroa.0.0..sroa_cast1.sroa_idx, align 8
  %.sroa.0.sroa.2.0..sroa.0.0..sroa_cast1.sroa_idx5 = getelementptr inbounds [1 x [2 x i64]], [1 x [2 x i64]]* %0, i64 0, i64 0, i64 1
  store i64 %38, i64* %.sroa.0.sroa.2.0..sroa.0.0..sroa_cast1.sroa_idx5, align 8
  %39 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %40 = load %jl_value_t*, %jl_value_t** %39
  %41 = getelementptr %jl_value_t**, %jl_value_t*** %4, i32 0
  %42 = bitcast %jl_value_t*** %41 to %jl_value_t**
  store %jl_value_t* %40, %jl_value_t** %42
  ret void
}

Namely "the apply_iterate that ruins your parade", but it should be optimised away: Keno concluded an issue needed to be opened, saying "Looks like splat of integer needs to be special cased".

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions