Add support for reductions on CPU in tensorexpr (#37333)

resistor · facebook-github-bot · commit 20ba29d81c29 · 2020-04-30T10:59:38.000-07:00
Summary: Pull Request resolved: #37333 Differential Revision: D21290289 Pulled By: resistor fbshipit-source-id: ebba11f7af9e22b48c47e2eefb9497fa77acd17d
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
@@ -1201,6 +1201,152 @@ void testLLVMEliminatedStmt() {
   cg.call({aData, cData});
 }
 
+void testLLVMSimpleReduction() {
+  KernelScope kernel_scope;
+
+  int M = 128;
+  int N = 64;
+  const int kTotalSize = M * N;
+
+  Buffer a("a", kFloat, {1, M, N});
+
+  // TODO: why doesn't implicit vector<DimArg> work?
+  std::vector<DimArg> axis = {DimArg(1)};
+  std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
+  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  LoopNest loop({b});
+
+  loop.prepareForCodegen();
+  Stmt* s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+void testLLVMRFactorReduction() {
+  KernelScope kernel_scope;
+
+  int M = 128;
+  int N = 64;
+  const int kTotalSize = M * N;
+
+  Buffer a("a", kFloat, {1, M, N});
+
+  // TODO: why doesn't implicit vector<DimArg> work?
+  std::vector<DimArg> axis = {DimArg(1)};
+  std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
+  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  LoopNest loop({b});
+
+  std::vector<For*> loops = loop.getLoopStmtsFor(b);
+  For* loop_m = loops.at(1);
+  For* loop_n = loops.at(2);
+  loop.reorderAxis(b, loop_m, loop_n);
+
+  loops = loop.getLoopStmtsFor(b);
+  loop_m = loops.at(2);
+  loop_n = loops.at(1);
+  loop.rfactor(b->body(), loop_n->var(), loop_n->body());
+
+  loop.prepareForCodegen();
+  Stmt* s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+void testLLVMRFactorVectorizedReduction() {
+  KernelScope kernel_scope;
+
+  int M = 128;
+  int N = 64;
+  const int kTotalSize = M * N;
+
+  Buffer a("a", kFloat, {1, M, N});
+
+  // TODO: why doesn't implicit vector<DimArg> work?
+  std::vector<DimArg> axis = {DimArg(1)};
+  std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
+  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  LoopNest loopnest({b});
+  std::vector<For*> loops = loopnest.getLoopStmtsFor(b);
+  For* loop_k = loops.at(0);
+  For* loop_m = loops.at(1);
+  For* loop_n = loops.at(2);
+  loopnest.reorderAxis(b, loop_n, loop_m);
+  loops = loopnest.getLoopStmtsFor(b);
+  loop_k = loops.at(0);
+  loop_n = loops.at(1);
+  loop_m = loops.at(2);
+  // Case-III reductions
+  loopnest.rfactor(b->body(), loop_n->var());
+  loopnest.prepareForCodegen();
+  Stmt* s = loopnest.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  Block* root_block = dynamic_cast<Block*>(s);
+  auto stmt_list = root_block->stmts();
+  auto I = stmt_list.begin();
+  ++I;
+
+  For* outer_loop = dynamic_cast<For*>(*I);
+  loopnest.vectorize(outer_loop);
+
+  s = IRSimplifier::simplify(s);
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
 } // namespace jit
 } // namespace torch
 
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
@@ -296,7 +296,10 @@ namespace jit {
   _(LLVMEmptyStmt)                         \
   _(LLVMEliminatedStmt)                    \
   _(LLVMIfThenElseTest)                    \
-  _(LLVMVectorizerLoadStoreTest)
+  _(LLVMVectorizerLoadStoreTest)           \
+  _(LLVMSimpleReduction)                   \
+  _(LLVMRFactorReduction)                  \
+  _(LLVMRFactorVectorizedReduction)
 
 #define TH_FORALL_TENSOREXPR_TESTS_CUDA(_) \
   _(CudaTestVectorAdd01)                   \
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -42,6 +42,7 @@ class LLVMCodeGenImpl : public IRVisitor {
   llvm::BasicBlock* bb_;
   llvm::Value* value_{nullptr};
   llvm::JITTargetAddress kernelAddress_;
+  std::unique_ptr<void* []> argv_ { nullptr };
 
 #define LLVM_TYPE_DECLARE(_1, Name) llvm::Type* Name##Ty_;
   AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, LLVM_TYPE_DECLARE);
@@ -66,6 +67,7 @@ class LLVMCodeGenImpl : public IRVisitor {
   ~LLVMCodeGenImpl() = default;
 
   llvm::JITTargetAddress getKernelAddress() const;
+  void** getArgvAddress() const;
 
   void visit(const Add* v) override;
   void visit(const Sub* v) override;
@@ -184,15 +186,16 @@ static void* argToPtr(
 }
 
 void LLVMCodeGen::call(const std::vector<CallArg>& args) {
-  if (args.size() != buffer_args().size()) {
+  const auto& buf_args = buffer_args();
+  if (args.size() != buf_args.size()) {
     throw malformed_input("wrong number of args in call");
   }
 
-  std::vector<void*> argv;
-  for (size_t i = 0; i < buffer_args().size(); i++) {
-    auto const& bufferArg = buffer_args()[i];
+  void** argv = impl_->getArgvAddress();
+  for (size_t i = 0, e = buf_args.size(); i < e; i++) {
+    auto const& bufferArg = buf_args[i];
     auto const& callArg = args[i];
-    argv.push_back(argToPtr(bufferArg, callArg));
+    argv[i] = argToPtr(bufferArg, callArg);
   }
   value<float>(argv);
   USE_TRIGGER(llvm_codegen_executed);
@@ -206,6 +209,10 @@ llvm::JITTargetAddress LLVMCodeGenImpl::getKernelAddress() const {
   return kernelAddress_;
 }
 
+void** LLVMCodeGenImpl::getArgvAddress() const {
+  return argv_.get();
+}
+
 LLVMCodeGenImpl::LLVMCodeGenImpl(
     Stmt* stmt,
     const std::vector<CodeGen::BufferArg>& args,
@@ -261,6 +268,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
       llvm::orc::ThreadSafeModule(std::move(module_), context_)));
   auto sym = jit_->findSymbol("wrapper");
   kernelAddress_ = cantFail(sym.getAddress());
+  argv_ = std::make_unique<void*[]>(params.size());
 
   USE_TRIGGER(llvm_codegen_created);
 }
@@ -919,7 +927,11 @@ void LLVMCodeGenImpl::visit(const For* v) {
   // Set up phi node for index variable.
   auto idx = irb_.CreatePHI(IntTy_, 2);
   idx->addIncoming(start, preheader);
-  varToVal_.emplace(v->var(), idx);
+  if (!varToVal_.count(v->var())) {
+    varToVal_.emplace(v->var(), idx);
+  } else {
+    throw std::runtime_error("var should not exist before");
+  }
 
   // Create the body and exit blocks.
   auto body = llvm::BasicBlock::Create(getContext(), "body", fn_);
@@ -944,6 +956,8 @@ void LLVMCodeGenImpl::visit(const For* v) {
 
   // Exit the loop.
   irb_.SetInsertPoint(exit);
+
+  varToVal_.erase(v->var());
   value_ = llvm::ConstantInt::get(IntTy_, 0);
 }
 
@@ -1454,11 +1468,43 @@ void LLVMCodeGenImpl::visit(const FunctionCall* v) {
 }
 
 void LLVMCodeGenImpl::visit(const Allocate* v) {
-  throw unimplemented_lowering(v);
+  llvm::Value* size =
+      llvm::ConstantInt::getSigned(LongTy_, v->dtype().byte_size());
+  for (const Expr* e : v->dims()) {
+    e->accept(this);
+    size = irb_.CreateMul(size, irb_.CreateZExt(value_, LongTy_));
+  }
+
+  value_ = llvm::ConstantInt::get(IntTy_, 0);
+
+  if (llvm::ConstantInt* CI = llvm::dyn_cast<llvm::ConstantInt>(size)) {
+    if (CI->getSExtValue() < 512) {
+      llvm::Value* alloca = irb_.CreateAlloca(dtypeToLLVM(v->dtype()), size);
+      varToVal_[v->buffer_var()] = alloca;
+      return;
+    }
+  }
+
+  llvm::Instruction* I = llvm::CallInst::CreateMalloc(
+      irb_.GetInsertBlock(),
+      LongTy_,
+      dtypeToLLVM(v->dtype()),
+      size,
+      nullptr,
+      nullptr);
+
+  // Insert the bitcast into the block.
+  irb_.SetInsertPoint(irb_.GetInsertBlock());
+  llvm::Value* malloc = irb_.Insert(I);
+  varToVal_[v->buffer_var()] = malloc;
 }
 
 void LLVMCodeGenImpl::visit(const Free* v) {
-  throw unimplemented_lowering(v);
+  value_ = llvm::ConstantInt::get(IntTy_, 0);
+  llvm::Value* ptr = varToVal_.at(v->buffer_var());
+  if (!llvm::isa<llvm::AllocaInst>(ptr)) {
+    irb_.Insert(llvm::CallInst::CreateFree(ptr, irb_.GetInsertBlock()));
+  }
 }
 
 void LLVMCodeGenImpl::visit(const Cond* v) {
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.h b/torch/csrc/jit/tensorexpr/llvm_codegen.h
@@ -32,14 +32,18 @@ class TORCH_API LLVMCodeGen : public CodeGen {
 
   template <typename T>
   T value() {
-    std::vector<void*> args;
-    return value<T>(args);
+    return value<T>(nullptr);
   }
 
   template <typename T>
   T value(std::vector<void*>& args) {
+    return value<T>(args.data());
+  }
+
+  template <typename T>
+  T value(void** args) {
     T (*fp)(void**) = (T(*)(void**))getKernelAddress(impl_.get());
-    T rv = fp(args.data());
+    T rv = fp(args);
     return rv;
   }