[TensorExpr] TensorExprKernel: don't do any compilation or lowering in run(). (#37948)

Mikhail Zolotukhin · facebook-github-bot · commit 6e13146d9659 · 2020-05-13T14:02:23.000-07:00
Summary: Pull Request resolved: #37948 The input JIT graph has all the information we need to perform the entire compilation at the construction time. We don't need to postpone any steps until the execution time. Also, from the graph we always know what device we will be executing on and thus we don't need to have a CodeGen cache in TensorExprKernel - we always have one and only one CodeGen. Test Plan: Imported from OSS Reviewed By: protonu Differential Revision: D21432145 Pulled By: ZolotukhinM fbshipit-source-id: 8dc86b891713056b2c62f30170cd4a168912f027
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
@@ -35,7 +35,7 @@ void testKernel_1() {
   auto ref = a * (a * b);
   TensorExprKernel k(graph);
   std::vector<at::Tensor> inputs = {a, b};
-  Stmt* s = k.getStmtForInputs(fmap<IValue>(inputs));
+  Stmt* s = k.getCodeGenStmt();
   // TODO: verify stmt
 
   std::vector<IValue> stack = fmap<IValue>(inputs);
@@ -65,7 +65,7 @@ void testKernel_2() {
   auto ref = a * (a * b);
   TensorExprKernel k(graph);
   std::vector<at::Tensor> inputs = {a, b};
-  Stmt* s = k.getStmtForInputs(fmap<IValue>(inputs));
+  Stmt* s = k.getCodeGenStmt();
   // TODO: verify stmt
 
   std::vector<IValue> stack = fmap<IValue>(inputs);
@@ -95,7 +95,7 @@ void testKernel_3() {
   auto ref = a * (a * b);
   TensorExprKernel k(graph);
   std::vector<at::Tensor> inputs = {a, b};
-  Stmt* s = k.getStmtForInputs(fmap<IValue>(inputs));
+  Stmt* s = k.getCodeGenStmt();
   // TODO: verify stmt
 
   std::vector<IValue> stack = fmap<IValue>(inputs);
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1158,7 +1158,7 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
   return stmt;
 }
 
-std::string TensorExprKernel::getCodegenName(BackendType backendType) {
+std::string TensorExprKernel::getCodeGenName(BackendType backendType) {
   switch (backendType) {
     case kCudaCodeGen:
       return "cuda_codegen";
@@ -1272,10 +1272,11 @@ static void checkInputs(
 }
 
 at::Device TensorExprKernel::pickDeviceType(
-    const at::ArrayRef<IValue>& inputs) {
+    const at::ArrayRef<torch::jit::Value*>& inputs) {
   for (auto const& input : inputs) {
-    if (input.isTensor()) {
-      return input.toTensor().device();
+    auto tt = input->type()->cast<TensorType>();
+    if (tt && tt->device()) {
+      return *tt->device();
     }
   }
   throw std::runtime_error("No tensor inputs");
@@ -1390,6 +1391,16 @@ void TensorExprKernel::compile() {
     tensorOutputs_.emplace_back(tensors_.at(output->unique()));
     tensors_.erase(output->unique());
   }
+
+  device_ = pickDeviceType(graph_->inputs());
+  BackendType backendType = inferBackendTypeFromDevice(device_);
+  Stmt* stmt = generateStmt(backendType);
+
+  // Set up formal params (inputs, then outputs) for kernel.
+  std::vector<CodeGen::BufferArg> params = prepareBufferArgs();
+
+  // Generate code.
+  codegen_ = CreateCodeGen(getCodeGenName(backendType), stmt, params, device_);
 }
 
 TensorExprKernel::TensorExprKernel(const std::shared_ptr<Graph>& subgraph)
@@ -1426,8 +1437,7 @@ void TensorExprKernel::run(Stack& stack) {
 
 std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
     const at::ArrayRef<IValue>& inputs,
-    std::vector<at::Tensor>& outputs,
-    at::Device device) {
+    std::vector<at::Tensor>& outputs) {
   std::map<const Expr*, int32_t> varToSize;
 
   std::vector<CodeGen::CallArg> runArgs;
@@ -1468,57 +1478,27 @@ std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
     }
 
     outputs.push_back(at::empty(
-        tensorSize, c10::TensorOptions(tensorType(o)).device(device)));
+        tensorSize, c10::TensorOptions(tensorType(o)).device(device_)));
     runArgs.emplace_back(outputs.back().data_ptr());
   }
   return runArgs;
 }
 
-void TensorExprKernel::lowerToBackend(const at::ArrayRef<IValue>& inputs) {
-  checkInputs(inputs, inputTypes_);
-
-  at::Device device = pickDeviceType(inputs);
-  if (!codegenCache_.count(torch::get_hash(device))) {
-    BackendType backendType = inferBackendTypeFromDevice(device);
-    Stmt* stmt = generateStmt(backendType);
-
-    // Set up formal params (inputs, then outputs) for kernel.
-    std::vector<CodeGen::BufferArg> params = prepareBufferArgs();
-
-    // Generate code.
-    codegenCache_.emplace(
-        torch::get_hash(device),
-        CreateCodeGen(getCodegenName(backendType), stmt, params, device));
-  }
-}
-
-void TensorExprKernel::codegenRun(
-    at::Device device,
-    const std::vector<CodeGen::CallArg>& runArgs) {
-  codegenCache_.at(torch::get_hash(device))->call(runArgs);
-}
-
-Stmt* TensorExprKernel::getStmtForInputs(const at::ArrayRef<IValue>& inputs) {
-  lowerToBackend(inputs);
-  at::Device device = pickDeviceType(inputs);
-  return codegenCache_.at(torch::get_hash(device))->stmt();
+Stmt* TensorExprKernel::getCodeGenStmt() {
+  return codegen_->stmt();
 }
 
 void TensorExprKernel::runKernel(Stack& stack) {
   KernelScope kernelScope(&kernelArena_);
+
   // Set up arguments (inputs, then outputs) for kernel call.
   auto inputs = last(stack, nInputs_);
-
-  lowerToBackend(inputs);
-
-  at::Device device = pickDeviceType(inputs);
-
   std::vector<at::Tensor> outputs;
-  std::vector<CodeGen::CallArg> runArgs =
-      prepareRunArgs(inputs, outputs, device);
+
+  std::vector<CodeGen::CallArg> runArgs = prepareRunArgs(inputs, outputs);
 
   // Call the kernel.
-  codegenRun(device, runArgs);
+  codegen_->call(runArgs);
 
   // Update the stack.
   drop(stack, nInputs_);
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
@@ -52,7 +52,7 @@ class TORCH_API TensorExprKernel {
     InterpreterState(code_).run(stack);
   }
 
-  Stmt* getStmtForInputs(const at::ArrayRef<IValue>& inputs);
+  Stmt* getCodeGenStmt();
 
  private:
   enum BackendType {
@@ -63,7 +63,6 @@ class TORCH_API TensorExprKernel {
   };
 
   void compile();
-  void lowerToBackend(const at::ArrayRef<IValue>& inputs);
 
   void runKernel(Stack& stack);
 
@@ -160,17 +159,13 @@ class TORCH_API TensorExprKernel {
   Stmt* generateStmt(BackendType backendType);
   std::vector<CodeGen::BufferArg> prepareBufferArgs();
 
-  std::string getCodegenName(BackendType backendType);
-  void codegenRun(
-      at::Device device,
-      const std::vector<CodeGen::CallArg>& runArgs);
+  std::string getCodeGenName(BackendType backendType);
 
   std::vector<CodeGen::CallArg> prepareRunArgs(
       const at::ArrayRef<IValue>& inputs,
-      std::vector<at::Tensor>& outputs,
-      at::Device device);
+      std::vector<at::Tensor>& outputs);
   BackendType inferBackendTypeFromDevice(at::Device device);
-  at::Device pickDeviceType(const at::ArrayRef<IValue>& inputs);
+  at::Device pickDeviceType(const at::ArrayRef<torch::jit::Value*>& inputs);
 
   void bindInput(const torch::jit::Value* input);
 
@@ -215,7 +210,8 @@ class TORCH_API TensorExprKernel {
   std::vector<Tensor*> flatTensorOutputs_;
   std::unordered_map<int64_t, Tensor*> tensors_;
   std::unordered_map<int64_t, VarHandle> scalars_;
-  std::unordered_map<size_t, std::unique_ptr<CodeGen>> codegenCache_;
+  std::unique_ptr<CodeGen> codegen_;
+  at::Device device_ = at::kCPU;
   KernelArena kernelArena_;
   std::vector<TypePtr> inputTypes_;
   std::shared_ptr<Graph> graph_;