pytorch
diff --git a/‎.github/workflows/stale_pull_requests.yml‎
Lines changed: 34 additions & 0 deletions b/‎.github/workflows/stale_pull_requests.yml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebraKernel.cpp‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/BatchLinearAlgebraKernel.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/distributed/pipeline/pipe.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/distributed/pipeline/pipe.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 72 additions & 55 deletions b/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 72 additions & 55 deletions
diff --git a/‎test/distributed/pipeline/sync/skip/test_gpipe.py‎
Lines changed: 2 additions & 2 deletions b/‎test/distributed/pipeline/sync/skip/test_gpipe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/quantization/test_quantize_fx.py‎
Lines changed: 105 additions & 2 deletions b/‎test/quantization/test_quantize_fx.py‎
Lines changed: 105 additions & 2 deletions
@@ -0,0 +1,34 @@
+name: 'Close stale pull requests'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+  workflow_dispatch:
+
+jobs:
+  stale:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v3
+        with:
+          stale-pr-message: >
+            Looks like this PR hasn't been updated in a while! Going to go ahead and mark this as `stale`.
+            Feel free to update / remove the `stale` label if you feel this is a mistake
+            `stale` pull requests will automatically be closed 30 days after being marked `stale`
+          exempt-pr-labels: "no-stale,open source,high priority"
+          days-before-stale: 60
+          days-before-close: 90
+  stale-open-source:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v3
+        with:
+          stale-pr-message: >
+            Looks like this PR hasn't been updated in a while! Going to go ahead and mark this as `stale`.
+            Feel free to update / remove the `stale` label if you feel this is a mistake
+            `stale` pull requests will automatically be closed 30 days after being marked `stale`
+          exempt-pr-labels: "no-stale,high priority"
+          only-labels: "open source"
+          days-before-stale: 150
+          days-before-close: 180
@@ -101,9 +101,12 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau, Tensor& infos, int6
 REGISTER_ARCH_DISPATCH(eig_stub, DEFAULT, &eig_kernel_impl);
 REGISTER_AVX_DISPATCH(eig_stub, &eig_kernel_impl);
 REGISTER_AVX2_DISPATCH(eig_stub, &eig_kernel_impl);
+REGISTER_VSX_DISPATCH(eig_stub, &eig_kernel_impl);
 
 REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl);
 REGISTER_AVX_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
 REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
+REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
+
 
 }} // namespace at::native
@@ -3,14 +3,14 @@
 import os
 import time
 
-from .benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
+from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
 import torch
 from torch.distributed import rpc
 import torch.nn as nn
 from torch.utils.data import DataLoader
 
 from torch.distributed.pipeline.sync import Pipe
-from torch.testing._internal.distributed.pipeline.utils import convert_to_balance
+from torch.distributed.pipeline.sync.utils import partition_model
 from torch.optim import Adam  # type: ignore
 
 def sizeof_fmt(num, suffix='B'):
@@ -248,7 +248,7 @@ def bench_single_process(args):
     model = blob["model"]
 
     balance = generate_balance(num_devices, len(model))
-    model = convert_to_balance(model, balance)
+    model = partition_model(model, balance)
     p = Pipe(
         model, chunks=args.chunks, checkpoint=args.checkpoint
     )
 
@@ -57,12 +57,12 @@ namespace {
 
 using stream_set = std::unordered_set<cuda::CUDAStream>;
 
-constexpr size_t kMinBlockSize = 512;       // all sizes are rounded to at least 512 bytes
-constexpr size_t kSmallSize = 1048576;      // largest "small" allocation is 1 MiB
-constexpr size_t kSmallBuffer = 2097152;    // "small" allocations are packed in 2 MiB blocks
-constexpr size_t kLargeBuffer = 20971520;   // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kMinBlockSize  =      512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize     =  1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer   =  2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer   = 20971520; // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152;     // round up large allocations to 2 MiB
+constexpr size_t kRoundLarge    =  2097152; // round up large allocations to 2 MiB
 
 typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
 
@@ -242,56 +242,57 @@ class DeviceCachingAllocator {
       // Free all non-split cached blocks and retry alloc.
       || (free_cached_blocks() && alloc_block(params, true));
 
-    TORCH_INTERNAL_ASSERT((!block_found && params.err != cudaSuccess) || params.block);
     if (!block_found) {
-      if (params.err == cudaErrorMemoryAllocation) {
-        size_t device_free;
-        size_t device_total;
-        C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-        std::string allowed_info;
-
-        if (set_fraction) {
-          allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
-        }
+      // For any error code other than cudaErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
 
-        stats.num_ooms += 1;
-
-        // "total capacity": total global memory on GPU
-        // "allowed": memory is allowed to use, which set by fraction.
-        // "already allocated": memory allocated by the program using the
-        //                      caching allocator
-        // "free": free memory as reported by the CUDA API
-        // "cached": memory held by the allocator but not used by the program
-        //
-        // The "allocated" amount  does not include memory allocated outside
-        // of the caching allocator, such as memory allocated by other programs
-        // or memory held by the driver.
-        //
-        // The sum of "allocated" + "free" + "cached" may be less than the
-        // total capacity due to memory held by the driver and usage by other
-        // programs.
-        //
-        // Note that at this point free_cached_blocks has already returned all
-        // possible "cached" memory to the driver. The only remaining "cached"
-        // memory is split from a larger block that is partially in-use.
-        TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
-          "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
-          " (GPU ", device, "; ",
-          format_size(device_total), " total capacity; ",
-          format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-          " already allocated; ",
-          format_size(device_free), " free; ",
-          allowed_info,
-          format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-          " reserved in total by PyTorch)");
-      } else {
-        C10_CUDA_CHECK(params.err);
+      size_t device_free;
+      size_t device_total;
+      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
       }
+
+      stats.num_ooms += 1;
+
+      // "total capacity": total global memory on GPU
+      // "allowed": memory is allowed to use, which set by fraction.
+      // "already allocated": memory allocated by the program using the
+      //                      caching allocator
+      // "free": free memory as reported by the CUDA API
+      // "cached": memory held by the allocator but not used by the program
+      //
+      // The "allocated" amount  does not include memory allocated outside
+      // of the caching allocator, such as memory allocated by other programs
+      // or memory held by the driver.
+      //
+      // The sum of "allocated" + "free" + "cached" may be less than the
+      // total capacity due to memory held by the driver and usage by other
+      // programs.
+      //
+      // Note that at this point free_cached_blocks has already returned all
+      // possible "cached" memory to the driver. The only remaining "cached"
+      // memory is split from a larger block that is partially in-use.
+      TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
+        "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
+        " (GPU ", device, "; ",
+        format_size(device_total), " total capacity; ",
+        format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " already allocated; ",
+        format_size(device_free), " free; ",
+        allowed_info,
+        format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " reserved in total by PyTorch)");
     }
 
+    TORCH_INTERNAL_ASSERT(params.err == cudaSuccess &&
+                          params.block != nullptr &&
+                          params.block->ptr != nullptr);
     Block* block = params.block;
     Block* remaining = nullptr;
-    TORCH_INTERNAL_ASSERT(block);
 
     const bool already_split = block->is_split();
     if (should_split(block, size)) {
@@ -647,30 +648,46 @@ class DeviceCachingAllocator {
   }
 
   bool alloc_block(AllocParams& p, bool isRetry) {
+    // Defensively checks for preexisting CUDA error state.
+    C10_CUDA_CHECK(cudaGetLastError());
+
     size_t size = p.alloc_size;
     void* ptr;
 
     if (isRetry) {
       stats.num_alloc_retries += 1;
     }
+
     if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
       p.err = cudaErrorMemoryAllocation;
+      return false;
     } else {
       p.err = cudaMalloc(&ptr, size);
-    }
-
-    if (p.err != cudaSuccess) {
-      if (!isRetry || p.err == cudaErrorMemoryAllocation)
-        cudaGetLastError();  // clear CUDA error
-      return false;
+      if (p.err != cudaSuccess) {
+        if (p.err == cudaErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear CUDA's
+          //   internal error state.
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH will take
+          //   over to throw a helpful exception. The user can choose to catch the exception,
+          //   free some stuff in their script, and attempt their allocation again.
+          //   In this case, we can also forgive and clear CUDA's internal error state.
+          cudaGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw immediately.
+          C10_CUDA_CHECK(p.err);
+        }
+        return false;
+      }
     }
 
     total_allocated_memory += size;
     p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
     update_stat_array(stats.segment, 1, p.stat_types);
     update_stat_array(stats.reserved_bytes, size, p.stat_types);
 
-    return (p.block != nullptr);
+    // p.block came from new, not cudaMalloc.  It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    return true;
   }
 
   bool free_cached_blocks()
 
@@ -11,7 +11,7 @@
 from torch.distributed.pipeline.sync import Pipe
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.portal import PortalBlue, PortalCopy, PortalOrange
-from torch.testing._internal.distributed.pipeline.utils import convert_to_balance
+from torch.distributed.pipeline.sync.utils import partition_model
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@@ -53,7 +53,7 @@ def forward(self, input):
             return output
 
     model = nn.Sequential(Layer1(), Layer2(), Layer3())
-    model = convert_to_balance(model, balance)
+    model = partition_model(model, balance)
     model = Pipe(model, chunks=3, checkpoint=checkpoint)
 
     in_device = model.devices[0]
 
@@ -347,10 +347,11 @@ def forward(self, x):
         qconfig_dict = {'': qconfig}
         prepared = prepare_fx(m, qconfig_dict)
         quantized = convert_fx(prepared, debug=True)
-        qparams = (quantized._scale_0, quantized._zero_point_0)
+        qparams = (quantized._input_scale_0, quantized._input_zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
-        ref_qparams = weight_obs.calculate_qparams()
+        # Get the actual value to avoid tensor size mismatch error, torch.Size([]) vs torch.Size([1])
+        ref_qparams = (weight_obs.calculate_qparams()[0].item(), weight_obs.calculate_qparams()[1].item())
         self.assertEqual(qparams, ref_qparams)
 
     def test_conv_bn_relu(self):
@@ -983,6 +984,46 @@ def forward(self, x):
             # make sure it runs
             m(torch.randn(2, 1, 3, 3))
 
+    def test_qconfig_for_call_func(self):
+        class Linear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.ones(5, 5)
+                self.b = torch.zeros(5)
+
+            def forward(self, x):
+                return torch.nn.functional.linear(x, self.w, self.b)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mods1 = torch.nn.Sequential(
+                    Linear(),
+                    Linear()
+                )
+                self.mods2 = Linear()
+
+            def forward(self, x):
+                x = self.mods1(x)
+                x = self.mods2(x)
+                return x
+
+        model = M().eval()
+        qconfig_dict = {"": default_qconfig, "module_name": [("mods2", None)]}
+        m = prepare_fx(model, qconfig_dict)
+        m(torch.rand(5, 5))
+
+        m = convert_fx(m)
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_function(torch.ops.quantized.linear),
+            ns.call_function(torch.ops.quantized.linear),
+            ns.call_method('dequantize'),
+            ns.call_function(torch.nn.functional.linear)
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+        m(torch.rand(5, 5))
+
     def test_preserve_attributes(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -1455,6 +1496,68 @@ def test_convtranspose_per_channel_fails_early(self):
             str(context.exception) ==
             'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
 
+    @skipIfNoFBGEMM
+    def test_qparams_buffers(self):
+        class Linear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.ones(5, 5)
+                self.b = torch.zeros(5)
+
+            def forward(self, x):
+                return torch.nn.functional.linear(x, self.w, self.b)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mods1 = torch.nn.Sequential(
+                    Linear(),
+                    Linear()
+                )
+                self.mods2 = Linear()
+
+            def forward(self, x):
+                x = self.mods1(x)
+                x = self.mods2(x)
+                return x
+
+        model = M().eval()
+        qconfig_dict = {"": default_qconfig}
+        m = prepare_fx(model, qconfig_dict)
+        m(torch.rand(5, 5))
+
+        m = convert_fx(m)
+        keys = m.state_dict().keys()
+        quant_scale_count = quant_zero_point = scale_count = zero_point_count = 0
+        for k in keys:
+            if 'input_scale' in k:
+                quant_scale_count = quant_scale_count + 1
+            elif 'input_zero_point' in k:
+                quant_zero_point = quant_zero_point + 1
+            elif 'scale' in k:
+                scale_count = scale_count + 1
+            elif 'zero_point' in k:
+                zero_point_count = zero_point_count + 1
+
+        # Expect each quantized linear op to have a scale and zero point
+        self.assertTrue(scale_count == 3, "Expect each quantized linear op to have a scale in state_dict")
+        self.assertTrue(zero_point_count == 3, "Expect each quantized linear op to have a zero_point in state_dict")
+        # ensure it runs
+        m(torch.rand(5, 5))
+        # ensure it is scriptable
+        scripted = torch.jit.script(m)
+        scripted_keys = scripted.state_dict().keys()
+        self.assertTrue(scripted_keys == keys, "Expected the scripted model to preserve the state_dict")
+        assert hasattr(m, "mods1_0_input_scale_0")
+        assert hasattr(m, "mods1_0_input_zero_point_0")
+        assert hasattr(m, "mods1_0_scale_0")
+        assert hasattr(m, "mods1_0_zero_point_0")
+        assert hasattr(m, "mods1_1_scale_0")
+        assert hasattr(m, "mods1_1_zero_point_0")
+        assert hasattr(m, "mods2_scale_0")
+        assert hasattr(m, "mods2_zero_point_0")
+
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops