Skip to content

Commit e845b4f

Browse files
committed
Update on "Make triton debug util reusable"
cc mlazos soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 desertfire [ghstack-poisoned]
2 parents 3a7a368 + 49e7327 commit e845b4f

426 files changed

Lines changed: 5521 additions & 2026 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/pytorch/test.sh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -828,10 +828,6 @@ test_executorch() {
828828
assert_git_not_dirty
829829
}
830830

831-
test_smoke() {
832-
time python test/run_test.py --include test_fx test_jit test_schema_check test_foreach test_weak --verbose
833-
}
834-
835831
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* || "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
836832
(cd test && python -c "import torch; print(torch.__config__.show())")
837833
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -992,9 +988,6 @@ elif [[ "${TEST_CONFIG}" = docs_test ]]; then
992988
test_docs_test
993989
elif [[ "${TEST_CONFIG}" == *functorch* ]]; then
994990
test_functorch
995-
elif [[ "${TEST_CONFIG}" == *smoke* ]]; then
996-
# TODO: Delete me once we get more 3.11 testing
997-
test_smoke
998991
else
999992
install_torchvision
1000993
install_triton

.github/ci_commit_pins/vision.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
85983a57e8986cf4a9afc34704bbacb9e6206ec9
1+
2d6e663afc15f878e6ff7ff52a1eaf0ee3e5a081

.github/scripts/gql_mocks.json

Lines changed: 883 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.github/scripts/test_trymerge.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,19 @@ def test_get_author_null(self, mocked_gql: Any, *args: Any) -> None:
247247
author = pr.get_author()
248248
self.assertTrue(author is not None)
249249

250+
@mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
251+
def test_last_pushed_at(self, mocked_gql: Any, *args: Any) -> None:
252+
""" Tests that last_pushed_at will return None on merge commits.
253+
"""
254+
pr = GitHubPR("pytorch", "pytorch", 71759)
255+
self.assertIsNotNone(pr.last_pushed_at())
256+
257+
# 307120d6d3f7fcc3f92cfd26be891d360ad6a92a is merge commit
258+
# and as such does not have a pushedDate
259+
# See https://github.com/pytorch/pytorch/pull/94146#issuecomment-1421647117
260+
pr = GitHubPR("pytorch", "pytorch", 94146)
261+
self.assertIsNone(pr.last_pushed_at())
262+
250263
@mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
251264
def test_large_diff(self, mocked_gql: Any, *args: Any) -> None:
252265
"Tests that PR with 100+ files can be fetched"

.github/scripts/trymerge.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -726,8 +726,11 @@ def is_base_repo_private(self) -> bool:
726726
def get_changed_files_count(self) -> int:
727727
return int(self.info["changedFiles"])
728728

729-
def last_pushed_at(self) -> datetime:
730-
return datetime.fromisoformat(self.last_commit()['pushedDate'][:-1])
729+
def last_pushed_at(self) -> Optional[datetime]:
730+
pushed_date = self.last_commit()["pushedDate"]
731+
if pushed_date is None:
732+
return None
733+
return datetime.fromisoformat(pushed_date[:-1])
731734

732735
def last_commit(self) -> Any:
733736
return self.info["commits"]["nodes"][-1]["commit"]
@@ -849,7 +852,7 @@ def get_checkrun_conclusions(self) -> JobNameToStateDict:
849852
""" Returns dict of checkrun -> [conclusion, url] """
850853
if self.conclusions is not None:
851854
return self.conclusions
852-
orig_last_commit = self.info["commits"]["nodes"][-1]["commit"]
855+
orig_last_commit = self.last_commit()
853856

854857
def get_pr_next_check_runs(edges: List[Dict[str, Dict[str, Any]]], edge_idx: int, checkruns: Any) -> Any:
855858
rc = gh_graphql(GH_GET_PR_NEXT_CHECK_RUNS,
@@ -1622,7 +1625,9 @@ def merge(pr_num: int, repo: GitRepo,
16221625
)
16231626

16241627
gh_post_pr_comment(org, project, pr.pr_num, explainer.get_merge_message(land_check_commit), dry_run=dry_run)
1625-
if (datetime.utcnow() - pr.last_pushed_at()).days > stale_pr_days:
1628+
if pr.last_pushed_at() is None:
1629+
print(f"Can't get commit {pr.last_commit()['oid']} pushed date. Is it merge commit by chance?")
1630+
elif (datetime.utcnow() - cast(datetime, pr.last_pushed_at())).days > stale_pr_days:
16261631
if land_checks and not dry_run:
16271632
pr.delete_land_time_check_branch(repo)
16281633
raise RuntimeError(f"This PR is too stale; the last push date was more than {stale_pr_days} days ago. "

.github/workflows/_win-test.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,18 @@ jobs:
7878
}
7979
Catch {
8080
Write-Output "No leftover $process process, continuing"
81+
Write-Output $_
82+
}
83+
}
84+
85+
# Try it again https://stackoverflow.com/questions/40585754/powershell-wont-terminate-hung-process
86+
# for hung processes
87+
Foreach ($process In $processes) {
88+
Try {
89+
(Get-WmiObject -Class Win32_Process -Filter "Name LIKE '${process}%'").terminate()
90+
}
91+
Catch {
92+
Write-Output $_
8193
}
8294
}
8395

.github/workflows/pull.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,13 @@ jobs:
146146
docker-image-name: pytorch-linux-bionic-py3.11-clang9
147147
test-matrix: |
148148
{ include: [
149-
{ config: "smoke", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
149+
{ config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
150+
{ config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
151+
{ config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
152+
{ config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
153+
{ config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
154+
{ config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
155+
{ config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
150156
]}
151157
152158
linux-bionic-py3_11-clang9-test:

CMakeLists.txt

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -477,14 +477,6 @@ if(MSVC)
477477
# Turn off warnings on Windows. In an ideal world we'd be warning
478478
# clean on Windows too, but this is too much work for our
479479
# non-Windows developers.
480-
#
481-
# NB: Technically, this is not necessary if CMP0092 was applied
482-
# properly, but only cmake >= 3.15 has this policy, so we nail
483-
# it one more time just be safe.
484-
#
485-
# NB2: This is NOT enough to prevent warnings from nvcc on MSVC. At the
486-
# moment only CMP0092 is enough to prevent those warnings too.
487-
string(REPLACE "/W3" "" ${flag_var} "${${flag_var}}")
488480

489481
# Turn off warnings (Windows build is currently is extremely warning
490482
# unclean and the warnings aren't telling us anything useful.)
@@ -1120,7 +1112,6 @@ if(BUILD_SHARED_LIBS)
11201112
${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
11211113
${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
11221114
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
1123-
${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
11241115
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
11251116
${PROJECT_SOURCE_DIR}/cmake/public/LoadHIP.cmake
11261117
DESTINATION share/cmake/Caffe2/public

aten/src/ATen/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ set_bool(AT_BLAS_F2C BLAS_F2C)
3232
set_bool(AT_BLAS_USE_CBLAS_DOT BLAS_USE_CBLAS_DOT)
3333
set_bool(AT_MAGMA_ENABLED USE_MAGMA)
3434
set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA)
35+
set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN)
3536

3637
configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
3738
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
@@ -622,4 +623,4 @@ set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
622623
set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
623624
set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
624625
set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
625-
set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
626+
set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)

aten/src/ATen/mps/MPSAllocator.h

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
// Copyright © 2022 Apple Inc.
22

3+
#pragma once
4+
5+
#include <ATen/mps/MPSAllocatorInterface.h>
36
#include <ATen/mps/MPSStream.h>
47
#include <cstdio>
58
#include <mutex>
@@ -9,27 +12,10 @@
912

1013
// this implementation is based on CUDACachingAllocator.
1114
// It utilizes Metal Heaps to improve the performance with buffer allocation.
15+
// Do not include this header. Use MPSAllocatorInterface.h instead.
1216
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
1317
namespace at {
1418
namespace mps {
15-
16-
class IMpsAllocatorCallback {
17-
public:
18-
enum class EventType {
19-
ALLOCATED, // buffer got allocated to be used immediately
20-
RECYCLED, // buffer pulled from free list to be reused
21-
FREED, // buffer put to free list for future recycling
22-
RELEASED, // buffer memory released
23-
};
24-
virtual ~IMpsAllocatorCallback() = default;
25-
virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
26-
};
27-
28-
// MPS allocator will execute every registered callback when a block of memory is freed.
29-
C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
30-
#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
31-
C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
32-
3319
namespace HeapAllocator {
3420

3521
#define MB(x) round_page(x * 1048576UL)
@@ -263,27 +249,44 @@ class MPSHeapAllocatorImpl
263249

264250
// interface exposed to at::Allocator
265251
id<MTLBuffer> malloc(size_t size, uint32_t usage);
252+
// frees a buffer and returns it into buffer pool
266253
void free(void* ptr);
254+
// releases all the cached buffers and their associated heaps
267255
void emptyCache();
268-
// interface exposed to internal MPS operations
256+
// returns true if buffer was allocated from the shared pool
269257
bool isSharedBuffer(void* ptr);
270-
ssize_t getRequestedBufferSize(void* ptr);
258+
// get the requested unaligned size of an MTLBuffer
259+
ssize_t getUnalignedBufferSize(void* ptr);
260+
// set the shape of a base tensor from a view tensor
271261
void setBufferShape(void* ptr, const IntArrayRef& shape);
262+
// retrieve the shape of a base tensor from a view tensor
272263
IntArrayRef getBufferShape(void* ptr);
264+
// allocate a buffer from a specialized pool to import CPU scalars into GPU
273265
id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
274266
// this indicates how far (in Megabytes) the current total allocations are from the
275267
// low watermark limit which is used to detect if we're under memory pressure
276268
// This returns zero if we've reached the low watermark limit
277269
ssize_t getLowWatermarkValue();
278-
279-
bool getDebugVerbosity() const { return m_debug_verbosity; }
280-
size_t getMaxTotalAllowedSize() const { return m_max_total_allowed_size; }
270+
// (see m_low_watermark_ratio for description)
271+
void setLowWatermarkRatio(double ratio);
272+
// (see m_high_watermark_ratio for description)
273+
void setHighWatermarkRatio(double ratio);
274+
// (see m_low_watermark_limit for description)
281275
size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
276+
// (see m_max_total_allowed_size for description)
277+
size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
278+
// (see m_total_allocated_memory for description)
279+
size_t getTotalAllocatedMemory() const {return m_total_allocated_memory; }
280+
// (see enum DebugVerbosity for description)
281+
uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
282+
// returns the device that we allocate from
282283
inline id<MTLDevice> Device() const { return m_device; }
283284

284285
private:
285286
// (see m_high_watermark_ratio for description)
286287
constexpr static double default_high_watermark_ratio = 1.7;
288+
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
289+
constexpr static double default_high_watermark_upper_bound = 2.0;
287290
// (see m_low_watermark_ratio for description)
288291
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
289292
constexpr static double default_low_watermark_ratio_unified = 1.4;
@@ -375,17 +378,5 @@ class MPSHeapAllocatorImpl
375378
};
376379

377380
} // namespace HeapAllocator
378-
379-
// interface exposed to internal MPS operations
380-
381-
// get the requested non-aligned size of an MTL buffer
382-
ssize_t get_requested_buffer_size(void* ptr);
383-
// retrieve the shape of a base tensor from a view tensor
384-
IntArrayRef get_buffer_shape(void* ptr);
385-
// set the shape of a base tensor from a view tensor
386-
void set_buffer_shape(void* ptr, const IntArrayRef& shape);
387-
// allocate a buffer from a specialized pool to import CPU scalars into GPU
388-
DataPtr allocate_scalar_buffer(void* value, size_t size);
389-
390381
} // namespace mps
391382
} // namespace at

0 commit comments

Comments
 (0)