Skip to content

Commit cd546d4

Browse files
committed
Update on "Add 'noarch' tests which only run in one CI config"
Fixes #53743 Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: [D26971343](https://our.internmc.facebook.com/intern/diff/D26971343) [ghstack-poisoned]
2 parents 861a4e4 + 625a412 commit cd546d4

16 files changed

Lines changed: 617 additions & 373 deletions

File tree

aten/src/ATen/native/BatchLinearAlgebra.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -848,7 +848,8 @@ static void apply_solve(Tensor& b, Tensor& A, Tensor& infos) {
848848
std::tuple<Tensor, Tensor> _solve_helper_cpu(const Tensor& self, const Tensor& A) {
849849
auto self_working_copy = cloneBatchedColumnMajor(self);
850850
auto A_working_copy = cloneBatchedColumnMajor(A);
851-
auto infos = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
851+
// infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty
852+
auto infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
852853
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cpu", [&]{
853854
apply_solve<scalar_t>(self_working_copy, A_working_copy, infos);
854855
});
@@ -1074,7 +1075,7 @@ static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
10741075
int lwork = -1;
10751076
scalar_t wkopt;
10761077
lapackGetri<scalar_t>(n, self_data, lda, ipiv_data, &wkopt, lwork, &info);
1077-
lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
1078+
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
10781079
Tensor work = at::empty({lwork}, self.options());
10791080
auto work_data = work.data_ptr<scalar_t>();
10801081

@@ -1626,7 +1627,7 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
16261627
int lwork = -1;
16271628
scalar_t wkopt;
16281629
lapackGeqrf<scalar_t>(m, n, self_data, m, tau_data, &wkopt, lwork, &info);
1629-
lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
1630+
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
16301631
Tensor work = at::empty({lwork}, self.options());
16311632

16321633
for (const auto i : c10::irange(batch_size)) {
@@ -2041,7 +2042,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
20412042
}
20422043

20432044
lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info);
2044-
lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
2045+
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
20452046
Tensor work = at::empty({lwork}, self.options());
20462047

20472048
for (const auto i : c10::irange(batch_size)) {
@@ -2197,7 +2198,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
21972198
int lwork = -1;
21982199
scalar_t wkopt;
21992200
lapackSvd<scalar_t, value_t>(jobz, m, n, self_data, lda, S_data, U_data, lda, VT_data, ldvt, &wkopt, lwork, rwork_data, iwork_data, &info);
2200-
lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
2201+
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
22012202
Tensor work = at::empty({lwork}, self.options());
22022203
auto work_data = work.data_ptr<scalar_t>();
22032204

aten/src/ATen/native/BatchLinearAlgebra.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau, Tensor& infos, int64_t
8585
int lwork = -1;
8686
scalar_t wkopt;
8787
lapackOrgqr<scalar_t>(m, n_columns, k, self_data, lda, tau_data, &wkopt, lwork, &infos_data[0]);
88-
lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
88+
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
8989
Tensor work = at::empty({lwork}, self.options());
9090

9191
for (int64_t i = 0; i < batch_size; i++) {

aten/src/ATen/native/BatchLinearAlgebraKernel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vec
117117
int info;
118118
lapackEig<scalar_t, value_t>('N', jobvr, n, self_data, n, wr,
119119
nullptr, 1, vecs_data, ldvr, &wkopt, -1, rwork_data, &info);
120-
int lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
120+
int lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
121121

122122
// call again to do the actual work
123123
Tensor work = at::empty({lwork}, self.dtype());

aten/src/ATen/native/LinearAlgebra.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,10 @@ Tensor linalg_pinv(const Tensor& input, const Tensor& rcond, bool hermitian) {
142142
if (input.numel() == 0) {
143143
// The implementation below uses operations that do not work for zero numel tensors
144144
// therefore we need this early return for 'input.numel() == 0' case
145-
auto input_sizes = input.sizes().vec();
146-
std::swap(input_sizes[input.dim() - 1], input_sizes[input.dim() - 2]);
147-
return at::empty(input_sizes, input.options());
145+
Tensor U, S, V;
146+
// TODO: replace input.svd with linalg_svd when torch/xla can work with at::linalg_svd
147+
std::tie(U, S, V) = input.svd();
148+
return at::matmul(V * S.reciprocal().unsqueeze(-2), U.conj().transpose(-2, -1));
148149
}
149150

150151
// If not Hermitian use singular value decomposition, else use eigenvalue decomposition

aten/src/ATen/native/cuda/BatchLinearAlgebra.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1272,7 +1272,8 @@ AT_ERROR("solve: MAGMA library not found in "
12721272
std::tuple<Tensor, Tensor> _solve_helper_cuda(const Tensor& self, const Tensor& A) {
12731273
auto self_working_copy = cloneBatchedColumnMajor(self);
12741274
auto A_working_copy = cloneBatchedColumnMajor(A);
1275-
auto infos = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
1275+
// infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty
1276+
auto infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
12761277
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cuda", [&]{
12771278
apply_solve<scalar_t>(self_working_copy, A_working_copy, infos);
12781279
});

benchmarks/sparse/dlmc/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Sparse benchmarks
2+
3+
These sets of benchmarks are for the sparse matrix functionality using a popular real dataset collection called the Deep Learning Matrix Collection (DLMC), which were used in recent studies [1, 2].
4+
5+
Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here.
6+
7+
- `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward_test`, on CPU or CUDA with `--with_cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
8+
9+
- `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test.
10+
11+
References:
12+
13+
1. Trevor Gale, Matei Zaharia, Cliff Young, Erich Elsen. Sparse GPU Kernels for Deep Learning. Proceedings of the International Conference for High Performance Computing, 2020. https://github.com/google-research/google-research/tree/master/sgk
14+
15+
2. Trevor Gale, Erich Elsen, Sara Hooker. The State of Sparsity in Deep Neural Networks. https://github.com/google-research/google-research/tree/master/state_of_sparsity

benchmarks/sparse/dlmc/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
if __name__ == "__main__":
3+
pass
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Sparse benchmarks
2+
3+
# This benchmark is for sparse matmul performance test.
4+
# They exist for comparing the performance of sparse matrix routines
5+
# `sparse @ vector`, `sparse @ sparse` and `sparse @ dense` with different backends (CPU/CUDA)
6+
# and with other frameworks such as scipy.
7+
8+
import sys
9+
import argparse
10+
import torch
11+
import torch.utils.benchmark as benchmark_utils
12+
from .utils import load_dlmc_dataset
13+
from scipy.sparse import isspmatrix
14+
import os
15+
16+
17+
def scipy_matmul(mat1, mat2):
18+
if isspmatrix(mat1) and isspmatrix(mat2):
19+
return mat1.dot(mat2).tocoo()
20+
return mat1.dot(mat2)
21+
22+
def matmul_backward(a_dense, b_dense, grad_output):
23+
r1 = a_dense.matmul(b_dense)
24+
r1.backward(grad_output)
25+
26+
27+
def sparse_matmul_backward(a, b, grad_output):
28+
c = torch.sparse.mm(a, b)
29+
c.backward(grad_output)
30+
31+
32+
OPS_MAP = {
33+
"sparse@sparse": "torch.sparse.mm",
34+
"sparse@dense": "torch.matmul",
35+
"sparse@vector": "torch.matmul",
36+
}
37+
38+
39+
# also get the arguments as input from the user using `argparse`
40+
def parse_args():
41+
parser = argparse.ArgumentParser(description='matmul benchmark')
42+
parser.add_argument('--path', type=str, help='DLMC dataset path')
43+
parser.add_argument('--dataset', type=str, default='magnitude_pruning')
44+
parser.add_argument('--hidden_size', default=2048, type=int)
45+
parser.add_argument('--backward_test', action="store_true")
46+
parser.add_argument('--operation', type=str, help="|".join(OPS_MAP.keys()), default=next(iter(OPS_MAP)))
47+
parser.add_argument('--with_cuda', action='store_true')
48+
parser.add_argument('--timer_min_run_time', default=1, type=float)
49+
return parser
50+
51+
52+
def get_tasks(op, backward_test, device):
53+
def filter_ops(operation):
54+
if backward_test:
55+
test_name = device + ":matmul-backward"
56+
return [
57+
(test_name, device, "torch:" + operation.replace("sparse", "dense"),
58+
"matmul_backward(dx, dy, grad_output)"),
59+
(test_name, device, "torch:" + operation, "sparse_matmul_backward(x, y, sparse_grad_output)")
60+
]
61+
else:
62+
test_name = device + ":matmul-forward"
63+
return list(filter(None, [
64+
(test_name, device, "torch:" + operation.replace("sparse", "dense"),
65+
"{}(dx, dy)".format(OPS_MAP[operation])),
66+
(test_name, device, "torch:" + operation, "{}(x, y)".format(OPS_MAP[operation])),
67+
(test_name, device, "scipy:" + operation, "scipy_matmul(sx, sy)") if device == "cpu" else None
68+
]))
69+
70+
all_operations = {
71+
"sparse@sparse": filter_ops("sparse@sparse"),
72+
"sparse@dense": filter_ops("sparse@dense"),
73+
"sparse@vector": filter_ops("sparse@vector"),
74+
}
75+
return all_operations[op]
76+
77+
78+
if __name__ == '__main__':
79+
parser = parse_args()
80+
args = parser.parse_args()
81+
82+
if args.with_cuda and not torch.cuda.is_available():
83+
raise RuntimeError("No CUDA available")
84+
85+
dataset_path = args.path
86+
dataset_name = args.dataset
87+
dataset_path = os.path.join(dataset_path, dataset_name)
88+
device = 'cuda' if args.with_cuda else 'cpu'
89+
90+
tasks = get_tasks(args.operation, args.backward_test, device)
91+
repeats = 3
92+
timers = [
93+
benchmark_utils.Timer(
94+
stmt=stmt,
95+
globals={
96+
"scipy_matmul": scipy_matmul,
97+
"matmul_backward": matmul_backward,
98+
"sparse_matmul_backward": sparse_matmul_backward,
99+
**variables
100+
},
101+
label=label,
102+
sub_label=sub_label,
103+
description=f"{sparsity}",
104+
env=device,
105+
)
106+
for sparsity in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]
107+
for label, device, sub_label, stmt in tasks
108+
for variables in
109+
load_dlmc_dataset(dataset_path, args.operation, args.hidden_size, sparsity, device, args.backward_test)
110+
]
111+
measurements = []
112+
113+
for i, timer in enumerate(timers * repeats):
114+
m = timer.blocked_autorange(min_run_time=args.timer_min_run_time)
115+
m.metadata = {
116+
"device": 'cuda' if m.task_spec.env.find("cuda") >= 0 else 'cpu'
117+
}
118+
measurements.append(m)
119+
print(f"\r{i + 1} / {len(timers) * repeats}", end="")
120+
sys.stdout.flush()
121+
print()
122+
123+
comparison = benchmark_utils.Compare(measurements)
124+
125+
print("== Results " + "=" * 80 + "\n" + "/" * 95 + "\n")
126+
comparison.print()

benchmarks/sparse/dlmc/test.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
DATASET_ROOT_DIR=$HOME/datasets/
4+
5+
# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR
6+
# tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz
7+
8+
echo "!! SPARSE SPMS TIME BENCHMARK!! "
9+
10+
# cpu
11+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse
12+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --backward_test
13+
14+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense
15+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --backward_test
16+
17+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector
18+
19+
20+
# cuda
21+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda
22+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda--backward_test
23+
24+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda
25+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda --backward_test
26+
27+
python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector --with_cuda

0 commit comments

Comments
 (0)