Skip to content

Commit 547b0fc

Browse files
committed
Merge branch 'main' into keep-bwd
Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
2 parents fda56bd + 8cf3c16 commit 547b0fc

82 files changed

Lines changed: 7388 additions & 2508 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3rdparty/cudnn-frontend

Submodule cudnn-frontend updated 91 files

qa/L0_pytorch_unittest/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_
4141
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
4242
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
4343
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
44-
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
44+
NVTE_CUTEDSL_FUSED_GROUPED_MLP=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
4545
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_backward_override.xml $TE_PATH/tests/pytorch/test_backward_override.py || test_fail "test_backward_override.py"
4646
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
4747
python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"

qa/L1_pytorch_thunder_integration/test.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.

tests/cpp/operator/test_cast_mxfp8.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ std::vector<std::vector<size_t>> matrix_sizes = {
535535
{1024},
536536
{8, 32, 1024},
537537
{16, 8, 4, 512},
538+
{8192, 7168},
538539
};
539540

540541
std::vector<std::pair<size_t, size_t>> block_sizes = {

tests/cpp/operator/test_cast_mxfp8_grouped.cu

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ void performTest(const ProcessingMethod processing_method,
371371

372372
NVTEShape logical_shape_ = nvte_make_shape(logical_shape_vec.data(), logical_shape_vec.size());
373373

374-
std::vector<size_t> dbias_logical_shape_vec= {num_tensors, cols};
374+
std::vector<size_t> dbias_logical_shape_vec = {num_tensors, cols};
375375
NVTEShape dbias_logical_shape_ = nvte_make_shape(dbias_logical_shape_vec.data(),
376376
dbias_logical_shape_vec.size());
377377

@@ -499,11 +499,13 @@ void performTest(const ProcessingMethod processing_method,
499499
scales_stride_colwise);
500500
}
501501

502+
QuantizationConfigWrapper quant_config;
503+
502504
// GPU
503505
Tensor workspace;
504506
switch (processing_method) {
505507
case ProcessingMethod::CAST_ONLY: {
506-
nvte_group_quantize(in_group_tensor, out_group_tensor, 0);
508+
nvte_group_quantize(in_group_tensor, out_group_tensor, quant_config, 0);
507509
break;
508510
}
509511
case ProcessingMethod::CAST_DBIAS: {
@@ -554,6 +556,11 @@ void performTest(const ProcessingMethod processing_method,
554556
const double abs_tolerable_mismatches_limit = 0.0;
555557
const double rel_tolerable_mismatches_limit = 0.0;
556558

559+
// Compare only allocated contiguous output range.
560+
// In graph-safe mode logical shape may include trailing garbage beyond offsets_h.back().
561+
const size_t compare_rows = 1;
562+
const size_t compare_cols = elts_num;
563+
557564
if (rowwise) {
558565
cudaMemcpy(out_data_rowwise_h.data(), out_data_rowwise_d, out_data_size, cudaMemcpyDeviceToHost);
559566
cudaMemcpy(out_scales_rowwise_h.data(), out_scales_rowwise_d, rowwise_scales_size, cudaMemcpyDeviceToHost);
@@ -566,7 +573,8 @@ void performTest(const ProcessingMethod processing_method,
566573
const size_t mismatches_elts = 32 * mismatches_scales;
567574

568575
compare_scaled_elts<OutputType>("rowwise_output", out_data_rowwise_ref.data(),
569-
out_data_rowwise_h.data(), rows, cols, true, mismatches_elts);
576+
out_data_rowwise_h.data(), compare_rows, compare_cols,
577+
true, mismatches_elts);
570578
}
571579

572580
if (colwise) {
@@ -581,7 +589,8 @@ void performTest(const ProcessingMethod processing_method,
581589
const size_t mismatches_elts = 32 * mismatches_scales;
582590

583591
compare_scaled_elts<OutputType>("colwise_output", out_data_colwise_ref.data(),
584-
out_data_colwise_h.data(), rows, cols, false, mismatches_elts);
592+
out_data_colwise_h.data(), compare_rows, compare_cols,
593+
false, mismatches_elts);
585594
}
586595

587596
if (compute_dbias) {
@@ -652,9 +661,13 @@ std::vector<std::vector<size_t>> input_config = {
652661
{VARYING_FIRST_DIM, 4, 1024,144, 128,384,0,512},
653662
{VARYING_FIRST_DIM, 4, 1536,160, 128,384,512,512},
654663
{VARYING_FIRST_DIM, 5, 4096,512, 128,256,384,1024,2304},
664+
{VARYING_FIRST_DIM, 5, 16 * 4096,512, 128,256,384,1024,2304},
655665
{VARYING_LAST_DIM, 3, 256,896, 128,256,512},
656666
{VARYING_BOTH_DIMS, 2, 1,(128*128)+(256*256), 128,256, 128,256},
657667
{VARYING_BOTH_DIMS, 2, 1,(256*128)+(512*640), 256,512, 128,640},
668+
// Empty tensor in the middle of the group must not terminate the persistent work loop.
669+
{VARYING_FIRST_DIM, 4, 512,160, 128,0,0,256},
670+
{VARYING_BOTH_DIMS, 3, 1,(128*128)+(128*128), 128,0,128, 128,0,128},
658671
};
659672

660673
} // namespace
@@ -808,6 +821,37 @@ std::string to_string(const ActivationKind activation) {
808821
}
809822
}
810823

824+
std::string MakeGroupedFusedCastMXFP8TestName(
825+
const testing::TestParamInfo<GroupedFusedCastMXFP8TestSuite::ParamType>& info) {
826+
const ProcessingMethod method = std::get<0>(info.param);
827+
std::string name = to_string(method);
828+
name += "X" + to_string(std::get<1>(info.param));
829+
830+
switch (std::get<2>(info.param)) {
831+
case ScalingDirection::ROWWISE: name += "_ROWWISE_"; break;
832+
case ScalingDirection::COLWISE: name += "_COLWISE_"; break;
833+
case ScalingDirection::BOTH: name += "_BIDIMENSIONAL_"; break;
834+
}
835+
836+
const std::vector<size_t> input = std::get<3>(info.param);
837+
838+
switch (static_cast<ShapeRepresentation>(input[0])) {
839+
case ShapeRepresentation::SAME_BOTH_DIMS: name += "SAME_BOTH_DIMS"; break;
840+
case ShapeRepresentation::VARYING_FIRST_DIM: name += "VARYING_FIRST_DIM"; break;
841+
case ShapeRepresentation::VARYING_LAST_DIM: name += "VARYING_LAST_DIM"; break;
842+
case ShapeRepresentation::VARYING_BOTH_DIMS: name += "VARYING_BOTH_DIMS"; break;
843+
}
844+
845+
name += "_N_" + std::to_string(input[1]);
846+
847+
name += "_SHAPE_" + std::to_string(input[2]) + "X" + std::to_string(input[3]);
848+
849+
name += "_" + test::typeName(std::get<4>(info.param)) +
850+
"_" + test::typeName(std::get<5>(info.param));
851+
852+
return name;
853+
}
854+
811855
INSTANTIATE_TEST_SUITE_P(
812856
OperatorTest,
813857
GroupedFusedCastMXFP8TestSuite,
@@ -818,33 +862,4 @@ INSTANTIATE_TEST_SUITE_P(
818862
::testing::ValuesIn(input_config),
819863
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
820864
::testing::Values(DType::kFloat8E4M3, DType::kFloat8E5M2)),
821-
[](const testing::TestParamInfo<GroupedFusedCastMXFP8TestSuite::ParamType>& info) {
822-
const ProcessingMethod method = std::get<0>(info.param);
823-
std::string name = to_string(method);
824-
name += "X" + to_string(std::get<1>(info.param));
825-
826-
switch (std::get<2>(info.param)) {
827-
case ScalingDirection::ROWWISE: name += "_ROWWISE_"; break;
828-
case ScalingDirection::COLWISE: name += "_COLWISE_"; break;
829-
case ScalingDirection::BOTH: name += "_BIDIMENSIONAL_"; break;
830-
}
831-
832-
const std::vector<size_t> input = std::get<3>(info.param);
833-
834-
switch(static_cast<ShapeRepresentation>(input[0])) {
835-
case ShapeRepresentation::SAME_BOTH_DIMS: name += "SAME_BOTH_DIMS"; break;
836-
case ShapeRepresentation::VARYING_FIRST_DIM: name += "VARYING_FIRST_DIM"; break;
837-
case ShapeRepresentation::VARYING_LAST_DIM: name += "VARYING_LAST_DIM"; break;
838-
case ShapeRepresentation::VARYING_BOTH_DIMS: name += "VARYING_BOTH_DIMS"; break;
839-
};
840-
841-
name += "_N_" + std::to_string(input[1]);
842-
843-
name += "_SHAPE_" +
844-
std::to_string(input[2]) +
845-
"X" + std::to_string(input[3]);
846-
847-
name += "_" + test::typeName(std::get<4>(info.param)) +
848-
"_" + test::typeName(std::get<5>(info.param));
849-
return name;
850-
});
865+
MakeGroupedFusedCastMXFP8TestName);

0 commit comments

Comments
 (0)