-
Notifications
You must be signed in to change notification settings - Fork 14
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Unit test TestTrainer.test_cuda reports failure "{parameter} is not cuda" when testing CUDA support.
Environment:
- OS system: Ubuntu 22.04
- Version: 2.1.0
- Branch commit ID: 3203d7c
- Inputs: built-in test data
To Reproduce
Steps/commands/screenshots to reproduce the behaviour:
cd ${path_to}/deeprank2 && python3 -m pytest
Expected Results
All unit tests in TestTrainer.test_cuda pass.
Actual Results or Error Info
============================================================================================================================================= FAILURES ==============================================================================================================================================
_______________________________________________________________________________________________________________________________________ TestTrainer.test_cuda _______________________________________________________________________________________________________________________________________
self = <tests.test_trainer.TestTrainer testMethod=test_cuda>
def test_cuda(self): # test_ginet, but with cuda
if torch.cuda.is_available():
files = glob.glob(self.work_directory + '/*')
for f in files:
os.remove(f)
assert len(os.listdir(self.work_directory)) == 0
> _model_base_test(
self.save_path,
GINet,
"tests/data/hdf5/1ATN_ppi.hdf5",
"tests/data/hdf5/1ATN_ppi.hdf5",
"tests/data/hdf5/1ATN_ppi.hdf5",
default_features,
[Efeat.DISTANCE],
targets.REGRESS,
targets.IRMSD,
False,
[HDF5OutputExporter(self.work_directory)],
"mcl",
True
)
tests/test_trainer.py:466:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
save_path = '/tmp/tmp_jtkrcq0test.tar', model_class = <class 'deeprank2.neuralnets.gnn.ginet.GINet'>, train_hdf5_path = 'tests/data/hdf5/1ATN_ppi.hdf5', val_hdf5_path = 'tests/data/hdf5/1ATN_ppi.hdf5', test_hdf5_path = 'tests/data/hdf5/1ATN_ppi.hdf5'
node_features = ['res_type', 'polarity', 'bsa', 'res_depth', 'hse', 'info_content', ...], edge_features = ['distance'], task = 'regress', target = 'irmsd', target_transform = False, output_exporters = [<deeprank2.utils.exporters.HDF5OutputExporter object at 0x7fb7f4fc4ac0>]
clustering_method = 'mcl', use_cuda = True
def _model_base_test( # pylint: disable=too-many-arguments, too-many-locals
save_path,
model_class,
train_hdf5_path,
val_hdf5_path,
test_hdf5_path,
node_features,
edge_features,
task,
target,
target_transform,
output_exporters,
clustering_method,
use_cuda = False
):
dataset_train = GraphDataset(
hdf5_path = train_hdf5_path,
node_features = node_features,
edge_features = edge_features,
clustering_method = clustering_method,
target = target,
target_transform = target_transform,
task = task
)
if val_hdf5_path is not None:
dataset_val = GraphDataset(
hdf5_path = val_hdf5_path,
train = False,
dataset_train = dataset_train,
clustering_method = clustering_method,
)
else:
dataset_val = None
if test_hdf5_path is not None:
dataset_test = GraphDataset(
hdf5_path = test_hdf5_path,
train = False,
dataset_train = dataset_train,
clustering_method = clustering_method,
)
else:
dataset_test = None
trainer = Trainer(
model_class,
dataset_train,
dataset_val,
dataset_test,
output_exporters=output_exporters,
)
if use_cuda:
_log.debug("cuda is available, testing that the model is cuda")
for parameter in trainer.model.parameters():
> assert parameter.is_cuda, f"{parameter} is not cuda"
E AssertionError: Parameter containing:
E tensor([[-0.0144, 0.0992, -0.0512, -0.0195, 0.1068, -0.1003, 0.0110, 0.0879,
E -0.0062, -0.0924, -0.0776, -0.0117, -0.0493, -0.1114, -0.1404, 0.1040,
E -0.0337, 0.0640, 0.0213, -0.0325, -0.0301, -0.0737, 0.1044, -0.1126,
E 0.0262, 0.0157, -0.0592, 0.0683, 0.0010, -0.0143, 0.0695, 0.0390,
E 0.0035, -0.0878, -0.0623, -0.0325, 0.0215, 0.0790, -0.0169, 0.0888,
E -0.0170, 0.0415, 0.1259, 0.1174, 0.0592, 0.0053, 0.1238, 0.0093,
E 0.0933, -0.0784],
E [-0.0636, 0.0521, 0.0685, 0.0085, -0.0176, -0.0022, -0.1396, -0.1115,
E -0.0423, 0.0597, 0.1176, 0.0292, 0.0522, 0.0748, -0.1094, -0.0483,
E -0.1350, 0.0730, 0.1028, 0.0208, -0.0054, -0.1384, 0.0446, -0.0862,
E -0.0873, 0.0802, 0.1212, 0.0552, 0.0870, 0.0722, 0.0470, -0.0942,
E -0.0946, -0.0125, 0.0174, 0.0141, 0.0426, 0.0405, 0.0153, -0.0257,
E -0.0634, -0.0135, -0.0591, -0.1072, 0.1289, 0.0492, 0.1118, 0.0114,
E 0.0758, 0.0117],
E [-0.0017, 0.0442, 0.0633, -0.0930, -0.0056, 0.1407, 0.0575, -0.0822,
E -0.0447, 0.0099, -0.0081, -0.0749, 0.0465, -0.0741, 0.1163, -0.0915,
E -0.0145, -0.0514, -0.0999, -0.0365, 0.1299, -0.1124, -0.0814, 0.1308,
E -0.0645, 0.1129, 0.0700, -0.1241, -0.0065, -0.1021, 0.1295, 0.0965,
E 0.1080, 0.0636, -0.1398, -0.0658, -0.0985, -0.0753, -0.0281, -0.0322,
E 0.0777, -0.0604, -0.1003, -0.1393, 0.1313, -0.0098, -0.1252, 0.0505,
E -0.0575, 0.0081],
E [ 0.0520, 0.0575, -0.0958, 0.0526, -0.0666, 0.0282, -0.0729, -0.0115,
E -0.0955, -0.1093, -0.0536, 0.0385, -0.0114, 0.1243, 0.1015, 0.1147,
E -0.1217, -0.0966, -0.0771, 0.0440, 0.0798, 0.1308, -0.0617, 0.0651,
E 0.0510, 0.0020, 0.0599, 0.1122, -0.0945, -0.0763, 0.0062, 0.1325,
E 0.1274, 0.0900, 0.1234, 0.0200, -0.0688, -0.1334, 0.1412, -0.1149,
E -0.0501, 0.1206, 0.0491, 0.0253, -0.0708, 0.0197, 0.1040, 0.0610,
E -0.0715, -0.1175],
E [-0.0822, -0.1260, -0.0521, 0.0120, -0.1407, 0.0361, -0.0430, -0.0256,
E 0.0725, -0.0614, 0.1259, -0.0703, 0.0273, -0.1157, 0.1234, -0.0091,
E -0.1137, -0.1078, 0.1147, 0.0508, -0.0769, -0.0187, 0.0972, -0.0944,
E 0.0069, 0.0057, -0.0039, -0.0094, -0.0548, -0.0139, 0.0678, -0.0181,
E 0.0602, -0.1162, 0.0426, -0.0692, 0.0511, 0.1086, -0.0887, 0.0231,
E 0.0222, -0.0728, 0.0763, -0.0486, -0.1119, 0.1193, -0.1024, -0.0205,
E -0.1126, 0.0480],
E [ 0.0479, 0.0253, -0.0691, -0.0518, -0.1335, 0.1016, -0.0699, -0.1113,
E -0.0269, 0.1003, 0.1241, 0.0168, 0.0273, -0.1267, -0.0303, -0.0313,
E 0.0080, -0.0807, 0.0885, 0.1291, -0.0525, -0.0733, 0.0704, -0.0245,
E 0.1250, -0.0754, 0.1224, -0.1165, -0.1291, -0.1365, 0.1412, 0.0068,
E 0.1128, -0.0286, 0.0092, 0.1310, -0.0834, 0.0890, -0.1287, 0.0112,
E -0.1109, -0.0088, -0.0812, 0.1078, 0.0685, -0.0095, -0.0836, 0.0313,
E 0.1368, 0.0146],
E [ 0.0500, 0.0443, 0.0487, -0.0627, -0.0461, -0.0323, -0.0025, 0.1373,
E -0.0851, -0.1374, -0.0431, -0.0513, -0.1102, -0.0971, -0.0632, -0.0291,
E -0.0575, 0.0877, -0.0693, -0.1353, 0.1394, -0.1395, 0.0458, -0.1131,
E -0.1330, 0.1307, -0.0527, 0.0849, -0.0669, 0.0005, 0.0558, 0.1317,
E -0.0124, -0.0017, -0.0891, -0.0400, 0.0913, -0.0388, -0.1388, 0.1312,
E -0.1197, 0.0558, 0.0600, -0.0582, 0.0584, 0.1312, -0.0508, 0.0919,
E 0.1182, 0.1147],
E [-0.1222, -0.1013, -0.0610, -0.1083, -0.1210, 0.1192, 0.0572, 0.0695,
E -0.0187, 0.1165, -0.0955, 0.0304, 0.0016, -0.0234, 0.1243, -0.1406,
E 0.1306, 0.0680, -0.0004, 0.0132, -0.0340, -0.0102, -0.0915, 0.1055,
E -0.1227, -0.0169, 0.0259, -0.1102, -0.0759, -0.1361, -0.0054, 0.1261,
E 0.0073, -0.0100, 0.0322, 0.0973, 0.0708, 0.0559, 0.0077, 0.0543,
E 0.0334, -0.0325, -0.0603, -0.0918, -0.0795, 0.0560, 0.0049, 0.0866,
E 0.1065, 0.1078],
E [ 0.1092, 0.1269, 0.0134, -0.1032, -0.0049, 0.0282, 0.0931, -0.0253,
E 0.0412, -0.0906, -0.0672, 0.0091, 0.1323, 0.0587, 0.0360, -0.0703,
E 0.1313, 0.0297, 0.0447, 0.0360, -0.1141, -0.1368, 0.1186, 0.0353,
E -0.0919, -0.0699, -0.1057, -0.1098, 0.1210, 0.1006, 0.0123, -0.1256,
E -0.1222, -0.1255, -0.1179, 0.0677, 0.0978, -0.1083, -0.0355, -0.0549,
E -0.0907, 0.0839, 0.0452, 0.0291, 0.0283, -0.0048, 0.1246, -0.0418,
E -0.0591, -0.0274],
E [-0.1327, 0.0765, -0.0711, -0.1398, -0.0851, 0.1156, 0.0890, -0.0477,
E -0.0240, -0.0513, 0.0639, 0.1241, -0.0770, 0.1357, 0.0375, 0.1185,
E 0.1413, 0.0844, -0.1152, -0.0550, -0.1389, 0.1353, -0.1109, -0.0761,
E 0.0663, -0.0293, -0.0479, -0.0303, -0.1002, -0.0813, -0.0583, 0.0320,
E 0.1161, -0.1387, 0.0423, -0.0680, -0.0568, 0.0894, -0.0183, -0.0503,
E -0.1405, -0.0993, -0.0882, -0.1191, 0.1114, -0.0468, 0.0368, 0.0227,
E 0.0651, -0.0205],
E [ 0.1009, -0.0631, -0.1068, 0.0724, -0.0133, 0.0564, 0.0791, -0.0070,
E 0.0009, -0.0135, -0.0610, 0.0973, 0.1119, -0.0408, -0.0797, -0.0021,
E 0.0446, -0.0459, -0.1371, 0.0652, -0.0538, -0.0543, -0.0242, 0.0633,
E -0.0280, 0.1385, 0.0986, 0.0812, -0.1076, 0.0379, 0.0937, -0.0940,
E -0.1136, -0.0697, 0.0721, -0.0743, -0.0771, 0.1386, 0.1303, -0.1327,
E 0.0323, -0.0199, 0.0193, 0.0895, 0.0244, -0.1336, -0.0821, -0.0775,
E -0.1054, -0.0005],
E [-0.0295, -0.0221, 0.0662, 0.1065, -0.0002, 0.0728, 0.1215, 0.0668,
E -0.1366, 0.0145, 0.1010, 0.0161, -0.0257, -0.0898, 0.0798, 0.0973,
E 0.0049, -0.0954, 0.0240, 0.0369, 0.0266, 0.0397, -0.0143, 0.0236,
E -0.1392, 0.0787, -0.1148, 0.0287, -0.1143, 0.0567, -0.0628, 0.0160,
E 0.1145, 0.0410, -0.1319, -0.1286, 0.1374, -0.1058, 0.1170, -0.0820,
E -0.1027, -0.1046, -0.0465, -0.0729, -0.0217, -0.0667, 0.1137, 0.0785,
E -0.1335, 0.1190],
E [ 0.0873, 0.1289, 0.1008, 0.0651, -0.0016, 0.0879, -0.1286, 0.0997,
E 0.1082, -0.0550, -0.1390, 0.0548, -0.0512, 0.1251, -0.1207, -0.0973,
E 0.1283, 0.1013, 0.0551, -0.0764, -0.1218, 0.0891, -0.0083, 0.1066,
E -0.0433, 0.0089, -0.0525, 0.0013, 0.0796, -0.0171, -0.0638, 0.0202,
E -0.0058, -0.1353, -0.0457, 0.0375, -0.1220, 0.0873, 0.0344, 0.1114,
E -0.0062, 0.0261, 0.0646, 0.0878, -0.0672, -0.0354, -0.0005, -0.0943,
E 0.1204, -0.0514],
E [ 0.1370, 0.0596, 0.0884, -0.0072, 0.0613, 0.0784, -0.0527, 0.0690,
E 0.0207, -0.0750, -0.0029, -0.0798, 0.0468, 0.0465, 0.1398, -0.1194,
E -0.0052, 0.1198, 0.1223, -0.1026, -0.0057, -0.1317, -0.0610, -0.0538,
E -0.0841, -0.0575, 0.1217, 0.0133, 0.0716, -0.0591, 0.1211, -0.0257,
E -0.1076, -0.0498, 0.0370, -0.0114, 0.1026, -0.1254, -0.0156, 0.0659,
E -0.0141, 0.0058, 0.1209, 0.0012, -0.0203, 0.1266, 0.1006, 0.0321,
E -0.0851, -0.1275],
E [ 0.1227, 0.0314, 0.0957, 0.0305, 0.0202, -0.1169, -0.1151, 0.0561,
E 0.0961, -0.0957, 0.0379, -0.0112, -0.1300, -0.0072, -0.0458, 0.0793,
E -0.1215, -0.0925, 0.1092, -0.0005, 0.0490, -0.0276, -0.0205, -0.0099,
E 0.1052, -0.0677, -0.0681, 0.0170, -0.1177, -0.0974, -0.1098, 0.1244,
E -0.0490, 0.1389, 0.1146, -0.0376, 0.0137, -0.1134, 0.0005, 0.1086,
E 0.0986, 0.1198, -0.0884, 0.1235, -0.1289, 0.0453, -0.1128, 0.0033,
E -0.0906, 0.1216],
E [ 0.1068, -0.0941, -0.0682, 0.0095, -0.0106, 0.0498, 0.1295, -0.1184,
E 0.0237, -0.0496, 0.1077, 0.0860, 0.1216, -0.0844, 0.0757, -0.0386,
E -0.0315, -0.0638, 0.0495, -0.0977, 0.1180, -0.0454, -0.0758, -0.1192,
E 0.0653, -0.0957, -0.0706, -0.0741, 0.0852, 0.0572, -0.1270, -0.0058,
E 0.0269, -0.1248, -0.1341, -0.1125, 0.0359, 0.0379, -0.0336, -0.0048,
E -0.0811, -0.0981, -0.0629, -0.0711, -0.1290, -0.0611, 0.0995, 0.0079,
E 0.0471, 0.1289]], requires_grad=True) is not cuda
E assert False
E + where False = Parameter containing:\ntensor([[-0.0144, 0.0992, -0.0512, -0.0195, 0.1068, -0.1003, 0.0110, 0.0879,\n -0.006...0.0811, -0.0981, -0.0629, -0.0711, -0.1290, -0.0611, 0.0995, 0.0079,\n 0.0471, 0.1289]], requires_grad=True).is_cuda
tests/test_trainer.py:87: AssertionError
Additional Context
Test failures observed when running in a Linux container with GPU passthrough.
- Host OS system: Windows 10 (22H2), Docker Desktop 4.25.0 (WSL 2 backend)
- GPU: NVIDIA RTX 2080 Super (driver version: 531.79)
- CUDA Version: 11.8.0 (docker image: nvidia/cuda:11.8.0-runtime-ubuntu22.04)
$ env | grep CUDA_VERSION: CUDA_VERSION=11.8.0$ nvidia-smi:
vscode@9026efaa1337:/workspaces/deeprank2$ nvidia-smi
Wed Nov 1 18:04:50 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.50 Driver Version: 531.79 CUDA Version: 12.1 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 2080 S... On | 00000000:08:00.0 On | N/A |
| 26% 31C P8 18W / 250W| 2601MiB / 8192MiB | 7% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working
Type
Projects
Status
Done