-
Notifications
You must be signed in to change notification settings - Fork 172
Expand file tree
/
Copy pathmemory.cpp
More file actions
465 lines (337 loc) · 16.2 KB
/
memory.cpp
File metadata and controls
465 lines (337 loc) · 16.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
/** @file
* Internal functions which query available CPU memory (in an
* attemptedly OS-agnostic way), and provide needed memory
* querents. Note GPU memory querying is performed by
* the dedicated GPU backend, though this file is always
* compiled (even in GPU mode) because GPU-acceleration still
* requires accompanying CPU memory arrays. This file does not
* perform any allocation of memory; that is instead performed
* by cpu_config.cpp, to be symmetric with the GPU-memory
* allocators in gpu_config.cpp, and use NUMA strategies.
*
* @author Tyson Jones
* @author Mai Đức Khang (CPU memory query)
*/
#include "quest/include/types.h"
#include "quest/include/paulis.h"
#include "quest/src/core/memory.hpp"
#include "quest/src/core/bitwise.hpp"
#include "quest/src/core/errors.hpp"
#include <cstdlib>
// Platform-specific includes for RAM querying
#if defined(__linux__)
#include <sys/sysinfo.h>
#elif defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
#elif defined(_WIN32)
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
/*
* MEMORY BOUNDS
*/
int getMaxNumQubitsWhichCanFitInMemory(bool isDensMatr, int numNodes, bool hasBuffer, bool isSuperOp, qindex memBytesPerNode) {
// distribution requires communication buffers, doubling costs, halving fittable amps-per-qureg
qindex maxLocalNumAmps = memBytesPerNode / sizeof(qcomp); // floors
if (hasBuffer && numNodes > 1)
maxLocalNumAmps /= 2; // floors
// density matrices require square more memory, so halve (flooring) the number of qubits
int maxLocalNumQubits = std::floor(std::log2(maxLocalNumAmps));
if (isDensMatr)
maxLocalNumQubits /= 2; // floors
// superoperators require square more memory still, so halve (flooring) the number of qubits
if (isSuperOp)
maxLocalNumQubits /= 2; // floors
// doubling nodes permits 1 additional qubit
int maxGlobalNumQubits = maxLocalNumQubits + logBase2(numNodes);
return maxGlobalNumQubits;
}
int mem_getMaxNumQuregQubitsWhichCanFitInMemory(bool isDensityMatrix, int numNodes, qindex memBytesPerNode) {
bool hasBuffer = true;
bool isSuperOp = false;
return getMaxNumQubitsWhichCanFitInMemory(isDensityMatrix, numNodes, hasBuffer, isSuperOp, memBytesPerNode);
}
int mem_getMaxNumMatrixQubitsWhichCanFitInMemory(bool isDenseMatrix, int numNodes, qindex memBytesPerNode) {
// matrix types don't store buffers - they'll use those of Quregs they're applied to
bool hasBuffer = false;
bool isSuperOp = false;
return getMaxNumQubitsWhichCanFitInMemory(isDenseMatrix, numNodes, hasBuffer, isSuperOp, memBytesPerNode);
}
int mem_getMaxNumSuperOpQubitsWhichCanFitInMemory(qindex memBytesPerNode) {
// superoperators have square-bigger superoperators than dense matrices, and are never distributed
int numNodes = 1;
bool isDense = true;
bool hasBuffer = false;
bool isSuperOp = true;
return getMaxNumQubitsWhichCanFitInMemory(isDense, numNodes, hasBuffer, isSuperOp, memBytesPerNode);
}
int mem_getMinNumQubitsForDistribution(int numNodes) {
return logBase2(numNodes);
}
int mem_getMaxNumQuregQubitsBeforeIndexOverflow(bool isDensityMatrix) {
// cannot store more amplitudes than can be counted by the qindex type (even when distributed)
qindex maxNumAmps = std::numeric_limits<qindex>::max();
int maxNumQubits = std::floor(std::log2(maxNumAmps) / static_cast<qreal>((isDensityMatrix)? 2 : 1));
return maxNumQubits;
}
int mem_getMaxNumMatrixQubitsBeforeIndexOverflow(bool isDenseMatrix) {
// matrices have the same number of amplitudes as a same-dimension Qureg
return mem_getMaxNumQuregQubitsBeforeIndexOverflow(isDenseMatrix);
}
int mem_getMaxNumSuperOpQubitsBeforeIndexOverflow() {
// the superoperator is square-bigger than a dense matrix
bool isDense = true;
int maxMatrixQubits = mem_getMaxNumMatrixQubitsBeforeIndexOverflow(isDense);
int maxSuperOpQubits = maxMatrixQubits / 2; // floors
return maxSuperOpQubits;
}
qindex mem_getMaxNumKrausMapMatricesBeforeIndexOverflow(int numQubits) {
qindex numElemPerMatrix = powerOf2(2 * numQubits);
qindex maxNumTotalElems = std::numeric_limits<qindex>::max();
qindex maxNumMatrices = maxNumTotalElems / numElemPerMatrix; // floors
return maxNumMatrices;
}
int getMaxNumQubitsBeforeGlobalMemSizeofOverflow(bool isDensMatr, int numNodes, bool hasBuffer, bool isSuperOp) {
// this function assumes we must be able to store the total
// CPU memory (in bytes) used by a single data structure,
// aggregate across all nodes, in a single size_t primitive.
// This is a defensively-designed constraint; we do not ever
// actually need to know the full memory, but assuring that
// we could principally store it in a size_t will futureproof
// reporter functions against future overflows etc. Note it
// does not meaningfully restrict the maximum simulatable size
// except on ~8 EiB supercomputers. Looking at you, Jupiter!
size_t maxSizeof = std::numeric_limits<size_t>::max();
size_t maxGlobalNumAmps = maxSizeof / sizeof(qcomp); // floors
size_t maxGlobalNumQubits = std::floor(std::log2(maxGlobalNumAmps)); // floors
// distributing Quregs requires communication buffers, doubling memory, decreasing qubits by 1
if (hasBuffer && numNodes > 1)
maxGlobalNumQubits -= 1;
// density matrices have square-more amps, halving the number of qubtis (AFTER buffer subtraction)
if (isDensMatr)
maxGlobalNumQubits /= 2; // floors
// superoperators are square-bigger than their constituent dense matrices
if (isSuperOp)
maxGlobalNumQubits /= 2; // floors
/// @todo
/// above sometimes overestimates by one; suggesting N can fit
/// in fact only N-1 can fit without causing overflows. It's
/// a chore to correct this precision-agnostically, and we cannot
/// just try get the total-memory and provoke the overflow because
/// a call to mem_getLocalQuregMemoryRequired() would recurse! As
/// this function is only needed for ridiculously overzealous
/// validation, and does not risk any logical error, we simply
/// subtract one to avoid the overflowing edge-case. The returned
/// max-size remains completely unreachable by users of course!
maxGlobalNumQubits -= 1;
return maxGlobalNumQubits;
}
int mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(bool isDensityMatrix, int numNodes) {
bool hasBuffer = true;
bool isSuperOp = false;
return getMaxNumQubitsBeforeGlobalMemSizeofOverflow(isDensityMatrix, numNodes, hasBuffer, isSuperOp);
}
int mem_getMaxNumMatrixQubitsBeforeGlobalMemSizeofOverflow(bool isDenseMatrix, int numNodes) {
// matrix types don't store buffers - they'll use those of Quregs they're applied to
bool hasBuffer = false;
bool isSuperOp = false;
return getMaxNumQubitsBeforeGlobalMemSizeofOverflow(isDenseMatrix, numNodes, hasBuffer, isSuperOp);
}
int mem_getMaxNumSuperOpQubitsBeforeGlobalMemSizeofOverflow() {
// superoperators have square-bigger superoperators than dense matrices, and are never distributed
int numNodes = 1;
bool isDense = true;
bool hasBuffer = false;
bool isSuperOp = true;
return getMaxNumQubitsBeforeGlobalMemSizeofOverflow(isDense, numNodes, hasBuffer, isSuperOp);
}
qindex mem_getMaxNumKrausMapMatricesBeforeLocalMemSizeofOverflow(int numQubits) {
qindex numMatrWithMaxTotalElems = mem_getMaxNumKrausMapMatricesBeforeIndexOverflow(numQubits);
qindex numMatrWithMaxTotalMem = numMatrWithMaxTotalElems / sizeof(qcomp); // floors
return numMatrWithMaxTotalMem;
}
/*
* HARDWARE QUERYING
*/
qindex mem_tryGetLocalRamCapacityInBytes() {
#if defined(__linux__)
struct sysinfo info;
if (sysinfo(&info) == 0)
return (qindex) info.totalram * info.mem_unit;
#elif defined(__APPLE__)
int mib[2] = {CTL_HW, HW_MEMSIZE};
int64_t memsize = 0;
size_t len = sizeof(memsize);
if (sysctl(mib, 2, &memsize, &len, NULL, 0) == 0 && memsize > 0)
return (qindex) memsize;
#elif defined(_WIN32)
MEMORYSTATUSEX statex;
statex.dwLength = sizeof(statex);
if (GlobalMemoryStatusEx(&statex))
return (qindex) statex.ullTotalPhys;
#endif
// fallback: throw exception
throw (mem::COULD_NOT_QUERY_RAM) false;
}
/*
* MEMORY USAGE
*/
int mem_getEffectiveNumStateVecQubitsPerNode(int numQubits, bool isDensMatr, int numNodes) {
// compute logs directly to avoid overflows (even though validation should preclude them)
qindex logNumAmpsTotal = ((isDensMatr)? 2 : 1) * numQubits;
qindex logNumAmpsPerNode = logNumAmpsTotal - logBase2(numNodes);
return logNumAmpsPerNode;
}
qindex mem_getTotalGlobalMemoryUsed(Qureg qureg) {
/// @todo
/// if sizeof(qcomp) is a power of 2 (which it almost always is, c'mon now),
/// then we could instead return the LOG of the total memory and always
/// avoid overflow, permitting reporters to display mem=2^exp.
/// it would also make changing units (e.g. to GB) easier.
// work out individual array costs
qindex memLocalArray = (qindex) mem_getLocalQuregMemoryRequired(qureg.numAmpsPerNode); // never overflows
int numLocalArrays =
mem_isAllocated(qureg.cpuAmps) + mem_isAllocated(qureg.cpuCommBuffer) +
mem_isAllocated(qureg.gpuAmps) + mem_isAllocated(qureg.gpuCommBuffer); // but 4*memLocalArray might overflow
// if total local costs would overflow qindex, return 0
qindex maxQindex = std::numeric_limits<qindex>::max();
qindex maxLocalArrayMem = maxQindex / numLocalArrays; // floors
if (memLocalArray > maxLocalArrayMem)
return 0;
// if qureg is non-distributed, compute local CPU+GPU+buffers costs and return
qindex memLocalTotal = numLocalArrays * memLocalArray;
if (!qureg.isDistributed)
return memLocalTotal;
// else if total global costs would overflow qindex, return 0
qindex maxLocalTotalMem = maxQindex / qureg.numNodes; // floors
if (memLocalTotal > maxLocalTotalMem)
return 0;
// else compute total costs between all nodes
qindex memGlobalTotal = memLocalTotal * qureg.numNodes;
return memGlobalTotal;
}
/*
* MEMORY REQUIRED
*/
size_t getLocalMemoryRequired(int numQubits, int numNodes, bool isDenseMatrix, bool hasBuffers) {
// assert no-overflow precondition
if (numQubits > getMaxNumQubitsBeforeGlobalMemSizeofOverflow(isDenseMatrix, numNodes, hasBuffers, false)) // isSuperop=false
error_memSizeQueriedButWouldOverflow();
// no risk of overflow; we have already validated numAmpsTotal fits in qindex
qindex numAmpsTotal = (isDenseMatrix)? powerOf2(2*numQubits) : powerOf2(numQubits);
qindex numAmpsPerNode = numAmpsTotal / numNodes; // divides evenly
// communication buffers double costs
if (hasBuffers && numNodes > 1)
numAmpsPerNode *= 2;
// beware that we must cast to a size_t (which can be greater
// than qindex) BEFORE multiplying, to avoid overflows
return static_cast<size_t>(numAmpsPerNode) * sizeof(qcomp);
}
size_t mem_getLocalQuregMemoryRequired(int numQubits, bool isDensityMatr, int numNodes) {
// Quregs may need buffers for inter-node communication, depending on numNodes > 1
bool hasBuffers = true;
return getLocalMemoryRequired(numQubits, numNodes, isDensityMatr, hasBuffers);
}
size_t mem_getLocalQuregMemoryRequired(qindex numAmpsPerNode) {
// assert no-overflow precondition
qindex maxNumAmpsPerNode = std::numeric_limits<size_t>::max() / sizeof(qcomp); // floors
if (numAmpsPerNode > maxNumAmpsPerNode)
error_memSizeQueriedButWouldOverflow();
// return number of bytes to store local array, EXCLUDING communication buffer
return numAmpsPerNode * sizeof(qcomp);
}
size_t mem_getLocalMatrixMemoryRequired(int numQubits, bool isDenseMatrix, int numNodes) {
// matrix types don't store buffers - they'll use those of Quregs they're applied to
bool hasBuffers = false;
return getLocalMemoryRequired(numQubits, numNodes, isDenseMatrix, hasBuffers);
}
size_t mem_getLocalSuperOpMemoryRequired(int numQubits) {
// superoperators have square-bigger superoperators than dense matrices, and are never distributed
int numMatrixQubits = 2 * numQubits;
bool isDense = true;
int numNodes = 1;
return mem_getLocalMatrixMemoryRequired(numMatrixQubits, isDense, numNodes);
}
/*
* SUFFICIENT MEMORY QUERYING
*/
bool mem_canQuregFitInMemory(int numQubits, bool isDensMatr, int numNodes, qindex memBytesPerNode) {
return numQubits <= mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, memBytesPerNode);
}
bool mem_canMatrixFitInMemory(int numQubits, bool isDense, int numNodes, qindex memBytesPerNode) {
// this function's logic is similar to mem_canQuregFitInMemory(), where diagonal matrices are
// like statevectors and dense matrices are like density-matrices, except that distributed
// matrices (numNodes > 1) do not store (nor need to account for) communication buffers
// distributing the matrix shrinks the local number of qubits stored, effectively
int localNumQubits = numQubits - logBase2(numNodes);
// work out the maximum "local" qubits that can fit in memory
qindex maxLocalNumElems = memBytesPerNode / sizeof(qcomp); // floors
int maxLocalNumQubits = std::floor(std::log2(maxLocalNumElems));
// dense matrices (as opposed to diagonals) require square more memory
if (isDense)
maxLocalNumQubits /= 2; // floors
return localNumQubits <= maxLocalNumQubits;
}
bool mem_canSuperOpFitInMemory(int numQubits, qindex numBytesPerNode) {
// superoperators are square-bigger than their constituent dense matrices, and are never distributed
int numMatrixQubits = 2 * numQubits;
int numNodes = 1;
bool isDense = true;
return mem_canMatrixFitInMemory(numMatrixQubits, isDense, numNodes, numBytesPerNode);
}
bool mem_canPauliStrSumFitInMemory(qindex numTerms, qindex numBytesPerNode) {
// awkwardly arranged to avoid overflow when numTerms is too large
size_t numBytesPerTerm = sizeof(PauliStr) + sizeof(qcomp);
qindex maxNumTerms = numBytesPerNode / numBytesPerTerm; // floors
return numTerms <= maxNumTerms;
}
/*
* MEMORY ALLOCATION SUCCESS
*
* which check that some or all nested pointers are
* non-NULL, indicating that all allocations involved
* in a multidimensional data structure were successful.
* Some of these functions are trivial NULL checks, but
* are still abstracted here so that data structures can
* later be adjusted (e.g. CPU matrices may be flattened).
*/
template <typename T>
bool isNonNull(T ptr) {
// note that (ptr == None) implies (ptr == nullptr)
return ptr != nullptr;
}
// fast checks which can be used in validation of existing
// heap objects, which check only the outer alloc is valid.
// this is useful for checking whether a user has manually
// modified a heap pointer to be NULL because they are
// tracking freed objects, but they cannot be used to check
// intiail memory mallocs were successful.
bool mem_isOuterAllocated(qcomp* ptr) { return isNonNull(ptr); }
bool mem_isOuterAllocated(qcomp** ptr) { return isNonNull(ptr); }
bool mem_isOuterAllocated(qcomp*** ptr) { return isNonNull(ptr); }
// slow checks that all nested pointers in the heap structure
// are non-NULL, implying all of them point to valid, existing
// heap memory. This is used by validation after allocation.
bool mem_isAllocated(int* heapflag) { return isNonNull(heapflag); }
bool mem_isAllocated(qcomp* array ) { return isNonNull(array); }
bool mem_isAllocated(PauliStr* array) { return isNonNull(array); }
bool mem_isAllocated(qcomp** matrix, qindex numRows) {
if (matrix == nullptr)
return false;
// avoid recursing for insignificant speedup
for (qindex r=0; r<numRows; r++)
if (matrix[r] == nullptr)
return false;
return true;
}
bool mem_isAllocated(qcomp*** matrixList, qindex numRows, int numMatrices) {
if (matrixList == nullptr)
return false;
// fine to recurse because we expect few matrices
for (qindex n=0; n<numMatrices; n++)
if (!mem_isAllocated(matrixList[n], numRows))
return false;
return true;
}