QuEST_gpu.cu
Go to the documentation of this file.
1 // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
2 
10 # include "QuEST.h"
11 # include "QuEST_precision.h"
12 # include "QuEST_internal.h" // purely to resolve getQuESTDefaultSeedKey
13 # include "mt19937ar.h"
14 
15 # include <stdlib.h>
16 # include <stdio.h>
17 # include <math.h>
18 
19 # define REDUCE_SHARED_SIZE 512
20 # define DEBUG 0
21 
22 
23 
24 /*
25  * struct types for concisely passing unitaries to kernels
26  */
27 
28  // hide these from doxygen
30 
31  typedef struct ArgMatrix2 {
32  Complex r0c0, r0c1;
33  Complex r1c0, r1c1;
34  } ArgMatrix2;
35 
36  typedef struct ArgMatrix4
37  {
38  Complex r0c0, r0c1, r0c2, r0c3;
39  Complex r1c0, r1c1, r1c2, r1c3;
40  Complex r2c0, r2c1, r2c2, r2c3;
41  Complex r3c0, r3c1, r3c2, r3c3;
42  } ArgMatrix4;
43 
44 ArgMatrix2 argifyMatrix2(ComplexMatrix2 m) {
45  ArgMatrix2 a;
46  a.r0c0.real=m.real[0][0]; a.r0c0.imag=m.imag[0][0];
47  a.r0c1.real=m.real[0][1]; a.r0c1.imag=m.imag[0][1];
48  a.r1c0.real=m.real[1][0]; a.r1c0.imag=m.imag[1][0];
49  a.r1c1.real=m.real[1][1]; a.r1c1.imag=m.imag[1][1];
50  return a;
51  }
52 
53 ArgMatrix4 argifyMatrix4(ComplexMatrix4 m) {
54  ArgMatrix4 a;
55  a.r0c0.real=m.real[0][0]; a.r0c0.imag=m.imag[0][0];
56  a.r0c1.real=m.real[0][1]; a.r0c1.imag=m.imag[0][1];
57  a.r0c2.real=m.real[0][2]; a.r0c2.imag=m.imag[0][2];
58  a.r0c3.real=m.real[0][3]; a.r0c3.imag=m.imag[0][3];
59  a.r1c0.real=m.real[1][0]; a.r1c0.imag=m.imag[1][0];
60  a.r1c1.real=m.real[1][1]; a.r1c1.imag=m.imag[1][1];
61  a.r1c2.real=m.real[1][2]; a.r1c2.imag=m.imag[1][2];
62  a.r1c3.real=m.real[1][3]; a.r1c3.imag=m.imag[1][3];
63  a.r2c0.real=m.real[2][0]; a.r2c0.imag=m.imag[2][0];
64  a.r2c1.real=m.real[2][1]; a.r2c1.imag=m.imag[2][1];
65  a.r2c2.real=m.real[2][2]; a.r2c2.imag=m.imag[2][2];
66  a.r2c3.real=m.real[2][3]; a.r2c3.imag=m.imag[2][3];
67  a.r3c0.real=m.real[3][0]; a.r3c0.imag=m.imag[3][0];
68  a.r3c1.real=m.real[3][1]; a.r3c1.imag=m.imag[3][1];
69  a.r3c2.real=m.real[3][2]; a.r3c2.imag=m.imag[3][2];
70  a.r3c3.real=m.real[3][3]; a.r3c3.imag=m.imag[3][3];
71  return a;
72  }
73 
75 
76 
77 
78 /*
79  * in-kernel bit twiddling functions
80  */
81 
82 __forceinline__ __device__ int extractBit (const int locationOfBitFromRight, const long long int theEncodedNumber) {
83  return (theEncodedNumber & ( 1LL << locationOfBitFromRight )) >> locationOfBitFromRight;
84 }
85 
86 __forceinline__ __device__ int getBitMaskParity(long long int mask) {
87  int parity = 0;
88  while (mask) {
89  parity = !parity;
90  mask = mask & (mask-1);
91  }
92  return parity;
93 }
94 
95 __forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd) {
96  return (number ^ (1LL << bitInd));
97 }
98 
99 __forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index) {
100  long long int left, right;
101  left = (number >> index) << index;
102  right = number - left;
103  return (left << 1) ^ right;
104 }
105 
106 __forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2) {
107  int small = (bit1 < bit2)? bit1 : bit2;
108  int big = (bit1 < bit2)? bit2 : bit1;
109  return insertZeroBit(insertZeroBit(number, small), big);
110 }
111 
112 __forceinline__ __device__ long long int insertZeroBits(long long int number, int* inds, const int numInds) {
113  /* inserted bit inds must strictly increase, so that their final indices are correct.
114  * in-lieu of sorting (avoided since no C++ variable-size arrays, and since we're already
115  * memory bottle-necked so overhead eats this slowdown), we find the next-smallest index each
116  * at each insert. recall every element of inds (a positive or zero number) is unique.
117  * This function won't appear in the CPU code, which can use C99 variable-size arrays and
118  * ought to make a sorted array before threading
119  */
120  int curMin = inds[0];
121  int prevMin = -1;
122  for (int n=0; n < numInds; n++) {
123 
124  // find next min
125  for (int t=0; t < numInds; t++)
126  if (inds[t]>prevMin && inds[t]<curMin)
127  curMin = inds[t];
128 
129  number = insertZeroBit(number, curMin);
130 
131  // set curMin to an arbitrary non-visited elem
132  prevMin = curMin;
133  for (int t=0; t < numInds; t++)
134  if (inds[t] > curMin) {
135  curMin = inds[t];
136  break;
137  }
138  }
139  return number;
140 }
141 
142 
143 
144 /*
145  * state vector and density matrix operations
146  */
147 
148 #ifdef __cplusplus
149 extern "C" {
150 #endif
151 
152 
153 void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) {
154 
155  cudaDeviceSynchronize();
156  cudaMemcpy(
157  qureg.deviceStateVec.real + startInd,
158  reals,
159  numAmps * sizeof(*(qureg.deviceStateVec.real)),
160  cudaMemcpyHostToDevice);
161  cudaMemcpy(
162  qureg.deviceStateVec.imag + startInd,
163  imags,
164  numAmps * sizeof(*(qureg.deviceStateVec.imag)),
165  cudaMemcpyHostToDevice);
166 }
167 
168 
170 void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
171 
172  // copy copyQureg's GPU statevec to targetQureg's GPU statevec
173  cudaDeviceSynchronize();
174  cudaMemcpy(
175  targetQureg.deviceStateVec.real,
176  copyQureg.deviceStateVec.real,
177  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.real)),
178  cudaMemcpyDeviceToDevice);
179  cudaMemcpy(
180  targetQureg.deviceStateVec.imag,
181  copyQureg.deviceStateVec.imag,
182  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.imag)),
183  cudaMemcpyDeviceToDevice);
184 }
185 
187  long long int numPureAmps,
188  qreal *targetVecReal, qreal *targetVecImag,
189  qreal *copyVecReal, qreal *copyVecImag)
190 {
191  // this is a particular index of the pure copyQureg
192  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
193  if (index>=numPureAmps) return;
194 
195  qreal realRow = copyVecReal[index];
196  qreal imagRow = copyVecImag[index];
197  for (long long int col=0; col < numPureAmps; col++) {
198  qreal realCol = copyVecReal[col];
199  qreal imagCol = - copyVecImag[col]; // minus for conjugation
200  targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
201  targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
202  }
203 }
204 
205 void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
206 {
207  int threadsPerCUDABlock, CUDABlocks;
208  threadsPerCUDABlock = 128;
209  CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
210  densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
211  copyQureg.numAmpsPerChunk,
212  targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
213  copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag);
214 }
215 
216 __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){
217  long long int index;
218 
219  index = blockIdx.x*blockDim.x + threadIdx.x;
220  if (index>=stateVecSize) return;
221 
222  stateVecReal[index] = probFactor;
223  stateVecImag[index] = 0.0;
224 }
225 
227 {
228  qreal probFactor = 1.0/((qreal) (1LL << qureg.numQubitsRepresented));
229  int threadsPerCUDABlock, CUDABlocks;
230  threadsPerCUDABlock = 128;
231  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
232  densmatr_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
233  qureg.numAmpsPerChunk,
234  probFactor,
235  qureg.deviceStateVec.real,
236  qureg.deviceStateVec.imag);
237 }
238 
240  long long int densityNumElems,
241  qreal *densityReal, qreal *densityImag,
242  long long int densityInd)
243 {
244  // initialise the state to all zeros
245  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
246  if (index >= densityNumElems) return;
247 
248  densityReal[index] = 0.0;
249  densityImag[index] = 0.0;
250 
251  if (index==densityInd){
252  // classical state has probability 1
253  densityReal[densityInd] = 1.0;
254  densityImag[densityInd] = 0.0;
255  }
256 }
257 
258 void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
259 {
260  int threadsPerCUDABlock, CUDABlocks;
261  threadsPerCUDABlock = 128;
262  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
263 
264  // index of the desired state in the flat density matrix
265  long long int densityDim = 1LL << qureg.numQubitsRepresented;
266  long long int densityInd = (densityDim + 1)*stateInd;
267 
268  // identical to pure version
269  densmatr_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
270  qureg.numAmpsPerChunk,
271  qureg.deviceStateVec.real,
272  qureg.deviceStateVec.imag, densityInd);
273 }
274 
275 void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env)
276 {
277  // allocate CPU memory
278  long long int numAmps = 1L << numQubits;
279  long long int numAmpsPerRank = numAmps/env.numRanks;
280  qureg->stateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.real));
281  qureg->stateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.imag));
282  if (env.numRanks>1){
283  qureg->pairStateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.real));
284  qureg->pairStateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.imag));
285  }
286 
287  // check cpu memory allocation was successful
288  if ( (!(qureg->stateVec.real) || !(qureg->stateVec.imag))
289  && numAmpsPerRank ) {
290  printf("Could not allocate memory!\n");
291  exit (EXIT_FAILURE);
292  }
293  if ( env.numRanks>1 && (!(qureg->pairStateVec.real) || !(qureg->pairStateVec.imag))
294  && numAmpsPerRank ) {
295  printf("Could not allocate memory!\n");
296  exit (EXIT_FAILURE);
297  }
298 
299  qureg->numQubitsInStateVec = numQubits;
300  qureg->numAmpsPerChunk = numAmpsPerRank;
301  qureg->numAmpsTotal = numAmps;
302  qureg->chunkId = env.rank;
303  qureg->numChunks = env.numRanks;
304  qureg->isDensityMatrix = 0;
305 
306  // allocate GPU memory
307  cudaMalloc(&(qureg->deviceStateVec.real), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.real)));
308  cudaMalloc(&(qureg->deviceStateVec.imag), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.imag)));
309  cudaMalloc(&(qureg->firstLevelReduction), ceil(qureg->numAmpsPerChunk/(qreal)REDUCE_SHARED_SIZE)*sizeof(qreal));
311  sizeof(qreal));
312 
313  // check gpu memory allocation was successful
314  if (!(qureg->deviceStateVec.real) || !(qureg->deviceStateVec.imag)){
315  printf("Could not allocate memory on GPU!\n");
316  exit (EXIT_FAILURE);
317  }
318 
319 }
320 
322 {
323  // Free CPU memory
324  free(qureg.stateVec.real);
325  free(qureg.stateVec.imag);
326  if (env.numRanks>1){
327  free(qureg.pairStateVec.real);
328  free(qureg.pairStateVec.imag);
329  }
330 
331  // Free GPU memory
332  cudaFree(qureg.deviceStateVec.real);
333  cudaFree(qureg.deviceStateVec.imag);
334  cudaFree(qureg.firstLevelReduction);
335  cudaFree(qureg.secondLevelReduction);
336 }
337 
339 
340  DiagonalOp op;
341  op.numQubits = numQubits;
342  op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
343  op.chunkId = env.rank;
344  op.numChunks = env.numRanks;
345 
346  // allocate CPU memory (initialised to zero)
347  op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
348  op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
349  // @TODO no handling of rank>1 allocation (no distributed GPU)
350 
351  // check cpu memory allocation was successful
352  if ( !op.real || !op.imag ) {
353  printf("Could not allocate memory!\n");
354  exit(EXIT_FAILURE);
355  }
356 
357  // allocate GPU memory
358  size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
359  cudaMalloc(&(op.deviceOperator.real), arrSize);
360  cudaMalloc(&(op.deviceOperator.imag), arrSize);
361 
362  // check gpu memory allocation was successful
363  if (!op.deviceOperator.real || !op.deviceOperator.imag) {
364  printf("Could not allocate memory on GPU!\n");
365  exit(EXIT_FAILURE);
366  }
367 
368  // initialise GPU memory to zero
369  cudaMemset(op.deviceOperator.real, 0, arrSize);
370  cudaMemset(op.deviceOperator.imag, 0, arrSize);
371 
372  return op;
373 }
374 
376  free(op.real);
377  free(op.imag);
378  cudaFree(op.deviceOperator.real);
379  cudaFree(op.deviceOperator.imag);
380 }
381 
383 
384  size_t arrSize = (1LL << op.numQubits) * sizeof(qreal);
385  cudaDeviceSynchronize();
386  cudaMemcpy(op.deviceOperator.real, op.real, arrSize, cudaMemcpyHostToDevice);
387  cudaMemcpy(op.deviceOperator.imag, op.imag, arrSize, cudaMemcpyHostToDevice);
388 }
389 
390 int GPUExists(void){
391  int deviceCount, device;
392  int gpuDeviceCount = 0;
393  struct cudaDeviceProp properties;
394  cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
395  if (cudaResultCode != cudaSuccess) deviceCount = 0;
396  /* machines with no GPUs can still report one emulation device */
397  for (device = 0; device < deviceCount; ++device) {
398  cudaGetDeviceProperties(&properties, device);
399  if (properties.major != 9999) { /* 9999 means emulation only */
400  ++gpuDeviceCount;
401  }
402  }
403  if (gpuDeviceCount) return 1;
404  else return 0;
405 }
406 
408 
409  if (!GPUExists()){
410  printf("Trying to run GPU code with no GPU available\n");
411  exit(EXIT_FAILURE);
412  }
413 
414  QuESTEnv env;
415  env.rank=0;
416  env.numRanks=1;
417 
419 
420  return env;
421 }
422 
424  cudaDeviceSynchronize();
425 }
426 
427 int syncQuESTSuccess(int successCode){
428  return successCode;
429 }
430 
432  // MPI finalize goes here in MPI version. Call this function anyway for consistency
433 }
434 
436  printf("EXECUTION ENVIRONMENT:\n");
437  printf("Running locally on one node with GPU\n");
438  printf("Number of ranks is %d\n", env.numRanks);
439 # ifdef _OPENMP
440  printf("OpenMP enabled\n");
441  printf("Number of threads available is %d\n", omp_get_max_threads());
442 # else
443  printf("OpenMP disabled\n");
444 # endif
445 }
446 
447 void getEnvironmentString(QuESTEnv env, Qureg qureg, char str[200]){
448  sprintf(str, "%dqubits_GPU_noMpi_noOMP", qureg.numQubitsInStateVec);
449 }
450 
452 {
453  if (DEBUG) printf("Copying data to GPU\n");
454  cudaMemcpy(qureg.deviceStateVec.real, qureg.stateVec.real,
455  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyHostToDevice);
456  cudaMemcpy(qureg.deviceStateVec.imag, qureg.stateVec.imag,
457  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyHostToDevice);
458  if (DEBUG) printf("Finished copying data to GPU\n");
459 }
460 
462 {
463  cudaDeviceSynchronize();
464  if (DEBUG) printf("Copying data from GPU\n");
465  cudaMemcpy(qureg.stateVec.real, qureg.deviceStateVec.real,
466  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
467  cudaMemcpy(qureg.stateVec.imag, qureg.deviceStateVec.imag,
468  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
469  if (DEBUG) printf("Finished copying data from GPU\n");
470 }
471 
475 void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){
476  long long int index;
477  int rank;
478  copyStateFromGPU(qureg);
479  if (qureg.numQubitsInStateVec<=5){
480  for (rank=0; rank<qureg.numChunks; rank++){
481  if (qureg.chunkId==rank){
482  if (reportRank) {
483  printf("Reporting state from rank %d [\n", qureg.chunkId);
484  //printf("\trank, index, real, imag\n");
485  printf("real, imag\n");
486  } else if (rank==0) {
487  printf("Reporting state [\n");
488  printf("real, imag\n");
489  }
490 
491  for(index=0; index<qureg.numAmpsPerChunk; index++){
492  printf(REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", qureg.stateVec.real[index], qureg.stateVec.imag[index]);
493  }
494  if (reportRank || rank==qureg.numChunks-1) printf("]\n");
495  }
496  syncQuESTEnv(env);
497  }
498  }
499 }
500 
501 qreal statevec_getRealAmp(Qureg qureg, long long int index){
502  qreal el=0;
503  cudaMemcpy(&el, &(qureg.deviceStateVec.real[index]),
504  sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
505  return el;
506 }
507 
508 qreal statevec_getImagAmp(Qureg qureg, long long int index){
509  qreal el=0;
510  cudaMemcpy(&el, &(qureg.deviceStateVec.imag[index]),
511  sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
512  return el;
513 }
514 
515 __global__ void statevec_initBlankStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
516  long long int index;
517 
518  // initialise the statevector to be all-zeros
519  index = blockIdx.x*blockDim.x + threadIdx.x;
520  if (index>=stateVecSize) return;
521  stateVecReal[index] = 0.0;
522  stateVecImag[index] = 0.0;
523 }
524 
526 {
527  int threadsPerCUDABlock, CUDABlocks;
528  threadsPerCUDABlock = 128;
529  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
530  statevec_initBlankStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
531  qureg.numAmpsPerChunk,
532  qureg.deviceStateVec.real,
533  qureg.deviceStateVec.imag);
534 }
535 
536 __global__ void statevec_initZeroStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
537  long long int index;
538 
539  // initialise the state to |0000..0000>
540  index = blockIdx.x*blockDim.x + threadIdx.x;
541  if (index>=stateVecSize) return;
542  stateVecReal[index] = 0.0;
543  stateVecImag[index] = 0.0;
544 
545  if (index==0){
546  // zero state |0000..0000> has probability 1
547  stateVecReal[0] = 1.0;
548  stateVecImag[0] = 0.0;
549  }
550 }
551 
553 {
554  int threadsPerCUDABlock, CUDABlocks;
555  threadsPerCUDABlock = 128;
556  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
557  statevec_initZeroStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
558  qureg.numAmpsPerChunk,
559  qureg.deviceStateVec.real,
560  qureg.deviceStateVec.imag);
561 }
562 
563 __global__ void statevec_initPlusStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
564  long long int index;
565 
566  index = blockIdx.x*blockDim.x + threadIdx.x;
567  if (index>=stateVecSize) return;
568 
569  qreal normFactor = 1.0/sqrt((qreal)stateVecSize);
570  stateVecReal[index] = normFactor;
571  stateVecImag[index] = 0.0;
572 }
573 
575 {
576  int threadsPerCUDABlock, CUDABlocks;
577  threadsPerCUDABlock = 128;
578  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
579  statevec_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
580  qureg.numAmpsPerChunk,
581  qureg.deviceStateVec.real,
582  qureg.deviceStateVec.imag);
583 }
584 
585 __global__ void statevec_initClassicalStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd){
586  long long int index;
587 
588  // initialise the state to |stateInd>
589  index = blockIdx.x*blockDim.x + threadIdx.x;
590  if (index>=stateVecSize) return;
591  stateVecReal[index] = 0.0;
592  stateVecImag[index] = 0.0;
593 
594  if (index==stateInd){
595  // classical state has probability 1
596  stateVecReal[stateInd] = 1.0;
597  stateVecImag[stateInd] = 0.0;
598  }
599 }
600 void statevec_initClassicalState(Qureg qureg, long long int stateInd)
601 {
602  int threadsPerCUDABlock, CUDABlocks;
603  threadsPerCUDABlock = 128;
604  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
605  statevec_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
606  qureg.numAmpsPerChunk,
607  qureg.deviceStateVec.real,
608  qureg.deviceStateVec.imag, stateInd);
609 }
610 
611 __global__ void statevec_initDebugStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
612  long long int index;
613 
614  index = blockIdx.x*blockDim.x + threadIdx.x;
615  if (index>=stateVecSize) return;
616 
617  stateVecReal[index] = (index*2.0)/10.0;
618  stateVecImag[index] = (index*2.0+1.0)/10.0;
619 }
620 
622 {
623  int threadsPerCUDABlock, CUDABlocks;
624  threadsPerCUDABlock = 128;
625  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
626  statevec_initDebugStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
627  qureg.numAmpsPerChunk,
628  qureg.deviceStateVec.real,
629  qureg.deviceStateVec.imag);
630 }
631 
632 __global__ void statevec_initStateOfSingleQubitKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome){
633  long long int index;
634  int bit;
635 
636  index = blockIdx.x*blockDim.x + threadIdx.x;
637  if (index>=stateVecSize) return;
638 
639  qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2);
640  bit = extractBit(qubitId, index);
641  if (bit==outcome) {
642  stateVecReal[index] = normFactor;
643  stateVecImag[index] = 0.0;
644  } else {
645  stateVecReal[index] = 0.0;
646  stateVecImag[index] = 0.0;
647  }
648 }
649 
650 void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
651 {
652  int threadsPerCUDABlock, CUDABlocks;
653  threadsPerCUDABlock = 128;
654  CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock);
655  statevec_initStateOfSingleQubitKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome);
656 }
657 
658 // returns 1 if successful, else 0
659 int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env){
660  long long int chunkSize, stateVecSize;
661  long long int indexInChunk, totalIndex;
662 
663  chunkSize = qureg->numAmpsPerChunk;
664  stateVecSize = chunkSize*qureg->numChunks;
665 
666  qreal *stateVecReal = qureg->stateVec.real;
667  qreal *stateVecImag = qureg->stateVec.imag;
668 
669  FILE *fp;
670  char line[200];
671 
672  fp = fopen(filename, "r");
673  if (fp == NULL)
674  return 0;
675 
676  indexInChunk = 0; totalIndex = 0;
677  while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
678  if (line[0]!='#'){
679  int chunkId = totalIndex/chunkSize;
680  if (chunkId==qureg->chunkId){
681  # if QuEST_PREC==1
682  sscanf(line, "%f, %f", &(stateVecReal[indexInChunk]),
683  &(stateVecImag[indexInChunk]));
684  # elif QuEST_PREC==2
685  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
686  &(stateVecImag[indexInChunk]));
687  # elif QuEST_PREC==4
688  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
689  &(stateVecImag[indexInChunk]));
690  # endif
691  indexInChunk += 1;
692  }
693  totalIndex += 1;
694  }
695  }
696  fclose(fp);
697  copyStateToGPU(*qureg);
698 
699  // indicate success
700  return 1;
701 }
702 
703 int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){
704  qreal diff;
705  int chunkSize = mq1.numAmpsPerChunk;
706 
707  copyStateFromGPU(mq1);
708  copyStateFromGPU(mq2);
709 
710  for (int i=0; i<chunkSize; i++){
711  diff = mq1.stateVec.real[i] - mq2.stateVec.real[i];
712  if (diff<0) diff *= -1;
713  if (diff>precision) return 0;
714  diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i];
715  if (diff<0) diff *= -1;
716  if (diff>precision) return 0;
717  }
718  return 1;
719 }
720 
721 __global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta){
722  // ----- sizes
723  long long int sizeBlock, // size of blocks
724  sizeHalfBlock; // size of blocks halved
725  // ----- indices
726  long long int thisBlock, // current block
727  indexUp,indexLo; // current index and corresponding index in lower half block
728 
729  // ----- temp variables
730  qreal stateRealUp,stateRealLo, // storage for previous state values
731  stateImagUp,stateImagLo; // (used in updates)
732  // ----- temp variables
733  long long int thisTask; // task based approach for expose loop with small granularity
734  long long int numTasks=qureg.numAmpsPerChunk>>1;
735 
736  sizeHalfBlock = 1LL << rotQubit; // size of blocks halved
737  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
738 
739  // ---------------------------------------------------------------- //
740  // rotate //
741  // ---------------------------------------------------------------- //
742 
744  qreal *stateVecReal = qureg.deviceStateVec.real;
745  qreal *stateVecImag = qureg.deviceStateVec.imag;
746  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
747  qreal betaImag=beta.imag, betaReal=beta.real;
748 
749  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
750  if (thisTask>=numTasks) return;
751 
752  thisBlock = thisTask / sizeHalfBlock;
753  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
754  indexLo = indexUp + sizeHalfBlock;
755 
756  // store current state vector values in temp variables
757  stateRealUp = stateVecReal[indexUp];
758  stateImagUp = stateVecImag[indexUp];
759 
760  stateRealLo = stateVecReal[indexLo];
761  stateImagLo = stateVecImag[indexLo];
762 
763  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
764  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
765  - betaReal*stateRealLo - betaImag*stateImagLo;
766  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
767  - betaReal*stateImagLo + betaImag*stateRealLo;
768 
769  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
770  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
771  + alphaReal*stateRealLo + alphaImag*stateImagLo;
772  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
773  + alphaReal*stateImagLo - alphaImag*stateRealLo;
774 }
775 
776 void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta)
777 {
778  int threadsPerCUDABlock, CUDABlocks;
779  threadsPerCUDABlock = 128;
780  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
781  statevec_compactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, alpha, beta);
782 }
783 
784 __global__ void statevec_controlledCompactUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta){
785  // ----- sizes
786  long long int sizeBlock, // size of blocks
787  sizeHalfBlock; // size of blocks halved
788  // ----- indices
789  long long int thisBlock, // current block
790  indexUp,indexLo; // current index and corresponding index in lower half block
791 
792  // ----- temp variables
793  qreal stateRealUp,stateRealLo, // storage for previous state values
794  stateImagUp,stateImagLo; // (used in updates)
795  // ----- temp variables
796  long long int thisTask; // task based approach for expose loop with small granularity
797  long long int numTasks=qureg.numAmpsPerChunk>>1;
798  int controlBit;
799 
800  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
801  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
802 
803  // ---------------------------------------------------------------- //
804  // rotate //
805  // ---------------------------------------------------------------- //
806 
808  qreal *stateVecReal = qureg.deviceStateVec.real;
809  qreal *stateVecImag = qureg.deviceStateVec.imag;
810  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
811  qreal betaImag=beta.imag, betaReal=beta.real;
812 
813  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
814  if (thisTask>=numTasks) return;
815 
816  thisBlock = thisTask / sizeHalfBlock;
817  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
818  indexLo = indexUp + sizeHalfBlock;
819 
820  controlBit = extractBit(controlQubit, indexUp);
821  if (controlBit){
822  // store current state vector values in temp variables
823  stateRealUp = stateVecReal[indexUp];
824  stateImagUp = stateVecImag[indexUp];
825 
826  stateRealLo = stateVecReal[indexLo];
827  stateImagLo = stateVecImag[indexLo];
828 
829  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
830  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
831  - betaReal*stateRealLo - betaImag*stateImagLo;
832  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
833  - betaReal*stateImagLo + betaImag*stateRealLo;
834 
835  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
836  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
837  + alphaReal*stateRealLo + alphaImag*stateImagLo;
838  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
839  + alphaReal*stateImagLo - alphaImag*stateRealLo;
840  }
841 }
842 
843 void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
844 {
845  int threadsPerCUDABlock, CUDABlocks;
846  threadsPerCUDABlock = 128;
847  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
848  statevec_controlledCompactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, alpha, beta);
849 }
850 
851 __global__ void statevec_unitaryKernel(Qureg qureg, int targetQubit, ArgMatrix2 u){
852  // ----- sizes
853  long long int sizeBlock, // size of blocks
854  sizeHalfBlock; // size of blocks halved
855  // ----- indices
856  long long int thisBlock, // current block
857  indexUp,indexLo; // current index and corresponding index in lower half block
858 
859  // ----- temp variables
860  qreal stateRealUp,stateRealLo, // storage for previous state values
861  stateImagUp,stateImagLo; // (used in updates)
862  // ----- temp variables
863  long long int thisTask; // task based approach for expose loop with small granularity
864  long long int numTasks=qureg.numAmpsPerChunk>>1;
865 
866  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
867  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
868 
869  // ---------------------------------------------------------------- //
870  // rotate //
871  // ---------------------------------------------------------------- //
872 
874  qreal *stateVecReal = qureg.deviceStateVec.real;
875  qreal *stateVecImag = qureg.deviceStateVec.imag;
876 
877  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
878  if (thisTask>=numTasks) return;
879 
880  thisBlock = thisTask / sizeHalfBlock;
881  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
882  indexLo = indexUp + sizeHalfBlock;
883 
884  // store current state vector values in temp variables
885  stateRealUp = stateVecReal[indexUp];
886  stateImagUp = stateVecImag[indexUp];
887 
888  stateRealLo = stateVecReal[indexLo];
889  stateImagLo = stateVecImag[indexLo];
890 
891  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
892  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
893  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
894  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
895  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
896 
897  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
898  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
899  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
900  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
901  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
902 }
903 
904 void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u)
905 {
906  int threadsPerCUDABlock, CUDABlocks;
907  threadsPerCUDABlock = 128;
908  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
909  statevec_unitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, argifyMatrix2(u));
910 }
911 
913  Qureg qureg, long long int ctrlMask, int* targs, int numTargs,
914  qreal* uRe, qreal* uIm, long long int* ampInds, qreal* reAmps, qreal* imAmps, long long int numTargAmps)
915 {
916 
917  // decide the amplitudes this thread will modify
918  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
919  long long int numTasks = qureg.numAmpsPerChunk >> numTargs; // kernel called on every 1 in 2^numTargs amplitudes
920  if (thisTask>=numTasks) return;
921 
922  // find this task's start index (where all targs are 0)
923  long long int ind00 = insertZeroBits(thisTask, targs, numTargs);
924 
925  // this task only modifies amplitudes if control qubits are 1 for this state
926  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
927  return;
928 
929  qreal *reVec = qureg.deviceStateVec.real;
930  qreal *imVec = qureg.deviceStateVec.imag;
931 
932  /*
933  each thread needs:
934  long long int ampInds[numAmps];
935  qreal reAmps[numAmps];
936  qreal imAmps[numAmps];
937  but instead has access to shared arrays, with below stride and offset
938  */
939  size_t stride = gridDim.x*blockDim.x;
940  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
941 
942  // determine the indices and record values of target amps
943  long long int ind;
944  for (int i=0; i < numTargAmps; i++) {
945 
946  // get global index of current target qubit assignment
947  ind = ind00;
948  for (int t=0; t < numTargs; t++)
949  if (extractBit(t, i))
950  ind = flipBit(ind, targs[t]);
951 
952  ampInds[i*stride+offset] = ind;
953  reAmps [i*stride+offset] = reVec[ind];
954  imAmps [i*stride+offset] = imVec[ind];
955  }
956 
957  // update the amplitudes
958  for (int r=0; r < numTargAmps; r++) {
959  ind = ampInds[r*stride+offset];
960  reVec[ind] = 0;
961  imVec[ind] = 0;
962  for (int c=0; c < numTargAmps; c++) {
963  qreal uReElem = uRe[c + r*numTargAmps];
964  qreal uImElem = uIm[c + r*numTargAmps];
965  reVec[ind] += reAmps[c*stride+offset]*uReElem - imAmps[c*stride+offset]*uImElem;
966  imVec[ind] += reAmps[c*stride+offset]*uImElem + imAmps[c*stride+offset]*uReElem;
967  }
968  }
969 }
970 
971 void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u)
972 {
973  int threadsPerCUDABlock = 128;
974  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>numTargs)/threadsPerCUDABlock);
975 
976  // allocate device space for global {targs} (length: numTargs) and populate
977  int *d_targs;
978  size_t targMemSize = numTargs * sizeof *d_targs;
979  cudaMalloc(&d_targs, targMemSize);
980  cudaMemcpy(d_targs, targs, targMemSize, cudaMemcpyHostToDevice);
981 
982  // flatten out the u.real and u.imag lists
983  int uNumRows = (1 << u.numQubits);
984  qreal* uReFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uReFlat);
985  qreal* uImFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uImFlat);
986  long long int i = 0;
987  for (int r=0; r < uNumRows; r++)
988  for (int c=0; c < uNumRows; c++) {
989  uReFlat[i] = u.real[r][c];
990  uImFlat[i] = u.imag[r][c];
991  i++;
992  }
993 
994  // allocate device space for global u.real and u.imag (flatten by concatenating rows) and populate
995  qreal* d_uRe;
996  qreal* d_uIm;
997  size_t uMemSize = uNumRows*uNumRows * sizeof *d_uRe; // size of each of d_uRe and d_uIm
998  cudaMalloc(&d_uRe, uMemSize);
999  cudaMalloc(&d_uIm, uMemSize);
1000  cudaMemcpy(d_uRe, uReFlat, uMemSize, cudaMemcpyHostToDevice);
1001  cudaMemcpy(d_uIm, uImFlat, uMemSize, cudaMemcpyHostToDevice);
1002 
1003  // allocate device Wspace for thread-local {ampInds}, {reAmps}, {imAmps} (length: 1<<numTargs)
1004  long long int *d_ampInds;
1005  qreal *d_reAmps;
1006  qreal *d_imAmps;
1007  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
1008  int numTargAmps = uNumRows;
1009  cudaMalloc(&d_ampInds, numTargAmps*gridSize * sizeof *d_ampInds);
1010  cudaMalloc(&d_reAmps, numTargAmps*gridSize * sizeof *d_reAmps);
1011  cudaMalloc(&d_imAmps, numTargAmps*gridSize * sizeof *d_imAmps);
1012 
1013  // call kernel
1014  statevec_multiControlledMultiQubitUnitaryKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
1015  qureg, ctrlMask, d_targs, numTargs, d_uRe, d_uIm, d_ampInds, d_reAmps, d_imAmps, numTargAmps);
1016 
1017  // free kernel memory
1018  free(uReFlat);
1019  free(uImFlat);
1020  cudaFree(d_targs);
1021  cudaFree(d_uRe);
1022  cudaFree(d_uIm);
1023  cudaFree(d_ampInds);
1024  cudaFree(d_reAmps);
1025  cudaFree(d_imAmps);
1026 }
1027 
1028 __global__ void statevec_multiControlledTwoQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u){
1029 
1030  // decide the 4 amplitudes this thread will modify
1031  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1032  long long int numTasks = qureg.numAmpsPerChunk >> 2; // kernel called on every 1 in 4 amplitudes
1033  if (thisTask>=numTasks) return;
1034 
1035  qreal *reVec = qureg.deviceStateVec.real;
1036  qreal *imVec = qureg.deviceStateVec.imag;
1037 
1038  // find indices of amplitudes to modify (treat q1 as the least significant bit)
1039  long long int ind00, ind01, ind10, ind11;
1040  ind00 = insertTwoZeroBits(thisTask, q1, q2);
1041 
1042  // modify only if control qubits are 1 for this state
1043  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
1044  return;
1045 
1046  ind01 = flipBit(ind00, q1);
1047  ind10 = flipBit(ind00, q2);
1048  ind11 = flipBit(ind01, q2);
1049 
1050  // extract statevec amplitudes
1051  qreal re00, re01, re10, re11;
1052  qreal im00, im01, im10, im11;
1053  re00 = reVec[ind00]; im00 = imVec[ind00];
1054  re01 = reVec[ind01]; im01 = imVec[ind01];
1055  re10 = reVec[ind10]; im10 = imVec[ind10];
1056  re11 = reVec[ind11]; im11 = imVec[ind11];
1057 
1058  // apply u * {amp00, amp01, amp10, amp11}
1059  reVec[ind00] =
1060  u.r0c0.real*re00 - u.r0c0.imag*im00 +
1061  u.r0c1.real*re01 - u.r0c1.imag*im01 +
1062  u.r0c2.real*re10 - u.r0c2.imag*im10 +
1063  u.r0c3.real*re11 - u.r0c3.imag*im11;
1064  imVec[ind00] =
1065  u.r0c0.imag*re00 + u.r0c0.real*im00 +
1066  u.r0c1.imag*re01 + u.r0c1.real*im01 +
1067  u.r0c2.imag*re10 + u.r0c2.real*im10 +
1068  u.r0c3.imag*re11 + u.r0c3.real*im11;
1069 
1070  reVec[ind01] =
1071  u.r1c0.real*re00 - u.r1c0.imag*im00 +
1072  u.r1c1.real*re01 - u.r1c1.imag*im01 +
1073  u.r1c2.real*re10 - u.r1c2.imag*im10 +
1074  u.r1c3.real*re11 - u.r1c3.imag*im11;
1075  imVec[ind01] =
1076  u.r1c0.imag*re00 + u.r1c0.real*im00 +
1077  u.r1c1.imag*re01 + u.r1c1.real*im01 +
1078  u.r1c2.imag*re10 + u.r1c2.real*im10 +
1079  u.r1c3.imag*re11 + u.r1c3.real*im11;
1080 
1081  reVec[ind10] =
1082  u.r2c0.real*re00 - u.r2c0.imag*im00 +
1083  u.r2c1.real*re01 - u.r2c1.imag*im01 +
1084  u.r2c2.real*re10 - u.r2c2.imag*im10 +
1085  u.r2c3.real*re11 - u.r2c3.imag*im11;
1086  imVec[ind10] =
1087  u.r2c0.imag*re00 + u.r2c0.real*im00 +
1088  u.r2c1.imag*re01 + u.r2c1.real*im01 +
1089  u.r2c2.imag*re10 + u.r2c2.real*im10 +
1090  u.r2c3.imag*re11 + u.r2c3.real*im11;
1091 
1092  reVec[ind11] =
1093  u.r3c0.real*re00 - u.r3c0.imag*im00 +
1094  u.r3c1.real*re01 - u.r3c1.imag*im01 +
1095  u.r3c2.real*re10 - u.r3c2.imag*im10 +
1096  u.r3c3.real*re11 - u.r3c3.imag*im11;
1097  imVec[ind11] =
1098  u.r3c0.imag*re00 + u.r3c0.real*im00 +
1099  u.r3c1.imag*re01 + u.r3c1.real*im01 +
1100  u.r3c2.imag*re10 + u.r3c2.real*im10 +
1101  u.r3c3.imag*re11 + u.r3c3.real*im11;
1102 }
1103 
1104 void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
1105 {
1106  int threadsPerCUDABlock = 128;
1107  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock); // one kernel eval for every 4 amplitudes
1108  statevec_multiControlledTwoQubitUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, q1, q2, argifyMatrix4(u));
1109 }
1110 
1111 __global__ void statevec_controlledUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u){
1112  // ----- sizes
1113  long long int sizeBlock, // size of blocks
1114  sizeHalfBlock; // size of blocks halved
1115  // ----- indices
1116  long long int thisBlock, // current block
1117  indexUp,indexLo; // current index and corresponding index in lower half block
1118 
1119  // ----- temp variables
1120  qreal stateRealUp,stateRealLo, // storage for previous state values
1121  stateImagUp,stateImagLo; // (used in updates)
1122  // ----- temp variables
1123  long long int thisTask; // task based approach for expose loop with small granularity
1124  long long int numTasks=qureg.numAmpsPerChunk>>1;
1125 
1126  int controlBit;
1127 
1128  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1129  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1130 
1131  // ---------------------------------------------------------------- //
1132  // rotate //
1133  // ---------------------------------------------------------------- //
1134 
1136  qreal *stateVecReal = qureg.deviceStateVec.real;
1137  qreal *stateVecImag = qureg.deviceStateVec.imag;
1138 
1139  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1140  if (thisTask>=numTasks) return;
1141 
1142  thisBlock = thisTask / sizeHalfBlock;
1143  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1144  indexLo = indexUp + sizeHalfBlock;
1145 
1146  // store current state vector values in temp variables
1147  stateRealUp = stateVecReal[indexUp];
1148  stateImagUp = stateVecImag[indexUp];
1149 
1150  stateRealLo = stateVecReal[indexLo];
1151  stateImagLo = stateVecImag[indexLo];
1152 
1153  controlBit = extractBit(controlQubit, indexUp);
1154  if (controlBit){
1155  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1156  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1157  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1158  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1159  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1160 
1161  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1162  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1163  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1164  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1165  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1166  }
1167 }
1168 
1169 void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
1170 {
1171  int threadsPerCUDABlock, CUDABlocks;
1172  threadsPerCUDABlock = 128;
1173  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1174  statevec_controlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, argifyMatrix2(u));
1175 }
1176 
1178  Qureg qureg,
1179  long long int ctrlQubitsMask, long long int ctrlFlipMask,
1180  int targetQubit, ArgMatrix2 u
1181 ){
1182  // ----- sizes
1183  long long int sizeBlock, // size of blocks
1184  sizeHalfBlock; // size of blocks halved
1185  // ----- indices
1186  long long int thisBlock, // current block
1187  indexUp,indexLo; // current index and corresponding index in lower half block
1188 
1189  // ----- temp variables
1190  qreal stateRealUp,stateRealLo, // storage for previous state values
1191  stateImagUp,stateImagLo; // (used in updates)
1192  // ----- temp variables
1193  long long int thisTask; // task based approach for expose loop with small granularity
1194  long long int numTasks=qureg.numAmpsPerChunk>>1;
1195 
1196 
1197  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1198  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1199 
1200  // ---------------------------------------------------------------- //
1201  // rotate //
1202  // ---------------------------------------------------------------- //
1203 
1205  qreal *stateVecReal = qureg.deviceStateVec.real;
1206  qreal *stateVecImag = qureg.deviceStateVec.imag;
1207 
1208  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1209  if (thisTask>=numTasks) return;
1210 
1211  thisBlock = thisTask / sizeHalfBlock;
1212  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1213  indexLo = indexUp + sizeHalfBlock;
1214 
1215  if (ctrlQubitsMask == (ctrlQubitsMask & (indexUp ^ ctrlFlipMask))) {
1216  // store current state vector values in temp variables
1217  stateRealUp = stateVecReal[indexUp];
1218  stateImagUp = stateVecImag[indexUp];
1219 
1220  stateRealLo = stateVecReal[indexLo];
1221  stateImagLo = stateVecImag[indexLo];
1222 
1223  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1224  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1225  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1226  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1227  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1228 
1229  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1230  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1231  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1232  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1233  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1234  }
1235 }
1236 
1238  Qureg qureg,
1239  long long int ctrlQubitsMask, long long int ctrlFlipMask,
1240  int targetQubit, ComplexMatrix2 u
1241 ){
1242  int threadsPerCUDABlock = 128;
1243  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1244  statevec_multiControlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
1245  qureg, ctrlQubitsMask, ctrlFlipMask, targetQubit, argifyMatrix2(u));
1246 }
1247 
1248 __global__ void statevec_pauliXKernel(Qureg qureg, int targetQubit){
1249  // ----- sizes
1250  long long int sizeBlock, // size of blocks
1251  sizeHalfBlock; // size of blocks halved
1252  // ----- indices
1253  long long int thisBlock, // current block
1254  indexUp,indexLo; // current index and corresponding index in lower half block
1255 
1256  // ----- temp variables
1257  qreal stateRealUp, // storage for previous state values
1258  stateImagUp; // (used in updates)
1259  // ----- temp variables
1260  long long int thisTask; // task based approach for expose loop with small granularity
1261  long long int numTasks=qureg.numAmpsPerChunk>>1;
1262 
1263  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1264  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1265 
1266  // ---------------------------------------------------------------- //
1267  // rotate //
1268  // ---------------------------------------------------------------- //
1269 
1271  qreal *stateVecReal = qureg.deviceStateVec.real;
1272  qreal *stateVecImag = qureg.deviceStateVec.imag;
1273 
1274  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1275  if (thisTask>=numTasks) return;
1276 
1277  thisBlock = thisTask / sizeHalfBlock;
1278  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1279  indexLo = indexUp + sizeHalfBlock;
1280 
1281  // store current state vector values in temp variables
1282  stateRealUp = stateVecReal[indexUp];
1283  stateImagUp = stateVecImag[indexUp];
1284 
1285  stateVecReal[indexUp] = stateVecReal[indexLo];
1286  stateVecImag[indexUp] = stateVecImag[indexLo];
1287 
1288  stateVecReal[indexLo] = stateRealUp;
1289  stateVecImag[indexLo] = stateImagUp;
1290 }
1291 
1292 void statevec_pauliX(Qureg qureg, int targetQubit)
1293 {
1294  int threadsPerCUDABlock, CUDABlocks;
1295  threadsPerCUDABlock = 128;
1296  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1297  statevec_pauliXKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1298 }
1299 
1300 __global__ void statevec_pauliYKernel(Qureg qureg, int targetQubit, int conjFac){
1301 
1302  long long int sizeHalfBlock = 1LL << targetQubit;
1303  long long int sizeBlock = 2LL * sizeHalfBlock;
1304  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1305  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1306  if (thisTask>=numTasks) return;
1307 
1308  long long int thisBlock = thisTask / sizeHalfBlock;
1309  long long int indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1310  long long int indexLo = indexUp + sizeHalfBlock;
1311  qreal stateRealUp, stateImagUp;
1312 
1313  qreal *stateVecReal = qureg.deviceStateVec.real;
1314  qreal *stateVecImag = qureg.deviceStateVec.imag;
1315  stateRealUp = stateVecReal[indexUp];
1316  stateImagUp = stateVecImag[indexUp];
1317 
1318  // update under +-{{0, -i}, {i, 0}}
1319  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1320  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1321  stateVecReal[indexLo] = conjFac * -stateImagUp;
1322  stateVecImag[indexLo] = conjFac * stateRealUp;
1323 }
1324 
1325 void statevec_pauliY(Qureg qureg, int targetQubit)
1326 {
1327  int threadsPerCUDABlock, CUDABlocks;
1328  threadsPerCUDABlock = 128;
1329  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1330  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, 1);
1331 }
1332 
1333 void statevec_pauliYConj(Qureg qureg, int targetQubit)
1334 {
1335  int threadsPerCUDABlock, CUDABlocks;
1336  threadsPerCUDABlock = 128;
1337  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1338  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, -1);
1339 }
1340 
1341 __global__ void statevec_controlledPauliYKernel(Qureg qureg, int controlQubit, int targetQubit, int conjFac)
1342 {
1343  long long int index;
1344  long long int sizeBlock, sizeHalfBlock;
1345  long long int stateVecSize;
1346  int controlBit;
1347 
1348  qreal stateRealUp, stateImagUp;
1349  long long int thisBlock, indexUp, indexLo;
1350  sizeHalfBlock = 1LL << targetQubit;
1351  sizeBlock = 2LL * sizeHalfBlock;
1352 
1353  stateVecSize = qureg.numAmpsPerChunk;
1354  qreal *stateVecReal = qureg.deviceStateVec.real;
1355  qreal *stateVecImag = qureg.deviceStateVec.imag;
1356 
1357  index = blockIdx.x*blockDim.x + threadIdx.x;
1358  if (index>=(stateVecSize>>1)) return;
1359  thisBlock = index / sizeHalfBlock;
1360  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1361  indexLo = indexUp + sizeHalfBlock;
1362 
1363  controlBit = extractBit(controlQubit, indexUp);
1364  if (controlBit){
1365 
1366  stateRealUp = stateVecReal[indexUp];
1367  stateImagUp = stateVecImag[indexUp];
1368 
1369  // update under +-{{0, -i}, {i, 0}}
1370  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1371  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1372  stateVecReal[indexLo] = conjFac * -stateImagUp;
1373  stateVecImag[indexLo] = conjFac * stateRealUp;
1374  }
1375 }
1376 
1377 void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit)
1378 {
1379  int conjFactor = 1;
1380  int threadsPerCUDABlock, CUDABlocks;
1381  threadsPerCUDABlock = 128;
1382  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1383  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1384 }
1385 
1386 void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit)
1387 {
1388  int conjFactor = -1;
1389  int threadsPerCUDABlock, CUDABlocks;
1390  threadsPerCUDABlock = 128;
1391  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1392  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1393 }
1394 
1395 __global__ void statevec_phaseShiftByTermKernel(Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle) {
1396 
1397  long long int sizeBlock, sizeHalfBlock;
1398  long long int thisBlock, indexUp,indexLo;
1399 
1400  qreal stateRealLo, stateImagLo;
1401  long long int thisTask;
1402  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1403 
1404  sizeHalfBlock = 1LL << targetQubit;
1405  sizeBlock = 2LL * sizeHalfBlock;
1406 
1407  qreal *stateVecReal = qureg.deviceStateVec.real;
1408  qreal *stateVecImag = qureg.deviceStateVec.imag;
1409 
1410  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1411  if (thisTask>=numTasks) return;
1412  thisBlock = thisTask / sizeHalfBlock;
1413  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1414  indexLo = indexUp + sizeHalfBlock;
1415 
1416  stateRealLo = stateVecReal[indexLo];
1417  stateImagLo = stateVecImag[indexLo];
1418 
1419  stateVecReal[indexLo] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1420  stateVecImag[indexLo] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1421 }
1422 
1423 void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term)
1424 {
1425  qreal cosAngle = term.real;
1426  qreal sinAngle = term.imag;
1427 
1428  int threadsPerCUDABlock, CUDABlocks;
1429  threadsPerCUDABlock = 128;
1430  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1431  statevec_phaseShiftByTermKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, cosAngle, sinAngle);
1432 }
1433 
1434 __global__ void statevec_controlledPhaseShiftKernel(Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
1435 {
1436  long long int index;
1437  long long int stateVecSize;
1438  int bit1, bit2;
1439  qreal stateRealLo, stateImagLo;
1440 
1441  stateVecSize = qureg.numAmpsPerChunk;
1442  qreal *stateVecReal = qureg.deviceStateVec.real;
1443  qreal *stateVecImag = qureg.deviceStateVec.imag;
1444 
1445  index = blockIdx.x*blockDim.x + threadIdx.x;
1446  if (index>=stateVecSize) return;
1447 
1448  bit1 = extractBit (idQubit1, index);
1449  bit2 = extractBit (idQubit2, index);
1450  if (bit1 && bit2) {
1451  stateRealLo = stateVecReal[index];
1452  stateImagLo = stateVecImag[index];
1453 
1454  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1455  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1456  }
1457 }
1458 
1459 void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle)
1460 {
1461  qreal cosAngle = cos(angle);
1462  qreal sinAngle = sin(angle);
1463 
1464  int threadsPerCUDABlock, CUDABlocks;
1465  threadsPerCUDABlock = 128;
1466  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1467  statevec_controlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2, cosAngle, sinAngle);
1468 }
1469 
1470 __global__ void statevec_multiControlledPhaseShiftKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle) {
1471  qreal stateRealLo, stateImagLo;
1472  long long int index;
1473  long long int stateVecSize;
1474 
1475  stateVecSize = qureg.numAmpsPerChunk;
1476  qreal *stateVecReal = qureg.deviceStateVec.real;
1477  qreal *stateVecImag = qureg.deviceStateVec.imag;
1478 
1479  index = blockIdx.x*blockDim.x + threadIdx.x;
1480  if (index>=stateVecSize) return;
1481 
1482  if (mask == (mask & index) ){
1483  stateRealLo = stateVecReal[index];
1484  stateImagLo = stateVecImag[index];
1485  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1486  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1487  }
1488 }
1489 
1490 void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
1491 {
1492  qreal cosAngle = cos(angle);
1493  qreal sinAngle = sin(angle);
1494 
1495  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1496 
1497  int threadsPerCUDABlock, CUDABlocks;
1498  threadsPerCUDABlock = 128;
1499  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1500  statevec_multiControlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1501 }
1502 
1503 __global__ void statevec_multiRotateZKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle) {
1504 
1505  long long int stateVecSize = qureg.numAmpsPerChunk;
1506  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1507  if (index>=stateVecSize) return;
1508 
1509  qreal *stateVecReal = qureg.deviceStateVec.real;
1510  qreal *stateVecImag = qureg.deviceStateVec.imag;
1511 
1512  int fac = getBitMaskParity(mask & index)? -1 : 1;
1513  qreal stateReal = stateVecReal[index];
1514  qreal stateImag = stateVecImag[index];
1515 
1516  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1517  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1518 }
1519 
1520 void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle)
1521 {
1522  qreal cosAngle = cos(angle/2.0);
1523  qreal sinAngle = sin(angle/2.0);
1524 
1525  int threadsPerCUDABlock, CUDABlocks;
1526  threadsPerCUDABlock = 128;
1527  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1528  statevec_multiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1529 }
1530 
1532 
1533  // computes the trace using Kahan summation
1534  qreal pTotal=0;
1535  qreal y, t, c;
1536  c = 0;
1537 
1538  long long int numCols = 1LL << qureg.numQubitsRepresented;
1539  long long diagIndex;
1540 
1541  copyStateFromGPU(qureg);
1542 
1543  for (int col=0; col< numCols; col++) {
1544  diagIndex = col*(numCols + 1);
1545  y = qureg.stateVec.real[diagIndex] - c;
1546  t = pTotal + y;
1547  c = ( t - pTotal ) - y; // brackets are important
1548  pTotal = t;
1549  }
1550 
1551  return pTotal;
1552 }
1553 
1555  /* IJB - implemented using Kahan summation for greater accuracy at a slight floating
1556  point operation overhead. For more details see https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
1557  /* Don't change the bracketing in this routine! */
1558  qreal pTotal=0;
1559  qreal y, t, c;
1560  long long int index;
1561  long long int numAmpsPerRank = qureg.numAmpsPerChunk;
1562 
1563  copyStateFromGPU(qureg);
1564 
1565  c = 0.0;
1566  for (index=0; index<numAmpsPerRank; index++){
1567  /* Perform pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index]; by Kahan */
1568  // pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index];
1569  y = qureg.stateVec.real[index]*qureg.stateVec.real[index] - c;
1570  t = pTotal + y;
1571  c = ( t - pTotal ) - y;
1572  pTotal = t;
1573 
1574  /* Perform pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index]; by Kahan */
1575  //pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index];
1576  y = qureg.stateVec.imag[index]*qureg.stateVec.imag[index] - c;
1577  t = pTotal + y;
1578  c = ( t - pTotal ) - y;
1579  pTotal = t;
1580 
1581 
1582  }
1583  return pTotal;
1584 }
1585 
1586 __global__ void statevec_controlledPhaseFlipKernel(Qureg qureg, int idQubit1, int idQubit2)
1587 {
1588  long long int index;
1589  long long int stateVecSize;
1590  int bit1, bit2;
1591 
1592  stateVecSize = qureg.numAmpsPerChunk;
1593  qreal *stateVecReal = qureg.deviceStateVec.real;
1594  qreal *stateVecImag = qureg.deviceStateVec.imag;
1595 
1596  index = blockIdx.x*blockDim.x + threadIdx.x;
1597  if (index>=stateVecSize) return;
1598 
1599  bit1 = extractBit (idQubit1, index);
1600  bit2 = extractBit (idQubit2, index);
1601  if (bit1 && bit2) {
1602  stateVecReal [index] = - stateVecReal [index];
1603  stateVecImag [index] = - stateVecImag [index];
1604  }
1605 }
1606 
1607 void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2)
1608 {
1609  int threadsPerCUDABlock, CUDABlocks;
1610  threadsPerCUDABlock = 128;
1611  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1612  statevec_controlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2);
1613 }
1614 
1615 __global__ void statevec_multiControlledPhaseFlipKernel(Qureg qureg, long long int mask)
1616 {
1617  long long int index;
1618  long long int stateVecSize;
1619 
1620  stateVecSize = qureg.numAmpsPerChunk;
1621  qreal *stateVecReal = qureg.deviceStateVec.real;
1622  qreal *stateVecImag = qureg.deviceStateVec.imag;
1623 
1624  index = blockIdx.x*blockDim.x + threadIdx.x;
1625  if (index>=stateVecSize) return;
1626 
1627  if (mask == (mask & index) ){
1628  stateVecReal [index] = - stateVecReal [index];
1629  stateVecImag [index] = - stateVecImag [index];
1630  }
1631 }
1632 
1633 void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits)
1634 {
1635  int threadsPerCUDABlock, CUDABlocks;
1636  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1637  threadsPerCUDABlock = 128;
1638  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1639  statevec_multiControlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask);
1640 }
1641 
1642 __global__ void statevec_swapQubitAmpsKernel(Qureg qureg, int qb1, int qb2) {
1643 
1644  qreal *reVec = qureg.deviceStateVec.real;
1645  qreal *imVec = qureg.deviceStateVec.imag;
1646 
1647  long long int numTasks = qureg.numAmpsPerChunk >> 2; // each iteration updates 2 amps and skips 2 amps
1648  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1649  if (thisTask>=numTasks) return;
1650 
1651  long long int ind00, ind01, ind10;
1652  qreal re01, re10, im01, im10;
1653 
1654  // determine ind00 of |..0..0..>, |..0..1..> and |..1..0..>
1655  ind00 = insertTwoZeroBits(thisTask, qb1, qb2);
1656  ind01 = flipBit(ind00, qb1);
1657  ind10 = flipBit(ind00, qb2);
1658 
1659  // extract statevec amplitudes
1660  re01 = reVec[ind01]; im01 = imVec[ind01];
1661  re10 = reVec[ind10]; im10 = imVec[ind10];
1662 
1663  // swap 01 and 10 amps
1664  reVec[ind01] = re10; reVec[ind10] = re01;
1665  imVec[ind01] = im10; imVec[ind10] = im01;
1666 }
1667 
1668 void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2)
1669 {
1670  int threadsPerCUDABlock, CUDABlocks;
1671  threadsPerCUDABlock = 128;
1672  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock);
1673  statevec_swapQubitAmpsKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, qb1, qb2);
1674 }
1675 
1676 __global__ void statevec_hadamardKernel (Qureg qureg, int targetQubit){
1677  // ----- sizes
1678  long long int sizeBlock, // size of blocks
1679  sizeHalfBlock; // size of blocks halved
1680  // ----- indices
1681  long long int thisBlock, // current block
1682  indexUp,indexLo; // current index and corresponding index in lower half block
1683 
1684  // ----- temp variables
1685  qreal stateRealUp,stateRealLo, // storage for previous state values
1686  stateImagUp,stateImagLo; // (used in updates)
1687  // ----- temp variables
1688  long long int thisTask; // task based approach for expose loop with small granularity
1689  long long int numTasks=qureg.numAmpsPerChunk>>1;
1690 
1691  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1692  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1693 
1694  // ---------------------------------------------------------------- //
1695  // rotate //
1696  // ---------------------------------------------------------------- //
1697 
1699  qreal *stateVecReal = qureg.deviceStateVec.real;
1700  qreal *stateVecImag = qureg.deviceStateVec.imag;
1701 
1702  qreal recRoot2 = 1.0/sqrt(2.0);
1703 
1704  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1705  if (thisTask>=numTasks) return;
1706 
1707  thisBlock = thisTask / sizeHalfBlock;
1708  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1709  indexLo = indexUp + sizeHalfBlock;
1710 
1711  // store current state vector values in temp variables
1712  stateRealUp = stateVecReal[indexUp];
1713  stateImagUp = stateVecImag[indexUp];
1714 
1715  stateRealLo = stateVecReal[indexLo];
1716  stateImagLo = stateVecImag[indexLo];
1717 
1718  stateVecReal[indexUp] = recRoot2*(stateRealUp + stateRealLo);
1719  stateVecImag[indexUp] = recRoot2*(stateImagUp + stateImagLo);
1720 
1721  stateVecReal[indexLo] = recRoot2*(stateRealUp - stateRealLo);
1722  stateVecImag[indexLo] = recRoot2*(stateImagUp - stateImagLo);
1723 }
1724 
1725 void statevec_hadamard(Qureg qureg, int targetQubit)
1726 {
1727  int threadsPerCUDABlock, CUDABlocks;
1728  threadsPerCUDABlock = 128;
1729  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1730  statevec_hadamardKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1731 }
1732 
1733 __global__ void statevec_controlledNotKernel(Qureg qureg, int controlQubit, int targetQubit)
1734 {
1735  long long int index;
1736  long long int sizeBlock, // size of blocks
1737  sizeHalfBlock; // size of blocks halved
1738  long long int stateVecSize;
1739  int controlBit;
1740 
1741  // ----- temp variables
1742  qreal stateRealUp, // storage for previous state values
1743  stateImagUp; // (used in updates)
1744  long long int thisBlock, // current block
1745  indexUp,indexLo; // current index and corresponding index in lower half block
1746  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1747  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1748 
1749  stateVecSize = qureg.numAmpsPerChunk;
1750  qreal *stateVecReal = qureg.deviceStateVec.real;
1751  qreal *stateVecImag = qureg.deviceStateVec.imag;
1752 
1753  index = blockIdx.x*blockDim.x + threadIdx.x;
1754  if (index>=(stateVecSize>>1)) return;
1755  thisBlock = index / sizeHalfBlock;
1756  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1757  indexLo = indexUp + sizeHalfBlock;
1758 
1759  controlBit = extractBit(controlQubit, indexUp);
1760  if (controlBit){
1761  stateRealUp = stateVecReal[indexUp];
1762  stateImagUp = stateVecImag[indexUp];
1763 
1764  stateVecReal[indexUp] = stateVecReal[indexLo];
1765  stateVecImag[indexUp] = stateVecImag[indexLo];
1766 
1767  stateVecReal[indexLo] = stateRealUp;
1768  stateVecImag[indexLo] = stateImagUp;
1769  }
1770 }
1771 
1772 void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit)
1773 {
1774  int threadsPerCUDABlock, CUDABlocks;
1775  threadsPerCUDABlock = 128;
1776  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1777  statevec_controlledNotKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit);
1778 }
1779 
1780 __device__ __host__ unsigned int log2Int( unsigned int x )
1781 {
1782  unsigned int ans = 0 ;
1783  while( x>>=1 ) ans++;
1784  return ans ;
1785 }
1786 
1787 __device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length){
1788  int i, l, r;
1789  int threadMax, maxDepth;
1790  threadMax = length/2;
1791  maxDepth = log2Int(length/2);
1792 
1793  for (i=0; i<maxDepth+1; i++){
1794  if (threadIdx.x<threadMax){
1795  l = threadIdx.x;
1796  r = l + threadMax;
1797  arrayIn[l] = arrayIn[r] + arrayIn[l];
1798  }
1799  threadMax = threadMax >> 1;
1800  __syncthreads(); // optimise -- use warp shuffle instead
1801  }
1802 
1803  if (threadIdx.x==0) reducedArray[blockIdx.x] = arrayIn[0];
1804 }
1805 
1806 __global__ void copySharedReduceBlock(qreal*arrayIn, qreal *reducedArray, int length){
1807  extern __shared__ qreal tempReductionArray[];
1808  int blockOffset = blockIdx.x*length;
1809  tempReductionArray[threadIdx.x*2] = arrayIn[blockOffset + threadIdx.x*2];
1810  tempReductionArray[threadIdx.x*2+1] = arrayIn[blockOffset + threadIdx.x*2+1];
1811  __syncthreads();
1812  reduceBlock(tempReductionArray, reducedArray, length);
1813 }
1814 
1816  Qureg qureg, int measureQubit, qreal *reducedArray
1817 ) {
1818  // run by each thread
1819  // use of block here refers to contiguous amplitudes where measureQubit = 0,
1820  // (then =1) and NOT the CUDA block, which is the partitioning of CUDA threads
1821 
1822  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1823  long long int numTasks = densityDim >> 1;
1824  long long int sizeHalfBlock = 1LL << (measureQubit);
1825  long long int sizeBlock = 2LL * sizeHalfBlock;
1826 
1827  long long int thisBlock; // which block this thread is processing
1828  long long int thisTask; // which part of the block this thread is processing
1829  long long int basisIndex; // index of this thread's computational basis state
1830  long long int densityIndex; // " " index of |basis><basis| in the flat density matrix
1831 
1832  // array of each thread's collected probability, to be summed
1833  extern __shared__ qreal tempReductionArray[];
1834 
1835  // figure out which density matrix prob that this thread is assigned
1836  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1837  if (thisTask>=numTasks) return;
1838  thisBlock = thisTask / sizeHalfBlock;
1839  basisIndex = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1840  densityIndex = (densityDim + 1) * basisIndex;
1841 
1842  // record the probability in the CUDA-BLOCK-wide array
1843  qreal prob = qureg.deviceStateVec.real[densityIndex]; // im[densityIndex] assumed ~ 0
1844  tempReductionArray[threadIdx.x] = prob;
1845 
1846  // sum the probs collected by this CUDA-BLOCK's threads into a per-CUDA-BLOCK array
1847  __syncthreads();
1848  if (threadIdx.x<blockDim.x/2){
1849  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1850  }
1851 }
1852 
1854  Qureg qureg, int measureQubit, qreal *reducedArray
1855 ) {
1856  // ----- sizes
1857  long long int sizeBlock, // size of blocks
1858  sizeHalfBlock; // size of blocks halved
1859  // ----- indices
1860  long long int thisBlock, // current block
1861  index; // current index for first half block
1862  // ----- temp variables
1863  long long int thisTask; // task based approach for expose loop with small granularity
1864  long long int numTasks=qureg.numAmpsPerChunk>>1;
1865  // (good for shared memory parallelism)
1866 
1867  extern __shared__ qreal tempReductionArray[];
1868 
1869  // ---------------------------------------------------------------- //
1870  // dimensions //
1871  // ---------------------------------------------------------------- //
1872  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
1873  // and then the number to skip
1874  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
1875 
1876  // ---------------------------------------------------------------- //
1877  // find probability //
1878  // ---------------------------------------------------------------- //
1879 
1880  //
1881  // --- task-based shared-memory parallel implementation
1882  //
1883 
1884  qreal *stateVecReal = qureg.deviceStateVec.real;
1885  qreal *stateVecImag = qureg.deviceStateVec.imag;
1886 
1887  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1888  if (thisTask>=numTasks) return;
1889 
1890  thisBlock = thisTask / sizeHalfBlock;
1891  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1892  qreal realVal, imagVal;
1893  realVal = stateVecReal[index];
1894  imagVal = stateVecImag[index];
1895  tempReductionArray[threadIdx.x] = realVal*realVal + imagVal*imagVal;
1896  __syncthreads();
1897 
1898  if (threadIdx.x<blockDim.x/2){
1899  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1900  }
1901 }
1902 
1903 int getNumReductionLevels(long long int numValuesToReduce, int numReducedPerLevel){
1904  int levels=0;
1905  while (numValuesToReduce){
1906  numValuesToReduce = numValuesToReduce/numReducedPerLevel;
1907  levels++;
1908  }
1909  return levels;
1910 }
1911 
1912 void swapDouble(qreal **a, qreal **b){
1913  qreal *temp;
1914  temp = *a;
1915  *a = *b;
1916  *b = temp;
1917 }
1918 
1920 {
1921  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1922  long long int numValuesToReduce = densityDim >> 1; // half of the diagonal has measureQubit=0
1923 
1924  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
1925  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
1926  int firstTime = 1;
1927 
1928  while (numValuesToReduce > 1) {
1929 
1930  // need less than one CUDA-BLOCK to reduce
1931  if (numValuesToReduce < maxReducedPerLevel) {
1932  valuesPerCUDABlock = numValuesToReduce;
1933  numCUDABlocks = 1;
1934  }
1935  // otherwise use only full CUDA-BLOCKS
1936  else {
1937  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
1938  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
1939  }
1940 
1941  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
1942 
1943  // spawn threads to sum the probs in each block
1944  if (firstTime) {
1945  densmatr_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
1946  qureg, measureQubit, qureg.firstLevelReduction);
1947  firstTime = 0;
1948 
1949  // sum the block probs
1950  } else {
1951  cudaDeviceSynchronize();
1952  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
1953  qureg.firstLevelReduction,
1954  qureg.secondLevelReduction, valuesPerCUDABlock);
1955  cudaDeviceSynchronize();
1957  }
1958 
1959  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
1960  }
1961 
1962  qreal zeroProb;
1963  cudaMemcpy(&zeroProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
1964  return zeroProb;
1965 }
1966 
1968 {
1969  long long int numValuesToReduce = qureg.numAmpsPerChunk>>1;
1970  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
1971  qreal stateProb=0;
1972  int firstTime=1;
1973  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
1974 
1975  while(numValuesToReduce>1){
1976  if (numValuesToReduce<maxReducedPerLevel){
1977  // Need less than one CUDA block to reduce values
1978  valuesPerCUDABlock = numValuesToReduce;
1979  numCUDABlocks = 1;
1980  } else {
1981  // Use full CUDA blocks, with block size constrained by shared mem usage
1982  valuesPerCUDABlock = maxReducedPerLevel;
1983  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
1984  }
1985  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
1986 
1987  if (firstTime){
1988  statevec_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
1989  qureg, measureQubit, qureg.firstLevelReduction);
1990  firstTime=0;
1991  } else {
1992  cudaDeviceSynchronize();
1993  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
1994  qureg.firstLevelReduction,
1995  qureg.secondLevelReduction, valuesPerCUDABlock);
1996  cudaDeviceSynchronize();
1998  }
1999  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2000  }
2001  cudaMemcpy(&stateProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2002  return stateProb;
2003 }
2004 
2005 qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
2006 {
2007  qreal outcomeProb = statevec_findProbabilityOfZero(qureg, measureQubit);
2008  if (outcome==1)
2009  outcomeProb = 1.0 - outcomeProb;
2010  return outcomeProb;
2011 }
2012 
2013 qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
2014 {
2015  qreal outcomeProb = densmatr_findProbabilityOfZero(qureg, measureQubit);
2016  if (outcome==1)
2017  outcomeProb = 1.0 - outcomeProb;
2018  return outcomeProb;
2019 }
2020 
2023  Qureg a, Qureg b, long long int numTermsToSum, qreal* reducedArray
2024 ) {
2025  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2026  if (index >= numTermsToSum) return;
2027 
2028  // Re{ conj(a) b } = Re{ (aRe - i aIm)(bRe + i bIm) } = aRe bRe + aIm bIm
2029  qreal prod = (
2030  a.deviceStateVec.real[index]*b.deviceStateVec.real[index]
2031  + a.deviceStateVec.imag[index]*b.deviceStateVec.imag[index]);
2032 
2033  // array of each thread's collected sum term, to be summed
2034  extern __shared__ qreal tempReductionArray[];
2035  tempReductionArray[threadIdx.x] = prod;
2036  __syncthreads();
2037 
2038  // every second thread reduces
2039  if (threadIdx.x<blockDim.x/2)
2040  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2041 }
2042 
2044 
2045  // we're summing the square of every term in the density matrix
2046  long long int numValuesToReduce = a.numAmpsTotal;
2047 
2048  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2049  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2050  int firstTime = 1;
2051 
2052  while (numValuesToReduce > 1) {
2053 
2054  // need less than one CUDA-BLOCK to reduce
2055  if (numValuesToReduce < maxReducedPerLevel) {
2056  valuesPerCUDABlock = numValuesToReduce;
2057  numCUDABlocks = 1;
2058  }
2059  // otherwise use only full CUDA-BLOCKS
2060  else {
2061  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2062  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2063  }
2064  // dictates size of reduction array
2065  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2066 
2067  // spawn threads to sum the terms in each block
2068  // arbitrarily store the reduction in the b qureg's array
2069  if (firstTime) {
2070  densmatr_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2071  a, b, a.numAmpsTotal, b.firstLevelReduction);
2072  firstTime = 0;
2073  }
2074  // sum the block terms
2075  else {
2076  cudaDeviceSynchronize();
2077  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2079  b.secondLevelReduction, valuesPerCUDABlock);
2080  cudaDeviceSynchronize();
2082  }
2083 
2084  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2085  }
2086 
2087  qreal innerprod;
2088  cudaMemcpy(&innerprod, b.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2089  return innerprod;
2090 }
2091 
2094  int getRealComp,
2095  qreal* vecReal1, qreal* vecImag1, qreal* vecReal2, qreal* vecImag2,
2096  long long int numTermsToSum, qreal* reducedArray)
2097 {
2098  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2099  if (index >= numTermsToSum) return;
2100 
2101  // choose whether to calculate the real or imaginary term of the inner product
2102  qreal innerProdTerm;
2103  if (getRealComp)
2104  innerProdTerm = vecReal1[index]*vecReal2[index] + vecImag1[index]*vecImag2[index];
2105  else
2106  innerProdTerm = vecReal1[index]*vecImag2[index] - vecImag1[index]*vecReal2[index];
2107 
2108  // array of each thread's collected sum term, to be summed
2109  extern __shared__ qreal tempReductionArray[];
2110  tempReductionArray[threadIdx.x] = innerProdTerm;
2111  __syncthreads();
2112 
2113  // every second thread reduces
2114  if (threadIdx.x<blockDim.x/2)
2115  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2116 }
2117 
2124 
2125  qreal innerProdReal, innerProdImag;
2126 
2127  int getRealComp;
2128  long long int numValuesToReduce;
2129  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2130  int maxReducedPerLevel;
2131  int firstTime;
2132 
2133  // compute real component of inner product
2134  getRealComp = 1;
2135  numValuesToReduce = bra.numAmpsPerChunk;
2136  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2137  firstTime = 1;
2138  while (numValuesToReduce > 1) {
2139  if (numValuesToReduce < maxReducedPerLevel) {
2140  valuesPerCUDABlock = numValuesToReduce;
2141  numCUDABlocks = 1;
2142  }
2143  else {
2144  valuesPerCUDABlock = maxReducedPerLevel;
2145  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2146  }
2147  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2148  if (firstTime) {
2149  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2150  getRealComp,
2151  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2152  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2153  numValuesToReduce,
2154  bra.firstLevelReduction);
2155  firstTime = 0;
2156  } else {
2157  cudaDeviceSynchronize();
2158  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2159  bra.firstLevelReduction,
2160  bra.secondLevelReduction, valuesPerCUDABlock);
2161  cudaDeviceSynchronize();
2163  }
2164  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2165  }
2166  cudaMemcpy(&innerProdReal, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2167 
2168  // compute imag component of inner product
2169  getRealComp = 0;
2170  numValuesToReduce = bra.numAmpsPerChunk;
2171  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2172  firstTime = 1;
2173  while (numValuesToReduce > 1) {
2174  if (numValuesToReduce < maxReducedPerLevel) {
2175  valuesPerCUDABlock = numValuesToReduce;
2176  numCUDABlocks = 1;
2177  }
2178  else {
2179  valuesPerCUDABlock = maxReducedPerLevel;
2180  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2181  }
2182  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2183  if (firstTime) {
2184  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2185  getRealComp,
2186  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2187  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2188  numValuesToReduce,
2189  bra.firstLevelReduction);
2190  firstTime = 0;
2191  } else {
2192  cudaDeviceSynchronize();
2193  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2194  bra.firstLevelReduction,
2195  bra.secondLevelReduction, valuesPerCUDABlock);
2196  cudaDeviceSynchronize();
2198  }
2199  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2200  }
2201  cudaMemcpy(&innerProdImag, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2202 
2203  // return complex
2204  Complex innerProd;
2205  innerProd.real = innerProdReal;
2206  innerProd.imag = innerProdImag;
2207  return innerProd;
2208 }
2209 
2211 __global__ void densmatr_calcFidelityKernel(Qureg dens, Qureg vec, long long int dim, qreal* reducedArray) {
2212 
2213  // figure out which density matrix row to consider
2214  long long int col;
2215  long long int row = blockIdx.x*blockDim.x + threadIdx.x;
2216  if (row >= dim) return;
2217 
2218  qreal* densReal = dens.deviceStateVec.real;
2219  qreal* densImag = dens.deviceStateVec.imag;
2220  qreal* vecReal = vec.deviceStateVec.real;
2221  qreal* vecImag = vec.deviceStateVec.imag;
2222 
2223  // compute the row-th element of the product dens*vec
2224  qreal prodReal = 0;
2225  qreal prodImag = 0;
2226  for (col=0LL; col < dim; col++) {
2227  qreal densElemReal = densReal[dim*col + row];
2228  qreal densElemImag = densImag[dim*col + row];
2229 
2230  prodReal += densElemReal*vecReal[col] - densElemImag*vecImag[col];
2231  prodImag += densElemReal*vecImag[col] + densElemImag*vecReal[col];
2232  }
2233 
2234  // multiply with row-th elem of (vec^*)
2235  qreal termReal = prodImag*vecImag[row] + prodReal*vecReal[row];
2236 
2237  // imag of every term should be zero, because each is a valid fidelity calc of an eigenstate
2238  //qreal termImag = prodImag*vecReal[row] - prodReal*vecImag[row];
2239 
2240  extern __shared__ qreal tempReductionArray[];
2241  tempReductionArray[threadIdx.x] = termReal;
2242  __syncthreads();
2243 
2244  // every second thread reduces
2245  if (threadIdx.x<blockDim.x/2)
2246  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2247 }
2248 
2250 
2251  // we're summing the square of every term in the density matrix
2252  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2253  long long int numValuesToReduce = densityDim;
2254 
2255  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2256  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2257  int firstTime = 1;
2258 
2259  while (numValuesToReduce > 1) {
2260 
2261  // need less than one CUDA-BLOCK to reduce
2262  if (numValuesToReduce < maxReducedPerLevel) {
2263  valuesPerCUDABlock = numValuesToReduce;
2264  numCUDABlocks = 1;
2265  }
2266  // otherwise use only full CUDA-BLOCKS
2267  else {
2268  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2269  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2270  }
2271  // dictates size of reduction array
2272  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2273 
2274  // spawn threads to sum the probs in each block
2275  // store the reduction in the pureState array
2276  if (firstTime) {
2277  densmatr_calcFidelityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2278  qureg, pureState, densityDim, pureState.firstLevelReduction);
2279  firstTime = 0;
2280 
2281  // sum the block probs
2282  } else {
2283  cudaDeviceSynchronize();
2284  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2285  pureState.firstLevelReduction,
2286  pureState.secondLevelReduction, valuesPerCUDABlock);
2287  cudaDeviceSynchronize();
2288  swapDouble(&(pureState.firstLevelReduction), &(pureState.secondLevelReduction));
2289  }
2290 
2291  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2292  }
2293 
2294  qreal fidelity;
2295  cudaMemcpy(&fidelity, pureState.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2296  return fidelity;
2297 }
2298 
2300  qreal* aRe, qreal* aIm, qreal* bRe, qreal* bIm,
2301  long long int numAmpsToSum, qreal *reducedArray
2302 ) {
2303  // figure out which density matrix term this thread is assigned
2304  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2305  if (index >= numAmpsToSum) return;
2306 
2307  // compute this thread's sum term
2308  qreal difRe = aRe[index] - bRe[index];
2309  qreal difIm = aIm[index] - bIm[index];
2310  qreal term = difRe*difRe + difIm*difIm;
2311 
2312  // array of each thread's collected term, to be summed
2313  extern __shared__ qreal tempReductionArray[];
2314  tempReductionArray[threadIdx.x] = term;
2315  __syncthreads();
2316 
2317  // every second thread reduces
2318  if (threadIdx.x<blockDim.x/2)
2319  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2320 }
2321 
2322 /* computes sqrt(Tr( (a-b) conjTrans(a-b) ) = sqrt( sum of abs vals of (a-b)) */
2324 
2325  // we're summing the square of every term in (a-b)
2326  long long int numValuesToReduce = a.numAmpsPerChunk;
2327 
2328  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2329  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2330  int firstTime = 1;
2331 
2332  while (numValuesToReduce > 1) {
2333 
2334  // need less than one CUDA-BLOCK to reduce
2335  if (numValuesToReduce < maxReducedPerLevel) {
2336  valuesPerCUDABlock = numValuesToReduce;
2337  numCUDABlocks = 1;
2338  }
2339  // otherwise use only full CUDA-BLOCKS
2340  else {
2341  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2342  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2343  }
2344  // dictates size of reduction array
2345  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2346 
2347  // spawn threads to sum the probs in each block (store reduction temp values in a's reduction array)
2348  if (firstTime) {
2349  densmatr_calcHilbertSchmidtDistanceSquaredKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2350  a.deviceStateVec.real, a.deviceStateVec.imag,
2351  b.deviceStateVec.real, b.deviceStateVec.imag,
2352  numValuesToReduce, a.firstLevelReduction);
2353  firstTime = 0;
2354 
2355  // sum the block probs
2356  } else {
2357  cudaDeviceSynchronize();
2358  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2360  a.secondLevelReduction, valuesPerCUDABlock);
2361  cudaDeviceSynchronize();
2363  }
2364 
2365  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2366  }
2367 
2368  qreal trace;
2369  cudaMemcpy(&trace, a.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2370 
2371  qreal sqrtTrace = sqrt(trace);
2372  return sqrtTrace;
2373 }
2374 
2375 __global__ void densmatr_calcPurityKernel(qreal* vecReal, qreal* vecImag, long long int numAmpsToSum, qreal *reducedArray) {
2376 
2377  // figure out which density matrix term this thread is assigned
2378  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2379  if (index >= numAmpsToSum) return;
2380 
2381  qreal term = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2382 
2383  // array of each thread's collected probability, to be summed
2384  extern __shared__ qreal tempReductionArray[];
2385  tempReductionArray[threadIdx.x] = term;
2386  __syncthreads();
2387 
2388  // every second thread reduces
2389  if (threadIdx.x<blockDim.x/2)
2390  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2391 }
2392 
2395 
2396  // we're summing the square of every term in the density matrix
2397  long long int numValuesToReduce = qureg.numAmpsPerChunk;
2398 
2399  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2400  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2401  int firstTime = 1;
2402 
2403  while (numValuesToReduce > 1) {
2404 
2405  // need less than one CUDA-BLOCK to reduce
2406  if (numValuesToReduce < maxReducedPerLevel) {
2407  valuesPerCUDABlock = numValuesToReduce;
2408  numCUDABlocks = 1;
2409  }
2410  // otherwise use only full CUDA-BLOCKS
2411  else {
2412  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2413  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2414  }
2415  // dictates size of reduction array
2416  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2417 
2418  // spawn threads to sum the probs in each block
2419  if (firstTime) {
2420  densmatr_calcPurityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2421  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
2422  numValuesToReduce, qureg.firstLevelReduction);
2423  firstTime = 0;
2424 
2425  // sum the block probs
2426  } else {
2427  cudaDeviceSynchronize();
2428  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2429  qureg.firstLevelReduction,
2430  qureg.secondLevelReduction, valuesPerCUDABlock);
2431  cudaDeviceSynchronize();
2433  }
2434 
2435  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2436  }
2437 
2438  qreal traceDensSquared;
2439  cudaMemcpy(&traceDensSquared, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2440  return traceDensSquared;
2441 }
2442 
2443 __global__ void statevec_collapseToKnownProbOutcomeKernel(Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
2444 {
2445  // ----- sizes
2446  long long int sizeBlock, // size of blocks
2447  sizeHalfBlock; // size of blocks halved
2448  // ----- indices
2449  long long int thisBlock, // current block
2450  index; // current index for first half block
2451  // ----- measured probability
2452  qreal renorm; // probability (returned) value
2453  // ----- temp variables
2454  long long int thisTask; // task based approach for expose loop with small granularity
2455  // (good for shared memory parallelism)
2456  long long int numTasks=qureg.numAmpsPerChunk>>1;
2457 
2458  // ---------------------------------------------------------------- //
2459  // dimensions //
2460  // ---------------------------------------------------------------- //
2461  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2462  // and then the number to skip
2463  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2464 
2465  // ---------------------------------------------------------------- //
2466  // find probability //
2467  // ---------------------------------------------------------------- //
2468 
2469  //
2470  // --- task-based shared-memory parallel implementation
2471  //
2472  renorm=1/sqrt(totalProbability);
2473  qreal *stateVecReal = qureg.deviceStateVec.real;
2474  qreal *stateVecImag = qureg.deviceStateVec.imag;
2475 
2476  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2477  if (thisTask>=numTasks) return;
2478  thisBlock = thisTask / sizeHalfBlock;
2479  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2480 
2481  if (outcome==0){
2482  stateVecReal[index]=stateVecReal[index]*renorm;
2483  stateVecImag[index]=stateVecImag[index]*renorm;
2484 
2485  stateVecReal[index+sizeHalfBlock]=0;
2486  stateVecImag[index+sizeHalfBlock]=0;
2487  } else if (outcome==1){
2488  stateVecReal[index]=0;
2489  stateVecImag[index]=0;
2490 
2491  stateVecReal[index+sizeHalfBlock]=stateVecReal[index+sizeHalfBlock]*renorm;
2492  stateVecImag[index+sizeHalfBlock]=stateVecImag[index+sizeHalfBlock]*renorm;
2493  }
2494 }
2495 
2496 /*
2497  * outcomeProb must accurately be the probability of that qubit outcome in the state-vector, or
2498  * else the state-vector will lose normalisation
2499  */
2500 void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
2501 {
2502  int threadsPerCUDABlock, CUDABlocks;
2503  threadsPerCUDABlock = 128;
2504  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
2505  statevec_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, measureQubit, outcome, outcomeProb);
2506 }
2507 
2510  qreal outcomeProb, qreal* vecReal, qreal *vecImag, long long int numBasesToVisit,
2511  long long int part1, long long int part2, long long int part3,
2512  long long int rowBit, long long int colBit, long long int desired, long long int undesired)
2513 {
2514  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2515  if (scanInd >= numBasesToVisit) return;
2516 
2517  long long int base = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2518 
2519  // renormalise desired outcome
2520  vecReal[base + desired] /= outcomeProb;
2521  vecImag[base + desired] /= outcomeProb;
2522 
2523  // kill undesired outcome
2524  vecReal[base + undesired] = 0;
2525  vecImag[base + undesired] = 0;
2526 
2527  // kill |..0..><..1..| states
2528  vecReal[base + colBit] = 0;
2529  vecImag[base + colBit] = 0;
2530  vecReal[base + rowBit] = 0;
2531  vecImag[base + rowBit] = 0;
2532 }
2533 
2535 void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) {
2536 
2537  int rowQubit = measureQubit + qureg.numQubitsRepresented;
2538 
2539  int colBit = 1LL << measureQubit;
2540  int rowBit = 1LL << rowQubit;
2541 
2542  long long int numBasesToVisit = qureg.numAmpsPerChunk/4;
2543  long long int part1 = colBit -1;
2544  long long int part2 = (rowBit >> 1) - colBit;
2545  long long int part3 = numBasesToVisit - (rowBit >> 1);
2546 
2547  long long int desired, undesired;
2548  if (outcome == 0) {
2549  desired = 0;
2550  undesired = colBit | rowBit;
2551  } else {
2552  desired = colBit | rowBit;
2553  undesired = 0;
2554  }
2555 
2556  int threadsPerCUDABlock, CUDABlocks;
2557  threadsPerCUDABlock = 128;
2558  CUDABlocks = ceil(numBasesToVisit / (qreal) threadsPerCUDABlock);
2559  densmatr_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2560  outcomeProb, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBasesToVisit,
2561  part1, part2, part3, rowBit, colBit, desired, undesired);
2562 }
2563 
2564 __global__ void densmatr_mixDensityMatrixKernel(Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit) {
2565 
2566  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2567  if (ampInd >= numAmpsToVisit) return;
2568 
2569  combineQureg.deviceStateVec.real[ampInd] *= 1-otherProb;
2570  combineQureg.deviceStateVec.imag[ampInd] *= 1-otherProb;
2571 
2572  combineQureg.deviceStateVec.real[ampInd] += otherProb*otherQureg.deviceStateVec.real[ampInd];
2573  combineQureg.deviceStateVec.imag[ampInd] += otherProb*otherQureg.deviceStateVec.imag[ampInd];
2574 }
2575 
2576 void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg) {
2577 
2578  long long int numAmpsToVisit = combineQureg.numAmpsPerChunk;
2579 
2580  int threadsPerCUDABlock, CUDABlocks;
2581  threadsPerCUDABlock = 128;
2582  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2583  densmatr_mixDensityMatrixKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2584  combineQureg, otherProb, otherQureg, numAmpsToVisit
2585  );
2586 }
2587 
2594  qreal fac, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2595  long long int part1, long long int part2, long long int part3,
2596  long long int colBit, long long int rowBit)
2597 {
2598  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2599  if (scanInd >= numAmpsToVisit) return;
2600 
2601  long long int ampInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2602  vecReal[ampInd + colBit] *= fac;
2603  vecImag[ampInd + colBit] *= fac;
2604  vecReal[ampInd + rowBit] *= fac;
2605  vecImag[ampInd + rowBit] *= fac;
2606 }
2607 
2608 
2609 void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac) {
2610 
2611  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2612 
2613  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2614  long long int colBit = 1LL << targetQubit;
2615  long long int rowBit = 1LL << rowQubit;
2616 
2617  long long int part1 = colBit - 1;
2618  long long int part2 = (rowBit >> 1) - colBit;
2619  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2620 
2621  int threadsPerCUDABlock, CUDABlocks;
2622  threadsPerCUDABlock = 128;
2623  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2624  densmatr_mixDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2625  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2626  part1, part2, part3, colBit, rowBit);
2627 }
2628 
2629 void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) {
2630 
2631  if (dephase == 0)
2632  return;
2633 
2634  qreal dephFac = 1 - dephase;
2635  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephFac);
2636 }
2637 
2645  qreal fac, qreal* vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit,
2646  long long int part1, long long int part2, long long int part3, long long int part4, long long int part5,
2647  long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
2648 {
2649  long long int outerInd = blockIdx.x*blockDim.x + threadIdx.x;
2650  if (outerInd >= numAmpsToVisit) return;
2651 
2652  // sets meta in 1...14 excluding 5, 10, creating bit string DCBA for |..D..C..><..B..A|
2653  int meta = 1 + (outerInd/numBackgroundStates);
2654  if (meta > 4) meta++;
2655  if (meta > 9) meta++;
2656 
2657  long long int shift = rowBit2*((meta>>3)%2) + rowBit1*((meta>>2)%2) + colBit2*((meta>>1)%2) + colBit1*(meta%2);
2658  long long int scanInd = outerInd % numBackgroundStates;
2659  long long int stateInd = (
2660  shift +
2661  (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4));
2662 
2663  vecReal[stateInd] *= fac;
2664  vecImag[stateInd] *= fac;
2665 }
2666 
2667 // @TODO is separating these 12 amplitudes really faster than letting every 16th base modify 12 elems?
2668 void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase) {
2669 
2670  if (dephase == 0)
2671  return;
2672 
2673  // assumes qubit2 > qubit1
2674 
2675  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2676  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2677 
2678  long long int colBit1 = 1LL << qubit1;
2679  long long int rowBit1 = 1LL << rowQubit1;
2680  long long int colBit2 = 1LL << qubit2;
2681  long long int rowBit2 = 1LL << rowQubit2;
2682 
2683  long long int part1 = colBit1 - 1;
2684  long long int part2 = (colBit2 >> 1) - colBit1;
2685  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2686  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2687  long long int part5 = (qureg.numAmpsPerChunk/16) - (rowBit2 >> 3);
2688  qreal dephFac = 1 - dephase;
2689 
2690  // refers to states |a 0 b 0 c><d 0 e 0 f| (target qubits are fixed)
2691  long long int numBackgroundStates = qureg.numAmpsPerChunk/16;
2692 
2693  // 12 of these states experience dephasing
2694  long long int numAmpsToVisit = 12 * numBackgroundStates;
2695 
2696  int threadsPerCUDABlock, CUDABlocks;
2697  threadsPerCUDABlock = 128;
2698  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2699  densmatr_mixTwoQubitDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2700  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBackgroundStates, numAmpsToVisit,
2701  part1, part2, part3, part4, part5, colBit1, rowBit1, colBit2, rowBit2);
2702 }
2703 
2706  qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2707  long long int part1, long long int part2, long long int part3,
2708  long long int bothBits)
2709 {
2710  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2711  if (scanInd >= numAmpsToVisit) return;
2712 
2713  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2714  long long int targetInd = baseInd + bothBits;
2715 
2716  qreal realAvDepol = depolLevel * 0.5 * (vecReal[baseInd] + vecReal[targetInd]);
2717  qreal imagAvDepol = depolLevel * 0.5 * (vecImag[baseInd] + vecImag[targetInd]);
2718 
2719  vecReal[baseInd] *= 1 - depolLevel;
2720  vecImag[baseInd] *= 1 - depolLevel;
2721  vecReal[targetInd] *= 1 - depolLevel;
2722  vecImag[targetInd] *= 1 - depolLevel;
2723 
2724  vecReal[baseInd] += realAvDepol;
2725  vecImag[baseInd] += imagAvDepol;
2726  vecReal[targetInd] += realAvDepol;
2727  vecImag[targetInd] += imagAvDepol;
2728 }
2729 
2732  qreal damping, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2733  long long int part1, long long int part2, long long int part3,
2734  long long int bothBits)
2735 {
2736  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2737  if (scanInd >= numAmpsToVisit) return;
2738 
2739  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2740  long long int targetInd = baseInd + bothBits;
2741 
2742  qreal realAvDepol = damping * ( vecReal[targetInd]);
2743  qreal imagAvDepol = damping * ( vecImag[targetInd]);
2744 
2745  vecReal[targetInd] *= 1 - damping;
2746  vecImag[targetInd] *= 1 - damping;
2747 
2748  vecReal[baseInd] += realAvDepol;
2749  vecImag[baseInd] += imagAvDepol;
2750 }
2751 
2752 void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel) {
2753 
2754  if (depolLevel == 0)
2755  return;
2756 
2757  densmatr_mixDephasing(qureg, targetQubit, depolLevel);
2758 
2759  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2760  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2761 
2762  long long int colBit = 1LL << targetQubit;
2763  long long int rowBit = 1LL << rowQubit;
2764  long long int bothBits = colBit | rowBit;
2765 
2766  long long int part1 = colBit - 1;
2767  long long int part2 = (rowBit >> 1) - colBit;
2768  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2769 
2770  int threadsPerCUDABlock, CUDABlocks;
2771  threadsPerCUDABlock = 128;
2772  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2773  densmatr_mixDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2774  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2775  part1, part2, part3, bothBits);
2776 }
2777 
2778 void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) {
2779 
2780  if (damping == 0)
2781  return;
2782 
2783  qreal dephase = sqrt(1-damping);
2784  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephase);
2785 
2786  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2787  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2788 
2789  long long int colBit = 1LL << targetQubit;
2790  long long int rowBit = 1LL << rowQubit;
2791  long long int bothBits = colBit | rowBit;
2792 
2793  long long int part1 = colBit - 1;
2794  long long int part2 = (rowBit >> 1) - colBit;
2795  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2796 
2797  int threadsPerCUDABlock, CUDABlocks;
2798  threadsPerCUDABlock = 128;
2799  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2800  densmatr_mixDampingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2801  damping, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2802  part1, part2, part3, bothBits);
2803 }
2804 
2807  qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2808  long long int part1, long long int part2, long long int part3,
2809  long long int part4, long long int part5,
2810  long long int rowCol1, long long int rowCol2)
2811 {
2812  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2813  if (scanInd >= numAmpsToVisit) return;
2814 
2815  // index of |..0..0..><..0..0|
2816  long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
2817  long long int ind01 = ind00 + rowCol1;
2818  long long int ind10 = ind00 + rowCol2;
2819  long long int ind11 = ind00 + rowCol1 + rowCol2;
2820 
2821  qreal realAvDepol = depolLevel * 0.25 * (
2822  vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]);
2823  qreal imagAvDepol = depolLevel * 0.25 * (
2824  vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]);
2825 
2826  qreal retain = 1 - depolLevel;
2827  vecReal[ind00] *= retain; vecImag[ind00] *= retain;
2828  vecReal[ind01] *= retain; vecImag[ind01] *= retain;
2829  vecReal[ind10] *= retain; vecImag[ind10] *= retain;
2830  vecReal[ind11] *= retain; vecImag[ind11] *= retain;
2831 
2832  vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol;
2833  vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol;
2834  vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol;
2835  vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol;
2836 }
2837 
2838 void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) {
2839 
2840  if (depolLevel == 0)
2841  return;
2842 
2843  // assumes qubit2 > qubit1
2844 
2845  densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
2846 
2847  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2848  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2849 
2850  long long int colBit1 = 1LL << qubit1;
2851  long long int rowBit1 = 1LL << rowQubit1;
2852  long long int colBit2 = 1LL << qubit2;
2853  long long int rowBit2 = 1LL << rowQubit2;
2854 
2855  long long int rowCol1 = colBit1 | rowBit1;
2856  long long int rowCol2 = colBit2 | rowBit2;
2857 
2858  long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
2859  long long int part1 = colBit1 - 1;
2860  long long int part2 = (colBit2 >> 1) - colBit1;
2861  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2862  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2863  long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
2864 
2865  int threadsPerCUDABlock, CUDABlocks;
2866  threadsPerCUDABlock = 128;
2867  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2868  densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2869  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2870  part1, part2, part3, part4, part5, rowCol1, rowCol2);
2871 }
2872 
2873 __global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) {
2874 
2875  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2876  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
2877  if (ampInd >= numAmpsToVisit) return;
2878 
2879  qreal *vecRe1 = qureg1.deviceStateVec.real;
2880  qreal *vecIm1 = qureg1.deviceStateVec.imag;
2881  qreal *vecRe2 = qureg2.deviceStateVec.real;
2882  qreal *vecIm2 = qureg2.deviceStateVec.imag;
2883  qreal *vecReOut = out.deviceStateVec.real;
2884  qreal *vecImOut = out.deviceStateVec.imag;
2885 
2886  qreal facRe1 = fac1.real;
2887  qreal facIm1 = fac1.imag;
2888  qreal facRe2 = fac2.real;
2889  qreal facIm2 = fac2.imag;
2890  qreal facReOut = facOut.real;
2891  qreal facImOut = facOut.imag;
2892 
2893  qreal re1,im1, re2,im2, reOut,imOut;
2894  long long int index = ampInd;
2895 
2896  re1 = vecRe1[index]; im1 = vecIm1[index];
2897  re2 = vecRe2[index]; im2 = vecIm2[index];
2898  reOut = vecReOut[index];
2899  imOut = vecImOut[index];
2900 
2901  vecReOut[index] = (facReOut*reOut - facImOut*imOut) + (facRe1*re1 - facIm1*im1) + (facRe2*re2 - facIm2*im2);
2902  vecImOut[index] = (facReOut*imOut + facImOut*reOut) + (facRe1*im1 + facIm1*re1) + (facRe2*im2 + facIm2*re2);
2903 }
2904 
2905 void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) {
2906 
2907  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
2908 
2909  int threadsPerCUDABlock, CUDABlocks;
2910  threadsPerCUDABlock = 128;
2911  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2912  statevec_setWeightedQuregKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2913  fac1, qureg1, fac2, qureg2, facOut, out
2914  );
2915 }
2916 
2918 
2919  // each thread modifies one value; a wasteful and inefficient strategy
2920  long long int numTasks = qureg.numAmpsPerChunk;
2921  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2922  if (thisTask >= numTasks) return;
2923 
2924  qreal* stateRe = qureg.deviceStateVec.real;
2925  qreal* stateIm = qureg.deviceStateVec.imag;
2926  qreal* opRe = op.deviceOperator.real;
2927  qreal* opIm = op.deviceOperator.imag;
2928 
2929  qreal a = stateRe[thisTask];
2930  qreal b = stateIm[thisTask];
2931  qreal c = opRe[thisTask];
2932  qreal d = opIm[thisTask];
2933 
2934  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
2935  stateRe[thisTask] = a*c - b*d;
2936  stateIm[thisTask] = a*d + b*c;
2937 }
2938 
2940 {
2941  int threadsPerCUDABlock, CUDABlocks;
2942  threadsPerCUDABlock = 128;
2943  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
2944  statevec_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
2945 }
2946 
2948 
2949  // each thread modifies one value; a wasteful and inefficient strategy
2950  long long int numTasks = qureg.numAmpsPerChunk;
2951  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2952  if (thisTask >= numTasks) return;
2953 
2954  qreal* stateRe = qureg.deviceStateVec.real;
2955  qreal* stateIm = qureg.deviceStateVec.imag;
2956  qreal* opRe = op.deviceOperator.real;
2957  qreal* opIm = op.deviceOperator.imag;
2958 
2959  int opDim = (1 << op.numQubits);
2960  qreal a = stateRe[thisTask];
2961  qreal b = stateIm[thisTask];
2962  qreal c = opRe[thisTask % opDim];
2963  qreal d = opIm[thisTask % opDim];
2964 
2965  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
2966  stateRe[thisTask] = a*c - b*d;
2967  stateIm[thisTask] = a*d + b*c;
2968 }
2969 
2971 
2972  int threadsPerCUDABlock, CUDABlocks;
2973  threadsPerCUDABlock = 128;
2974  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
2975  densmatr_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
2976 }
2977 
2980  int getRealComp,
2981  qreal* vecReal, qreal* vecImag, qreal* opReal, qreal* opImag,
2982  long long int numTermsToSum, qreal* reducedArray)
2983 {
2984  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2985  if (index >= numTermsToSum) return;
2986 
2987  qreal vecAbs = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2988 
2989  // choose whether to calculate the real or imaginary term of the expec term
2990  qreal expecVal;
2991  if (getRealComp)
2992  expecVal = vecAbs * opReal[index];
2993  else
2994  expecVal = vecAbs * opImag[index];
2995 
2996  // array of each thread's collected sum term, to be summed
2997  extern __shared__ qreal tempReductionArray[];
2998  tempReductionArray[threadIdx.x] = expecVal;
2999  __syncthreads();
3000 
3001  // every second thread reduces
3002  if (threadIdx.x<blockDim.x/2)
3003  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3004 }
3005 
3007 
3008  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3009  * (e.g. a func which accepts a pointer to do every-value reduction?)
3010  */
3011 
3012  qreal expecReal, expecImag;
3013 
3014  int getRealComp;
3015  long long int numValuesToReduce;
3016  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3017  int maxReducedPerLevel;
3018  int firstTime;
3019 
3020  // compute real component of inner product
3021  getRealComp = 1;
3022  numValuesToReduce = qureg.numAmpsPerChunk;
3023  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3024  firstTime = 1;
3025  while (numValuesToReduce > 1) {
3026  if (numValuesToReduce < maxReducedPerLevel) {
3027  valuesPerCUDABlock = numValuesToReduce;
3028  numCUDABlocks = 1;
3029  }
3030  else {
3031  valuesPerCUDABlock = maxReducedPerLevel;
3032  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3033  }
3034  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3035  if (firstTime) {
3036  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3037  getRealComp,
3038  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3039  op.deviceOperator.real, op.deviceOperator.imag,
3040  numValuesToReduce,
3041  qureg.firstLevelReduction);
3042  firstTime = 0;
3043  } else {
3044  cudaDeviceSynchronize();
3045  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3046  qureg.firstLevelReduction,
3047  qureg.secondLevelReduction, valuesPerCUDABlock);
3048  cudaDeviceSynchronize();
3050  }
3051  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3052  }
3053  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3054 
3055  // compute imag component of inner product
3056  getRealComp = 0;
3057  numValuesToReduce = qureg.numAmpsPerChunk;
3058  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3059  firstTime = 1;
3060  while (numValuesToReduce > 1) {
3061  if (numValuesToReduce < maxReducedPerLevel) {
3062  valuesPerCUDABlock = numValuesToReduce;
3063  numCUDABlocks = 1;
3064  }
3065  else {
3066  valuesPerCUDABlock = maxReducedPerLevel;
3067  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3068  }
3069  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3070  if (firstTime) {
3071  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3072  getRealComp,
3073  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3074  op.deviceOperator.real, op.deviceOperator.imag,
3075  numValuesToReduce,
3076  qureg.firstLevelReduction);
3077  firstTime = 0;
3078  } else {
3079  cudaDeviceSynchronize();
3080  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3081  qureg.firstLevelReduction,
3082  qureg.secondLevelReduction, valuesPerCUDABlock);
3083  cudaDeviceSynchronize();
3085  }
3086  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3087  }
3088  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3089 
3090  // return complex
3091  Complex expecVal;
3092  expecVal.real = expecReal;
3093  expecVal.imag = expecImag;
3094  return expecVal;
3095 }
3096 
3098  int getRealComp,
3099  qreal* matReal, qreal* matImag, qreal* opReal, qreal* opImag,
3100  int numQubits, long long int numTermsToSum, qreal* reducedArray)
3101 {
3107  // index will identy one of the 2^Q diagonals to be summed
3108  long long int matInd = blockIdx.x*blockDim.x + threadIdx.x;
3109  if (matInd >= numTermsToSum) return;
3110 
3111  long long int diagSpacing = (1LL << numQubits) + 1LL;
3112  int isDiag = ((matInd % diagSpacing) == 0);
3113 
3114  long long int opInd = matInd / diagSpacing;
3115 
3116  qreal val = 0;
3117  if (isDiag) {
3118 
3119  qreal matRe = matReal[matInd];
3120  qreal matIm = matImag[matInd];
3121  qreal opRe = opReal[opInd];
3122  qreal opIm = opImag[opInd];
3123 
3124  // (matRe + matIm i)(opRe + opIm i) =
3125  // (matRe opRe - matIm opIm) + i (matRe opIm + matIm opRe)
3126  if (getRealComp)
3127  val = matRe * opRe - matIm * opIm;
3128  else
3129  val = matRe * opIm + matIm * opRe;
3130  }
3131 
3132  // array of each thread's collected sum term, to be summed
3133  extern __shared__ qreal tempReductionArray[];
3134  tempReductionArray[threadIdx.x] = val;
3135  __syncthreads();
3136 
3137  // every second thread reduces
3138  if (threadIdx.x<blockDim.x/2)
3139  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3140 }
3141 
3143 
3144  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3145  * (e.g. a func which accepts a pointer to do every-value reduction?)
3146  */
3147 
3148  qreal expecReal, expecImag;
3149 
3150  int getRealComp;
3151  long long int numValuesToReduce;
3152  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3153  int maxReducedPerLevel;
3154  int firstTime;
3155 
3156  // compute real component of inner product
3157  getRealComp = 1;
3158  numValuesToReduce = qureg.numAmpsPerChunk;
3159  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3160  firstTime = 1;
3161  while (numValuesToReduce > 1) {
3162  if (numValuesToReduce < maxReducedPerLevel) {
3163  valuesPerCUDABlock = numValuesToReduce;
3164  numCUDABlocks = 1;
3165  }
3166  else {
3167  valuesPerCUDABlock = maxReducedPerLevel;
3168  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3169  }
3170  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3171  if (firstTime) {
3172  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3173  getRealComp,
3174  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3175  op.deviceOperator.real, op.deviceOperator.imag,
3176  op.numQubits, numValuesToReduce,
3177  qureg.firstLevelReduction);
3178  firstTime = 0;
3179  } else {
3180  cudaDeviceSynchronize();
3181  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3182  qureg.firstLevelReduction,
3183  qureg.secondLevelReduction, valuesPerCUDABlock);
3184  cudaDeviceSynchronize();
3186  }
3187  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3188  }
3189  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3190 
3191  // compute imag component of inner product
3192  getRealComp = 0;
3193  numValuesToReduce = qureg.numAmpsPerChunk;
3194  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3195  firstTime = 1;
3196  while (numValuesToReduce > 1) {
3197  if (numValuesToReduce < maxReducedPerLevel) {
3198  valuesPerCUDABlock = numValuesToReduce;
3199  numCUDABlocks = 1;
3200  }
3201  else {
3202  valuesPerCUDABlock = maxReducedPerLevel;
3203  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3204  }
3205  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3206  if (firstTime) {
3207  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3208  getRealComp,
3209  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3210  op.deviceOperator.real, op.deviceOperator.imag,
3211  op.numQubits, numValuesToReduce,
3212  qureg.firstLevelReduction);
3213  firstTime = 0;
3214  } else {
3215  cudaDeviceSynchronize();
3216  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3217  qureg.firstLevelReduction,
3218  qureg.secondLevelReduction, valuesPerCUDABlock);
3219  cudaDeviceSynchronize();
3221  }
3222  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3223  }
3224  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3225 
3226  // return complex
3227  Complex expecVal;
3228  expecVal.real = expecReal;
3229  expecVal.imag = expecImag;
3230  return expecVal;
3231 }
3232 
3233 void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) {
3234 
3235  // update both RAM and VRAM, for consistency
3236  memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
3237  memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
3238 
3239  cudaDeviceSynchronize();
3240  cudaMemcpy(
3241  op.deviceOperator.real + startInd,
3242  real,
3243  numElems * sizeof(*(op.deviceOperator.real)),
3244  cudaMemcpyHostToDevice);
3245  cudaMemcpy(
3246  op.deviceOperator.imag + startInd,
3247  imag,
3248  numElems * sizeof(*(op.deviceOperator.imag)),
3249  cudaMemcpyHostToDevice);
3250 }
3251 
3253  // init MT random number generator with three keys -- time and pid
3254  // for the MPI version, it is ok that all procs will get the same seed as random numbers will only be
3255  // used by the master process
3256 
3257  unsigned long int key[2];
3258  getQuESTDefaultSeedKey(key);
3259  init_by_array(key, 2);
3260 }
3261 
3262 
3263 
3264 
3265 #ifdef __cplusplus
3266 }
3267 #endif
void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping)
Definition: QuEST_gpu.cu:2778
__global__ void statevec_multiControlledUnitaryKernel(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:1177
__global__ void statevec_initPlusStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:563
void destroyQuESTEnv(QuESTEnv env)
Destroy the QuEST environment.
Definition: QuEST_gpu.cu:431
void init_by_array(unsigned long init_key[], int key_length)
Definition: mt19937ar.c:80
__device__ __host__ unsigned int log2Int(unsigned int x)
Definition: QuEST_gpu.cu:1780
__global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
Definition: QuEST_gpu.cu:2873
__global__ void densmatr_collapseToKnownProbOutcomeKernel(qreal outcomeProb, qreal *vecReal, qreal *vecImag, long long int numBasesToVisit, long long int part1, long long int part2, long long int part3, long long int rowBit, long long int colBit, long long int desired, long long int undesired)
Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|.
Definition: QuEST_gpu.cu:2509
void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2)
Definition: QuEST_gpu.cu:1668
void copyStateFromGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg....
Definition: QuEST_gpu.cu:461
__global__ void statevec_controlledUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:1111
void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle)
Definition: QuEST_gpu.cu:1520
qreal real[4][4]
Definition: QuEST.h:127
void syncQuESTEnv(QuESTEnv env)
Guarantees that all code up to the given point has been executed on all nodes (if running in distribu...
Definition: QuEST_gpu.cu:423
__global__ void densmatr_initPureStateKernel(long long int numPureAmps, qreal *targetVecReal, qreal *targetVecImag, qreal *copyVecReal, qreal *copyVecImag)
Definition: QuEST_gpu.cu:186
void statevec_setAmps(Qureg qureg, long long int startInd, qreal *reals, qreal *imags, long long int numAmps)
Definition: QuEST_gpu.cu:153
__global__ void densmatr_mixTwoQubitDepolarisingKernel(qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2)
Called once for every 16 amplitudes.
Definition: QuEST_gpu.cu:2806
int rank
Definition: QuEST.h:244
void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
Definition: QuEST_gpu.cu:1490
int GPUExists(void)
Definition: QuEST_gpu.cu:390
__global__ void copySharedReduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1806
void swapDouble(qreal **a, qreal **b)
Definition: QuEST_gpu.cu:1912
__global__ void statevec_multiControlledMultiQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int *targs, int numTargs, qreal *uRe, qreal *uIm, long long int *ampInds, qreal *reAmps, qreal *imAmps, long long int numTargAmps)
Definition: QuEST_gpu.cu:912
__global__ void densmatr_initClassicalStateKernel(long long int densityNumElems, qreal *densityReal, qreal *densityImag, long long int densityInd)
Definition: QuEST_gpu.cu:239
int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env)
Definition: QuEST_gpu.cu:659
int numChunks
The number of nodes between which the elements of this operator are split.
Definition: QuEST.h:185
void statevec_pauliY(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1325
ComplexArray pairStateVec
Temporary storage for a chunk of the state vector received from another process in the MPI version.
Definition: QuEST.h:224
qreal statevec_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:1967
void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:843
__global__ void densmatr_calcExpecDiagonalOpKernel(int getRealComp, qreal *matReal, qreal *matImag, qreal *opReal, qreal *opImag, int numQubits, long long int numTermsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:3097
void getEnvironmentString(QuESTEnv env, Qureg qureg, char str[200])
Sets str to a string containing the number of qubits in qureg, and the hardware facilities used (e....
Definition: QuEST_gpu.cu:447
void agnostic_syncDiagonalOp(DiagonalOp op)
Definition: QuEST_gpu.cu:382
__global__ void statevec_controlledPhaseShiftKernel(Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1434
#define DEBUG
Definition: QuEST_gpu.cu:20
__global__ void statevec_multiRotateZKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1503
int numChunks
Number of chunks the state vector is broken up into – the number of MPI processes used.
Definition: QuEST.h:219
__global__ void densmatr_applyDiagonalOpKernel(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:2947
void getQuESTDefaultSeedKey(unsigned long int *key)
Definition: QuEST_common.c:182
ComplexArray deviceOperator
A copy of the elements stored persistently on the GPU.
Definition: QuEST.h:193
void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks ...
Definition: QuEST_gpu.cu:1104
__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel(qreal *aRe, qreal *aIm, qreal *bRe, qreal *bIm, long long int numAmpsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:2299
void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2)
Definition: QuEST_gpu.cu:1607
int chunkId
The position of the chunk of the operator held by this process in the full operator.
Definition: QuEST.h:187
ComplexArray deviceStateVec
Storage for wavefunction amplitudes in the GPU version.
Definition: QuEST.h:227
qreal statevec_getRealAmp(Qureg qureg, long long int index)
Definition: QuEST_gpu.cu:501
void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env)
Definition: QuEST_gpu.cu:275
qreal densmatr_calcInnerProduct(Qureg a, Qureg b)
Definition: QuEST_gpu.cu:2043
__global__ void densmatr_mixDampingKernel(qreal damping, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
Works like mixDephasing but modifies every other element, and elements are averaged in pairs.
Definition: QuEST_gpu.cu:2731
void statevec_hadamard(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1725
__global__ void statevec_calcExpecDiagonalOpKernel(int getRealComp, qreal *vecReal, qreal *vecImag, qreal *opReal, qreal *opImag, long long int numTermsToSum, qreal *reducedArray)
computes either a real or imag term of |vec_i|^2 op_i
Definition: QuEST_gpu.cu:2979
Represents a 4x4 matrix of complex numbers.
Definition: QuEST.h:125
void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac)
Definition: QuEST_gpu.cu:2609
__global__ void densmatr_mixTwoQubitDephasingKernel(qreal fac, qreal *vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0....
Definition: QuEST_gpu.cu:2644
Information about the environment the program is running in.
Definition: QuEST.h:242
void statevec_initClassicalState(Qureg qureg, long long int stateInd)
Definition: QuEST_gpu.cu:600
void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int *targs, int numTargs, ComplexMatrixN u)
This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks ...
Definition: QuEST_gpu.cu:971
Represents a general 2^N by 2^N matrix of complex numbers.
Definition: QuEST.h:136
qreal densmatr_calcTotalProb(Qureg qureg)
Definition: QuEST_gpu.cu:1531
void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits)
Definition: QuEST_gpu.cu:1633
__global__ void statevec_controlledCompactUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:784
#define qreal
void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:904
void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
Definition: QuEST_gpu.cu:2500
void statevec_pauliX(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1292
void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
Definition: QuEST_gpu.cu:258
__forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd)
Definition: QuEST_gpu.cu:95
__global__ void statevec_unitaryKernel(Qureg qureg, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:851
__global__ void statevec_multiControlledPhaseFlipKernel(Qureg qureg, long long int mask)
Definition: QuEST_gpu.cu:1615
void statevec_initZeroState(Qureg qureg)
Definition: QuEST_gpu.cu:552
__global__ void statevec_findProbabilityOfZeroKernel(Qureg qureg, int measureQubit, qreal *reducedArray)
Definition: QuEST_gpu.cu:1853
__global__ void statevec_applyDiagonalOpKernel(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:2917
int numQubitsInStateVec
Number of qubits in the state-vector - this is double the number represented for mixed states.
Definition: QuEST.h:210
__global__ void statevec_initBlankStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:515
qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
Definition: QuEST_gpu.cu:2005
qreal statevec_calcTotalProb(Qureg qureg)
Definition: QuEST_gpu.cu:1554
__global__ void densmatr_calcInnerProductKernel(Qureg a, Qureg b, long long int numTermsToSum, qreal *reducedArray)
computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number
Definition: QuEST_gpu.cu:2022
int chunkId
The position of the chunk of the state vector held by this process in the full state vector.
Definition: QuEST.h:217
void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term)
Definition: QuEST_gpu.cu:1423
qreal imag[2][2]
Definition: QuEST.h:117
int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision)
Definition: QuEST_gpu.cu:703
__forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index)
Definition: QuEST_gpu.cu:99
qreal * imag
The imaginary values of the 2^numQubits complex elements.
Definition: QuEST.h:191
void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg)
Definition: QuEST_gpu.cu:2576
qreal densmatr_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:1919
__global__ void statevec_pauliXKernel(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1248
__forceinline__ __device__ int getBitMaskParity(long long int mask)
Definition: QuEST_gpu.cu:86
long long int numAmpsPerChunk
Number of probability amplitudes held in stateVec by this process In the non-MPI version,...
Definition: QuEST.h:213
void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:2939
void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
Definition: QuEST_gpu.cu:205
__global__ void statevec_phaseShiftByTermKernel(Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1395
__global__ void statevec_initZeroStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:536
void copyStateToGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU...
Definition: QuEST_gpu.cu:451
#define REDUCE_SHARED_SIZE
Definition: QuEST_gpu.cu:19
__global__ void densmatr_calcPurityKernel(qreal *vecReal, qreal *vecImag, long long int numAmpsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:2375
DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env)
Definition: QuEST_gpu.cu:338
qreal statevec_getImagAmp(Qureg qureg, long long int index)
Definition: QuEST_gpu.cu:508
int numRanks
Definition: QuEST.h:245
qreal imag[4][4]
Definition: QuEST.h:128
qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
Definition: QuEST_gpu.cu:2013
void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:776
int numQubits
The number of qubits this operator can act on (informing its size)
Definition: QuEST.h:181
long long int getQubitBitMask(int *qubits, int numQubits)
Definition: QuEST_common.c:44
Represents a diagonal complex operator on the full Hilbert state of a Qureg.
Definition: QuEST.h:178
__forceinline__ __device__ long long int insertZeroBits(long long int number, int *inds, const int numInds)
Definition: QuEST_gpu.cu:112
Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3142
__global__ void densmatr_mixDephasingKernel(qreal fac, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int colBit, long long int rowBit)
Called once for every 4 amplitudes in density matrix Works by establishing the |.....
Definition: QuEST_gpu.cu:2593
int getNumReductionLevels(long long int numValuesToReduce, int numReducedPerLevel)
Definition: QuEST_gpu.cu:1903
qreal ** real
Definition: QuEST.h:139
__global__ void statevec_hadamardKernel(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1676
__global__ void statevec_controlledPhaseFlipKernel(Qureg qureg, int idQubit1, int idQubit2)
Definition: QuEST_gpu.cu:1586
qreal * secondLevelReduction
Definition: QuEST.h:229
__global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:216
__forceinline__ __device__ int extractBit(const int locationOfBitFromRight, const long long int theEncodedNumber)
Definition: QuEST_gpu.cu:82
__global__ void densmatr_mixDepolarisingKernel(qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
Works like mixDephasing but modifies every other element, and elements are averaged in pairs.
Definition: QuEST_gpu.cu:2705
void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel)
Definition: QuEST_gpu.cu:2838
void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle)
Definition: QuEST_gpu.cu:1459
qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b)
Definition: QuEST_gpu.cu:2323
void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase)
Definition: QuEST_gpu.cu:2668
void statevec_initBlankState(Qureg qureg)
Definition: QuEST_gpu.cu:525
Represents a system of qubits.
Definition: QuEST.h:203
void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1377
__global__ void statevec_initDebugStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:611
__forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2)
Definition: QuEST_gpu.cu:106
Complex statevec_calcInnerProduct(Qureg bra, Qureg ket)
Terrible code which unnecessarily individually computes and sums the real and imaginary components of...
Definition: QuEST_gpu.cu:2123
qreal ** imag
Definition: QuEST.h:140
void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
This involves finding |...i...><...j...| states and killing those where i!=j.
Definition: QuEST_gpu.cu:2535
__global__ void statevec_pauliYKernel(Qureg qureg, int targetQubit, int conjFac)
Definition: QuEST_gpu.cu:1300
void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank)
Print the current state vector of probability amplitudes for a set of qubits to standard out.
Definition: QuEST_gpu.cu:475
__global__ void statevec_initClassicalStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd)
Definition: QuEST_gpu.cu:585
ComplexArray stateVec
Computational state amplitudes - a subset thereof in the MPI version.
Definition: QuEST.h:222
__global__ void statevec_calcInnerProductKernel(int getRealComp, qreal *vecReal1, qreal *vecImag1, qreal *vecReal2, qreal *vecImag2, long long int numTermsToSum, qreal *reducedArray)
computes either a real or imag term in the inner product
Definition: QuEST_gpu.cu:2093
__global__ void densmatr_findProbabilityOfZeroKernel(Qureg qureg, int measureQubit, qreal *reducedArray)
Definition: QuEST_gpu.cu:1815
qreal real[2][2]
Definition: QuEST.h:116
void seedQuESTDefault()
Seed the Mersenne Twister used for random number generation in the QuEST environment with an example ...
Definition: QuEST_gpu.cu:3252
long long int numElemsPerChunk
The number of the 2^numQubits amplitudes stored on each distributed node.
Definition: QuEST.h:183
int isDensityMatrix
Whether this instance is a density-state representation.
Definition: QuEST.h:206
__global__ void statevec_controlledPauliYKernel(Qureg qureg, int controlQubit, int targetQubit, int conjFac)
Definition: QuEST_gpu.cu:1341
void reportQuESTEnv(QuESTEnv env)
Report information about the QuEST environment.
Definition: QuEST_gpu.cu:435
int numQubits
Definition: QuEST.h:138
void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase)
Definition: QuEST_gpu.cu:2629
void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:1169
void densmatr_initPlusState(Qureg qureg)
Definition: QuEST_gpu.cu:226
void statevec_pauliYConj(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1333
void statevec_destroyQureg(Qureg qureg, QuESTEnv env)
Definition: QuEST_gpu.cu:321
void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1772
int numQubitsRepresented
The number of qubits represented in either the state-vector or density matrix.
Definition: QuEST.h:208
void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg)
works for both statevectors and density matrices
Definition: QuEST_gpu.cu:170
void agnostic_destroyDiagonalOp(DiagonalOp op)
Definition: QuEST_gpu.cu:375
long long int numAmpsTotal
Total number of amplitudes, which are possibly distributed among machines.
Definition: QuEST.h:215
int syncQuESTSuccess(int successCode)
Performs a logical AND on all successCodes held by all processes.
Definition: QuEST_gpu.cu:427
qreal * real
The real values of the 2^numQubits complex elements.
Definition: QuEST.h:189
qreal real
Definition: QuEST.h:105
__device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1787
qreal imag
Definition: QuEST.h:106
__global__ void statevec_initStateOfSingleQubitKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome)
Definition: QuEST_gpu.cu:632
__global__ void statevec_collapseToKnownProbOutcomeKernel(Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
Definition: QuEST_gpu.cu:2443
void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:2970
void statevec_initDebugState(Qureg qureg)
Initialise the state vector of probability amplitudes to an (unphysical) state with each component of...
Definition: QuEST_gpu.cu:621
void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:1237
qreal densmatr_calcPurity(Qureg qureg)
Computes the trace of the density matrix squared.
Definition: QuEST_gpu.cu:2394
__global__ void statevec_multiControlledTwoQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u)
Definition: QuEST_gpu.cu:1028
Represents one complex number.
Definition: QuEST.h:103
QuESTEnv createQuESTEnv(void)
Create the QuEST execution environment.
Definition: QuEST_gpu.cu:407
void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all...
Definition: QuEST_gpu.cu:650
void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
Definition: QuEST_gpu.cu:2905
qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState)
Definition: QuEST_gpu.cu:2249
void statevec_initPlusState(Qureg qureg)
Definition: QuEST_gpu.cu:574
qreal * firstLevelReduction
Storage for reduction of probabilities on GPU.
Definition: QuEST.h:229
__global__ void statevec_compactUnitaryKernel(Qureg qureg, int rotQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:721
__global__ void statevec_controlledNotKernel(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1733
Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3006
void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal *real, qreal *imag, long long int numElems)
Definition: QuEST_gpu.cu:3233
__global__ void densmatr_mixDensityMatrixKernel(Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit)
Definition: QuEST_gpu.cu:2564
void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel)
Definition: QuEST_gpu.cu:2752
__global__ void statevec_swapQubitAmpsKernel(Qureg qureg, int qb1, int qb2)
Definition: QuEST_gpu.cu:1642
Represents a 2x2 matrix of complex numbers.
Definition: QuEST.h:114
void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1386
__global__ void densmatr_calcFidelityKernel(Qureg dens, Qureg vec, long long int dim, qreal *reducedArray)
computes one term of (vec^*T) dens * vec
Definition: QuEST_gpu.cu:2211
__global__ void statevec_multiControlledPhaseShiftKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1470