QuEST_gpu.cu File Reference
#include "QuEST.h"
#include "QuEST_precision.h"
#include "QuEST_internal.h"
#include "mt19937ar.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

Go to the source code of this file.

Macros

#define DEBUG   0
 
#define REDUCE_SHARED_SIZE   512
 

Functions

DiagonalOp agnostic_createDiagonalOp (int numQubits, QuESTEnv env)
 
void agnostic_destroyDiagonalOp (DiagonalOp op)
 
void agnostic_setDiagonalOpElems (DiagonalOp op, long long int startInd, qreal *real, qreal *imag, long long int numElems)
 
void agnostic_syncDiagonalOp (DiagonalOp op)
 
__global__ void copySharedReduceBlock (qreal *arrayIn, qreal *reducedArray, int length)
 
void copyStateFromGPU (Qureg qureg)
 In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg.deviceStateVec) to RAM (qureg.stateVec), where it can be accessed/modified by the user. More...
 
void copyStateToGPU (Qureg qureg)
 In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU-memory (qureg.deviceStateVec), which is the version operated upon by other calls to the API. More...
 
QuESTEnv createQuESTEnv (void)
 Create the QuEST execution environment. More...
 
void densmatr_applyDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void densmatr_applyDiagonalOpKernel (Qureg qureg, DiagonalOp op)
 
Complex densmatr_calcExpecDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void densmatr_calcExpecDiagonalOpKernel (int getRealComp, qreal *matReal, qreal *matImag, qreal *opReal, qreal *opImag, int numQubits, long long int numTermsToSum, qreal *reducedArray)
 
qreal densmatr_calcFidelity (Qureg qureg, Qureg pureState)
 
__global__ void densmatr_calcFidelityKernel (Qureg dens, Qureg vec, long long int dim, qreal *reducedArray)
 computes one term of (vec^*T) dens * vec More...
 
qreal densmatr_calcHilbertSchmidtDistance (Qureg a, Qureg b)
 
__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel (qreal *aRe, qreal *aIm, qreal *bRe, qreal *bIm, long long int numAmpsToSum, qreal *reducedArray)
 
qreal densmatr_calcInnerProduct (Qureg a, Qureg b)
 
__global__ void densmatr_calcInnerProductKernel (Qureg a, Qureg b, long long int numTermsToSum, qreal *reducedArray)
 computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number More...
 
qreal densmatr_calcProbOfOutcome (Qureg qureg, int measureQubit, int outcome)
 
qreal densmatr_calcPurity (Qureg qureg)
 Computes the trace of the density matrix squared. More...
 
__global__ void densmatr_calcPurityKernel (qreal *vecReal, qreal *vecImag, long long int numAmpsToSum, qreal *reducedArray)
 
qreal densmatr_calcTotalProb (Qureg qureg)
 
void densmatr_collapseToKnownProbOutcome (Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
 This involves finding |...i...><...j...| states and killing those where i!=j. More...
 
__global__ void densmatr_collapseToKnownProbOutcomeKernel (qreal outcomeProb, qreal *vecReal, qreal *vecImag, long long int numBasesToVisit, long long int part1, long long int part2, long long int part3, long long int rowBit, long long int colBit, long long int desired, long long int undesired)
 Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|. More...
 
qreal densmatr_findProbabilityOfZero (Qureg qureg, int measureQubit)
 
__global__ void densmatr_findProbabilityOfZeroKernel (Qureg qureg, int measureQubit, qreal *reducedArray)
 
void densmatr_initClassicalState (Qureg qureg, long long int stateInd)
 
__global__ void densmatr_initClassicalStateKernel (long long int densityNumElems, qreal *densityReal, qreal *densityImag, long long int densityInd)
 
void densmatr_initPlusState (Qureg qureg)
 
__global__ void densmatr_initPlusStateKernel (long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag)
 
void densmatr_initPureState (Qureg targetQureg, Qureg copyQureg)
 
__global__ void densmatr_initPureStateKernel (long long int numPureAmps, qreal *targetVecReal, qreal *targetVecImag, qreal *copyVecReal, qreal *copyVecImag)
 
void densmatr_mixDamping (Qureg qureg, int targetQubit, qreal damping)
 
__global__ void densmatr_mixDampingKernel (qreal damping, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
 Works like mixDephasing but modifies every other element, and elements are averaged in pairs. More...
 
void densmatr_mixDensityMatrix (Qureg combineQureg, qreal otherProb, Qureg otherQureg)
 
__global__ void densmatr_mixDensityMatrixKernel (Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit)
 
void densmatr_mixDephasing (Qureg qureg, int targetQubit, qreal dephase)
 
__global__ void densmatr_mixDephasingKernel (qreal fac, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int colBit, long long int rowBit)
 Called once for every 4 amplitudes in density matrix Works by establishing the |..0..><..0..| state (for its given index) then visiting |..1..><..0..| and |..0..><..1..|. More...
 
void densmatr_mixDepolarising (Qureg qureg, int targetQubit, qreal depolLevel)
 
__global__ void densmatr_mixDepolarisingKernel (qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
 Works like mixDephasing but modifies every other element, and elements are averaged in pairs. More...
 
void densmatr_mixTwoQubitDephasing (Qureg qureg, int qubit1, int qubit2, qreal dephase)
 
__global__ void densmatr_mixTwoQubitDephasingKernel (qreal fac, qreal *vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
 Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0..0..><..0..0..| index to visit either |..0..0..><..0..1..|, |..0..0..><..1..0..|, |..0..0..><..1..1..|, |..0..1..><..0..0..| etc and so on to |..1..1..><..1..0|. More...
 
void densmatr_mixTwoQubitDepolarising (Qureg qureg, int qubit1, int qubit2, qreal depolLevel)
 
__global__ void densmatr_mixTwoQubitDepolarisingKernel (qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2)
 Called once for every 16 amplitudes. More...
 
void densmatr_oneQubitDegradeOffDiagonal (Qureg qureg, int targetQubit, qreal dephFac)
 
void destroyQuESTEnv (QuESTEnv env)
 Destroy the QuEST environment. More...
 
__forceinline__ __device__ int extractBit (const int locationOfBitFromRight, const long long int theEncodedNumber)
 
__forceinline__ __device__ long long int flipBit (const long long int number, const int bitInd)
 
__forceinline__ __device__ int getBitMaskParity (long long int mask)
 
void getEnvironmentString (QuESTEnv env, Qureg qureg, char str[200])
 Sets str to a string containing the number of qubits in qureg, and the hardware facilities used (e.g. More...
 
int getNumReductionLevels (long long int numValuesToReduce, int numReducedPerLevel)
 
int GPUExists (void)
 
__forceinline__ __device__ long long int insertTwoZeroBits (const long long int number, const int bit1, const int bit2)
 
__forceinline__ __device__ long long int insertZeroBit (const long long int number, const int index)
 
__forceinline__ __device__ long long int insertZeroBits (long long int number, int *inds, const int numInds)
 
__device__ __host__ unsigned int log2Int (unsigned int x)
 
__device__ void reduceBlock (qreal *arrayIn, qreal *reducedArray, int length)
 
void reportQuESTEnv (QuESTEnv env)
 Report information about the QuEST environment. More...
 
void seedQuESTDefault ()
 Seed the Mersenne Twister used for random number generation in the QuEST environment with an example defualt seed. More...
 
void statevec_applyDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void statevec_applyDiagonalOpKernel (Qureg qureg, DiagonalOp op)
 
Complex statevec_calcExpecDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void statevec_calcExpecDiagonalOpKernel (int getRealComp, qreal *vecReal, qreal *vecImag, qreal *opReal, qreal *opImag, long long int numTermsToSum, qreal *reducedArray)
 computes either a real or imag term of |vec_i|^2 op_i More...
 
Complex statevec_calcInnerProduct (Qureg bra, Qureg ket)
 Terrible code which unnecessarily individually computes and sums the real and imaginary components of the inner product, so as to not have to worry about keeping the sums separated during reduction. More...
 
__global__ void statevec_calcInnerProductKernel (int getRealComp, qreal *vecReal1, qreal *vecImag1, qreal *vecReal2, qreal *vecImag2, long long int numTermsToSum, qreal *reducedArray)
 computes either a real or imag term in the inner product More...
 
qreal statevec_calcProbOfOutcome (Qureg qureg, int measureQubit, int outcome)
 
qreal statevec_calcTotalProb (Qureg qureg)
 
void statevec_cloneQureg (Qureg targetQureg, Qureg copyQureg)
 works for both statevectors and density matrices More...
 
void statevec_collapseToKnownProbOutcome (Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
 
__global__ void statevec_collapseToKnownProbOutcomeKernel (Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
 
void statevec_compactUnitary (Qureg qureg, int targetQubit, Complex alpha, Complex beta)
 
__global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta)
 
int statevec_compareStates (Qureg mq1, Qureg mq2, qreal precision)
 
void statevec_controlledCompactUnitary (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
 
__global__ void statevec_controlledCompactUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
 
void statevec_controlledNot (Qureg qureg, int controlQubit, int targetQubit)
 
__global__ void statevec_controlledNotKernel (Qureg qureg, int controlQubit, int targetQubit)
 
void statevec_controlledPauliY (Qureg qureg, int controlQubit, int targetQubit)
 
void statevec_controlledPauliYConj (Qureg qureg, int controlQubit, int targetQubit)
 
__global__ void statevec_controlledPauliYKernel (Qureg qureg, int controlQubit, int targetQubit, int conjFac)
 
void statevec_controlledPhaseFlip (Qureg qureg, int idQubit1, int idQubit2)
 
__global__ void statevec_controlledPhaseFlipKernel (Qureg qureg, int idQubit1, int idQubit2)
 
void statevec_controlledPhaseShift (Qureg qureg, int idQubit1, int idQubit2, qreal angle)
 
__global__ void statevec_controlledPhaseShiftKernel (Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
 
void statevec_controlledUnitary (Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_controlledUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u)
 
void statevec_createQureg (Qureg *qureg, int numQubits, QuESTEnv env)
 
void statevec_destroyQureg (Qureg qureg, QuESTEnv env)
 
qreal statevec_findProbabilityOfZero (Qureg qureg, int measureQubit)
 
__global__ void statevec_findProbabilityOfZeroKernel (Qureg qureg, int measureQubit, qreal *reducedArray)
 
qreal statevec_getImagAmp (Qureg qureg, long long int index)
 
qreal statevec_getRealAmp (Qureg qureg, long long int index)
 
void statevec_hadamard (Qureg qureg, int targetQubit)
 
__global__ void statevec_hadamardKernel (Qureg qureg, int targetQubit)
 
void statevec_initBlankState (Qureg qureg)
 
__global__ void statevec_initBlankStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_initClassicalState (Qureg qureg, long long int stateInd)
 
__global__ void statevec_initClassicalStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd)
 
void statevec_initDebugState (Qureg qureg)
 Initialise the state vector of probability amplitudes to an (unphysical) state with each component of each probability amplitude a unique floating point value. More...
 
__global__ void statevec_initDebugStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_initPlusState (Qureg qureg)
 
__global__ void statevec_initPlusStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
int statevec_initStateFromSingleFile (Qureg *qureg, char filename[200], QuESTEnv env)
 
void statevec_initStateOfSingleQubit (Qureg *qureg, int qubitId, int outcome)
 Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one. More...
 
__global__ void statevec_initStateOfSingleQubitKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome)
 
void statevec_initZeroState (Qureg qureg)
 
__global__ void statevec_initZeroStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_multiControlledMultiQubitUnitary (Qureg qureg, long long int ctrlMask, int *targs, int numTargs, ComplexMatrixN u)
 This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct. More...
 
__global__ void statevec_multiControlledMultiQubitUnitaryKernel (Qureg qureg, long long int ctrlMask, int *targs, int numTargs, qreal *uRe, qreal *uIm, long long int *ampInds, qreal *reAmps, qreal *imAmps, long long int numTargAmps)
 
void statevec_multiControlledPhaseFlip (Qureg qureg, int *controlQubits, int numControlQubits)
 
__global__ void statevec_multiControlledPhaseFlipKernel (Qureg qureg, long long int mask)
 
void statevec_multiControlledPhaseShift (Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
 
__global__ void statevec_multiControlledPhaseShiftKernel (Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
 
void statevec_multiControlledTwoQubitUnitary (Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
 This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct. More...
 
__global__ void statevec_multiControlledTwoQubitUnitaryKernel (Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u)
 
void statevec_multiControlledUnitary (Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_multiControlledUnitaryKernel (Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ArgMatrix2 u)
 
void statevec_multiRotateZ (Qureg qureg, long long int mask, qreal angle)
 
__global__ void statevec_multiRotateZKernel (Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
 
void statevec_pauliX (Qureg qureg, int targetQubit)
 
__global__ void statevec_pauliXKernel (Qureg qureg, int targetQubit)
 
void statevec_pauliY (Qureg qureg, int targetQubit)
 
void statevec_pauliYConj (Qureg qureg, int targetQubit)
 
__global__ void statevec_pauliYKernel (Qureg qureg, int targetQubit, int conjFac)
 
void statevec_phaseShiftByTerm (Qureg qureg, int targetQubit, Complex term)
 
__global__ void statevec_phaseShiftByTermKernel (Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle)
 
void statevec_reportStateToScreen (Qureg qureg, QuESTEnv env, int reportRank)
 Print the current state vector of probability amplitudes for a set of qubits to standard out. More...
 
void statevec_setAmps (Qureg qureg, long long int startInd, qreal *reals, qreal *imags, long long int numAmps)
 
void statevec_setWeightedQureg (Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
 
__global__ void statevec_setWeightedQuregKernel (Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
 
void statevec_swapQubitAmps (Qureg qureg, int qb1, int qb2)
 
__global__ void statevec_swapQubitAmpsKernel (Qureg qureg, int qb1, int qb2)
 
void statevec_unitary (Qureg qureg, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_unitaryKernel (Qureg qureg, int targetQubit, ArgMatrix2 u)
 
void swapDouble (qreal **a, qreal **b)
 
void syncQuESTEnv (QuESTEnv env)
 Guarantees that all code up to the given point has been executed on all nodes (if running in distributed mode) More...
 
int syncQuESTSuccess (int successCode)
 Performs a logical AND on all successCodes held by all processes. More...
 

Detailed Description

An implementation of the backend in ../QuEST_internal.h for a GPU environment.

Author
Ania Brown
Tyson Jones

Definition in file QuEST_gpu.cu.

Macro Definition Documentation

◆ DEBUG

#define DEBUG   0

Definition at line 20 of file QuEST_gpu.cu.

◆ REDUCE_SHARED_SIZE

#define REDUCE_SHARED_SIZE   512

Definition at line 19 of file QuEST_gpu.cu.

Function Documentation

◆ agnostic_createDiagonalOp()

DiagonalOp agnostic_createDiagonalOp ( int  numQubits,
QuESTEnv  env 
)

Definition at line 338 of file QuEST_gpu.cu.

338  {
339 
340  DiagonalOp op;
341  op.numQubits = numQubits;
342  op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
343  op.chunkId = env.rank;
344  op.numChunks = env.numRanks;
345 
346  // allocate CPU memory (initialised to zero)
347  op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
348  op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
349  // @TODO no handling of rank>1 allocation (no distributed GPU)
350 
351  // check cpu memory allocation was successful
352  if ( !op.real || !op.imag ) {
353  printf("Could not allocate memory!\n");
354  exit(EXIT_FAILURE);
355  }
356 
357  // allocate GPU memory
358  size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
359  cudaMalloc(&(op.deviceOperator.real), arrSize);
360  cudaMalloc(&(op.deviceOperator.imag), arrSize);
361 
362  // check gpu memory allocation was successful
363  if (!op.deviceOperator.real || !op.deviceOperator.imag) {
364  printf("Could not allocate memory on GPU!\n");
365  exit(EXIT_FAILURE);
366  }
367 
368  // initialise GPU memory to zero
369  cudaMemset(op.deviceOperator.real, 0, arrSize);
370  cudaMemset(op.deviceOperator.imag, 0, arrSize);
371 
372  return op;
373 }

References DiagonalOp::chunkId, DiagonalOp::deviceOperator, DiagonalOp::imag, DiagonalOp::numChunks, DiagonalOp::numElemsPerChunk, DiagonalOp::numQubits, QuESTEnv::numRanks, qreal, QuESTEnv::rank, and DiagonalOp::real.

Referenced by createDiagonalOp().

◆ agnostic_destroyDiagonalOp()

void agnostic_destroyDiagonalOp ( DiagonalOp  op)

Definition at line 375 of file QuEST_gpu.cu.

375  {
376  free(op.real);
377  free(op.imag);
378  cudaFree(op.deviceOperator.real);
379  cudaFree(op.deviceOperator.imag);
380 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, and DiagonalOp::real.

Referenced by destroyDiagonalOp().

◆ agnostic_setDiagonalOpElems()

void agnostic_setDiagonalOpElems ( DiagonalOp  op,
long long int  startInd,
qreal real,
qreal imag,
long long int  numElems 
)

Definition at line 3233 of file QuEST_gpu.cu.

3233  {
3234 
3235  // update both RAM and VRAM, for consistency
3236  memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
3237  memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
3238 
3239  cudaDeviceSynchronize();
3240  cudaMemcpy(
3241  op.deviceOperator.real + startInd,
3242  real,
3243  numElems * sizeof(*(op.deviceOperator.real)),
3244  cudaMemcpyHostToDevice);
3245  cudaMemcpy(
3246  op.deviceOperator.imag + startInd,
3247  imag,
3248  numElems * sizeof(*(op.deviceOperator.imag)),
3249  cudaMemcpyHostToDevice);
3250 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, qreal, and DiagonalOp::real.

Referenced by initDiagonalOp(), and setDiagonalOpElems().

◆ agnostic_syncDiagonalOp()

void agnostic_syncDiagonalOp ( DiagonalOp  op)

Definition at line 382 of file QuEST_gpu.cu.

382  {
383 
384  size_t arrSize = (1LL << op.numQubits) * sizeof(qreal);
385  cudaDeviceSynchronize();
386  cudaMemcpy(op.deviceOperator.real, op.real, arrSize, cudaMemcpyHostToDevice);
387  cudaMemcpy(op.deviceOperator.imag, op.imag, arrSize, cudaMemcpyHostToDevice);
388 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, DiagonalOp::numQubits, qreal, and DiagonalOp::real.

Referenced by syncDiagonalOp().

◆ copySharedReduceBlock()

__global__ void copySharedReduceBlock ( qreal arrayIn,
qreal reducedArray,
int  length 
)

Definition at line 1806 of file QuEST_gpu.cu.

1806  {
1807  extern __shared__ qreal tempReductionArray[];
1808  int blockOffset = blockIdx.x*length;
1809  tempReductionArray[threadIdx.x*2] = arrayIn[blockOffset + threadIdx.x*2];
1810  tempReductionArray[threadIdx.x*2+1] = arrayIn[blockOffset + threadIdx.x*2+1];
1811  __syncthreads();
1812  reduceBlock(tempReductionArray, reducedArray, length);
1813 }

References qreal, and reduceBlock().

Referenced by densmatr_calcExpecDiagonalOp(), densmatr_calcFidelity(), densmatr_calcHilbertSchmidtDistance(), densmatr_calcInnerProduct(), densmatr_calcPurity(), densmatr_findProbabilityOfZero(), statevec_calcExpecDiagonalOp(), statevec_calcInnerProduct(), and statevec_findProbabilityOfZero().

◆ densmatr_applyDiagonalOp()

void densmatr_applyDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 2970 of file QuEST_gpu.cu.

2970  {
2971 
2972  int threadsPerCUDABlock, CUDABlocks;
2973  threadsPerCUDABlock = 128;
2974  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
2975  densmatr_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
2976 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyDiagonalOp().

◆ densmatr_applyDiagonalOpKernel()

__global__ void densmatr_applyDiagonalOpKernel ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 2947 of file QuEST_gpu.cu.

2947  {
2948 
2949  // each thread modifies one value; a wasteful and inefficient strategy
2950  long long int numTasks = qureg.numAmpsPerChunk;
2951  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2952  if (thisTask >= numTasks) return;
2953 
2954  qreal* stateRe = qureg.deviceStateVec.real;
2955  qreal* stateIm = qureg.deviceStateVec.imag;
2956  qreal* opRe = op.deviceOperator.real;
2957  qreal* opIm = op.deviceOperator.imag;
2958 
2959  int opDim = (1 << op.numQubits);
2960  qreal a = stateRe[thisTask];
2961  qreal b = stateIm[thisTask];
2962  qreal c = opRe[thisTask % opDim];
2963  qreal d = opIm[thisTask % opDim];
2964 
2965  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
2966  stateRe[thisTask] = a*c - b*d;
2967  stateIm[thisTask] = a*d + b*c;
2968 }

References DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::numAmpsPerChunk, DiagonalOp::numQubits, and qreal.

◆ densmatr_calcExpecDiagonalOp()

Complex densmatr_calcExpecDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3142 of file QuEST_gpu.cu.

3142  {
3143 
3144  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3145  * (e.g. a func which accepts a pointer to do every-value reduction?)
3146  */
3147 
3148  qreal expecReal, expecImag;
3149 
3150  int getRealComp;
3151  long long int numValuesToReduce;
3152  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3153  int maxReducedPerLevel;
3154  int firstTime;
3155 
3156  // compute real component of inner product
3157  getRealComp = 1;
3158  numValuesToReduce = qureg.numAmpsPerChunk;
3159  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3160  firstTime = 1;
3161  while (numValuesToReduce > 1) {
3162  if (numValuesToReduce < maxReducedPerLevel) {
3163  valuesPerCUDABlock = numValuesToReduce;
3164  numCUDABlocks = 1;
3165  }
3166  else {
3167  valuesPerCUDABlock = maxReducedPerLevel;
3168  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3169  }
3170  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3171  if (firstTime) {
3172  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3173  getRealComp,
3174  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3175  op.deviceOperator.real, op.deviceOperator.imag,
3176  op.numQubits, numValuesToReduce,
3177  qureg.firstLevelReduction);
3178  firstTime = 0;
3179  } else {
3180  cudaDeviceSynchronize();
3181  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3182  qureg.firstLevelReduction,
3183  qureg.secondLevelReduction, valuesPerCUDABlock);
3184  cudaDeviceSynchronize();
3186  }
3187  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3188  }
3189  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3190 
3191  // compute imag component of inner product
3192  getRealComp = 0;
3193  numValuesToReduce = qureg.numAmpsPerChunk;
3194  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3195  firstTime = 1;
3196  while (numValuesToReduce > 1) {
3197  if (numValuesToReduce < maxReducedPerLevel) {
3198  valuesPerCUDABlock = numValuesToReduce;
3199  numCUDABlocks = 1;
3200  }
3201  else {
3202  valuesPerCUDABlock = maxReducedPerLevel;
3203  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3204  }
3205  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3206  if (firstTime) {
3207  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3208  getRealComp,
3209  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3210  op.deviceOperator.real, op.deviceOperator.imag,
3211  op.numQubits, numValuesToReduce,
3212  qureg.firstLevelReduction);
3213  firstTime = 0;
3214  } else {
3215  cudaDeviceSynchronize();
3216  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3217  qureg.firstLevelReduction,
3218  qureg.secondLevelReduction, valuesPerCUDABlock);
3219  cudaDeviceSynchronize();
3221  }
3222  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3223  }
3224  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3225 
3226  // return complex
3227  Complex expecVal;
3228  expecVal.real = expecReal;
3229  expecVal.imag = expecImag;
3230  return expecVal;
3231 }

References copySharedReduceBlock(), DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, DiagonalOp::numQubits, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcExpecDiagonalOp().

◆ densmatr_calcExpecDiagonalOpKernel()

__global__ void densmatr_calcExpecDiagonalOpKernel ( int  getRealComp,
qreal matReal,
qreal matImag,
qreal opReal,
qreal opImag,
int  numQubits,
long long int  numTermsToSum,
qreal reducedArray 
)

if the thread represents a diagonal op, then it computes either a real or imag term of matr_{ii} op_i. Otherwise, it writes a 0 to the reduction array

Definition at line 3097 of file QuEST_gpu.cu.

3101 {
3107  // index will identy one of the 2^Q diagonals to be summed
3108  long long int matInd = blockIdx.x*blockDim.x + threadIdx.x;
3109  if (matInd >= numTermsToSum) return;
3110 
3111  long long int diagSpacing = (1LL << numQubits) + 1LL;
3112  int isDiag = ((matInd % diagSpacing) == 0);
3113 
3114  long long int opInd = matInd / diagSpacing;
3115 
3116  qreal val = 0;
3117  if (isDiag) {
3118 
3119  qreal matRe = matReal[matInd];
3120  qreal matIm = matImag[matInd];
3121  qreal opRe = opReal[opInd];
3122  qreal opIm = opImag[opInd];
3123 
3124  // (matRe + matIm i)(opRe + opIm i) =
3125  // (matRe opRe - matIm opIm) + i (matRe opIm + matIm opRe)
3126  if (getRealComp)
3127  val = matRe * opRe - matIm * opIm;
3128  else
3129  val = matRe * opIm + matIm * opRe;
3130  }
3131 
3132  // array of each thread's collected sum term, to be summed
3133  extern __shared__ qreal tempReductionArray[];
3134  tempReductionArray[threadIdx.x] = val;
3135  __syncthreads();
3136 
3137  // every second thread reduces
3138  if (threadIdx.x<blockDim.x/2)
3139  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3140 }

References qreal, and reduceBlock().

◆ densmatr_calcFidelity()

qreal densmatr_calcFidelity ( Qureg  qureg,
Qureg  pureState 
)

Definition at line 2249 of file QuEST_gpu.cu.

2249  {
2250 
2251  // we're summing the square of every term in the density matrix
2252  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2253  long long int numValuesToReduce = densityDim;
2254 
2255  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2256  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2257  int firstTime = 1;
2258 
2259  while (numValuesToReduce > 1) {
2260 
2261  // need less than one CUDA-BLOCK to reduce
2262  if (numValuesToReduce < maxReducedPerLevel) {
2263  valuesPerCUDABlock = numValuesToReduce;
2264  numCUDABlocks = 1;
2265  }
2266  // otherwise use only full CUDA-BLOCKS
2267  else {
2268  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2269  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2270  }
2271  // dictates size of reduction array
2272  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2273 
2274  // spawn threads to sum the probs in each block
2275  // store the reduction in the pureState array
2276  if (firstTime) {
2277  densmatr_calcFidelityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2278  qureg, pureState, densityDim, pureState.firstLevelReduction);
2279  firstTime = 0;
2280 
2281  // sum the block probs
2282  } else {
2283  cudaDeviceSynchronize();
2284  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2285  pureState.firstLevelReduction,
2286  pureState.secondLevelReduction, valuesPerCUDABlock);
2287  cudaDeviceSynchronize();
2288  swapDouble(&(pureState.firstLevelReduction), &(pureState.secondLevelReduction));
2289  }
2290 
2291  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2292  }
2293 
2294  qreal fidelity;
2295  cudaMemcpy(&fidelity, pureState.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2296  return fidelity;
2297 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numQubitsRepresented, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcFidelity().

◆ densmatr_calcFidelityKernel()

__global__ void densmatr_calcFidelityKernel ( Qureg  dens,
Qureg  vec,
long long int  dim,
qreal reducedArray 
)

computes one term of (vec^*T) dens * vec

Definition at line 2211 of file QuEST_gpu.cu.

2211  {
2212 
2213  // figure out which density matrix row to consider
2214  long long int col;
2215  long long int row = blockIdx.x*blockDim.x + threadIdx.x;
2216  if (row >= dim) return;
2217 
2218  qreal* densReal = dens.deviceStateVec.real;
2219  qreal* densImag = dens.deviceStateVec.imag;
2220  qreal* vecReal = vec.deviceStateVec.real;
2221  qreal* vecImag = vec.deviceStateVec.imag;
2222 
2223  // compute the row-th element of the product dens*vec
2224  qreal prodReal = 0;
2225  qreal prodImag = 0;
2226  for (col=0LL; col < dim; col++) {
2227  qreal densElemReal = densReal[dim*col + row];
2228  qreal densElemImag = densImag[dim*col + row];
2229 
2230  prodReal += densElemReal*vecReal[col] - densElemImag*vecImag[col];
2231  prodImag += densElemReal*vecImag[col] + densElemImag*vecReal[col];
2232  }
2233 
2234  // multiply with row-th elem of (vec^*)
2235  qreal termReal = prodImag*vecImag[row] + prodReal*vecReal[row];
2236 
2237  // imag of every term should be zero, because each is a valid fidelity calc of an eigenstate
2238  //qreal termImag = prodImag*vecReal[row] - prodReal*vecImag[row];
2239 
2240  extern __shared__ qreal tempReductionArray[];
2241  tempReductionArray[threadIdx.x] = termReal;
2242  __syncthreads();
2243 
2244  // every second thread reduces
2245  if (threadIdx.x<blockDim.x/2)
2246  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2247 }

References Qureg::deviceStateVec, qreal, and reduceBlock().

◆ densmatr_calcHilbertSchmidtDistance()

qreal densmatr_calcHilbertSchmidtDistance ( Qureg  a,
Qureg  b 
)

Definition at line 2323 of file QuEST_gpu.cu.

2323  {
2324 
2325  // we're summing the square of every term in (a-b)
2326  long long int numValuesToReduce = a.numAmpsPerChunk;
2327 
2328  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2329  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2330  int firstTime = 1;
2331 
2332  while (numValuesToReduce > 1) {
2333 
2334  // need less than one CUDA-BLOCK to reduce
2335  if (numValuesToReduce < maxReducedPerLevel) {
2336  valuesPerCUDABlock = numValuesToReduce;
2337  numCUDABlocks = 1;
2338  }
2339  // otherwise use only full CUDA-BLOCKS
2340  else {
2341  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2342  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2343  }
2344  // dictates size of reduction array
2345  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2346 
2347  // spawn threads to sum the probs in each block (store reduction temp values in a's reduction array)
2348  if (firstTime) {
2349  densmatr_calcHilbertSchmidtDistanceSquaredKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2350  a.deviceStateVec.real, a.deviceStateVec.imag,
2351  b.deviceStateVec.real, b.deviceStateVec.imag,
2352  numValuesToReduce, a.firstLevelReduction);
2353  firstTime = 0;
2354 
2355  // sum the block probs
2356  } else {
2357  cudaDeviceSynchronize();
2358  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2360  a.secondLevelReduction, valuesPerCUDABlock);
2361  cudaDeviceSynchronize();
2363  }
2364 
2365  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2366  }
2367 
2368  qreal trace;
2369  cudaMemcpy(&trace, a.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2370 
2371  qreal sqrtTrace = sqrt(trace);
2372  return sqrtTrace;
2373 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcHilbertSchmidtDistance().

◆ densmatr_calcHilbertSchmidtDistanceSquaredKernel()

__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel ( qreal aRe,
qreal aIm,
qreal bRe,
qreal bIm,
long long int  numAmpsToSum,
qreal reducedArray 
)

Definition at line 2299 of file QuEST_gpu.cu.

2302  {
2303  // figure out which density matrix term this thread is assigned
2304  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2305  if (index >= numAmpsToSum) return;
2306 
2307  // compute this thread's sum term
2308  qreal difRe = aRe[index] - bRe[index];
2309  qreal difIm = aIm[index] - bIm[index];
2310  qreal term = difRe*difRe + difIm*difIm;
2311 
2312  // array of each thread's collected term, to be summed
2313  extern __shared__ qreal tempReductionArray[];
2314  tempReductionArray[threadIdx.x] = term;
2315  __syncthreads();
2316 
2317  // every second thread reduces
2318  if (threadIdx.x<blockDim.x/2)
2319  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2320 }

References qreal, and reduceBlock().

◆ densmatr_calcInnerProduct()

qreal densmatr_calcInnerProduct ( Qureg  a,
Qureg  b 
)

Definition at line 2043 of file QuEST_gpu.cu.

2043  {
2044 
2045  // we're summing the square of every term in the density matrix
2046  long long int numValuesToReduce = a.numAmpsTotal;
2047 
2048  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2049  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2050  int firstTime = 1;
2051 
2052  while (numValuesToReduce > 1) {
2053 
2054  // need less than one CUDA-BLOCK to reduce
2055  if (numValuesToReduce < maxReducedPerLevel) {
2056  valuesPerCUDABlock = numValuesToReduce;
2057  numCUDABlocks = 1;
2058  }
2059  // otherwise use only full CUDA-BLOCKS
2060  else {
2061  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2062  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2063  }
2064  // dictates size of reduction array
2065  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2066 
2067  // spawn threads to sum the terms in each block
2068  // arbitrarily store the reduction in the b qureg's array
2069  if (firstTime) {
2070  densmatr_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2071  a, b, a.numAmpsTotal, b.firstLevelReduction);
2072  firstTime = 0;
2073  }
2074  // sum the block terms
2075  else {
2076  cudaDeviceSynchronize();
2077  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2079  b.secondLevelReduction, valuesPerCUDABlock);
2080  cudaDeviceSynchronize();
2082  }
2083 
2084  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2085  }
2086 
2087  qreal innerprod;
2088  cudaMemcpy(&innerprod, b.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2089  return innerprod;
2090 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numAmpsTotal, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcDensityInnerProduct().

◆ densmatr_calcInnerProductKernel()

__global__ void densmatr_calcInnerProductKernel ( Qureg  a,
Qureg  b,
long long int  numTermsToSum,
qreal reducedArray 
)

computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number

Definition at line 2022 of file QuEST_gpu.cu.

2024  {
2025  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2026  if (index >= numTermsToSum) return;
2027 
2028  // Re{ conj(a) b } = Re{ (aRe - i aIm)(bRe + i bIm) } = aRe bRe + aIm bIm
2029  qreal prod = (
2030  a.deviceStateVec.real[index]*b.deviceStateVec.real[index]
2031  + a.deviceStateVec.imag[index]*b.deviceStateVec.imag[index]);
2032 
2033  // array of each thread's collected sum term, to be summed
2034  extern __shared__ qreal tempReductionArray[];
2035  tempReductionArray[threadIdx.x] = prod;
2036  __syncthreads();
2037 
2038  // every second thread reduces
2039  if (threadIdx.x<blockDim.x/2)
2040  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2041 }

References Qureg::deviceStateVec, qreal, and reduceBlock().

◆ densmatr_calcProbOfOutcome()

qreal densmatr_calcProbOfOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome 
)

Definition at line 2013 of file QuEST_gpu.cu.

2014 {
2015  qreal outcomeProb = densmatr_findProbabilityOfZero(qureg, measureQubit);
2016  if (outcome==1)
2017  outcomeProb = 1.0 - outcomeProb;
2018  return outcomeProb;
2019 }

References densmatr_findProbabilityOfZero(), and qreal.

Referenced by calcProbOfOutcome(), collapseToOutcome(), and densmatr_measureWithStats().

◆ densmatr_calcPurity()

qreal densmatr_calcPurity ( Qureg  qureg)

Computes the trace of the density matrix squared.

Definition at line 2394 of file QuEST_gpu.cu.

2394  {
2395 
2396  // we're summing the square of every term in the density matrix
2397  long long int numValuesToReduce = qureg.numAmpsPerChunk;
2398 
2399  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2400  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2401  int firstTime = 1;
2402 
2403  while (numValuesToReduce > 1) {
2404 
2405  // need less than one CUDA-BLOCK to reduce
2406  if (numValuesToReduce < maxReducedPerLevel) {
2407  valuesPerCUDABlock = numValuesToReduce;
2408  numCUDABlocks = 1;
2409  }
2410  // otherwise use only full CUDA-BLOCKS
2411  else {
2412  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2413  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2414  }
2415  // dictates size of reduction array
2416  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2417 
2418  // spawn threads to sum the probs in each block
2419  if (firstTime) {
2420  densmatr_calcPurityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2421  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
2422  numValuesToReduce, qureg.firstLevelReduction);
2423  firstTime = 0;
2424 
2425  // sum the block probs
2426  } else {
2427  cudaDeviceSynchronize();
2428  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2429  qureg.firstLevelReduction,
2430  qureg.secondLevelReduction, valuesPerCUDABlock);
2431  cudaDeviceSynchronize();
2433  }
2434 
2435  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2436  }
2437 
2438  qreal traceDensSquared;
2439  cudaMemcpy(&traceDensSquared, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2440  return traceDensSquared;
2441 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcPurity().

◆ densmatr_calcPurityKernel()

__global__ void densmatr_calcPurityKernel ( qreal vecReal,
qreal vecImag,
long long int  numAmpsToSum,
qreal reducedArray 
)

Definition at line 2375 of file QuEST_gpu.cu.

2375  {
2376 
2377  // figure out which density matrix term this thread is assigned
2378  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2379  if (index >= numAmpsToSum) return;
2380 
2381  qreal term = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2382 
2383  // array of each thread's collected probability, to be summed
2384  extern __shared__ qreal tempReductionArray[];
2385  tempReductionArray[threadIdx.x] = term;
2386  __syncthreads();
2387 
2388  // every second thread reduces
2389  if (threadIdx.x<blockDim.x/2)
2390  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2391 }

References qreal, and reduceBlock().

◆ densmatr_calcTotalProb()

qreal densmatr_calcTotalProb ( Qureg  qureg)

Definition at line 1531 of file QuEST_gpu.cu.

1531  {
1532 
1533  // computes the trace using Kahan summation
1534  qreal pTotal=0;
1535  qreal y, t, c;
1536  c = 0;
1537 
1538  long long int numCols = 1LL << qureg.numQubitsRepresented;
1539  long long diagIndex;
1540 
1541  copyStateFromGPU(qureg);
1542 
1543  for (int col=0; col< numCols; col++) {
1544  diagIndex = col*(numCols + 1);
1545  y = qureg.stateVec.real[diagIndex] - c;
1546  t = pTotal + y;
1547  c = ( t - pTotal ) - y; // brackets are important
1548  pTotal = t;
1549  }
1550 
1551  return pTotal;
1552 }

References copyStateFromGPU(), Qureg::numQubitsRepresented, qreal, and Qureg::stateVec.

Referenced by calcTotalProb(), and statevec_calcExpecPauliProd().

◆ densmatr_collapseToKnownProbOutcome()

void densmatr_collapseToKnownProbOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  outcomeProb 
)

This involves finding |...i...><...j...| states and killing those where i!=j.

Renorms (/prob) every | * outcome * >< * outcome * | state, setting all others to zero.

Definition at line 2535 of file QuEST_gpu.cu.

2535  {
2536 
2537  int rowQubit = measureQubit + qureg.numQubitsRepresented;
2538 
2539  int colBit = 1LL << measureQubit;
2540  int rowBit = 1LL << rowQubit;
2541 
2542  long long int numBasesToVisit = qureg.numAmpsPerChunk/4;
2543  long long int part1 = colBit -1;
2544  long long int part2 = (rowBit >> 1) - colBit;
2545  long long int part3 = numBasesToVisit - (rowBit >> 1);
2546 
2547  long long int desired, undesired;
2548  if (outcome == 0) {
2549  desired = 0;
2550  undesired = colBit | rowBit;
2551  } else {
2552  desired = colBit | rowBit;
2553  undesired = 0;
2554  }
2555 
2556  int threadsPerCUDABlock, CUDABlocks;
2557  threadsPerCUDABlock = 128;
2558  CUDABlocks = ceil(numBasesToVisit / (qreal) threadsPerCUDABlock);
2559  densmatr_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2560  outcomeProb, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBasesToVisit,
2561  part1, part2, part3, rowBit, colBit, desired, undesired);
2562 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by collapseToOutcome(), and densmatr_measureWithStats().

◆ densmatr_collapseToKnownProbOutcomeKernel()

__global__ void densmatr_collapseToKnownProbOutcomeKernel ( qreal  outcomeProb,
qreal vecReal,
qreal vecImag,
long long int  numBasesToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  rowBit,
long long int  colBit,
long long int  desired,
long long int  undesired 
)

Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|.

Definition at line 2509 of file QuEST_gpu.cu.

2513 {
2514  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2515  if (scanInd >= numBasesToVisit) return;
2516 
2517  long long int base = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2518 
2519  // renormalise desired outcome
2520  vecReal[base + desired] /= outcomeProb;
2521  vecImag[base + desired] /= outcomeProb;
2522 
2523  // kill undesired outcome
2524  vecReal[base + undesired] = 0;
2525  vecImag[base + undesired] = 0;
2526 
2527  // kill |..0..><..1..| states
2528  vecReal[base + colBit] = 0;
2529  vecImag[base + colBit] = 0;
2530  vecReal[base + rowBit] = 0;
2531  vecImag[base + rowBit] = 0;
2532 }

◆ densmatr_findProbabilityOfZero()

qreal densmatr_findProbabilityOfZero ( Qureg  qureg,
int  measureQubit 
)

Definition at line 1919 of file QuEST_gpu.cu.

1920 {
1921  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1922  long long int numValuesToReduce = densityDim >> 1; // half of the diagonal has measureQubit=0
1923 
1924  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
1925  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
1926  int firstTime = 1;
1927 
1928  while (numValuesToReduce > 1) {
1929 
1930  // need less than one CUDA-BLOCK to reduce
1931  if (numValuesToReduce < maxReducedPerLevel) {
1932  valuesPerCUDABlock = numValuesToReduce;
1933  numCUDABlocks = 1;
1934  }
1935  // otherwise use only full CUDA-BLOCKS
1936  else {
1937  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
1938  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
1939  }
1940 
1941  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
1942 
1943  // spawn threads to sum the probs in each block
1944  if (firstTime) {
1945  densmatr_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
1946  qureg, measureQubit, qureg.firstLevelReduction);
1947  firstTime = 0;
1948 
1949  // sum the block probs
1950  } else {
1951  cudaDeviceSynchronize();
1952  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
1953  qureg.firstLevelReduction,
1954  qureg.secondLevelReduction, valuesPerCUDABlock);
1955  cudaDeviceSynchronize();
1957  }
1958 
1959  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
1960  }
1961 
1962  qreal zeroProb;
1963  cudaMemcpy(&zeroProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
1964  return zeroProb;
1965 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numQubitsRepresented, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by densmatr_calcProbOfOutcome().

◆ densmatr_findProbabilityOfZeroKernel()

__global__ void densmatr_findProbabilityOfZeroKernel ( Qureg  qureg,
int  measureQubit,
qreal reducedArray 
)

Definition at line 1815 of file QuEST_gpu.cu.

1817  {
1818  // run by each thread
1819  // use of block here refers to contiguous amplitudes where measureQubit = 0,
1820  // (then =1) and NOT the CUDA block, which is the partitioning of CUDA threads
1821 
1822  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1823  long long int numTasks = densityDim >> 1;
1824  long long int sizeHalfBlock = 1LL << (measureQubit);
1825  long long int sizeBlock = 2LL * sizeHalfBlock;
1826 
1827  long long int thisBlock; // which block this thread is processing
1828  long long int thisTask; // which part of the block this thread is processing
1829  long long int basisIndex; // index of this thread's computational basis state
1830  long long int densityIndex; // " " index of |basis><basis| in the flat density matrix
1831 
1832  // array of each thread's collected probability, to be summed
1833  extern __shared__ qreal tempReductionArray[];
1834 
1835  // figure out which density matrix prob that this thread is assigned
1836  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1837  if (thisTask>=numTasks) return;
1838  thisBlock = thisTask / sizeHalfBlock;
1839  basisIndex = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1840  densityIndex = (densityDim + 1) * basisIndex;
1841 
1842  // record the probability in the CUDA-BLOCK-wide array
1843  qreal prob = qureg.deviceStateVec.real[densityIndex]; // im[densityIndex] assumed ~ 0
1844  tempReductionArray[threadIdx.x] = prob;
1845 
1846  // sum the probs collected by this CUDA-BLOCK's threads into a per-CUDA-BLOCK array
1847  __syncthreads();
1848  if (threadIdx.x<blockDim.x/2){
1849  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1850  }
1851 }

References Qureg::deviceStateVec, Qureg::numQubitsRepresented, qreal, and reduceBlock().

◆ densmatr_initClassicalState()

void densmatr_initClassicalState ( Qureg  qureg,
long long int  stateInd 
)

Definition at line 258 of file QuEST_gpu.cu.

259 {
260  int threadsPerCUDABlock, CUDABlocks;
261  threadsPerCUDABlock = 128;
262  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
263 
264  // index of the desired state in the flat density matrix
265  long long int densityDim = 1LL << qureg.numQubitsRepresented;
266  long long int densityInd = (densityDim + 1)*stateInd;
267 
268  // identical to pure version
269  densmatr_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
270  qureg.numAmpsPerChunk,
271  qureg.deviceStateVec.real,
272  qureg.deviceStateVec.imag, densityInd);
273 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by initClassicalState().

◆ densmatr_initClassicalStateKernel()

__global__ void densmatr_initClassicalStateKernel ( long long int  densityNumElems,
qreal densityReal,
qreal densityImag,
long long int  densityInd 
)

Definition at line 239 of file QuEST_gpu.cu.

243 {
244  // initialise the state to all zeros
245  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
246  if (index >= densityNumElems) return;
247 
248  densityReal[index] = 0.0;
249  densityImag[index] = 0.0;
250 
251  if (index==densityInd){
252  // classical state has probability 1
253  densityReal[densityInd] = 1.0;
254  densityImag[densityInd] = 0.0;
255  }
256 }

◆ densmatr_initPlusState()

void densmatr_initPlusState ( Qureg  qureg)

Definition at line 226 of file QuEST_gpu.cu.

227 {
228  qreal probFactor = 1.0/((qreal) (1LL << qureg.numQubitsRepresented));
229  int threadsPerCUDABlock, CUDABlocks;
230  threadsPerCUDABlock = 128;
231  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
232  densmatr_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
233  qureg.numAmpsPerChunk,
234  probFactor,
235  qureg.deviceStateVec.real,
236  qureg.deviceStateVec.imag);
237 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by initPlusState().

◆ densmatr_initPlusStateKernel()

__global__ void densmatr_initPlusStateKernel ( long long int  stateVecSize,
qreal  probFactor,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 216 of file QuEST_gpu.cu.

216  {
217  long long int index;
218 
219  index = blockIdx.x*blockDim.x + threadIdx.x;
220  if (index>=stateVecSize) return;
221 
222  stateVecReal[index] = probFactor;
223  stateVecImag[index] = 0.0;
224 }

◆ densmatr_initPureState()

void densmatr_initPureState ( Qureg  targetQureg,
Qureg  copyQureg 
)

Definition at line 205 of file QuEST_gpu.cu.

206 {
207  int threadsPerCUDABlock, CUDABlocks;
208  threadsPerCUDABlock = 128;
209  CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
210  densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
211  copyQureg.numAmpsPerChunk,
212  targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
213  copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag);
214 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initPureState().

◆ densmatr_initPureStateKernel()

__global__ void densmatr_initPureStateKernel ( long long int  numPureAmps,
qreal targetVecReal,
qreal targetVecImag,
qreal copyVecReal,
qreal copyVecImag 
)

Definition at line 186 of file QuEST_gpu.cu.

190 {
191  // this is a particular index of the pure copyQureg
192  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
193  if (index>=numPureAmps) return;
194 
195  qreal realRow = copyVecReal[index];
196  qreal imagRow = copyVecImag[index];
197  for (long long int col=0; col < numPureAmps; col++) {
198  qreal realCol = copyVecReal[col];
199  qreal imagCol = - copyVecImag[col]; // minus for conjugation
200  targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
201  targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
202  }
203 }

References qreal.

◆ densmatr_mixDamping()

void densmatr_mixDamping ( Qureg  qureg,
int  targetQubit,
qreal  damping 
)

Definition at line 2778 of file QuEST_gpu.cu.

2778  {
2779 
2780  if (damping == 0)
2781  return;
2782 
2783  qreal dephase = sqrt(1-damping);
2784  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephase);
2785 
2786  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2787  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2788 
2789  long long int colBit = 1LL << targetQubit;
2790  long long int rowBit = 1LL << rowQubit;
2791  long long int bothBits = colBit | rowBit;
2792 
2793  long long int part1 = colBit - 1;
2794  long long int part2 = (rowBit >> 1) - colBit;
2795  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2796 
2797  int threadsPerCUDABlock, CUDABlocks;
2798  threadsPerCUDABlock = 128;
2799  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2800  densmatr_mixDampingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2801  damping, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2802  part1, part2, part3, bothBits);
2803 }

References densmatr_oneQubitDegradeOffDiagonal(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixDamping().

◆ densmatr_mixDampingKernel()

__global__ void densmatr_mixDampingKernel ( qreal  damping,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  bothBits 
)

Works like mixDephasing but modifies every other element, and elements are averaged in pairs.

Definition at line 2731 of file QuEST_gpu.cu.

2735 {
2736  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2737  if (scanInd >= numAmpsToVisit) return;
2738 
2739  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2740  long long int targetInd = baseInd + bothBits;
2741 
2742  qreal realAvDepol = damping * ( vecReal[targetInd]);
2743  qreal imagAvDepol = damping * ( vecImag[targetInd]);
2744 
2745  vecReal[targetInd] *= 1 - damping;
2746  vecImag[targetInd] *= 1 - damping;
2747 
2748  vecReal[baseInd] += realAvDepol;
2749  vecImag[baseInd] += imagAvDepol;
2750 }

References qreal.

◆ densmatr_mixDensityMatrix()

void densmatr_mixDensityMatrix ( Qureg  combineQureg,
qreal  otherProb,
Qureg  otherQureg 
)

Definition at line 2576 of file QuEST_gpu.cu.

2576  {
2577 
2578  long long int numAmpsToVisit = combineQureg.numAmpsPerChunk;
2579 
2580  int threadsPerCUDABlock, CUDABlocks;
2581  threadsPerCUDABlock = 128;
2582  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2583  densmatr_mixDensityMatrixKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2584  combineQureg, otherProb, otherQureg, numAmpsToVisit
2585  );
2586 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by mixDensityMatrix().

◆ densmatr_mixDensityMatrixKernel()

__global__ void densmatr_mixDensityMatrixKernel ( Qureg  combineQureg,
qreal  otherProb,
Qureg  otherQureg,
long long int  numAmpsToVisit 
)

Definition at line 2564 of file QuEST_gpu.cu.

2564  {
2565 
2566  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2567  if (ampInd >= numAmpsToVisit) return;
2568 
2569  combineQureg.deviceStateVec.real[ampInd] *= 1-otherProb;
2570  combineQureg.deviceStateVec.imag[ampInd] *= 1-otherProb;
2571 
2572  combineQureg.deviceStateVec.real[ampInd] += otherProb*otherQureg.deviceStateVec.real[ampInd];
2573  combineQureg.deviceStateVec.imag[ampInd] += otherProb*otherQureg.deviceStateVec.imag[ampInd];
2574 }

References Qureg::deviceStateVec.

◆ densmatr_mixDephasing()

void densmatr_mixDephasing ( Qureg  qureg,
int  targetQubit,
qreal  dephase 
)

Definition at line 2629 of file QuEST_gpu.cu.

2629  {
2630 
2631  if (dephase == 0)
2632  return;
2633 
2634  qreal dephFac = 1 - dephase;
2635  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephFac);
2636 }

References densmatr_oneQubitDegradeOffDiagonal(), and qreal.

Referenced by densmatr_mixDepolarising(), and mixDephasing().

◆ densmatr_mixDephasingKernel()

__global__ void densmatr_mixDephasingKernel ( qreal  fac,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  colBit,
long long int  rowBit 
)

Called once for every 4 amplitudes in density matrix Works by establishing the |..0..><..0..| state (for its given index) then visiting |..1..><..0..| and |..0..><..1..|.

Labels |part1 X pa><rt2 NOT(X) part3| From the brain of Simon Benjamin

Definition at line 2593 of file QuEST_gpu.cu.

2597 {
2598  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2599  if (scanInd >= numAmpsToVisit) return;
2600 
2601  long long int ampInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2602  vecReal[ampInd + colBit] *= fac;
2603  vecImag[ampInd + colBit] *= fac;
2604  vecReal[ampInd + rowBit] *= fac;
2605  vecImag[ampInd + rowBit] *= fac;
2606 }

◆ densmatr_mixDepolarising()

void densmatr_mixDepolarising ( Qureg  qureg,
int  targetQubit,
qreal  depolLevel 
)

Definition at line 2752 of file QuEST_gpu.cu.

2752  {
2753 
2754  if (depolLevel == 0)
2755  return;
2756 
2757  densmatr_mixDephasing(qureg, targetQubit, depolLevel);
2758 
2759  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2760  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2761 
2762  long long int colBit = 1LL << targetQubit;
2763  long long int rowBit = 1LL << rowQubit;
2764  long long int bothBits = colBit | rowBit;
2765 
2766  long long int part1 = colBit - 1;
2767  long long int part2 = (rowBit >> 1) - colBit;
2768  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2769 
2770  int threadsPerCUDABlock, CUDABlocks;
2771  threadsPerCUDABlock = 128;
2772  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2773  densmatr_mixDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2774  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2775  part1, part2, part3, bothBits);
2776 }

References densmatr_mixDephasing(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixDepolarising().

◆ densmatr_mixDepolarisingKernel()

__global__ void densmatr_mixDepolarisingKernel ( qreal  depolLevel,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  bothBits 
)

Works like mixDephasing but modifies every other element, and elements are averaged in pairs.

Definition at line 2705 of file QuEST_gpu.cu.

2709 {
2710  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2711  if (scanInd >= numAmpsToVisit) return;
2712 
2713  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2714  long long int targetInd = baseInd + bothBits;
2715 
2716  qreal realAvDepol = depolLevel * 0.5 * (vecReal[baseInd] + vecReal[targetInd]);
2717  qreal imagAvDepol = depolLevel * 0.5 * (vecImag[baseInd] + vecImag[targetInd]);
2718 
2719  vecReal[baseInd] *= 1 - depolLevel;
2720  vecImag[baseInd] *= 1 - depolLevel;
2721  vecReal[targetInd] *= 1 - depolLevel;
2722  vecImag[targetInd] *= 1 - depolLevel;
2723 
2724  vecReal[baseInd] += realAvDepol;
2725  vecImag[baseInd] += imagAvDepol;
2726  vecReal[targetInd] += realAvDepol;
2727  vecImag[targetInd] += imagAvDepol;
2728 }

References qreal.

◆ densmatr_mixTwoQubitDephasing()

void densmatr_mixTwoQubitDephasing ( Qureg  qureg,
int  qubit1,
int  qubit2,
qreal  dephase 
)

Definition at line 2668 of file QuEST_gpu.cu.

2668  {
2669 
2670  if (dephase == 0)
2671  return;
2672 
2673  // assumes qubit2 > qubit1
2674 
2675  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2676  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2677 
2678  long long int colBit1 = 1LL << qubit1;
2679  long long int rowBit1 = 1LL << rowQubit1;
2680  long long int colBit2 = 1LL << qubit2;
2681  long long int rowBit2 = 1LL << rowQubit2;
2682 
2683  long long int part1 = colBit1 - 1;
2684  long long int part2 = (colBit2 >> 1) - colBit1;
2685  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2686  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2687  long long int part5 = (qureg.numAmpsPerChunk/16) - (rowBit2 >> 3);
2688  qreal dephFac = 1 - dephase;
2689 
2690  // refers to states |a 0 b 0 c><d 0 e 0 f| (target qubits are fixed)
2691  long long int numBackgroundStates = qureg.numAmpsPerChunk/16;
2692 
2693  // 12 of these states experience dephasing
2694  long long int numAmpsToVisit = 12 * numBackgroundStates;
2695 
2696  int threadsPerCUDABlock, CUDABlocks;
2697  threadsPerCUDABlock = 128;
2698  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2699  densmatr_mixTwoQubitDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2700  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBackgroundStates, numAmpsToVisit,
2701  part1, part2, part3, part4, part5, colBit1, rowBit1, colBit2, rowBit2);
2702 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by densmatr_mixTwoQubitDepolarising(), and mixTwoQubitDephasing().

◆ densmatr_mixTwoQubitDephasingKernel()

__global__ void densmatr_mixTwoQubitDephasingKernel ( qreal  fac,
qreal vecReal,
qreal vecImag,
long long int  numBackgroundStates,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  part4,
long long int  part5,
long long int  colBit1,
long long int  rowBit1,
long long int  colBit2,
long long int  rowBit2 
)

Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0..0..><..0..0..| index to visit either |..0..0..><..0..1..|, |..0..0..><..1..0..|, |..0..0..><..1..1..|, |..0..1..><..0..0..| etc and so on to |..1..1..><..1..0|.

Labels |part1 0 part2 0 par><t3 0 part4 0 part5|. From the brain of Simon Benjamin

Definition at line 2644 of file QuEST_gpu.cu.

2648 {
2649  long long int outerInd = blockIdx.x*blockDim.x + threadIdx.x;
2650  if (outerInd >= numAmpsToVisit) return;
2651 
2652  // sets meta in 1...14 excluding 5, 10, creating bit string DCBA for |..D..C..><..B..A|
2653  int meta = 1 + (outerInd/numBackgroundStates);
2654  if (meta > 4) meta++;
2655  if (meta > 9) meta++;
2656 
2657  long long int shift = rowBit2*((meta>>3)%2) + rowBit1*((meta>>2)%2) + colBit2*((meta>>1)%2) + colBit1*(meta%2);
2658  long long int scanInd = outerInd % numBackgroundStates;
2659  long long int stateInd = (
2660  shift +
2661  (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4));
2662 
2663  vecReal[stateInd] *= fac;
2664  vecImag[stateInd] *= fac;
2665 }

◆ densmatr_mixTwoQubitDepolarising()

void densmatr_mixTwoQubitDepolarising ( Qureg  qureg,
int  qubit1,
int  qubit2,
qreal  depolLevel 
)

Definition at line 2838 of file QuEST_gpu.cu.

2838  {
2839 
2840  if (depolLevel == 0)
2841  return;
2842 
2843  // assumes qubit2 > qubit1
2844 
2845  densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
2846 
2847  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2848  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2849 
2850  long long int colBit1 = 1LL << qubit1;
2851  long long int rowBit1 = 1LL << rowQubit1;
2852  long long int colBit2 = 1LL << qubit2;
2853  long long int rowBit2 = 1LL << rowQubit2;
2854 
2855  long long int rowCol1 = colBit1 | rowBit1;
2856  long long int rowCol2 = colBit2 | rowBit2;
2857 
2858  long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
2859  long long int part1 = colBit1 - 1;
2860  long long int part2 = (colBit2 >> 1) - colBit1;
2861  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2862  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2863  long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
2864 
2865  int threadsPerCUDABlock, CUDABlocks;
2866  threadsPerCUDABlock = 128;
2867  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2868  densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2869  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2870  part1, part2, part3, part4, part5, rowCol1, rowCol2);
2871 }

References densmatr_mixTwoQubitDephasing(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixTwoQubitDepolarising().

◆ densmatr_mixTwoQubitDepolarisingKernel()

__global__ void densmatr_mixTwoQubitDepolarisingKernel ( qreal  depolLevel,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  part4,
long long int  part5,
long long int  rowCol1,
long long int  rowCol2 
)

Called once for every 16 amplitudes.

Definition at line 2806 of file QuEST_gpu.cu.

2811 {
2812  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2813  if (scanInd >= numAmpsToVisit) return;
2814 
2815  // index of |..0..0..><..0..0|
2816  long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
2817  long long int ind01 = ind00 + rowCol1;
2818  long long int ind10 = ind00 + rowCol2;
2819  long long int ind11 = ind00 + rowCol1 + rowCol2;
2820 
2821  qreal realAvDepol = depolLevel * 0.25 * (
2822  vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]);
2823  qreal imagAvDepol = depolLevel * 0.25 * (
2824  vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]);
2825 
2826  qreal retain = 1 - depolLevel;
2827  vecReal[ind00] *= retain; vecImag[ind00] *= retain;
2828  vecReal[ind01] *= retain; vecImag[ind01] *= retain;
2829  vecReal[ind10] *= retain; vecImag[ind10] *= retain;
2830  vecReal[ind11] *= retain; vecImag[ind11] *= retain;
2831 
2832  vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol;
2833  vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol;
2834  vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol;
2835  vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol;
2836 }

References qreal.

◆ densmatr_oneQubitDegradeOffDiagonal()

void densmatr_oneQubitDegradeOffDiagonal ( Qureg  qureg,
int  targetQubit,
qreal  dephFac 
)

Definition at line 2609 of file QuEST_gpu.cu.

2609  {
2610 
2611  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2612 
2613  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2614  long long int colBit = 1LL << targetQubit;
2615  long long int rowBit = 1LL << rowQubit;
2616 
2617  long long int part1 = colBit - 1;
2618  long long int part2 = (rowBit >> 1) - colBit;
2619  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2620 
2621  int threadsPerCUDABlock, CUDABlocks;
2622  threadsPerCUDABlock = 128;
2623  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2624  densmatr_mixDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2625  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2626  part1, part2, part3, colBit, rowBit);
2627 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by densmatr_mixDamping(), and densmatr_mixDephasing().

◆ extractBit()

◆ flipBit()

◆ getBitMaskParity()

__forceinline__ __device__ int getBitMaskParity ( long long int  mask)

Definition at line 86 of file QuEST_gpu.cu.

86  {
87  int parity = 0;
88  while (mask) {
89  parity = !parity;
90  mask = mask & (mask-1);
91  }
92  return parity;
93 }

Referenced by statevec_multiRotateZKernel().

◆ getNumReductionLevels()

int getNumReductionLevels ( long long int  numValuesToReduce,
int  numReducedPerLevel 
)

Definition at line 1903 of file QuEST_gpu.cu.

1903  {
1904  int levels=0;
1905  while (numValuesToReduce){
1906  numValuesToReduce = numValuesToReduce/numReducedPerLevel;
1907  levels++;
1908  }
1909  return levels;
1910 }

◆ GPUExists()

int GPUExists ( void  )

Definition at line 390 of file QuEST_gpu.cu.

390  {
391  int deviceCount, device;
392  int gpuDeviceCount = 0;
393  struct cudaDeviceProp properties;
394  cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
395  if (cudaResultCode != cudaSuccess) deviceCount = 0;
396  /* machines with no GPUs can still report one emulation device */
397  for (device = 0; device < deviceCount; ++device) {
398  cudaGetDeviceProperties(&properties, device);
399  if (properties.major != 9999) { /* 9999 means emulation only */
400  ++gpuDeviceCount;
401  }
402  }
403  if (gpuDeviceCount) return 1;
404  else return 0;
405 }

Referenced by createQuESTEnv().

◆ insertTwoZeroBits()

__forceinline__ __device__ long long int insertTwoZeroBits ( const long long int  number,
const int  bit1,
const int  bit2 
)

Definition at line 106 of file QuEST_gpu.cu.

106  {
107  int small = (bit1 < bit2)? bit1 : bit2;
108  int big = (bit1 < bit2)? bit2 : bit1;
109  return insertZeroBit(insertZeroBit(number, small), big);
110 }

References insertZeroBit().

Referenced by statevec_multiControlledTwoQubitUnitaryKernel(), statevec_multiControlledTwoQubitUnitaryLocal(), statevec_swapQubitAmpsKernel(), and statevec_swapQubitAmpsLocal().

◆ insertZeroBit()

__forceinline__ __device__ long long int insertZeroBit ( const long long int  number,
const int  index 
)

Definition at line 99 of file QuEST_gpu.cu.

99  {
100  long long int left, right;
101  left = (number >> index) << index;
102  right = number - left;
103  return (left << 1) ^ right;
104 }

Referenced by insertTwoZeroBits(), insertZeroBits(), and statevec_multiControlledMultiQubitUnitaryLocal().

◆ insertZeroBits()

__forceinline__ __device__ long long int insertZeroBits ( long long int  number,
int *  inds,
const int  numInds 
)

Definition at line 112 of file QuEST_gpu.cu.

112  {
113  /* inserted bit inds must strictly increase, so that their final indices are correct.
114  * in-lieu of sorting (avoided since no C++ variable-size arrays, and since we're already
115  * memory bottle-necked so overhead eats this slowdown), we find the next-smallest index each
116  * at each insert. recall every element of inds (a positive or zero number) is unique.
117  * This function won't appear in the CPU code, which can use C99 variable-size arrays and
118  * ought to make a sorted array before threading
119  */
120  int curMin = inds[0];
121  int prevMin = -1;
122  for (int n=0; n < numInds; n++) {
123 
124  // find next min
125  for (int t=0; t < numInds; t++)
126  if (inds[t]>prevMin && inds[t]<curMin)
127  curMin = inds[t];
128 
129  number = insertZeroBit(number, curMin);
130 
131  // set curMin to an arbitrary non-visited elem
132  prevMin = curMin;
133  for (int t=0; t < numInds; t++)
134  if (inds[t] > curMin) {
135  curMin = inds[t];
136  break;
137  }
138  }
139  return number;
140 }

References insertZeroBit().

Referenced by statevec_multiControlledMultiQubitUnitaryKernel().

◆ log2Int()

__device__ __host__ unsigned int log2Int ( unsigned int  x)

Definition at line 1780 of file QuEST_gpu.cu.

1781 {
1782  unsigned int ans = 0 ;
1783  while( x>>=1 ) ans++;
1784  return ans ;
1785 }

Referenced by reduceBlock().

◆ reduceBlock()

__device__ void reduceBlock ( qreal arrayIn,
qreal reducedArray,
int  length 
)

Definition at line 1787 of file QuEST_gpu.cu.

1787  {
1788  int i, l, r;
1789  int threadMax, maxDepth;
1790  threadMax = length/2;
1791  maxDepth = log2Int(length/2);
1792 
1793  for (i=0; i<maxDepth+1; i++){
1794  if (threadIdx.x<threadMax){
1795  l = threadIdx.x;
1796  r = l + threadMax;
1797  arrayIn[l] = arrayIn[r] + arrayIn[l];
1798  }
1799  threadMax = threadMax >> 1;
1800  __syncthreads(); // optimise -- use warp shuffle instead
1801  }
1802 
1803  if (threadIdx.x==0) reducedArray[blockIdx.x] = arrayIn[0];
1804 }

References log2Int().

Referenced by copySharedReduceBlock(), densmatr_calcExpecDiagonalOpKernel(), densmatr_calcFidelityKernel(), densmatr_calcHilbertSchmidtDistanceSquaredKernel(), densmatr_calcInnerProductKernel(), densmatr_calcPurityKernel(), densmatr_findProbabilityOfZeroKernel(), statevec_calcExpecDiagonalOpKernel(), statevec_calcInnerProductKernel(), and statevec_findProbabilityOfZeroKernel().

◆ statevec_applyDiagonalOp()

void statevec_applyDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 2939 of file QuEST_gpu.cu.

2940 {
2941  int threadsPerCUDABlock, CUDABlocks;
2942  threadsPerCUDABlock = 128;
2943  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
2944  statevec_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
2945 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyDiagonalOp().

◆ statevec_applyDiagonalOpKernel()

__global__ void statevec_applyDiagonalOpKernel ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 2917 of file QuEST_gpu.cu.

2917  {
2918 
2919  // each thread modifies one value; a wasteful and inefficient strategy
2920  long long int numTasks = qureg.numAmpsPerChunk;
2921  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2922  if (thisTask >= numTasks) return;
2923 
2924  qreal* stateRe = qureg.deviceStateVec.real;
2925  qreal* stateIm = qureg.deviceStateVec.imag;
2926  qreal* opRe = op.deviceOperator.real;
2927  qreal* opIm = op.deviceOperator.imag;
2928 
2929  qreal a = stateRe[thisTask];
2930  qreal b = stateIm[thisTask];
2931  qreal c = opRe[thisTask];
2932  qreal d = opIm[thisTask];
2933 
2934  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
2935  stateRe[thisTask] = a*c - b*d;
2936  stateIm[thisTask] = a*d + b*c;
2937 }

References DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_calcExpecDiagonalOp()

Complex statevec_calcExpecDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3006 of file QuEST_gpu.cu.

3006  {
3007 
3008  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3009  * (e.g. a func which accepts a pointer to do every-value reduction?)
3010  */
3011 
3012  qreal expecReal, expecImag;
3013 
3014  int getRealComp;
3015  long long int numValuesToReduce;
3016  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3017  int maxReducedPerLevel;
3018  int firstTime;
3019 
3020  // compute real component of inner product
3021  getRealComp = 1;
3022  numValuesToReduce = qureg.numAmpsPerChunk;
3023  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3024  firstTime = 1;
3025  while (numValuesToReduce > 1) {
3026  if (numValuesToReduce < maxReducedPerLevel) {
3027  valuesPerCUDABlock = numValuesToReduce;
3028  numCUDABlocks = 1;
3029  }
3030  else {
3031  valuesPerCUDABlock = maxReducedPerLevel;
3032  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3033  }
3034  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3035  if (firstTime) {
3036  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3037  getRealComp,
3038  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3039  op.deviceOperator.real, op.deviceOperator.imag,
3040  numValuesToReduce,
3041  qureg.firstLevelReduction);
3042  firstTime = 0;
3043  } else {
3044  cudaDeviceSynchronize();
3045  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3046  qureg.firstLevelReduction,
3047  qureg.secondLevelReduction, valuesPerCUDABlock);
3048  cudaDeviceSynchronize();
3050  }
3051  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3052  }
3053  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3054 
3055  // compute imag component of inner product
3056  getRealComp = 0;
3057  numValuesToReduce = qureg.numAmpsPerChunk;
3058  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3059  firstTime = 1;
3060  while (numValuesToReduce > 1) {
3061  if (numValuesToReduce < maxReducedPerLevel) {
3062  valuesPerCUDABlock = numValuesToReduce;
3063  numCUDABlocks = 1;
3064  }
3065  else {
3066  valuesPerCUDABlock = maxReducedPerLevel;
3067  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3068  }
3069  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3070  if (firstTime) {
3071  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3072  getRealComp,
3073  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3074  op.deviceOperator.real, op.deviceOperator.imag,
3075  numValuesToReduce,
3076  qureg.firstLevelReduction);
3077  firstTime = 0;
3078  } else {
3079  cudaDeviceSynchronize();
3080  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3081  qureg.firstLevelReduction,
3082  qureg.secondLevelReduction, valuesPerCUDABlock);
3083  cudaDeviceSynchronize();
3085  }
3086  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3087  }
3088  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3089 
3090  // return complex
3091  Complex expecVal;
3092  expecVal.real = expecReal;
3093  expecVal.imag = expecImag;
3094  return expecVal;
3095 }

References copySharedReduceBlock(), DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcExpecDiagonalOp().

◆ statevec_calcExpecDiagonalOpKernel()

__global__ void statevec_calcExpecDiagonalOpKernel ( int  getRealComp,
qreal vecReal,
qreal vecImag,
qreal opReal,
qreal opImag,
long long int  numTermsToSum,
qreal reducedArray 
)

computes either a real or imag term of |vec_i|^2 op_i

Definition at line 2979 of file QuEST_gpu.cu.

2983 {
2984  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2985  if (index >= numTermsToSum) return;
2986 
2987  qreal vecAbs = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2988 
2989  // choose whether to calculate the real or imaginary term of the expec term
2990  qreal expecVal;
2991  if (getRealComp)
2992  expecVal = vecAbs * opReal[index];
2993  else
2994  expecVal = vecAbs * opImag[index];
2995 
2996  // array of each thread's collected sum term, to be summed
2997  extern __shared__ qreal tempReductionArray[];
2998  tempReductionArray[threadIdx.x] = expecVal;
2999  __syncthreads();
3000 
3001  // every second thread reduces
3002  if (threadIdx.x<blockDim.x/2)
3003  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3004 }

References qreal, and reduceBlock().

◆ statevec_calcInnerProduct()

Complex statevec_calcInnerProduct ( Qureg  bra,
Qureg  ket 
)

Terrible code which unnecessarily individually computes and sums the real and imaginary components of the inner product, so as to not have to worry about keeping the sums separated during reduction.

Truly disgusting, probably doubles runtime, please fix. @TODO could even do the kernel twice, storing real in bra.reduc and imag in ket.reduc?

Definition at line 2123 of file QuEST_gpu.cu.

2123  {
2124 
2125  qreal innerProdReal, innerProdImag;
2126 
2127  int getRealComp;
2128  long long int numValuesToReduce;
2129  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2130  int maxReducedPerLevel;
2131  int firstTime;
2132 
2133  // compute real component of inner product
2134  getRealComp = 1;
2135  numValuesToReduce = bra.numAmpsPerChunk;
2136  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2137  firstTime = 1;
2138  while (numValuesToReduce > 1) {
2139  if (numValuesToReduce < maxReducedPerLevel) {
2140  valuesPerCUDABlock = numValuesToReduce;
2141  numCUDABlocks = 1;
2142  }
2143  else {
2144  valuesPerCUDABlock = maxReducedPerLevel;
2145  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2146  }
2147  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2148  if (firstTime) {
2149  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2150  getRealComp,
2151  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2152  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2153  numValuesToReduce,
2154  bra.firstLevelReduction);
2155  firstTime = 0;
2156  } else {
2157  cudaDeviceSynchronize();
2158  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2159  bra.firstLevelReduction,
2160  bra.secondLevelReduction, valuesPerCUDABlock);
2161  cudaDeviceSynchronize();
2163  }
2164  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2165  }
2166  cudaMemcpy(&innerProdReal, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2167 
2168  // compute imag component of inner product
2169  getRealComp = 0;
2170  numValuesToReduce = bra.numAmpsPerChunk;
2171  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2172  firstTime = 1;
2173  while (numValuesToReduce > 1) {
2174  if (numValuesToReduce < maxReducedPerLevel) {
2175  valuesPerCUDABlock = numValuesToReduce;
2176  numCUDABlocks = 1;
2177  }
2178  else {
2179  valuesPerCUDABlock = maxReducedPerLevel;
2180  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2181  }
2182  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2183  if (firstTime) {
2184  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2185  getRealComp,
2186  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2187  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2188  numValuesToReduce,
2189  bra.firstLevelReduction);
2190  firstTime = 0;
2191  } else {
2192  cudaDeviceSynchronize();
2193  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2194  bra.firstLevelReduction,
2195  bra.secondLevelReduction, valuesPerCUDABlock);
2196  cudaDeviceSynchronize();
2198  }
2199  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2200  }
2201  cudaMemcpy(&innerProdImag, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2202 
2203  // return complex
2204  Complex innerProd;
2205  innerProd.real = innerProdReal;
2206  innerProd.imag = innerProdImag;
2207  return innerProd;
2208 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcInnerProduct(), statevec_calcExpecPauliProd(), and statevec_calcFidelity().

◆ statevec_calcInnerProductKernel()

__global__ void statevec_calcInnerProductKernel ( int  getRealComp,
qreal vecReal1,
qreal vecImag1,
qreal vecReal2,
qreal vecImag2,
long long int  numTermsToSum,
qreal reducedArray 
)

computes either a real or imag term in the inner product

Definition at line 2093 of file QuEST_gpu.cu.

2097 {
2098  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2099  if (index >= numTermsToSum) return;
2100 
2101  // choose whether to calculate the real or imaginary term of the inner product
2102  qreal innerProdTerm;
2103  if (getRealComp)
2104  innerProdTerm = vecReal1[index]*vecReal2[index] + vecImag1[index]*vecImag2[index];
2105  else
2106  innerProdTerm = vecReal1[index]*vecImag2[index] - vecImag1[index]*vecReal2[index];
2107 
2108  // array of each thread's collected sum term, to be summed
2109  extern __shared__ qreal tempReductionArray[];
2110  tempReductionArray[threadIdx.x] = innerProdTerm;
2111  __syncthreads();
2112 
2113  // every second thread reduces
2114  if (threadIdx.x<blockDim.x/2)
2115  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2116 }

References qreal, and reduceBlock().

◆ statevec_calcProbOfOutcome()

qreal statevec_calcProbOfOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome 
)

Definition at line 2005 of file QuEST_gpu.cu.

2006 {
2007  qreal outcomeProb = statevec_findProbabilityOfZero(qureg, measureQubit);
2008  if (outcome==1)
2009  outcomeProb = 1.0 - outcomeProb;
2010  return outcomeProb;
2011 }

References qreal, and statevec_findProbabilityOfZero().

Referenced by calcProbOfOutcome(), collapseToOutcome(), and statevec_measureWithStats().

◆ statevec_calcTotalProb()

qreal statevec_calcTotalProb ( Qureg  qureg)

Definition at line 1554 of file QuEST_gpu.cu.

1554  {
1555  /* IJB - implemented using Kahan summation for greater accuracy at a slight floating
1556  point operation overhead. For more details see https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
1557  /* Don't change the bracketing in this routine! */
1558  qreal pTotal=0;
1559  qreal y, t, c;
1560  long long int index;
1561  long long int numAmpsPerRank = qureg.numAmpsPerChunk;
1562 
1563  copyStateFromGPU(qureg);
1564 
1565  c = 0.0;
1566  for (index=0; index<numAmpsPerRank; index++){
1567  /* Perform pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index]; by Kahan */
1568  // pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index];
1569  y = qureg.stateVec.real[index]*qureg.stateVec.real[index] - c;
1570  t = pTotal + y;
1571  c = ( t - pTotal ) - y;
1572  pTotal = t;
1573 
1574  /* Perform pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index]; by Kahan */
1575  //pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index];
1576  y = qureg.stateVec.imag[index]*qureg.stateVec.imag[index] - c;
1577  t = pTotal + y;
1578  c = ( t - pTotal ) - y;
1579  pTotal = t;
1580 
1581 
1582  }
1583  return pTotal;
1584 }

References copyStateFromGPU(), Qureg::numAmpsPerChunk, qreal, and Qureg::stateVec.

Referenced by calcTotalProb().

◆ statevec_cloneQureg()

void statevec_cloneQureg ( Qureg  targetQureg,
Qureg  copyQureg 
)

works for both statevectors and density matrices

Definition at line 170 of file QuEST_gpu.cu.

170  {
171 
172  // copy copyQureg's GPU statevec to targetQureg's GPU statevec
173  cudaDeviceSynchronize();
174  cudaMemcpy(
175  targetQureg.deviceStateVec.real,
176  copyQureg.deviceStateVec.real,
177  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.real)),
178  cudaMemcpyDeviceToDevice);
179  cudaMemcpy(
180  targetQureg.deviceStateVec.imag,
181  copyQureg.deviceStateVec.imag,
182  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.imag)),
183  cudaMemcpyDeviceToDevice);
184 }

References Qureg::deviceStateVec, and Qureg::numAmpsPerChunk.

Referenced by cloneQureg(), createCloneQureg(), initPureState(), and statevec_calcExpecPauliProd().

◆ statevec_collapseToKnownProbOutcome()

void statevec_collapseToKnownProbOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  outcomeProb 
)

Definition at line 2500 of file QuEST_gpu.cu.

2501 {
2502  int threadsPerCUDABlock, CUDABlocks;
2503  threadsPerCUDABlock = 128;
2504  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
2505  statevec_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, measureQubit, outcome, outcomeProb);
2506 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by collapseToOutcome(), and statevec_measureWithStats().

◆ statevec_collapseToKnownProbOutcomeKernel()

__global__ void statevec_collapseToKnownProbOutcomeKernel ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  totalProbability 
)

Definition at line 2443 of file QuEST_gpu.cu.

2444 {
2445  // ----- sizes
2446  long long int sizeBlock, // size of blocks
2447  sizeHalfBlock; // size of blocks halved
2448  // ----- indices
2449  long long int thisBlock, // current block
2450  index; // current index for first half block
2451  // ----- measured probability
2452  qreal renorm; // probability (returned) value
2453  // ----- temp variables
2454  long long int thisTask; // task based approach for expose loop with small granularity
2455  // (good for shared memory parallelism)
2456  long long int numTasks=qureg.numAmpsPerChunk>>1;
2457 
2458  // ---------------------------------------------------------------- //
2459  // dimensions //
2460  // ---------------------------------------------------------------- //
2461  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2462  // and then the number to skip
2463  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2464 
2465  // ---------------------------------------------------------------- //
2466  // find probability //
2467  // ---------------------------------------------------------------- //
2468 
2469  //
2470  // --- task-based shared-memory parallel implementation
2471  //
2472  renorm=1/sqrt(totalProbability);
2473  qreal *stateVecReal = qureg.deviceStateVec.real;
2474  qreal *stateVecImag = qureg.deviceStateVec.imag;
2475 
2476  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2477  if (thisTask>=numTasks) return;
2478  thisBlock = thisTask / sizeHalfBlock;
2479  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2480 
2481  if (outcome==0){
2482  stateVecReal[index]=stateVecReal[index]*renorm;
2483  stateVecImag[index]=stateVecImag[index]*renorm;
2484 
2485  stateVecReal[index+sizeHalfBlock]=0;
2486  stateVecImag[index+sizeHalfBlock]=0;
2487  } else if (outcome==1){
2488  stateVecReal[index]=0;
2489  stateVecImag[index]=0;
2490 
2491  stateVecReal[index+sizeHalfBlock]=stateVecReal[index+sizeHalfBlock]*renorm;
2492  stateVecImag[index+sizeHalfBlock]=stateVecImag[index+sizeHalfBlock]*renorm;
2493  }
2494 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_compactUnitary()

void statevec_compactUnitary ( Qureg  qureg,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

Definition at line 776 of file QuEST_gpu.cu.

777 {
778  int threadsPerCUDABlock, CUDABlocks;
779  threadsPerCUDABlock = 128;
780  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
781  statevec_compactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, alpha, beta);
782 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by compactUnitary(), statevec_multiRotatePauli(), statevec_rotateAroundAxis(), and statevec_rotateAroundAxisConj().

◆ statevec_compactUnitaryKernel()

__global__ void statevec_compactUnitaryKernel ( Qureg  qureg,
int  rotQubit,
Complex  alpha,
Complex  beta 
)

fix – no necessary for GPU version

Definition at line 721 of file QuEST_gpu.cu.

721  {
722  // ----- sizes
723  long long int sizeBlock, // size of blocks
724  sizeHalfBlock; // size of blocks halved
725  // ----- indices
726  long long int thisBlock, // current block
727  indexUp,indexLo; // current index and corresponding index in lower half block
728 
729  // ----- temp variables
730  qreal stateRealUp,stateRealLo, // storage for previous state values
731  stateImagUp,stateImagLo; // (used in updates)
732  // ----- temp variables
733  long long int thisTask; // task based approach for expose loop with small granularity
734  long long int numTasks=qureg.numAmpsPerChunk>>1;
735 
736  sizeHalfBlock = 1LL << rotQubit; // size of blocks halved
737  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
738 
739  // ---------------------------------------------------------------- //
740  // rotate //
741  // ---------------------------------------------------------------- //
742 
744  qreal *stateVecReal = qureg.deviceStateVec.real;
745  qreal *stateVecImag = qureg.deviceStateVec.imag;
746  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
747  qreal betaImag=beta.imag, betaReal=beta.real;
748 
749  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
750  if (thisTask>=numTasks) return;
751 
752  thisBlock = thisTask / sizeHalfBlock;
753  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
754  indexLo = indexUp + sizeHalfBlock;
755 
756  // store current state vector values in temp variables
757  stateRealUp = stateVecReal[indexUp];
758  stateImagUp = stateVecImag[indexUp];
759 
760  stateRealLo = stateVecReal[indexLo];
761  stateImagLo = stateVecImag[indexLo];
762 
763  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
764  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
765  - betaReal*stateRealLo - betaImag*stateImagLo;
766  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
767  - betaReal*stateImagLo + betaImag*stateRealLo;
768 
769  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
770  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
771  + alphaReal*stateRealLo + alphaImag*stateImagLo;
772  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
773  + alphaReal*stateImagLo - alphaImag*stateRealLo;
774 }

References Qureg::deviceStateVec, Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_compareStates()

int statevec_compareStates ( Qureg  mq1,
Qureg  mq2,
qreal  precision 
)

Definition at line 703 of file QuEST_gpu.cu.

703  {
704  qreal diff;
705  int chunkSize = mq1.numAmpsPerChunk;
706 
707  copyStateFromGPU(mq1);
708  copyStateFromGPU(mq2);
709 
710  for (int i=0; i<chunkSize; i++){
711  diff = mq1.stateVec.real[i] - mq2.stateVec.real[i];
712  if (diff<0) diff *= -1;
713  if (diff>precision) return 0;
714  diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i];
715  if (diff<0) diff *= -1;
716  if (diff>precision) return 0;
717  }
718  return 1;
719 }

References copyStateFromGPU(), Qureg::numAmpsPerChunk, qreal, and Qureg::stateVec.

Referenced by compareStates().

◆ statevec_controlledCompactUnitary()

void statevec_controlledCompactUnitary ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

Definition at line 843 of file QuEST_gpu.cu.

844 {
845  int threadsPerCUDABlock, CUDABlocks;
846  threadsPerCUDABlock = 128;
847  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
848  statevec_controlledCompactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, alpha, beta);
849 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledCompactUnitary(), statevec_controlledRotateAroundAxis(), and statevec_controlledRotateAroundAxisConj().

◆ statevec_controlledCompactUnitaryKernel()

__global__ void statevec_controlledCompactUnitaryKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

fix – no necessary for GPU version

Definition at line 784 of file QuEST_gpu.cu.

784  {
785  // ----- sizes
786  long long int sizeBlock, // size of blocks
787  sizeHalfBlock; // size of blocks halved
788  // ----- indices
789  long long int thisBlock, // current block
790  indexUp,indexLo; // current index and corresponding index in lower half block
791 
792  // ----- temp variables
793  qreal stateRealUp,stateRealLo, // storage for previous state values
794  stateImagUp,stateImagLo; // (used in updates)
795  // ----- temp variables
796  long long int thisTask; // task based approach for expose loop with small granularity
797  long long int numTasks=qureg.numAmpsPerChunk>>1;
798  int controlBit;
799 
800  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
801  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
802 
803  // ---------------------------------------------------------------- //
804  // rotate //
805  // ---------------------------------------------------------------- //
806 
808  qreal *stateVecReal = qureg.deviceStateVec.real;
809  qreal *stateVecImag = qureg.deviceStateVec.imag;
810  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
811  qreal betaImag=beta.imag, betaReal=beta.real;
812 
813  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
814  if (thisTask>=numTasks) return;
815 
816  thisBlock = thisTask / sizeHalfBlock;
817  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
818  indexLo = indexUp + sizeHalfBlock;
819 
820  controlBit = extractBit(controlQubit, indexUp);
821  if (controlBit){
822  // store current state vector values in temp variables
823  stateRealUp = stateVecReal[indexUp];
824  stateImagUp = stateVecImag[indexUp];
825 
826  stateRealLo = stateVecReal[indexLo];
827  stateImagLo = stateVecImag[indexLo];
828 
829  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
830  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
831  - betaReal*stateRealLo - betaImag*stateImagLo;
832  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
833  - betaReal*stateImagLo + betaImag*stateRealLo;
834 
835  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
836  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
837  + alphaReal*stateRealLo + alphaImag*stateImagLo;
838  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
839  + alphaReal*stateImagLo - alphaImag*stateRealLo;
840  }
841 }

References Qureg::deviceStateVec, extractBit(), Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_controlledNot()

void statevec_controlledNot ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1772 of file QuEST_gpu.cu.

1773 {
1774  int threadsPerCUDABlock, CUDABlocks;
1775  threadsPerCUDABlock = 128;
1776  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1777  statevec_controlledNotKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit);
1778 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledNot().

◆ statevec_controlledNotKernel()

__global__ void statevec_controlledNotKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1733 of file QuEST_gpu.cu.

1734 {
1735  long long int index;
1736  long long int sizeBlock, // size of blocks
1737  sizeHalfBlock; // size of blocks halved
1738  long long int stateVecSize;
1739  int controlBit;
1740 
1741  // ----- temp variables
1742  qreal stateRealUp, // storage for previous state values
1743  stateImagUp; // (used in updates)
1744  long long int thisBlock, // current block
1745  indexUp,indexLo; // current index and corresponding index in lower half block
1746  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1747  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1748 
1749  stateVecSize = qureg.numAmpsPerChunk;
1750  qreal *stateVecReal = qureg.deviceStateVec.real;
1751  qreal *stateVecImag = qureg.deviceStateVec.imag;
1752 
1753  index = blockIdx.x*blockDim.x + threadIdx.x;
1754  if (index>=(stateVecSize>>1)) return;
1755  thisBlock = index / sizeHalfBlock;
1756  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1757  indexLo = indexUp + sizeHalfBlock;
1758 
1759  controlBit = extractBit(controlQubit, indexUp);
1760  if (controlBit){
1761  stateRealUp = stateVecReal[indexUp];
1762  stateImagUp = stateVecImag[indexUp];
1763 
1764  stateVecReal[indexUp] = stateVecReal[indexLo];
1765  stateVecImag[indexUp] = stateVecImag[indexLo];
1766 
1767  stateVecReal[indexLo] = stateRealUp;
1768  stateVecImag[indexLo] = stateImagUp;
1769  }
1770 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPauliY()

void statevec_controlledPauliY ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1377 of file QuEST_gpu.cu.

1378 {
1379  int conjFactor = 1;
1380  int threadsPerCUDABlock, CUDABlocks;
1381  threadsPerCUDABlock = 128;
1382  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1383  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1384 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPauliY().

◆ statevec_controlledPauliYConj()

void statevec_controlledPauliYConj ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1386 of file QuEST_gpu.cu.

1387 {
1388  int conjFactor = -1;
1389  int threadsPerCUDABlock, CUDABlocks;
1390  threadsPerCUDABlock = 128;
1391  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1392  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1393 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPauliY().

◆ statevec_controlledPauliYKernel()

__global__ void statevec_controlledPauliYKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
int  conjFac 
)

Definition at line 1341 of file QuEST_gpu.cu.

1342 {
1343  long long int index;
1344  long long int sizeBlock, sizeHalfBlock;
1345  long long int stateVecSize;
1346  int controlBit;
1347 
1348  qreal stateRealUp, stateImagUp;
1349  long long int thisBlock, indexUp, indexLo;
1350  sizeHalfBlock = 1LL << targetQubit;
1351  sizeBlock = 2LL * sizeHalfBlock;
1352 
1353  stateVecSize = qureg.numAmpsPerChunk;
1354  qreal *stateVecReal = qureg.deviceStateVec.real;
1355  qreal *stateVecImag = qureg.deviceStateVec.imag;
1356 
1357  index = blockIdx.x*blockDim.x + threadIdx.x;
1358  if (index>=(stateVecSize>>1)) return;
1359  thisBlock = index / sizeHalfBlock;
1360  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1361  indexLo = indexUp + sizeHalfBlock;
1362 
1363  controlBit = extractBit(controlQubit, indexUp);
1364  if (controlBit){
1365 
1366  stateRealUp = stateVecReal[indexUp];
1367  stateImagUp = stateVecImag[indexUp];
1368 
1369  // update under +-{{0, -i}, {i, 0}}
1370  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1371  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1372  stateVecReal[indexLo] = conjFac * -stateImagUp;
1373  stateVecImag[indexLo] = conjFac * stateRealUp;
1374  }
1375 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPhaseFlip()

void statevec_controlledPhaseFlip ( Qureg  qureg,
int  idQubit1,
int  idQubit2 
)

Definition at line 1607 of file QuEST_gpu.cu.

1608 {
1609  int threadsPerCUDABlock, CUDABlocks;
1610  threadsPerCUDABlock = 128;
1611  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1612  statevec_controlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2);
1613 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPhaseFlip().

◆ statevec_controlledPhaseFlipKernel()

__global__ void statevec_controlledPhaseFlipKernel ( Qureg  qureg,
int  idQubit1,
int  idQubit2 
)

Definition at line 1586 of file QuEST_gpu.cu.

1587 {
1588  long long int index;
1589  long long int stateVecSize;
1590  int bit1, bit2;
1591 
1592  stateVecSize = qureg.numAmpsPerChunk;
1593  qreal *stateVecReal = qureg.deviceStateVec.real;
1594  qreal *stateVecImag = qureg.deviceStateVec.imag;
1595 
1596  index = blockIdx.x*blockDim.x + threadIdx.x;
1597  if (index>=stateVecSize) return;
1598 
1599  bit1 = extractBit (idQubit1, index);
1600  bit2 = extractBit (idQubit2, index);
1601  if (bit1 && bit2) {
1602  stateVecReal [index] = - stateVecReal [index];
1603  stateVecImag [index] = - stateVecImag [index];
1604  }
1605 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPhaseShift()

void statevec_controlledPhaseShift ( Qureg  qureg,
int  idQubit1,
int  idQubit2,
qreal  angle 
)

Definition at line 1459 of file QuEST_gpu.cu.

1460 {
1461  qreal cosAngle = cos(angle);
1462  qreal sinAngle = sin(angle);
1463 
1464  int threadsPerCUDABlock, CUDABlocks;
1465  threadsPerCUDABlock = 128;
1466  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1467  statevec_controlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2, cosAngle, sinAngle);
1468 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPhaseShift().

◆ statevec_controlledPhaseShiftKernel()

__global__ void statevec_controlledPhaseShiftKernel ( Qureg  qureg,
int  idQubit1,
int  idQubit2,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1434 of file QuEST_gpu.cu.

1435 {
1436  long long int index;
1437  long long int stateVecSize;
1438  int bit1, bit2;
1439  qreal stateRealLo, stateImagLo;
1440 
1441  stateVecSize = qureg.numAmpsPerChunk;
1442  qreal *stateVecReal = qureg.deviceStateVec.real;
1443  qreal *stateVecImag = qureg.deviceStateVec.imag;
1444 
1445  index = blockIdx.x*blockDim.x + threadIdx.x;
1446  if (index>=stateVecSize) return;
1447 
1448  bit1 = extractBit (idQubit1, index);
1449  bit2 = extractBit (idQubit2, index);
1450  if (bit1 && bit2) {
1451  stateRealLo = stateVecReal[index];
1452  stateImagLo = stateVecImag[index];
1453 
1454  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1455  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1456  }
1457 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledUnitary()

void statevec_controlledUnitary ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 1169 of file QuEST_gpu.cu.

1170 {
1171  int threadsPerCUDABlock, CUDABlocks;
1172  threadsPerCUDABlock = 128;
1173  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1174  statevec_controlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, argifyMatrix2(u));
1175 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledUnitary().

◆ statevec_controlledUnitaryKernel()

__global__ void statevec_controlledUnitaryKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 1111 of file QuEST_gpu.cu.

1111  {
1112  // ----- sizes
1113  long long int sizeBlock, // size of blocks
1114  sizeHalfBlock; // size of blocks halved
1115  // ----- indices
1116  long long int thisBlock, // current block
1117  indexUp,indexLo; // current index and corresponding index in lower half block
1118 
1119  // ----- temp variables
1120  qreal stateRealUp,stateRealLo, // storage for previous state values
1121  stateImagUp,stateImagLo; // (used in updates)
1122  // ----- temp variables
1123  long long int thisTask; // task based approach for expose loop with small granularity
1124  long long int numTasks=qureg.numAmpsPerChunk>>1;
1125 
1126  int controlBit;
1127 
1128  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1129  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1130 
1131  // ---------------------------------------------------------------- //
1132  // rotate //
1133  // ---------------------------------------------------------------- //
1134 
1136  qreal *stateVecReal = qureg.deviceStateVec.real;
1137  qreal *stateVecImag = qureg.deviceStateVec.imag;
1138 
1139  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1140  if (thisTask>=numTasks) return;
1141 
1142  thisBlock = thisTask / sizeHalfBlock;
1143  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1144  indexLo = indexUp + sizeHalfBlock;
1145 
1146  // store current state vector values in temp variables
1147  stateRealUp = stateVecReal[indexUp];
1148  stateImagUp = stateVecImag[indexUp];
1149 
1150  stateRealLo = stateVecReal[indexLo];
1151  stateImagLo = stateVecImag[indexLo];
1152 
1153  controlBit = extractBit(controlQubit, indexUp);
1154  if (controlBit){
1155  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1156  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1157  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1158  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1159  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1160 
1161  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1162  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1163  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1164  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1165  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1166  }
1167 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_createQureg()

void statevec_createQureg ( Qureg qureg,
int  numQubits,
QuESTEnv  env 
)

Definition at line 275 of file QuEST_gpu.cu.

276 {
277  // allocate CPU memory
278  long long int numAmps = 1L << numQubits;
279  long long int numAmpsPerRank = numAmps/env.numRanks;
280  qureg->stateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.real));
281  qureg->stateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.imag));
282  if (env.numRanks>1){
283  qureg->pairStateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.real));
284  qureg->pairStateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.imag));
285  }
286 
287  // check cpu memory allocation was successful
288  if ( (!(qureg->stateVec.real) || !(qureg->stateVec.imag))
289  && numAmpsPerRank ) {
290  printf("Could not allocate memory!\n");
291  exit (EXIT_FAILURE);
292  }
293  if ( env.numRanks>1 && (!(qureg->pairStateVec.real) || !(qureg->pairStateVec.imag))
294  && numAmpsPerRank ) {
295  printf("Could not allocate memory!\n");
296  exit (EXIT_FAILURE);
297  }
298 
299  qureg->numQubitsInStateVec = numQubits;
300  qureg->numAmpsPerChunk = numAmpsPerRank;
301  qureg->numAmpsTotal = numAmps;
302  qureg->chunkId = env.rank;
303  qureg->numChunks = env.numRanks;
304  qureg->isDensityMatrix = 0;
305 
306  // allocate GPU memory
307  cudaMalloc(&(qureg->deviceStateVec.real), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.real)));
308  cudaMalloc(&(qureg->deviceStateVec.imag), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.imag)));
309  cudaMalloc(&(qureg->firstLevelReduction), ceil(qureg->numAmpsPerChunk/(qreal)REDUCE_SHARED_SIZE)*sizeof(qreal));
311  sizeof(qreal));
312 
313  // check gpu memory allocation was successful
314  if (!(qureg->deviceStateVec.real) || !(qureg->deviceStateVec.imag)){
315  printf("Could not allocate memory on GPU!\n");
316  exit (EXIT_FAILURE);
317  }
318 
319 }

References Qureg::chunkId, Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::isDensityMatrix, Qureg::numAmpsPerChunk, Qureg::numAmpsTotal, Qureg::numChunks, Qureg::numQubitsInStateVec, QuESTEnv::numRanks, Qureg::pairStateVec, qreal, QuESTEnv::rank, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and Qureg::stateVec.

Referenced by createCloneQureg(), createDensityQureg(), and createQureg().

◆ statevec_destroyQureg()

void statevec_destroyQureg ( Qureg  qureg,
QuESTEnv  env 
)

Definition at line 321 of file QuEST_gpu.cu.

322 {
323  // Free CPU memory
324  free(qureg.stateVec.real);
325  free(qureg.stateVec.imag);
326  if (env.numRanks>1){
327  free(qureg.pairStateVec.real);
328  free(qureg.pairStateVec.imag);
329  }
330 
331  // Free GPU memory
332  cudaFree(qureg.deviceStateVec.real);
333  cudaFree(qureg.deviceStateVec.imag);
334  cudaFree(qureg.firstLevelReduction);
335  cudaFree(qureg.secondLevelReduction);
336 }

References Qureg::deviceStateVec, Qureg::firstLevelReduction, QuESTEnv::numRanks, Qureg::pairStateVec, Qureg::secondLevelReduction, and Qureg::stateVec.

Referenced by destroyQureg().

◆ statevec_findProbabilityOfZero()

qreal statevec_findProbabilityOfZero ( Qureg  qureg,
int  measureQubit 
)

Definition at line 1967 of file QuEST_gpu.cu.

1968 {
1969  long long int numValuesToReduce = qureg.numAmpsPerChunk>>1;
1970  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
1971  qreal stateProb=0;
1972  int firstTime=1;
1973  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
1974 
1975  while(numValuesToReduce>1){
1976  if (numValuesToReduce<maxReducedPerLevel){
1977  // Need less than one CUDA block to reduce values
1978  valuesPerCUDABlock = numValuesToReduce;
1979  numCUDABlocks = 1;
1980  } else {
1981  // Use full CUDA blocks, with block size constrained by shared mem usage
1982  valuesPerCUDABlock = maxReducedPerLevel;
1983  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
1984  }
1985  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
1986 
1987  if (firstTime){
1988  statevec_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
1989  qureg, measureQubit, qureg.firstLevelReduction);
1990  firstTime=0;
1991  } else {
1992  cudaDeviceSynchronize();
1993  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
1994  qureg.firstLevelReduction,
1995  qureg.secondLevelReduction, valuesPerCUDABlock);
1996  cudaDeviceSynchronize();
1998  }
1999  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2000  }
2001  cudaMemcpy(&stateProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2002  return stateProb;
2003 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by statevec_calcProbOfOutcome().

◆ statevec_findProbabilityOfZeroKernel()

__global__ void statevec_findProbabilityOfZeroKernel ( Qureg  qureg,
int  measureQubit,
qreal reducedArray 
)

Definition at line 1853 of file QuEST_gpu.cu.

1855  {
1856  // ----- sizes
1857  long long int sizeBlock, // size of blocks
1858  sizeHalfBlock; // size of blocks halved
1859  // ----- indices
1860  long long int thisBlock, // current block
1861  index; // current index for first half block
1862  // ----- temp variables
1863  long long int thisTask; // task based approach for expose loop with small granularity
1864  long long int numTasks=qureg.numAmpsPerChunk>>1;
1865  // (good for shared memory parallelism)
1866 
1867  extern __shared__ qreal tempReductionArray[];
1868 
1869  // ---------------------------------------------------------------- //
1870  // dimensions //
1871  // ---------------------------------------------------------------- //
1872  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
1873  // and then the number to skip
1874  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
1875 
1876  // ---------------------------------------------------------------- //
1877  // find probability //
1878  // ---------------------------------------------------------------- //
1879 
1880  //
1881  // --- task-based shared-memory parallel implementation
1882  //
1883 
1884  qreal *stateVecReal = qureg.deviceStateVec.real;
1885  qreal *stateVecImag = qureg.deviceStateVec.imag;
1886 
1887  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1888  if (thisTask>=numTasks) return;
1889 
1890  thisBlock = thisTask / sizeHalfBlock;
1891  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1892  qreal realVal, imagVal;
1893  realVal = stateVecReal[index];
1894  imagVal = stateVecImag[index];
1895  tempReductionArray[threadIdx.x] = realVal*realVal + imagVal*imagVal;
1896  __syncthreads();
1897 
1898  if (threadIdx.x<blockDim.x/2){
1899  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1900  }
1901 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, qreal, and reduceBlock().

◆ statevec_getImagAmp()

qreal statevec_getImagAmp ( Qureg  qureg,
long long int  index 
)

Definition at line 508 of file QuEST_gpu.cu.

508  {
509  qreal el=0;
510  cudaMemcpy(&el, &(qureg.deviceStateVec.imag[index]),
511  sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
512  return el;
513 }

References Qureg::deviceStateVec, and qreal.

Referenced by getAmp(), getDensityAmp(), getImagAmp(), and statevec_getProbAmp().

◆ statevec_getRealAmp()

qreal statevec_getRealAmp ( Qureg  qureg,
long long int  index 
)

Definition at line 501 of file QuEST_gpu.cu.

501  {
502  qreal el=0;
503  cudaMemcpy(&el, &(qureg.deviceStateVec.real[index]),
504  sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
505  return el;
506 }

References Qureg::deviceStateVec, and qreal.

Referenced by getAmp(), getDensityAmp(), getRealAmp(), and statevec_getProbAmp().

◆ statevec_hadamard()

void statevec_hadamard ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1725 of file QuEST_gpu.cu.

1726 {
1727  int threadsPerCUDABlock, CUDABlocks;
1728  threadsPerCUDABlock = 128;
1729  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1730  statevec_hadamardKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1731 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by hadamard().

◆ statevec_hadamardKernel()

__global__ void statevec_hadamardKernel ( Qureg  qureg,
int  targetQubit 
)

fix – no necessary for GPU version

Definition at line 1676 of file QuEST_gpu.cu.

1676  {
1677  // ----- sizes
1678  long long int sizeBlock, // size of blocks
1679  sizeHalfBlock; // size of blocks halved
1680  // ----- indices
1681  long long int thisBlock, // current block
1682  indexUp,indexLo; // current index and corresponding index in lower half block
1683 
1684  // ----- temp variables
1685  qreal stateRealUp,stateRealLo, // storage for previous state values
1686  stateImagUp,stateImagLo; // (used in updates)
1687  // ----- temp variables
1688  long long int thisTask; // task based approach for expose loop with small granularity
1689  long long int numTasks=qureg.numAmpsPerChunk>>1;
1690 
1691  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1692  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1693 
1694  // ---------------------------------------------------------------- //
1695  // rotate //
1696  // ---------------------------------------------------------------- //
1697 
1699  qreal *stateVecReal = qureg.deviceStateVec.real;
1700  qreal *stateVecImag = qureg.deviceStateVec.imag;
1701 
1702  qreal recRoot2 = 1.0/sqrt(2.0);
1703 
1704  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1705  if (thisTask>=numTasks) return;
1706 
1707  thisBlock = thisTask / sizeHalfBlock;
1708  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1709  indexLo = indexUp + sizeHalfBlock;
1710 
1711  // store current state vector values in temp variables
1712  stateRealUp = stateVecReal[indexUp];
1713  stateImagUp = stateVecImag[indexUp];
1714 
1715  stateRealLo = stateVecReal[indexLo];
1716  stateImagLo = stateVecImag[indexLo];
1717 
1718  stateVecReal[indexUp] = recRoot2*(stateRealUp + stateRealLo);
1719  stateVecImag[indexUp] = recRoot2*(stateImagUp + stateImagLo);
1720 
1721  stateVecReal[indexLo] = recRoot2*(stateRealUp - stateRealLo);
1722  stateVecImag[indexLo] = recRoot2*(stateImagUp - stateImagLo);
1723 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_initBlankState()

void statevec_initBlankState ( Qureg  qureg)

Definition at line 525 of file QuEST_gpu.cu.

526 {
527  int threadsPerCUDABlock, CUDABlocks;
528  threadsPerCUDABlock = 128;
529  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
530  statevec_initBlankStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
531  qureg.numAmpsPerChunk,
532  qureg.deviceStateVec.real,
533  qureg.deviceStateVec.imag);
534 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initBlankState(), and statevec_applyPauliSum().

◆ statevec_initBlankStateKernel()

__global__ void statevec_initBlankStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 515 of file QuEST_gpu.cu.

515  {
516  long long int index;
517 
518  // initialise the statevector to be all-zeros
519  index = blockIdx.x*blockDim.x + threadIdx.x;
520  if (index>=stateVecSize) return;
521  stateVecReal[index] = 0.0;
522  stateVecImag[index] = 0.0;
523 }

◆ statevec_initClassicalState()

void statevec_initClassicalState ( Qureg  qureg,
long long int  stateInd 
)

Definition at line 600 of file QuEST_gpu.cu.

601 {
602  int threadsPerCUDABlock, CUDABlocks;
603  threadsPerCUDABlock = 128;
604  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
605  statevec_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
606  qureg.numAmpsPerChunk,
607  qureg.deviceStateVec.real,
608  qureg.deviceStateVec.imag, stateInd);
609 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initClassicalState().

◆ statevec_initClassicalStateKernel()

__global__ void statevec_initClassicalStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag,
long long int  stateInd 
)

Definition at line 585 of file QuEST_gpu.cu.

585  {
586  long long int index;
587 
588  // initialise the state to |stateInd>
589  index = blockIdx.x*blockDim.x + threadIdx.x;
590  if (index>=stateVecSize) return;
591  stateVecReal[index] = 0.0;
592  stateVecImag[index] = 0.0;
593 
594  if (index==stateInd){
595  // classical state has probability 1
596  stateVecReal[stateInd] = 1.0;
597  stateVecImag[stateInd] = 0.0;
598  }
599 }

◆ statevec_initDebugState()

void statevec_initDebugState ( Qureg  qureg)

Initialise the state vector of probability amplitudes to an (unphysical) state with each component of each probability amplitude a unique floating point value.

For debugging processes

Parameters
[in,out]quregobject representing the set of qubits to be initialised

Definition at line 621 of file QuEST_gpu.cu.

622 {
623  int threadsPerCUDABlock, CUDABlocks;
624  threadsPerCUDABlock = 128;
625  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
626  statevec_initDebugStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
627  qureg.numAmpsPerChunk,
628  qureg.deviceStateVec.real,
629  qureg.deviceStateVec.imag);
630 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initDebugState().

◆ statevec_initDebugStateKernel()

__global__ void statevec_initDebugStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 611 of file QuEST_gpu.cu.

611  {
612  long long int index;
613 
614  index = blockIdx.x*blockDim.x + threadIdx.x;
615  if (index>=stateVecSize) return;
616 
617  stateVecReal[index] = (index*2.0)/10.0;
618  stateVecImag[index] = (index*2.0+1.0)/10.0;
619 }

◆ statevec_initPlusState()

void statevec_initPlusState ( Qureg  qureg)

Definition at line 574 of file QuEST_gpu.cu.

575 {
576  int threadsPerCUDABlock, CUDABlocks;
577  threadsPerCUDABlock = 128;
578  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
579  statevec_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
580  qureg.numAmpsPerChunk,
581  qureg.deviceStateVec.real,
582  qureg.deviceStateVec.imag);
583 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initPlusState().

◆ statevec_initPlusStateKernel()

__global__ void statevec_initPlusStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 563 of file QuEST_gpu.cu.

563  {
564  long long int index;
565 
566  index = blockIdx.x*blockDim.x + threadIdx.x;
567  if (index>=stateVecSize) return;
568 
569  qreal normFactor = 1.0/sqrt((qreal)stateVecSize);
570  stateVecReal[index] = normFactor;
571  stateVecImag[index] = 0.0;
572 }

References qreal.

◆ statevec_initStateFromSingleFile()

int statevec_initStateFromSingleFile ( Qureg qureg,
char  filename[200],
QuESTEnv  env 
)

Definition at line 659 of file QuEST_gpu.cu.

659  {
660  long long int chunkSize, stateVecSize;
661  long long int indexInChunk, totalIndex;
662 
663  chunkSize = qureg->numAmpsPerChunk;
664  stateVecSize = chunkSize*qureg->numChunks;
665 
666  qreal *stateVecReal = qureg->stateVec.real;
667  qreal *stateVecImag = qureg->stateVec.imag;
668 
669  FILE *fp;
670  char line[200];
671 
672  fp = fopen(filename, "r");
673  if (fp == NULL)
674  return 0;
675 
676  indexInChunk = 0; totalIndex = 0;
677  while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
678  if (line[0]!='#'){
679  int chunkId = totalIndex/chunkSize;
680  if (chunkId==qureg->chunkId){
681  # if QuEST_PREC==1
682  sscanf(line, "%f, %f", &(stateVecReal[indexInChunk]),
683  &(stateVecImag[indexInChunk]));
684  # elif QuEST_PREC==2
685  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
686  &(stateVecImag[indexInChunk]));
687  # elif QuEST_PREC==4
688  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
689  &(stateVecImag[indexInChunk]));
690  # endif
691  indexInChunk += 1;
692  }
693  totalIndex += 1;
694  }
695  }
696  fclose(fp);
697  copyStateToGPU(*qureg);
698 
699  // indicate success
700  return 1;
701 }

References Qureg::chunkId, copyStateToGPU(), Qureg::numAmpsPerChunk, Qureg::numChunks, qreal, and Qureg::stateVec.

Referenced by initStateFromSingleFile().

◆ statevec_initStateOfSingleQubit()

void statevec_initStateOfSingleQubit ( Qureg qureg,
int  qubitId,
int  outcome 
)

Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one.

Parameters
[in,out]quregobject representing the set of qubits to be initialised
[in]qubitIdid of qubit to set to state 'outcome'
[in]valueof qubit 'qubitId'

Definition at line 650 of file QuEST_gpu.cu.

651 {
652  int threadsPerCUDABlock, CUDABlocks;
653  threadsPerCUDABlock = 128;
654  CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock);
655  statevec_initStateOfSingleQubitKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome);
656 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initStateOfSingleQubit().

◆ statevec_initStateOfSingleQubitKernel()

__global__ void statevec_initStateOfSingleQubitKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag,
int  qubitId,
int  outcome 
)

Definition at line 632 of file QuEST_gpu.cu.

632  {
633  long long int index;
634  int bit;
635 
636  index = blockIdx.x*blockDim.x + threadIdx.x;
637  if (index>=stateVecSize) return;
638 
639  qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2);
640  bit = extractBit(qubitId, index);
641  if (bit==outcome) {
642  stateVecReal[index] = normFactor;
643  stateVecImag[index] = 0.0;
644  } else {
645  stateVecReal[index] = 0.0;
646  stateVecImag[index] = 0.0;
647  }
648 }

References extractBit(), and qreal.

◆ statevec_initZeroState()

void statevec_initZeroState ( Qureg  qureg)

Definition at line 552 of file QuEST_gpu.cu.

553 {
554  int threadsPerCUDABlock, CUDABlocks;
555  threadsPerCUDABlock = 128;
556  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
557  statevec_initZeroStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
558  qureg.numAmpsPerChunk,
559  qureg.deviceStateVec.real,
560  qureg.deviceStateVec.imag);
561 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initZeroState().

◆ statevec_initZeroStateKernel()

__global__ void statevec_initZeroStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 536 of file QuEST_gpu.cu.

536  {
537  long long int index;
538 
539  // initialise the state to |0000..0000>
540  index = blockIdx.x*blockDim.x + threadIdx.x;
541  if (index>=stateVecSize) return;
542  stateVecReal[index] = 0.0;
543  stateVecImag[index] = 0.0;
544 
545  if (index==0){
546  // zero state |0000..0000> has probability 1
547  stateVecReal[0] = 1.0;
548  stateVecImag[0] = 0.0;
549  }
550 }

◆ statevec_multiControlledMultiQubitUnitary()

void statevec_multiControlledMultiQubitUnitary ( Qureg  qureg,
long long int  ctrlMask,
int *  targs,
int  numTargs,
ComplexMatrixN  u 
)

This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct.

It is already gauranteed here that all target qubits can fit on each node (this is validated in the front-end)

@TODO: refactor so that the 'swap back' isn't performed; instead the qubit locations are updated.

Definition at line 971 of file QuEST_gpu.cu.

972 {
973  int threadsPerCUDABlock = 128;
974  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>numTargs)/threadsPerCUDABlock);
975 
976  // allocate device space for global {targs} (length: numTargs) and populate
977  int *d_targs;
978  size_t targMemSize = numTargs * sizeof *d_targs;
979  cudaMalloc(&d_targs, targMemSize);
980  cudaMemcpy(d_targs, targs, targMemSize, cudaMemcpyHostToDevice);
981 
982  // flatten out the u.real and u.imag lists
983  int uNumRows = (1 << u.numQubits);
984  qreal* uReFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uReFlat);
985  qreal* uImFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uImFlat);
986  long long int i = 0;
987  for (int r=0; r < uNumRows; r++)
988  for (int c=0; c < uNumRows; c++) {
989  uReFlat[i] = u.real[r][c];
990  uImFlat[i] = u.imag[r][c];
991  i++;
992  }
993 
994  // allocate device space for global u.real and u.imag (flatten by concatenating rows) and populate
995  qreal* d_uRe;
996  qreal* d_uIm;
997  size_t uMemSize = uNumRows*uNumRows * sizeof *d_uRe; // size of each of d_uRe and d_uIm
998  cudaMalloc(&d_uRe, uMemSize);
999  cudaMalloc(&d_uIm, uMemSize);
1000  cudaMemcpy(d_uRe, uReFlat, uMemSize, cudaMemcpyHostToDevice);
1001  cudaMemcpy(d_uIm, uImFlat, uMemSize, cudaMemcpyHostToDevice);
1002 
1003  // allocate device Wspace for thread-local {ampInds}, {reAmps}, {imAmps} (length: 1<<numTargs)
1004  long long int *d_ampInds;
1005  qreal *d_reAmps;
1006  qreal *d_imAmps;
1007  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
1008  int numTargAmps = uNumRows;
1009  cudaMalloc(&d_ampInds, numTargAmps*gridSize * sizeof *d_ampInds);
1010  cudaMalloc(&d_reAmps, numTargAmps*gridSize * sizeof *d_reAmps);
1011  cudaMalloc(&d_imAmps, numTargAmps*gridSize * sizeof *d_imAmps);
1012 
1013  // call kernel
1014  statevec_multiControlledMultiQubitUnitaryKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
1015  qureg, ctrlMask, d_targs, numTargs, d_uRe, d_uIm, d_ampInds, d_reAmps, d_imAmps, numTargAmps);
1016 
1017  // free kernel memory
1018  free(uReFlat);
1019  free(uImFlat);
1020  cudaFree(d_targs);
1021  cudaFree(d_uRe);
1022  cudaFree(d_uIm);
1023  cudaFree(d_ampInds);
1024  cudaFree(d_reAmps);
1025  cudaFree(d_imAmps);
1026 }

References ComplexMatrixN::imag, Qureg::numAmpsPerChunk, ComplexMatrixN::numQubits, qreal, and ComplexMatrixN::real.

Referenced by applyMultiControlledMatrixN(), densmatr_applyMultiQubitKrausSuperoperator(), densmatr_applyTwoQubitKrausSuperoperator(), multiControlledMultiQubitUnitary(), statevec_controlledMultiQubitUnitary(), and statevec_multiQubitUnitary().

◆ statevec_multiControlledMultiQubitUnitaryKernel()

__global__ void statevec_multiControlledMultiQubitUnitaryKernel ( Qureg  qureg,
long long int  ctrlMask,
int *  targs,
int  numTargs,
qreal uRe,
qreal uIm,
long long int *  ampInds,
qreal reAmps,
qreal imAmps,
long long int  numTargAmps 
)

Definition at line 912 of file QuEST_gpu.cu.

915 {
916 
917  // decide the amplitudes this thread will modify
918  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
919  long long int numTasks = qureg.numAmpsPerChunk >> numTargs; // kernel called on every 1 in 2^numTargs amplitudes
920  if (thisTask>=numTasks) return;
921 
922  // find this task's start index (where all targs are 0)
923  long long int ind00 = insertZeroBits(thisTask, targs, numTargs);
924 
925  // this task only modifies amplitudes if control qubits are 1 for this state
926  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
927  return;
928 
929  qreal *reVec = qureg.deviceStateVec.real;
930  qreal *imVec = qureg.deviceStateVec.imag;
931 
932  /*
933  each thread needs:
934  long long int ampInds[numAmps];
935  qreal reAmps[numAmps];
936  qreal imAmps[numAmps];
937  but instead has access to shared arrays, with below stride and offset
938  */
939  size_t stride = gridDim.x*blockDim.x;
940  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
941 
942  // determine the indices and record values of target amps
943  long long int ind;
944  for (int i=0; i < numTargAmps; i++) {
945 
946  // get global index of current target qubit assignment
947  ind = ind00;
948  for (int t=0; t < numTargs; t++)
949  if (extractBit(t, i))
950  ind = flipBit(ind, targs[t]);
951 
952  ampInds[i*stride+offset] = ind;
953  reAmps [i*stride+offset] = reVec[ind];
954  imAmps [i*stride+offset] = imVec[ind];
955  }
956 
957  // update the amplitudes
958  for (int r=0; r < numTargAmps; r++) {
959  ind = ampInds[r*stride+offset];
960  reVec[ind] = 0;
961  imVec[ind] = 0;
962  for (int c=0; c < numTargAmps; c++) {
963  qreal uReElem = uRe[c + r*numTargAmps];
964  qreal uImElem = uIm[c + r*numTargAmps];
965  reVec[ind] += reAmps[c*stride+offset]*uReElem - imAmps[c*stride+offset]*uImElem;
966  imVec[ind] += reAmps[c*stride+offset]*uImElem + imAmps[c*stride+offset]*uReElem;
967  }
968  }
969 }

References Qureg::deviceStateVec, extractBit(), flipBit(), insertZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledPhaseFlip()

void statevec_multiControlledPhaseFlip ( Qureg  qureg,
int *  controlQubits,
int  numControlQubits 
)

Definition at line 1633 of file QuEST_gpu.cu.

1634 {
1635  int threadsPerCUDABlock, CUDABlocks;
1636  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1637  threadsPerCUDABlock = 128;
1638  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1639  statevec_multiControlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask);
1640 }

References getQubitBitMask(), Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledPhaseFlip().

◆ statevec_multiControlledPhaseFlipKernel()

__global__ void statevec_multiControlledPhaseFlipKernel ( Qureg  qureg,
long long int  mask 
)

Definition at line 1615 of file QuEST_gpu.cu.

1616 {
1617  long long int index;
1618  long long int stateVecSize;
1619 
1620  stateVecSize = qureg.numAmpsPerChunk;
1621  qreal *stateVecReal = qureg.deviceStateVec.real;
1622  qreal *stateVecImag = qureg.deviceStateVec.imag;
1623 
1624  index = blockIdx.x*blockDim.x + threadIdx.x;
1625  if (index>=stateVecSize) return;
1626 
1627  if (mask == (mask & index) ){
1628  stateVecReal [index] = - stateVecReal [index];
1629  stateVecImag [index] = - stateVecImag [index];
1630  }
1631 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledPhaseShift()

void statevec_multiControlledPhaseShift ( Qureg  qureg,
int *  controlQubits,
int  numControlQubits,
qreal  angle 
)

Definition at line 1490 of file QuEST_gpu.cu.

1491 {
1492  qreal cosAngle = cos(angle);
1493  qreal sinAngle = sin(angle);
1494 
1495  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1496 
1497  int threadsPerCUDABlock, CUDABlocks;
1498  threadsPerCUDABlock = 128;
1499  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1500  statevec_multiControlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1501 }

References getQubitBitMask(), Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledPhaseShift().

◆ statevec_multiControlledPhaseShiftKernel()

__global__ void statevec_multiControlledPhaseShiftKernel ( Qureg  qureg,
long long int  mask,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1470 of file QuEST_gpu.cu.

1470  {
1471  qreal stateRealLo, stateImagLo;
1472  long long int index;
1473  long long int stateVecSize;
1474 
1475  stateVecSize = qureg.numAmpsPerChunk;
1476  qreal *stateVecReal = qureg.deviceStateVec.real;
1477  qreal *stateVecImag = qureg.deviceStateVec.imag;
1478 
1479  index = blockIdx.x*blockDim.x + threadIdx.x;
1480  if (index>=stateVecSize) return;
1481 
1482  if (mask == (mask & index) ){
1483  stateRealLo = stateVecReal[index];
1484  stateImagLo = stateVecImag[index];
1485  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1486  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1487  }
1488 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledTwoQubitUnitary()

void statevec_multiControlledTwoQubitUnitary ( Qureg  qureg,
long long int  ctrlMask,
int  q1,
int  q2,
ComplexMatrix4  u 
)

This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct.

Note the order of q1 and q2 in the call to twoQubitUnitaryLocal is important.

@TODO: refactor so that the 'swap back' isn't performed; instead the qubit locations are updated. @TODO: the double swap (q1,q2 to 0,1) may be possible simultaneously by a bespoke swap routine.

Definition at line 1104 of file QuEST_gpu.cu.

1105 {
1106  int threadsPerCUDABlock = 128;
1107  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock); // one kernel eval for every 4 amplitudes
1108  statevec_multiControlledTwoQubitUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, q1, q2, argifyMatrix4(u));
1109 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by densmatr_applyKrausSuperoperator(), multiControlledTwoQubitUnitary(), statevec_controlledTwoQubitUnitary(), and statevec_twoQubitUnitary().

◆ statevec_multiControlledTwoQubitUnitaryKernel()

__global__ void statevec_multiControlledTwoQubitUnitaryKernel ( Qureg  qureg,
long long int  ctrlMask,
int  q1,
int  q2,
ArgMatrix4  u 
)

Definition at line 1028 of file QuEST_gpu.cu.

1028  {
1029 
1030  // decide the 4 amplitudes this thread will modify
1031  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1032  long long int numTasks = qureg.numAmpsPerChunk >> 2; // kernel called on every 1 in 4 amplitudes
1033  if (thisTask>=numTasks) return;
1034 
1035  qreal *reVec = qureg.deviceStateVec.real;
1036  qreal *imVec = qureg.deviceStateVec.imag;
1037 
1038  // find indices of amplitudes to modify (treat q1 as the least significant bit)
1039  long long int ind00, ind01, ind10, ind11;
1040  ind00 = insertTwoZeroBits(thisTask, q1, q2);
1041 
1042  // modify only if control qubits are 1 for this state
1043  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
1044  return;
1045 
1046  ind01 = flipBit(ind00, q1);
1047  ind10 = flipBit(ind00, q2);
1048  ind11 = flipBit(ind01, q2);
1049 
1050  // extract statevec amplitudes
1051  qreal re00, re01, re10, re11;
1052  qreal im00, im01, im10, im11;
1053  re00 = reVec[ind00]; im00 = imVec[ind00];
1054  re01 = reVec[ind01]; im01 = imVec[ind01];
1055  re10 = reVec[ind10]; im10 = imVec[ind10];
1056  re11 = reVec[ind11]; im11 = imVec[ind11];
1057 
1058  // apply u * {amp00, amp01, amp10, amp11}
1059  reVec[ind00] =
1060  u.r0c0.real*re00 - u.r0c0.imag*im00 +
1061  u.r0c1.real*re01 - u.r0c1.imag*im01 +
1062  u.r0c2.real*re10 - u.r0c2.imag*im10 +
1063  u.r0c3.real*re11 - u.r0c3.imag*im11;
1064  imVec[ind00] =
1065  u.r0c0.imag*re00 + u.r0c0.real*im00 +
1066  u.r0c1.imag*re01 + u.r0c1.real*im01 +
1067  u.r0c2.imag*re10 + u.r0c2.real*im10 +
1068  u.r0c3.imag*re11 + u.r0c3.real*im11;
1069 
1070  reVec[ind01] =
1071  u.r1c0.real*re00 - u.r1c0.imag*im00 +
1072  u.r1c1.real*re01 - u.r1c1.imag*im01 +
1073  u.r1c2.real*re10 - u.r1c2.imag*im10 +
1074  u.r1c3.real*re11 - u.r1c3.imag*im11;
1075  imVec[ind01] =
1076  u.r1c0.imag*re00 + u.r1c0.real*im00 +
1077  u.r1c1.imag*re01 + u.r1c1.real*im01 +
1078  u.r1c2.imag*re10 + u.r1c2.real*im10 +
1079  u.r1c3.imag*re11 + u.r1c3.real*im11;
1080 
1081  reVec[ind10] =
1082  u.r2c0.real*re00 - u.r2c0.imag*im00 +
1083  u.r2c1.real*re01 - u.r2c1.imag*im01 +
1084  u.r2c2.real*re10 - u.r2c2.imag*im10 +
1085  u.r2c3.real*re11 - u.r2c3.imag*im11;
1086  imVec[ind10] =
1087  u.r2c0.imag*re00 + u.r2c0.real*im00 +
1088  u.r2c1.imag*re01 + u.r2c1.real*im01 +
1089  u.r2c2.imag*re10 + u.r2c2.real*im10 +
1090  u.r2c3.imag*re11 + u.r2c3.real*im11;
1091 
1092  reVec[ind11] =
1093  u.r3c0.real*re00 - u.r3c0.imag*im00 +
1094  u.r3c1.real*re01 - u.r3c1.imag*im01 +
1095  u.r3c2.real*re10 - u.r3c2.imag*im10 +
1096  u.r3c3.real*re11 - u.r3c3.imag*im11;
1097  imVec[ind11] =
1098  u.r3c0.imag*re00 + u.r3c0.real*im00 +
1099  u.r3c1.imag*re01 + u.r3c1.real*im01 +
1100  u.r3c2.imag*re10 + u.r3c2.real*im10 +
1101  u.r3c3.imag*re11 + u.r3c3.real*im11;
1102 }

References Qureg::deviceStateVec, flipBit(), insertTwoZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledUnitary()

void statevec_multiControlledUnitary ( Qureg  qureg,
long long int  ctrlQubitsMask,
long long int  ctrlFlipMask,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 1237 of file QuEST_gpu.cu.

1241  {
1242  int threadsPerCUDABlock = 128;
1243  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1244  statevec_multiControlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
1245  qureg, ctrlQubitsMask, ctrlFlipMask, targetQubit, argifyMatrix2(u));
1246 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledUnitary(), and multiStateControlledUnitary().

◆ statevec_multiControlledUnitaryKernel()

__global__ void statevec_multiControlledUnitaryKernel ( Qureg  qureg,
long long int  ctrlQubitsMask,
long long int  ctrlFlipMask,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 1177 of file QuEST_gpu.cu.

1181  {
1182  // ----- sizes
1183  long long int sizeBlock, // size of blocks
1184  sizeHalfBlock; // size of blocks halved
1185  // ----- indices
1186  long long int thisBlock, // current block
1187  indexUp,indexLo; // current index and corresponding index in lower half block
1188 
1189  // ----- temp variables
1190  qreal stateRealUp,stateRealLo, // storage for previous state values
1191  stateImagUp,stateImagLo; // (used in updates)
1192  // ----- temp variables
1193  long long int thisTask; // task based approach for expose loop with small granularity
1194  long long int numTasks=qureg.numAmpsPerChunk>>1;
1195 
1196 
1197  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1198  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1199 
1200  // ---------------------------------------------------------------- //
1201  // rotate //
1202  // ---------------------------------------------------------------- //
1203 
1205  qreal *stateVecReal = qureg.deviceStateVec.real;
1206  qreal *stateVecImag = qureg.deviceStateVec.imag;
1207 
1208  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1209  if (thisTask>=numTasks) return;
1210 
1211  thisBlock = thisTask / sizeHalfBlock;
1212  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1213  indexLo = indexUp + sizeHalfBlock;
1214 
1215  if (ctrlQubitsMask == (ctrlQubitsMask & (indexUp ^ ctrlFlipMask))) {
1216  // store current state vector values in temp variables
1217  stateRealUp = stateVecReal[indexUp];
1218  stateImagUp = stateVecImag[indexUp];
1219 
1220  stateRealLo = stateVecReal[indexLo];
1221  stateImagLo = stateVecImag[indexLo];
1222 
1223  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1224  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1225  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1226  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1227  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1228 
1229  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1230  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1231  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1232  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1233  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1234  }
1235 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiRotateZ()

void statevec_multiRotateZ ( Qureg  qureg,
long long int  mask,
qreal  angle 
)

Definition at line 1520 of file QuEST_gpu.cu.

1521 {
1522  qreal cosAngle = cos(angle/2.0);
1523  qreal sinAngle = sin(angle/2.0);
1524 
1525  int threadsPerCUDABlock, CUDABlocks;
1526  threadsPerCUDABlock = 128;
1527  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1528  statevec_multiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1529 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiRotateZ(), and statevec_multiRotatePauli().

◆ statevec_multiRotateZKernel()

__global__ void statevec_multiRotateZKernel ( Qureg  qureg,
long long int  mask,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1503 of file QuEST_gpu.cu.

1503  {
1504 
1505  long long int stateVecSize = qureg.numAmpsPerChunk;
1506  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1507  if (index>=stateVecSize) return;
1508 
1509  qreal *stateVecReal = qureg.deviceStateVec.real;
1510  qreal *stateVecImag = qureg.deviceStateVec.imag;
1511 
1512  int fac = getBitMaskParity(mask & index)? -1 : 1;
1513  qreal stateReal = stateVecReal[index];
1514  qreal stateImag = stateVecImag[index];
1515 
1516  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1517  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1518 }

References Qureg::deviceStateVec, getBitMaskParity(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_pauliX()

void statevec_pauliX ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1292 of file QuEST_gpu.cu.

1293 {
1294  int threadsPerCUDABlock, CUDABlocks;
1295  threadsPerCUDABlock = 128;
1296  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1297  statevec_pauliXKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1298 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliX(), and statevec_applyPauliProd().

◆ statevec_pauliXKernel()

__global__ void statevec_pauliXKernel ( Qureg  qureg,
int  targetQubit 
)

fix – no necessary for GPU version

Definition at line 1248 of file QuEST_gpu.cu.

1248  {
1249  // ----- sizes
1250  long long int sizeBlock, // size of blocks
1251  sizeHalfBlock; // size of blocks halved
1252  // ----- indices
1253  long long int thisBlock, // current block
1254  indexUp,indexLo; // current index and corresponding index in lower half block
1255 
1256  // ----- temp variables
1257  qreal stateRealUp, // storage for previous state values
1258  stateImagUp; // (used in updates)
1259  // ----- temp variables
1260  long long int thisTask; // task based approach for expose loop with small granularity
1261  long long int numTasks=qureg.numAmpsPerChunk>>1;
1262 
1263  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1264  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1265 
1266  // ---------------------------------------------------------------- //
1267  // rotate //
1268  // ---------------------------------------------------------------- //
1269 
1271  qreal *stateVecReal = qureg.deviceStateVec.real;
1272  qreal *stateVecImag = qureg.deviceStateVec.imag;
1273 
1274  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1275  if (thisTask>=numTasks) return;
1276 
1277  thisBlock = thisTask / sizeHalfBlock;
1278  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1279  indexLo = indexUp + sizeHalfBlock;
1280 
1281  // store current state vector values in temp variables
1282  stateRealUp = stateVecReal[indexUp];
1283  stateImagUp = stateVecImag[indexUp];
1284 
1285  stateVecReal[indexUp] = stateVecReal[indexLo];
1286  stateVecImag[indexUp] = stateVecImag[indexLo];
1287 
1288  stateVecReal[indexLo] = stateRealUp;
1289  stateVecImag[indexLo] = stateImagUp;
1290 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_pauliY()

void statevec_pauliY ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1325 of file QuEST_gpu.cu.

1326 {
1327  int threadsPerCUDABlock, CUDABlocks;
1328  threadsPerCUDABlock = 128;
1329  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1330  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, 1);
1331 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliY(), and statevec_applyPauliProd().

◆ statevec_pauliYConj()

void statevec_pauliYConj ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1333 of file QuEST_gpu.cu.

1334 {
1335  int threadsPerCUDABlock, CUDABlocks;
1336  threadsPerCUDABlock = 128;
1337  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1338  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, -1);
1339 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliY().

◆ statevec_pauliYKernel()

__global__ void statevec_pauliYKernel ( Qureg  qureg,
int  targetQubit,
int  conjFac 
)

Definition at line 1300 of file QuEST_gpu.cu.

1300  {
1301 
1302  long long int sizeHalfBlock = 1LL << targetQubit;
1303  long long int sizeBlock = 2LL * sizeHalfBlock;
1304  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1305  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1306  if (thisTask>=numTasks) return;
1307 
1308  long long int thisBlock = thisTask / sizeHalfBlock;
1309  long long int indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1310  long long int indexLo = indexUp + sizeHalfBlock;
1311  qreal stateRealUp, stateImagUp;
1312 
1313  qreal *stateVecReal = qureg.deviceStateVec.real;
1314  qreal *stateVecImag = qureg.deviceStateVec.imag;
1315  stateRealUp = stateVecReal[indexUp];
1316  stateImagUp = stateVecImag[indexUp];
1317 
1318  // update under +-{{0, -i}, {i, 0}}
1319  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1320  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1321  stateVecReal[indexLo] = conjFac * -stateImagUp;
1322  stateVecImag[indexLo] = conjFac * stateRealUp;
1323 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_phaseShiftByTerm()

void statevec_phaseShiftByTerm ( Qureg  qureg,
int  targetQubit,
Complex  term 
)

Definition at line 1423 of file QuEST_gpu.cu.

1424 {
1425  qreal cosAngle = term.real;
1426  qreal sinAngle = term.imag;
1427 
1428  int threadsPerCUDABlock, CUDABlocks;
1429  threadsPerCUDABlock = 128;
1430  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1431  statevec_phaseShiftByTermKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, cosAngle, sinAngle);
1432 }

References Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

Referenced by statevec_pauliZ(), statevec_phaseShift(), statevec_sGate(), statevec_sGateConj(), statevec_tGate(), and statevec_tGateConj().

◆ statevec_phaseShiftByTermKernel()

__global__ void statevec_phaseShiftByTermKernel ( Qureg  qureg,
int  targetQubit,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1395 of file QuEST_gpu.cu.

1395  {
1396 
1397  long long int sizeBlock, sizeHalfBlock;
1398  long long int thisBlock, indexUp,indexLo;
1399 
1400  qreal stateRealLo, stateImagLo;
1401  long long int thisTask;
1402  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1403 
1404  sizeHalfBlock = 1LL << targetQubit;
1405  sizeBlock = 2LL * sizeHalfBlock;
1406 
1407  qreal *stateVecReal = qureg.deviceStateVec.real;
1408  qreal *stateVecImag = qureg.deviceStateVec.imag;
1409 
1410  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1411  if (thisTask>=numTasks) return;
1412  thisBlock = thisTask / sizeHalfBlock;
1413  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1414  indexLo = indexUp + sizeHalfBlock;
1415 
1416  stateRealLo = stateVecReal[indexLo];
1417  stateImagLo = stateVecImag[indexLo];
1418 
1419  stateVecReal[indexLo] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1420  stateVecImag[indexLo] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1421 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_reportStateToScreen()

void statevec_reportStateToScreen ( Qureg  qureg,
QuESTEnv  env,
int  reportRank 
)

Print the current state vector of probability amplitudes for a set of qubits to standard out.

For debugging purposes. Each rank should print output serially. Only print output for systems <= 5 qubits

Definition at line 475 of file QuEST_gpu.cu.

475  {
476  long long int index;
477  int rank;
478  copyStateFromGPU(qureg);
479  if (qureg.numQubitsInStateVec<=5){
480  for (rank=0; rank<qureg.numChunks; rank++){
481  if (qureg.chunkId==rank){
482  if (reportRank) {
483  printf("Reporting state from rank %d [\n", qureg.chunkId);
484  //printf("\trank, index, real, imag\n");
485  printf("real, imag\n");
486  } else if (rank==0) {
487  printf("Reporting state [\n");
488  printf("real, imag\n");
489  }
490 
491  for(index=0; index<qureg.numAmpsPerChunk; index++){
492  printf(REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", qureg.stateVec.real[index], qureg.stateVec.imag[index]);
493  }
494  if (reportRank || rank==qureg.numChunks-1) printf("]\n");
495  }
496  syncQuESTEnv(env);
497  }
498  }
499 }

References Qureg::chunkId, copyStateFromGPU(), Qureg::numAmpsPerChunk, Qureg::numChunks, Qureg::numQubitsInStateVec, Qureg::stateVec, and syncQuESTEnv().

Referenced by reportStateToScreen().

◆ statevec_setAmps()

void statevec_setAmps ( Qureg  qureg,
long long int  startInd,
qreal reals,
qreal imags,
long long int  numAmps 
)

Definition at line 153 of file QuEST_gpu.cu.

153  {
154 
155  cudaDeviceSynchronize();
156  cudaMemcpy(
157  qureg.deviceStateVec.real + startInd,
158  reals,
159  numAmps * sizeof(*(qureg.deviceStateVec.real)),
160  cudaMemcpyHostToDevice);
161  cudaMemcpy(
162  qureg.deviceStateVec.imag + startInd,
163  imags,
164  numAmps * sizeof(*(qureg.deviceStateVec.imag)),
165  cudaMemcpyHostToDevice);
166 }

References Qureg::deviceStateVec.

Referenced by initStateFromAmps(), setAmps(), and setDensityAmps().

◆ statevec_setWeightedQureg()

void statevec_setWeightedQureg ( Complex  fac1,
Qureg  qureg1,
Complex  fac2,
Qureg  qureg2,
Complex  facOut,
Qureg  out 
)

Definition at line 2905 of file QuEST_gpu.cu.

2905  {
2906 
2907  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
2908 
2909  int threadsPerCUDABlock, CUDABlocks;
2910  threadsPerCUDABlock = 128;
2911  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2912  statevec_setWeightedQuregKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2913  fac1, qureg1, fac2, qureg2, facOut, out
2914  );
2915 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by setWeightedQureg(), and statevec_applyPauliSum().

◆ statevec_setWeightedQuregKernel()

__global__ void statevec_setWeightedQuregKernel ( Complex  fac1,
Qureg  qureg1,
Complex  fac2,
Qureg  qureg2,
Complex  facOut,
Qureg  out 
)

Definition at line 2873 of file QuEST_gpu.cu.

2873  {
2874 
2875  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2876  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
2877  if (ampInd >= numAmpsToVisit) return;
2878 
2879  qreal *vecRe1 = qureg1.deviceStateVec.real;
2880  qreal *vecIm1 = qureg1.deviceStateVec.imag;
2881  qreal *vecRe2 = qureg2.deviceStateVec.real;
2882  qreal *vecIm2 = qureg2.deviceStateVec.imag;
2883  qreal *vecReOut = out.deviceStateVec.real;
2884  qreal *vecImOut = out.deviceStateVec.imag;
2885 
2886  qreal facRe1 = fac1.real;
2887  qreal facIm1 = fac1.imag;
2888  qreal facRe2 = fac2.real;
2889  qreal facIm2 = fac2.imag;
2890  qreal facReOut = facOut.real;
2891  qreal facImOut = facOut.imag;
2892 
2893  qreal re1,im1, re2,im2, reOut,imOut;
2894  long long int index = ampInd;
2895 
2896  re1 = vecRe1[index]; im1 = vecIm1[index];
2897  re2 = vecRe2[index]; im2 = vecIm2[index];
2898  reOut = vecReOut[index];
2899  imOut = vecImOut[index];
2900 
2901  vecReOut[index] = (facReOut*reOut - facImOut*imOut) + (facRe1*re1 - facIm1*im1) + (facRe2*re2 - facIm2*im2);
2902  vecImOut[index] = (facReOut*imOut + facImOut*reOut) + (facRe1*im1 + facIm1*re1) + (facRe2*im2 + facIm2*re2);
2903 }

References Qureg::deviceStateVec, Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_swapQubitAmps()

void statevec_swapQubitAmps ( Qureg  qureg,
int  qb1,
int  qb2 
)

Definition at line 1668 of file QuEST_gpu.cu.

1669 {
1670  int threadsPerCUDABlock, CUDABlocks;
1671  threadsPerCUDABlock = 128;
1672  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock);
1673  statevec_swapQubitAmpsKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, qb1, qb2);
1674 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by swapGate().

◆ statevec_swapQubitAmpsKernel()

__global__ void statevec_swapQubitAmpsKernel ( Qureg  qureg,
int  qb1,
int  qb2 
)

Definition at line 1642 of file QuEST_gpu.cu.

1642  {
1643 
1644  qreal *reVec = qureg.deviceStateVec.real;
1645  qreal *imVec = qureg.deviceStateVec.imag;
1646 
1647  long long int numTasks = qureg.numAmpsPerChunk >> 2; // each iteration updates 2 amps and skips 2 amps
1648  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1649  if (thisTask>=numTasks) return;
1650 
1651  long long int ind00, ind01, ind10;
1652  qreal re01, re10, im01, im10;
1653 
1654  // determine ind00 of |..0..0..>, |..0..1..> and |..1..0..>
1655  ind00 = insertTwoZeroBits(thisTask, qb1, qb2);
1656  ind01 = flipBit(ind00, qb1);
1657  ind10 = flipBit(ind00, qb2);
1658 
1659  // extract statevec amplitudes
1660  re01 = reVec[ind01]; im01 = imVec[ind01];
1661  re10 = reVec[ind10]; im10 = imVec[ind10];
1662 
1663  // swap 01 and 10 amps
1664  reVec[ind01] = re10; reVec[ind10] = re01;
1665  imVec[ind01] = im10; imVec[ind10] = im01;
1666 }

References Qureg::deviceStateVec, flipBit(), insertTwoZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_unitary()

void statevec_unitary ( Qureg  qureg,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 904 of file QuEST_gpu.cu.

905 {
906  int threadsPerCUDABlock, CUDABlocks;
907  threadsPerCUDABlock = 128;
908  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
909  statevec_unitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, argifyMatrix2(u));
910 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyMatrix2(), and unitary().

◆ statevec_unitaryKernel()

__global__ void statevec_unitaryKernel ( Qureg  qureg,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 851 of file QuEST_gpu.cu.

851  {
852  // ----- sizes
853  long long int sizeBlock, // size of blocks
854  sizeHalfBlock; // size of blocks halved
855  // ----- indices
856  long long int thisBlock, // current block
857  indexUp,indexLo; // current index and corresponding index in lower half block
858 
859  // ----- temp variables
860  qreal stateRealUp,stateRealLo, // storage for previous state values
861  stateImagUp,stateImagLo; // (used in updates)
862  // ----- temp variables
863  long long int thisTask; // task based approach for expose loop with small granularity
864  long long int numTasks=qureg.numAmpsPerChunk>>1;
865 
866  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
867  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
868 
869  // ---------------------------------------------------------------- //
870  // rotate //
871  // ---------------------------------------------------------------- //
872 
874  qreal *stateVecReal = qureg.deviceStateVec.real;
875  qreal *stateVecImag = qureg.deviceStateVec.imag;
876 
877  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
878  if (thisTask>=numTasks) return;
879 
880  thisBlock = thisTask / sizeHalfBlock;
881  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
882  indexLo = indexUp + sizeHalfBlock;
883 
884  // store current state vector values in temp variables
885  stateRealUp = stateVecReal[indexUp];
886  stateImagUp = stateVecImag[indexUp];
887 
888  stateRealLo = stateVecReal[indexLo];
889  stateImagLo = stateVecImag[indexLo];
890 
891  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
892  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
893  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
894  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
895  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
896 
897  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
898  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
899  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
900  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
901  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
902 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ swapDouble()

void swapDouble ( qreal **  a,
qreal **  b 
)
__device__ __host__ unsigned int log2Int(unsigned int x)
Definition: QuEST_gpu.cu:1780
void copyStateFromGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg....
Definition: QuEST_gpu.cu:461
void syncQuESTEnv(QuESTEnv env)
Guarantees that all code up to the given point has been executed on all nodes (if running in distribu...
Definition: QuEST_gpu.cu:423
int rank
Definition: QuEST.h:244
__global__ void copySharedReduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1806
void swapDouble(qreal **a, qreal **b)
Definition: QuEST_gpu.cu:1912
int numChunks
The number of nodes between which the elements of this operator are split.
Definition: QuEST.h:185
ComplexArray pairStateVec
Temporary storage for a chunk of the state vector received from another process in the MPI version.
Definition: QuEST.h:224
qreal statevec_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:1967
int numChunks
Number of chunks the state vector is broken up into – the number of MPI processes used.
Definition: QuEST.h:219
ComplexArray deviceOperator
A copy of the elements stored persistently on the GPU.
Definition: QuEST.h:193
int chunkId
The position of the chunk of the operator held by this process in the full operator.
Definition: QuEST.h:187
ComplexArray deviceStateVec
Storage for wavefunction amplitudes in the GPU version.
Definition: QuEST.h:227
void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac)
Definition: QuEST_gpu.cu:2609
#define qreal
__forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd)
Definition: QuEST_gpu.cu:95
int numQubitsInStateVec
Number of qubits in the state-vector - this is double the number represented for mixed states.
Definition: QuEST.h:210
int chunkId
The position of the chunk of the state vector held by this process in the full state vector.
Definition: QuEST.h:217
__forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index)
Definition: QuEST_gpu.cu:99
qreal * imag
The imaginary values of the 2^numQubits complex elements.
Definition: QuEST.h:191
qreal densmatr_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:1919
__forceinline__ __device__ int getBitMaskParity(long long int mask)
Definition: QuEST_gpu.cu:86
long long int numAmpsPerChunk
Number of probability amplitudes held in stateVec by this process In the non-MPI version,...
Definition: QuEST.h:213
void copyStateToGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU...
Definition: QuEST_gpu.cu:451
#define REDUCE_SHARED_SIZE
Definition: QuEST_gpu.cu:19
int numRanks
Definition: QuEST.h:245
int numQubits
The number of qubits this operator can act on (informing its size)
Definition: QuEST.h:181
long long int getQubitBitMask(int *qubits, int numQubits)
Definition: QuEST_common.c:44
Represents a diagonal complex operator on the full Hilbert state of a Qureg.
Definition: QuEST.h:178
__forceinline__ __device__ long long int insertZeroBits(long long int number, int *inds, const int numInds)
Definition: QuEST_gpu.cu:112
qreal ** real
Definition: QuEST.h:139
qreal * secondLevelReduction
Definition: QuEST.h:229
__forceinline__ __device__ int extractBit(const int locationOfBitFromRight, const long long int theEncodedNumber)
Definition: QuEST_gpu.cu:82
void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase)
Definition: QuEST_gpu.cu:2668
__forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2)
Definition: QuEST_gpu.cu:106
qreal ** imag
Definition: QuEST.h:140
ComplexArray stateVec
Computational state amplitudes - a subset thereof in the MPI version.
Definition: QuEST.h:222
long long int numElemsPerChunk
The number of the 2^numQubits amplitudes stored on each distributed node.
Definition: QuEST.h:183
int isDensityMatrix
Whether this instance is a density-state representation.
Definition: QuEST.h:206
int numQubits
Definition: QuEST.h:138
void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase)
Definition: QuEST_gpu.cu:2629
int numQubitsRepresented
The number of qubits represented in either the state-vector or density matrix.
Definition: QuEST.h:208
long long int numAmpsTotal
Total number of amplitudes, which are possibly distributed among machines.
Definition: QuEST.h:215
qreal * real
The real values of the 2^numQubits complex elements.
Definition: QuEST.h:189
qreal real
Definition: QuEST.h:105
__device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1787
qreal imag
Definition: QuEST.h:106
Represents one complex number.
Definition: QuEST.h:103
qreal * firstLevelReduction
Storage for reduction of probabilities on GPU.
Definition: QuEST.h:229