diff --git a/common/ScaLBL.h b/common/ScaLBL.h index 0b71cc2f..372cb73a 100644 --- a/common/ScaLBL.h +++ b/common/ScaLBL.h @@ -11,7 +11,7 @@ extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size); -//extern "C" void FreeDeviceMemory(void** address); +extern "C" void ScaLBL_FreeDeviceMemory(void* pointer); extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size); @@ -21,7 +21,7 @@ extern "C" void ScaLBL_DeviceBarrier(); extern "C" void ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N); -extern "C" void ScaLBL_D3Q19_Unpack(int q, int Cqx, int Cqy, int Cqz, int *list, int start, int count,double *recvbuf, double *dist, int Nx, int Ny, int Nz); +extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N); extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N); @@ -41,6 +41,8 @@ extern "C" void ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int N extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz); +extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, double *distodd, int Np); + extern "C" void ScaLBL_D3Q19_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,int Nx, int Ny, int Nz); @@ -56,6 +58,13 @@ extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, do extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout, int Nx, int Ny, int Nz, int outlet); + +extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux, + int Nx, int Ny, int Nz); + +extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, + int Nx, int Ny, int Nz, int outlet); + extern "C" void ScaLBL_Color_Init(char *ID, double *Den, double *Phi, double das, double dbs, int Nx, int Ny, int Nz); extern "C" void ScaLBL_ColorDistance_Init(char *ID, double *Den, double *Phi, double *Distance, @@ -69,7 +78,7 @@ extern "C" void ScaLBL_D3Q19_ColorCollide( char *ID, double *disteven, double *d extern "C" void ScaLBL_D3Q19_ColorCollide_gen( char *ID, double *disteven, double *distodd, double *phi, double *ColorGrad, double *Velocity, int Nx, int Ny, int Nz, double tau1, double tau2, - double alpha, double beta, double Fx, double Fy, double Fz); + double alpha, double beta, double Fx, double Fy, double Fz); extern "C" void ScaLBL_D3Q7_ColorCollideMass(char *ID, double *A_even, double *A_odd, double *B_even, double *B_odd, double *Den, double *Phi, double *ColorGrad, double *Velocity, double beta, int N, bool pBC); @@ -91,17 +100,11 @@ extern "C" void ScaLBL_D3Q19_Velocity_BC_Z(double *disteven, double *distodd, do extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice); - -extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux, - int Nx, int Ny, int Nz); - -extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, - int Nx, int Ny, int Nz, int outlet); - class ScaLBL_Communicator{ public: //...................................................................................... ScaLBL_Communicator(Domain &Dm); + //ScaLBL_Communicator(Domain &Dm, IntArray &Map); ~ScaLBL_Communicator(); //...................................................................................... unsigned long int CommunicationCount,SendCount,RecvCount; @@ -119,6 +122,7 @@ public: double *recvbuf_xY, *recvbuf_yZ, *recvbuf_Xz, *recvbuf_XY, *recvbuf_YZ, *recvbuf_XZ; //...................................................................................... + void MemoryOptimizedLayout(IntArray &Map, int *neighborList, char *id, int Np); void SendD3Q19(double *f_even, double *f_odd); void RecvD3Q19(double *f_even, double *f_odd); void BiSendD3Q7(double *A_even, double *A_odd, double *B_even, double *B_odd); @@ -126,7 +130,12 @@ public: void SendHalo(double *data); void RecvHalo(double *data); + // Debugging and unit testing functions + void PrintD3Q19(); + private: + void D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list, int start, int count, int *d3q19_recvlist); + bool Lock; // use Lock to make sure only one call at a time to protect data in transit // only one set of Send requests can be active at any time (per instance) int i,j,k,n; @@ -166,10 +175,16 @@ private: int *dvcRecvList_x, *dvcRecvList_y, *dvcRecvList_z, *dvcRecvList_X, *dvcRecvList_Y, *dvcRecvList_Z; int *dvcRecvList_xy, *dvcRecvList_yz, *dvcRecvList_xz, *dvcRecvList_Xy, *dvcRecvList_Yz, *dvcRecvList_xZ; int *dvcRecvList_xY, *dvcRecvList_yZ, *dvcRecvList_Xz, *dvcRecvList_XY, *dvcRecvList_YZ, *dvcRecvList_XZ; + // Recieve buffers for the distributions + int *dvcRecvDist_x, *dvcRecvDist_y, *dvcRecvDist_z, *dvcRecvDist_X, *dvcRecvDist_Y, *dvcRecvDist_Z; + int *dvcRecvDist_xy, *dvcRecvDist_yz, *dvcRecvDist_xz, *dvcRecvDist_Xy, *dvcRecvDist_Yz, *dvcRecvDist_xZ; + int *dvcRecvDist_xY, *dvcRecvDist_yZ, *dvcRecvDist_Xz, *dvcRecvDist_XY, *dvcRecvDist_YZ, *dvcRecvDist_XZ; //...................................................................................... }; + + ScaLBL_Communicator::ScaLBL_Communicator(Domain &Dm){ //...................................................................................... Lock=false; // unlock the communicator @@ -315,6 +330,25 @@ ScaLBL_Communicator::ScaLBL_Communicator(Domain &Dm){ ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_Yz, recvCount_Yz*sizeof(int)); // Allocate device memory ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_YZ, recvCount_YZ*sizeof(int)); // Allocate device memory //...................................................................................... + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_x, 5*recvCount_x*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_X, 5*recvCount_X*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_y, 5*recvCount_y*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_Y, 5*recvCount_Y*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_z, 5*recvCount_z*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_Z, 5*recvCount_Z*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_xy, recvCount_xy*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_xY, recvCount_xY*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_Xy, recvCount_Xy*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_XY, recvCount_XY*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_xz, recvCount_xz*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_xZ, recvCount_xZ*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_Xz, recvCount_Xz*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_XZ, recvCount_XZ*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_yz, recvCount_yz*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_yZ, recvCount_yZ*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_Yz, recvCount_Yz*sizeof(int)); // Allocate device memory + ScaLBL_AllocateDeviceMemory((void **) &dvcRecvDist_YZ, recvCount_YZ*sizeof(int)); // Allocate device memory + //...................................................................................... ScaLBL_CopyToDevice(dvcSendList_x,Dm.sendList_x,sendCount_x*sizeof(int)); ScaLBL_CopyToDevice(dvcSendList_X,Dm.sendList_X,sendCount_X*sizeof(int)); ScaLBL_CopyToDevice(dvcSendList_y,Dm.sendList_y,sendCount_y*sizeof(int)); @@ -352,6 +386,81 @@ ScaLBL_Communicator::ScaLBL_Communicator(Domain &Dm){ ScaLBL_CopyToDevice(dvcRecvList_YZ,Dm.recvList_YZ,recvCount_YZ*sizeof(int)); ScaLBL_CopyToDevice(dvcRecvList_yZ,Dm.recvList_yZ,recvCount_yZ*sizeof(int)); ScaLBL_CopyToDevice(dvcRecvList_Yz,Dm.recvList_Yz,recvCount_Yz*sizeof(int)); + //...................................................................................... + + MPI_Barrier(MPI_COMM_SCALBL); + if (rank==0) printf("Mapping recieve distributions \n"); + + //................................................................................... + // Set up the recieve distribution lists + //................................................................................... + //...Map recieve list for the X face: q=2,8,10,12,13 ................................. + D3Q19_MapRecv(0,-1,0,0,Dm.recvList_X,0,recvCount_X,dvcRecvDist_X); + D3Q19_MapRecv(3,-1,-1,0,Dm.recvList_X,recvCount_X,recvCount_X,dvcRecvDist_X); + D3Q19_MapRecv(4,-1,1,0,Dm.recvList_X,2*recvCount_X,recvCount_X,dvcRecvDist_X); + D3Q19_MapRecv(5,-1,0,-1,Dm.recvList_X,3*recvCount_X,recvCount_X,dvcRecvDist_X); + D3Q19_MapRecv(6,-1,0,1,Dm.recvList_X,4*recvCount_X,recvCount_X,dvcRecvDist_X); + //................................................................................... + //...Map recieve list for the x face: q=1,7,9,11,13.................................. + D3Q19_MapRecv(1,1,0,0,Dm.recvList_x,0,recvCount_x,dvcRecvDist_x); + D3Q19_MapRecv(4,1,1,0,Dm.recvList_x,recvCount_x,recvCount_x,dvcRecvDist_x); + D3Q19_MapRecv(5,1,-1,0,Dm.recvList_x,2*recvCount_x,recvCount_x,dvcRecvDist_x); + D3Q19_MapRecv(6,1,0,1,Dm.recvList_x,3*recvCount_x,recvCount_x,dvcRecvDist_x); + D3Q19_MapRecv(7,1,0,-1,Dm.recvList_x,4*recvCount_x,recvCount_x,dvcRecvDist_x); + //................................................................................... + //...Map recieve list for the y face: q=4,8,9,16,18 ................................... + D3Q19_MapRecv(1,0,-1,0,Dm.recvList_Y,0,recvCount_Y,dvcRecvDist_Y); + D3Q19_MapRecv(3,-1,-1,0,Dm.recvList_Y,recvCount_Y,recvCount_Y,dvcRecvDist_Y); + D3Q19_MapRecv(5,1,-1,0,Dm.recvList_Y,2*recvCount_Y,recvCount_Y,dvcRecvDist_Y); + D3Q19_MapRecv(7,0,-1,-1,Dm.recvList_Y,3*recvCount_Y,recvCount_Y,dvcRecvDist_Y); + D3Q19_MapRecv(8,0,-1,1,Dm.recvList_Y,4*recvCount_Y,recvCount_Y,dvcRecvDist_Y); + //................................................................................... + //...Map recieve list for the Y face: q=3,7,10,15,17 .................................. + D3Q19_MapRecv(2,0,1,0,Dm.recvList_y,0,recvCount_y,dvcRecvDist_y); + D3Q19_MapRecv(4,1,1,0,Dm.recvList_y,recvCount_y,recvCount_y,dvcRecvDist_y); + D3Q19_MapRecv(4,-1,1,0,Dm.recvList_y,2*recvCount_y,recvCount_y,dvcRecvDist_y); + D3Q19_MapRecv(8,0,1,1,Dm.recvList_y,3*recvCount_y,recvCount_y,dvcRecvDist_y); + D3Q19_MapRecv(9,0,1,-1,Dm.recvList_y,4*recvCount_y,recvCount_y,dvcRecvDist_y); + //................................................................................... + //...Map recieve list for the z face<<<6,12,13,16,17).............................................. + D3Q19_MapRecv(2,0,0,-1,Dm.recvList_Z,0,recvCount_Z,dvcRecvDist_Z); + D3Q19_MapRecv(5,-1,0,-1,Dm.recvList_Z,recvCount_Z,recvCount_Z,dvcRecvDist_Z); + D3Q19_MapRecv(7,1,0,-1,Dm.recvList_Z,2*recvCount_Z,recvCount_Z,dvcRecvDist_Z); + D3Q19_MapRecv(7,0,-1,-1,Dm.recvList_Z,3*recvCount_Z,recvCount_Z,dvcRecvDist_Z); + D3Q19_MapRecv(9,0,1,-1,Dm.recvList_Z,4*recvCount_Z,recvCount_Z,dvcRecvDist_Z); + //...Map recieve list for the Z face<<<5,11,14,15,18).............................................. + D3Q19_MapRecv(3,0,0,1,Dm.recvList_z,0,recvCount_z,dvcRecvDist_z); + D3Q19_MapRecv(6,1,0,1,Dm.recvList_z,recvCount_z,recvCount_z,dvcRecvDist_z); + D3Q19_MapRecv(6,-1,0,1,Dm.recvList_z,2*recvCount_z,recvCount_z,dvcRecvDist_z); + D3Q19_MapRecv(8,0,1,1,Dm.recvList_z,3*recvCount_z,recvCount_z,dvcRecvDist_z); + D3Q19_MapRecv(8,0,-1,1,Dm.recvList_z,4*recvCount_z,recvCount_z,dvcRecvDist_z); + //.................................................................................. + //...Map recieve list for the xy edge <<<8)................................ + D3Q19_MapRecv(3,-1,-1,0,Dm.recvList_XY,0,recvCount_XY,dvcRecvDist_XY); + //...Map recieve list for the Xy edge <<<9)................................ + D3Q19_MapRecv(5,1,-1,0,Dm.recvList_xY,0,recvCount_xY,dvcRecvDist_xY); + //...Map recieve list for the xY edge <<<10)................................ + D3Q19_MapRecv(4,-1,1,0,Dm.recvList_Xy,0,recvCount_Xy,dvcRecvDist_Xy); + //...Map recieve list for the XY edge <<<7)................................ + D3Q19_MapRecv(4,1,1,0,Dm.recvList_xy,0,recvCount_xy,dvcRecvDist_xy); + //...Map recieve list for the xz edge <<<12)................................ + D3Q19_MapRecv(5,-1,0,-1,Dm.recvList_XZ,0,recvCount_XZ,dvcRecvDist_XZ); + //...Map recieve list for the xZ edge <<<14)................................ + D3Q19_MapRecv(6,-1,0,1,Dm.recvList_Xz,0,recvCount_Xz,dvcRecvDist_Xz); + //...Map recieve list for the Xz edge <<<13)................................ + D3Q19_MapRecv(7,1,0,-1,Dm.recvList_xZ,0,recvCount_xZ,dvcRecvDist_xZ); + //...Map recieve list for the XZ edge <<<11)................................ + D3Q19_MapRecv(6,1,0,1,Dm.recvList_xz,0,recvCount_xz,dvcRecvDist_xz); + //...Map recieve list for the yz edge <<<16)................................ + D3Q19_MapRecv(7,0,-1,-1,Dm.recvList_YZ,0,recvCount_YZ,dvcRecvDist_YZ); + //...Map recieve list for the yZ edge <<<18)................................ + D3Q19_MapRecv(8,0,-1,1,Dm.recvList_Yz,0,recvCount_Yz,dvcRecvDist_Yz); + //...Map recieve list for the Yz edge <<<17)................................ + D3Q19_MapRecv(9,0,1,-1,Dm.recvList_yZ,0,recvCount_yZ,dvcRecvDist_yZ); + //...Map recieve list for the YZ edge <<<15)................................ + D3Q19_MapRecv(8,0,1,1,Dm.recvList_yz,0,recvCount_yz,dvcRecvDist_yz); + //................................................................................... + //...................................................................................... MPI_Barrier(MPI_COMM_SCALBL); ScaLBL_DeviceBarrier(); @@ -376,6 +485,527 @@ ScaLBL_Communicator::~ScaLBL_Communicator(){ // -- note that there needs to be a way to free memory allocated on the device!!! } +void ScaLBL_Communicator::D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list, int start, int count, + int *d3q19_recvlist){ + //.................................................................................... + // Map the recieve distributions to + // Distribution q matche Cqx, Cqy, Cqz + // swap rule means that the distributions in recvbuf are OPPOSITE of q + // dist may be even or odd distributions stored by stream layout + //.................................................................................... + + int i,j,k,n,nn,idx; + //int N = Nx*Ny*Nz; + // Create work arrays to store the values from the device (so that host can do this work) + int * ReturnDist; + ReturnDist=new int [count]; + + // Copy the list from the device + //ScaLBL_CopyToHost(List, list, count*sizeof(int)); + + for (idx=0; idx Np ){ + ERROR("ScaLBL_Communicator::MemoryOptimizedLayout: Failed to create memory efficient layout!\n"); + } +/* + for (k=1;k Np) printf("ScaLBL_Communicator::MemoryOptimizedLayout: Map(%i,%i,%i) = %i > %i \n",i,j,k,Map(i,j,k),Np); + else if (!(idx<0)){ + // store the idx associated with each neighbor + // store idx for self if neighbor is in solid or out of domain + //D3Q19 = {{1,0,0},{-1,0,0} + // {0,1,0},{0,-1,0} + // {0,0,1},{0,0,-1}, + // {1,1,0},{-1,-1,0}, + // {1,-1,0},{-1,1,0}, + // {1,0,1},{-1,0,-1}, + // {1,0,-1},{-1,0,1}, + // {0,1,1},{0,-1,-1}, + // {0,1,-1},{0,-1,1}}; + // note that only odd distributions need to be stored to execute the swap algorithm + int neighbor; // cycle through the neighbors of lattice site idx + neighbor=Map(i+1,j,k); + if (neighbor==-2) neighborList[idx]=-1; + else if (neighbor<0) neighborList[idx]=idx; + else neighborList[idx]=neighbor; + + neighbor=Map(i,j+1,k); + if (neighbor==-2) neighborList[Np+idx]=-1; + else if (neighbor<0) neighborList[Np+idx]=idx; + else neighborList[Np+idx]=neighbor; + + neighbor=Map(i,j,k+1); + if (neighbor==-2) neighborList[2*Np+idx]=-1; + else if (neighbor<0) neighborList[2*Np+idx]=idx; + else neighborList[2*Np+idx]=neighbor; + + neighbor=Map(i+1,j+1,k); + if (neighbor==-2) neighborList[3*Np+idx]=-1; + else if (neighbor<0) neighborList[3*Np+idx]=idx; + else neighborList[3*Np+idx]=neighbor; + + neighbor=Map(i+1,j-1,k); + if (neighbor==-2) neighborList[4*Np+idx]=-1; + else if (neighbor<0) neighborList[4*Np+idx]=idx; + else neighborList[4*Np+idx]=neighbor; + + neighbor=Map(i+1,j,k+1); + if (neighbor==-2) neighborList[5*Np+idx]=-1; + else if (neighbor<0) neighborList[5*Np+idx]=idx; + else neighborList[5*Np+idx]=neighbor; + + neighbor=Map(i+1,j,k-1); + if (neighbor==-2) neighborList[6*Np+idx]=-1; + else if (neighbor<0) neighborList[6*Np+idx]=idx; + else neighborList[6*Np+idx]=neighbor; + + neighbor=Map(i,j+1,k+1); + if (neighbor==-2) neighborList[7*Np+idx]=-1; + else if (neighbor<0) neighborList[7*Np+idx]=idx; + else neighborList[7*Np+idx]=neighbor; + + neighbor=Map(i,j+1,k-1); + if (neighbor==-2) neighborList[8*Np+idx]=-1; + else if (neighbor<0) neighborList[8*Np+idx]=idx; + else neighborList[8*Np+idx]=neighbor; + } + } + } + } + + //for (idx=0; idx 0; offset /= 2) - val += __shfl_down(val, offset); - return val; + for (int offset = warpSize/2; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; } __inline__ __device__ double blockReduceSum(double val) { - static __shared__ double shared[32]; // Shared mem for 32 partial sums - int lane = threadIdx.x % warpSize; - int wid = threadIdx.x / warpSize; + static __shared__ double shared[32]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; - val = warpReduceSum(val); // Each warp performs partial reduction + val = warpReduceSum(val); // Each warp performs partial reduction - if (lane==0) shared[wid]=val; // Write reduced value to shared memory + if (lane==0) shared[wid]=val; // Write reduced value to shared memory - __syncthreads(); // Wait for all partial reductions + __syncthreads(); // Wait for all partial reductions - //read from shared memory only if that warp existed - val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - if (wid==0) val = warpReduceSum(val); //Final reduce within first warp + if (wid==0) val = warpReduceSum(val); //Final reduce within first warp - return val; + return val; } __global__ void deviceReduceKernel(double *in, double* out, int N) { - double sum = 0; - //reduce multiple elements per thread - for (int i = blockIdx.x * blockDim.x + threadIdx.x; - i < N; - i += blockDim.x * gridDim.x) { - sum += in[i]; - } - sum = blockReduceSum(sum); - if (threadIdx.x==0) - out[blockIdx.x]=sum; + double sum = 0; + //reduce multiple elements per thread + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < N; + i += blockDim.x * gridDim.x) { + sum += in[i]; + } + sum = blockReduceSum(sum); + if (threadIdx.x==0) + out[blockIdx.x]=sum; } __global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){ @@ -76,9 +62,9 @@ __global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, d } __global__ void dvc_ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, - double *recvbuf, double *dist, int N){ + double *recvbuf, double *dist, int N){ //.................................................................................... - // Unpack distribution from the recv buffer + // Unack distribution from the recv buffer // Distribution q matche Cqx, Cqy, Cqz // swap rule means that the distributions in recvbuf are OPPOSITE of q // dist may be even or odd distributions stored by stream layout @@ -120,7 +106,7 @@ __global__ void dvc_ScaLBL_D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int * d3q19_recvlist[start+idx]=nn; } } - */ +*/ __global__ void dvc_ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz) @@ -133,35 +119,35 @@ __global__ void dvc_ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, i //........Get 1-D index for this thread.................... n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x; if (n 0 ){ - f_even[n] = 0.3333333333333333; - f_odd[n] = 0.055555555555555555; //double(100*n)+1.f; - f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f; - f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f; - f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f; - f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f; - f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f; - f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f; - f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f; - f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f; - f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f; - f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f; - f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f; - f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f; - f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f; - f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f; - f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f; - f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f; - f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f; - } - else{ - for(int q=0; q<9; q++){ - f_even[q*N+n] = -1.0; - f_odd[q*N+n] = -1.0; - } - f_even[9*N+n] = -1.0; + id = ID[n]; + if (id > 0 ){ + f_even[n] = 0.3333333333333333; + f_odd[n] = 0.055555555555555555; //double(100*n)+1.f; + f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f; + f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f; + f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f; + f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f; + f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f; + f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f; + f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f; + f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f; + f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f; + f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f; + f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f; + f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f; + f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f; + f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f; + f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f; + f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f; + f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f; + } + else{ + for(int q=0; q<9; q++){ + f_even[q*N+n] = -1.0; + f_odd[q*N+n] = -1.0; } + f_even[9*N+n] = -1.0; + } } } } @@ -185,451 +171,6 @@ __global__ void dvc_ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *distev } } } - -__global__ void dvc_ScaLBL_AAodd_Compact(char * ID, int *d_neighborList, double *dist, int Np) { - - int n, nn, nnn; - double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; - double f10,f11,f12,f13,f14,f15,f16,f17,f18; - char id; - int nread,nwrite,naccess; - int S = Np/NBLOCKS/NTHREADS+1; - - for (int s=0; s 0) { - - f0 = dist[n]; - - nread = d_neighborList[n]; naccess = Np; - if (nread<0) { nread=n; naccess = 10*Np; } - f2 = dist[nread + naccess]; - - nread = d_neighborList[n+2*Np]; naccess = 2*Np; - if (nread<0) { nread=n; naccess = 11*Np; } - f4 = dist[nread + naccess]; - - nread = d_neighborList[n+4*Np]; naccess = 3*Np; - if (nread<0) { nread=n; naccess = 12*Np; } - f6 = dist[nread + naccess]; - - nread = d_neighborList[n+6*Np]; naccess = 4*Np; - if (nread<0) { nread=n; naccess = 13*Np; } - f8 = dist[nread + naccess]; - - nread = d_neighborList[n+8*Np]; naccess = 5*Np; - if (nread<0) { nread=n; naccess = 14*Np; } - f10 = dist[nread + naccess]; - - nread = d_neighborList[n+10*Np]; naccess = 6*Np; - if (nread<0) { nread=n; naccess = 15*Np; } - f12 = dist[nread + naccess]; - - nread = d_neighborList[n+12*Np]; naccess = 7*Np; - if (nread<0) { nread=n; naccess = 16*Np; } - f14 = dist[nread + naccess]; - - nread = d_neighborList[n+14*Np]; naccess = 8*Np; - if (nread<0) { nread=n; naccess = 17*Np; } - f16 = dist[nread + naccess]; - - nread = d_neighborList[n+16*Np]; naccess = 9*Np; - if (nread<0) { nread=n; naccess = 18*Np; } - f18 = dist[nread + naccess]; - - - - nread = d_neighborList[n+Np]; naccess = 10*Np; - if (nread<0) { nread=n; naccess = Np; } - f1 = dist[nread + naccess]; - - nread = d_neighborList[n+3*Np]; naccess = 11*Np; - if (nread<0) { nread=n; naccess = 2*Np; } - f3 = dist[nread + naccess]; - - nread = d_neighborList[n+5*Np]; naccess = 12*Np; - if (nread<0) { nread=n; naccess = 3*Np; } - f5 = dist[nread + naccess]; - - nread = d_neighborList[n+7*Np]; naccess = 13*Np; - if (nread<0) { nread=n; naccess = 4*Np; } - f7 = dist[nread + naccess]; - - nread = d_neighborList[n+9*Np]; naccess = 14*Np; - if (nread<0) { nread=n; naccess = 5*Np; } - f9 = dist[nread + naccess]; - - nread = d_neighborList[n+11*Np]; naccess = 15*Np; - if (nread<0) { nread=n; naccess = 6*Np; } - f11 = dist[nread + naccess]; - - nread = d_neighborList[n+13*Np]; naccess = 16*Np; - if (nread<0) { nread=n; naccess = 7*Np; } - f13 = dist[nread + naccess]; - - nread = d_neighborList[n+15*Np]; naccess = 17*Np; - if (nread<0) { nread=n; naccess = 8*Np; } - f15 = dist[nread + naccess]; - - nread = d_neighborList[n+17*Np]; naccess = 18*Np; - if (nread<0) { nread=n; naccess = 9*Np; } - f17 = dist[nread + naccess]; - - - - - // ORIGINAL CORRECT WRITES - // nwrite = d_neighborList[n]; naccess = 10*Np; - // if (nwrite<0) { nwrite=n; naccess = Np; } - // dist[nwrite + naccess] = f1; - - // nwrite = d_neighborList[n+Np]; naccess = Np; - // if (nwrite<0) { nwrite=n; naccess = 10*Np; } - // dist[nwrite + naccess] = f2; - - nread = d_neighborList[n]; naccess = Np; - if (nread<0) { nread=n; naccess = 10*Np; } - dist[nread + naccess] = f1; - - nread = d_neighborList[n+2*Np]; naccess = 2*Np; - if (nread<0) { nread=n; naccess = 11*Np; } - dist[nread + naccess] = f3; - - nread = d_neighborList[n+4*Np]; naccess = 3*Np; - if (nread<0) { nread=n; naccess = 12*Np; } - dist[nread + naccess] = f5; - - nread = d_neighborList[n+6*Np]; naccess = 4*Np; - if (nread<0) { nread=n; naccess = 13*Np; } - dist[nread + naccess] = f7; - - nread = d_neighborList[n+8*Np]; naccess = 5*Np; - if (nread<0) { nread=n; naccess = 14*Np; } - dist[nread + naccess] = f9; - - nread = d_neighborList[n+10*Np]; naccess = 6*Np; - if (nread<0) { nread=n; naccess = 15*Np; } - dist[nread + naccess] = f11; - - nread = d_neighborList[n+12*Np]; naccess = 7*Np; - if (nread<0) { nread=n; naccess = 16*Np; } - dist[nread + naccess] = f13; - - nread = d_neighborList[n+14*Np]; naccess = 8*Np; - if (nread<0) { nread=n; naccess = 17*Np; } - dist[nread + naccess] = f15; - - nread = d_neighborList[n+16*Np]; naccess = 9*Np; - if (nread<0) { nread=n; naccess = 18*Np; } - dist[nread + naccess] = f17; - - - - nread = d_neighborList[n+Np]; naccess = 10*Np; - if (nread<0) { nread=n; naccess = Np; } - dist[nread + naccess] = f2; - - nread = d_neighborList[n+3*Np]; naccess = 11*Np; - if (nread<0) { nread=n; naccess = 2*Np; } - dist[nread + naccess] = f4; - - nread = d_neighborList[n+5*Np]; naccess = 12*Np; - if (nread<0) { nread=n; naccess = 3*Np; } - dist[nread + naccess] = f6; - - nread = d_neighborList[n+7*Np]; naccess = 13*Np; - if (nread<0) { nread=n; naccess = 4*Np; } - dist[nread + naccess] = f8; - - nread = d_neighborList[n+9*Np]; naccess = 14*Np; - if (nread<0) { nread=n; naccess = 5*Np; } - dist[nread + naccess] = f10; - - nread = d_neighborList[n+11*Np]; naccess = 15*Np; - if (nread<0) { nread=n; naccess = 6*Np; } - dist[nread + naccess]= f12; - - nread = d_neighborList[n+13*Np]; naccess = 16*Np; - if (nread<0) { nread=n; naccess = 7*Np; } - dist[nread + naccess] = f14; - - nread = d_neighborList[n+15*Np]; naccess = 17*Np; - if (nread<0) { nread=n; naccess = 8*Np; } - dist[nread + naccess] = f16; - - nread = d_neighborList[n+17*Np]; naccess = 18*Np; - if (nread<0) { nread=n; naccess = 9*Np; } - dist[nread + naccess] = f18; - } - } - } -} - - -__global__ void dvc_ScaLBL_AAeven_Compact(char * ID, double *dist, int Np) { - - - int q,n,nn; - double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; - double f10,f11,f12,f13,f14,f15,f16,f17,f18; - - char id; - int S = Np/NBLOCKS/NTHREADS + 1; - for (int s=0; s 0) { - - //........................................................................ - // READ THE DISTRIBUTIONS - // (read from opposite array due to previous swap operation) - //........................................................................ - // even - f2 = dist[10*Np+n]; - f4 = dist[11*Np+n]; - f6 = dist[12*Np+n]; - f8 = dist[13*Np+n]; - f10 = dist[14*Np+n]; - f12 = dist[15*Np+n]; - f14 = dist[16*Np+n]; - f16 = dist[17*Np+n]; - f18 = dist[18*Np+n]; - f0 = dist[n]; - // odd - f1 = dist[Np+n]; - f3 = dist[2*Np+n]; - f5 = dist[3*Np+n]; - f7 = dist[4*Np+n]; - f9 = dist[5*Np+n]; - f11 = dist[6*Np+n]; - f13 = dist[7*Np+n]; - f15 = dist[8*Np+n]; - f17 = dist[9*Np+n]; - - //........................................................................ - // WRITE THE DISTRIBUTIONS - // even - //disteven[n] = f0; - dist[Np+n] = f2; - dist[2*Np+n] = f4; - dist[3*Np+n] = f6; - dist[4*Np+n] = f8; - dist[5*Np+n] = f10; - dist[6*Np+n] = f12; - dist[7*Np+n] = f14; - dist[8*Np+n] = f16; - dist[9*Np+n] = f18; - // odd - dist[10*Np+n] = f1; - dist[11*Np+n] = f3; - dist[12*Np+n] = f5; - dist[13*Np+n] = f7; - dist[14*Np+n] = f9; - dist[15*Np+n] = f11; - dist[16*Np+n] = f13; - dist[17*Np+n] = f15; - dist[18*Np+n] = f17; - //........................................................................ - } - } - } -} - -//__global__ void dvc_ScaLBL_AAodd_Compact_OLD(char * ID, int *d_neighborList, double *d_disteven, double *d_distodd, int Np) { -// -// int n, nn, nnn; -// double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; -// double f10,f11,f12,f13,f14,f15,f16,f17,f18; -// char id; -// int nread,nwrite,naccess; -// int S = Np/NBLOCKS/NTHREADS+1; -// -// for (int s=0; s 0) { -// // Convention -// // nread - read neighbor location for distribution q -// // nwrite - write neighbor location for distribution q -// // read f1 -// -// // Compact: -// // nn = d_neighborList[n+2*0*Np]; // nread -// // nnn=d_neighborList[n+(2*0+1)*Np]; // nwrite -// // // if ((!(nn<0)) && (!(nnn<0))) { -// // //printf("n=%d: nn=%d, nnn=%d\n",n,nn,nnn); -// // f1 = d_distodd[nnn + 0*Np]; -// // f2 = d_disteven[nn + (0+1)*Np]; -// // d_distodd[nnn+0*Np] = f2; -// // d_disteven[nn+(0+1)*Np] = f1; -// -// // still need f0 -// //f0 = d_disteven[n]; // ? -// -// -// nread = d_neighborList[n]; naccess = 10*Np; // 1 "neighbor 0" -// if (nread<0) { nread=n; naccess = Np; } -// f2 = dist[nread + naccess]; -// -// nread = d_neighborList[n+Np]; naccess = Np; // 2 -// if (nread<0) { nread=n; naccess = 10*Np; } -// f1 = dist[nread + nacess]; -// -//// nread = d_neighborList[n+2*Np]; // 3 -//// if (nread<0) nread=n; -//// f4 = d_disteven[2*Np+nread]; -//// -//// nread = d_neighborList[n+3*Np]; // 4 -//// if (nread<0) nread=n; -//// f3 = d_distodd[Np+nread]; -//// -//// nread = d_neighborList[n+4*Np]; // 5 -//// if (nread<0) nread=n; -//// f6 = d_disteven[3*Np+nread]; -//// -//// nread = d_neighborList[n+5*Np]; // 6 -//// if (nread<0) nread=n; -//// f5 = d_distodd[2*Np+nread]; -//// -//// nread = d_neighborList[n+6*Np]; // 7 -//// if (nread<0) nread=n; -//// f8 = d_disteven[4*Np+nread]; -//// -//// nread = d_neighborList[n+7*Np]; // 8 -//// if (nread<0) nread=n; -//// f7 = d_distodd[3*Np+nread]; -//// -//// nread = d_neighborList[n+8*Np]; // 9 -//// if (nread<0) nread=n; -//// f10 = d_disteven[5*Np+nread]; -//// -//// nread = d_neighborList[n+9*Np]; // 10 -//// if (nread<0) nread=n; -//// f9 = d_distodd[4*Np+nread]; -//// -//// nread = d_neighborList[n+10*Np]; // 11 -//// if (nread<0) nread=n; -//// f12 = d_disteven[6*Np+nread]; -//// -//// nread = d_neighborList[n+11*Np]; // 12 -//// if (nread<0) nread=n; -//// f11 = d_distodd[5*Np+nread]; -//// -//// nread = d_neighborList[n+12*Np]; // 13 -//// if (nread<0) nread=n; -//// f14 = d_disteven[7*Np+nread]; -//// -//// nread = d_neighborList[n+13*Np]; // 14 -//// if (nread<0) nread=n; -//// f13 = d_distodd[6*Np+nread]; -//// -//// nread = d_neighborList[n+14*Np]; //15 -//// if (nread<0) nread=n; -//// f16 = d_disteven[8*Np+nread]; -//// -//// nread = d_neighborList[n+15*Np]; //16 -//// if (nread<0) nread=n; -//// f15 = d_distodd[7*Np+nread]; -//// -//// nread = d_neighborList[n+16*Np]; //17 -//// if (nread<0) nread=n; -//// f18 = d_disteven[9*Np+nread]; -//// -//// nread = d_neighborList[n+17*Np]; // 18 -//// if (nread<0) nread=n; -//// f17 = d_distodd[8*Np+nread]; -// -// // At this point we need all f0, f1, f2, f3, f4 ..., f18 -// // COLLISION -// -// -// -// // write -// nwrite = d_neighborList[n]; // 1 -// if (nwrite<0) nwrite=n; -// d_disteven[Np+nwrite] = f1; -// -// nwrite = d_neighborList[Np+n]; // 2 -// if (nwrite<0) nwrite=n; -// d_distodd[nwrite] = f2; -// -//// nwrite = d_neighborList[2*Np+n]; // 3 -//// if (nwrite<0) nwrite=n; -//// d_disteven[2*Np+nwrite] = f3; -//// -//// nwrite = d_neighborList[3*Np+n]; // 4 -//// if (nwrite<0) nwrite=n; -//// d_distodd[Np+nwrite] = f4; -//// -//// nwrite = d_neighborList[4*Np+n]; // 5 -//// if (nwrite<0) nwrite=n; -//// d_disteven[3*Np+nwrite] = f5; -//// -//// nwrite = d_neighborList[5*Np+n]; // 6 -//// if (nwrite<0) nwrite=n; -//// d_distodd[2*Np+nwrite] = f6; -//// -//// nwrite = d_neighborList[6*Np+n]; // 7 -//// if (nwrite<0) nwrite=n; -//// d_disteven[4*Np+nwrite] = f7; -//// -//// nwrite = d_neighborList[7*Np+n]; // 8 -//// if (nwrite<0) nwrite=n; -//// d_distodd[3*Np+nwrite] = f8; -//// -//// nwrite = d_neighborList[8*Np+n]; // 9 -//// if (nwrite<0) nwrite=n; -//// d_disteven[5*Np+nwrite] = f9; -//// -//// nwrite = d_neighborList[9*Np+n]; // 10 -//// if (nwrite<0) nwrite=n; -//// d_distodd[4*Np+nwrite] = f10; -//// -//// nwrite = d_neighborList[10*Np+n]; // 11 -//// if (nwrite<0) nwrite=n; -//// d_disteven[6*Np+nwrite] = f11; -//// -//// nwrite = d_neighborList[11*Np+n]; // 12 -//// if (nwrite<0) nwrite=n; -//// d_distodd[5*Np+nwrite] = f12; -//// -//// nwrite = d_neighborList[12*Np+n]; // 13 -//// if (nwrite<0) nwrite=n; -//// d_disteven[7*Np+nwrite] = f13; -//// -//// nwrite = d_neighborList[13*Np+n]; // 14 -//// if (nwrite<0) nwrite=n; -//// d_distodd[6*Np+nwrite] = f14; -//// -//// nwrite = d_neighborList[14*Np+n]; // 15 -//// if (nwrite<0) nwrite=n; -//// d_disteven[8*Np+nwrite] = f15; -//// -//// nwrite = d_neighborList[15*Np+n]; // 16 -//// if (nwrite<0) nwrite=n; -//// d_distodd[7*Np+nwrite] = f16; -//// -//// nwrite = d_neighborList[16*Np+n]; // 17 -//// if (nwrite<0) nwrite=n; -//// d_disteven[9*Np+nwrite] = f17; -//// -//// nwrite = d_neighborList[17*Np+n]; // 18 -//// if (nwrite<0) nwrite=n; -//// d_distodd[8*Np+nwrite] = f18; -// -// } -// } -// } -//} - __global__ void dvc_ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz) { int i,j,k,n,nn,N; @@ -637,135 +178,135 @@ __global__ void dvc_ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *disto char id; double f1,f2,f3,f4,f5,f6,f7,f8,f9; double f10,f11,f12,f13,f14,f15,f16,f17,f18; - + N = Nx*Ny*Nz; - + int S = N/NBLOCKS/NTHREADS + 1; for (int s=0; s 0){ - //.......Back out the 3-D indices for node n.............. - k = n/(Nx*Ny); - j = (n-Nx*Ny*k)/Nx; - i = n-Nx*Ny*k-Nx*j; - //........................................................................ - // Retrieve even distributions from the local node (swap convention) - // f0 = disteven[n]; // Does not particupate in streaming - f1 = distodd[n]; - f3 = distodd[N+n]; - f5 = distodd[2*N+n]; - f7 = distodd[3*N+n]; - f9 = distodd[4*N+n]; - f11 = distodd[5*N+n]; - f13 = distodd[6*N+n]; - f15 = distodd[7*N+n]; - f17 = distodd[8*N+n]; - //........................................................................ - - //........................................................................ - // Retrieve odd distributions from neighboring nodes (swap convention) - //........................................................................ - nn = n+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[n] = f2; - disteven[N+nn] = f1; - } - //} - //........................................................................ - nn = n+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ - distodd[N+n] = f4; - disteven[2*N+nn] = f3; - // } - } - //........................................................................ - nn = n+Nx*Ny; // neighbor index (pull convention) - if (!(k+1 0.0){ - distodd[2*N+n] = f6; - disteven[3*N+nn] = f5; - // } - } - //........................................................................ - nn = n+Nx+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[3*N+n] = f8; - disteven[4*N+nn] = f7; - // } - } - //........................................................................ - nn = n-Nx+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[4*N+n] = f10; - disteven[5*N+nn] = f9; - // } - } - //........................................................................ - nn = n+Nx*Ny+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[5*N+n] = f12; - disteven[6*N+nn] = f11; - // } - } - //........................................................................ - nn = n-Nx*Ny+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[6*N+n] = f14; - disteven[7*N+nn] = f13; - // } - } - //........................................................................ - nn = n+Nx*Ny+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ - distodd[7*N+n] = f16; - disteven[8*N+nn] = f15; - // } - } - //........................................................................ - nn = n-Nx*Ny+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ - distodd[8*N+n] = f18; - disteven[9*N+nn] = f17; - // } - } - //........................................................................ - + id = ID[n]; + if (id > 0){ + //.......Back out the 3-D indices for node n.............. + k = n/(Nx*Ny); + j = (n-Nx*Ny*k)/Nx; + i = n-Nx*Ny*k-Nx*j; + //........................................................................ + // Retrieve even distributions from the local node (swap convention) + // f0 = disteven[n]; // Does not particupate in streaming + f1 = distodd[n]; + f3 = distodd[N+n]; + f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; + f11 = distodd[5*N+n]; + f13 = distodd[6*N+n]; + f15 = distodd[7*N+n]; + f17 = distodd[8*N+n]; + //........................................................................ + + //........................................................................ + // Retrieve odd distributions from neighboring nodes (swap convention) + //........................................................................ + nn = n+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[n] = f2; + disteven[N+nn] = f1; } + //} + //........................................................................ + nn = n+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ + distodd[N+n] = f4; + disteven[2*N+nn] = f3; + // } + } + //........................................................................ + nn = n+Nx*Ny; // neighbor index (pull convention) + if (!(k+1 0.0){ + distodd[2*N+n] = f6; + disteven[3*N+nn] = f5; + // } + } + //........................................................................ + nn = n+Nx+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[3*N+n] = f8; + disteven[4*N+nn] = f7; + // } + } + //........................................................................ + nn = n-Nx+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[4*N+n] = f10; + disteven[5*N+nn] = f9; + // } + } + //........................................................................ + nn = n+Nx*Ny+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[5*N+n] = f12; + disteven[6*N+nn] = f11; + // } + } + //........................................................................ + nn = n-Nx*Ny+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[6*N+n] = f14; + disteven[7*N+nn] = f13; + // } + } + //........................................................................ + nn = n+Nx*Ny+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ + distodd[7*N+n] = f16; + disteven[8*N+nn] = f15; + // } + } + //........................................................................ + nn = n-Nx*Ny+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ + distodd[8*N+n] = f18; + disteven[9*N+nn] = f17; + // } + } + //........................................................................ + + } } } } @@ -786,54 +327,54 @@ __global__ void dvc_ScaLBL_D3Q19_Velocity(char *ID, double *disteven, double *d //........Get 1-D index for this thread.................... n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x; if (n 0 ){ - f_even[n] = 0 + 0.01*0; - f_odd[n] = 0+ 0.01*1; //double(100*n)+1.f; - f_even[N+n] = 1+ 0.01*2; //double(100*n)+2.f; - f_odd[N+n] = 1+ 0.01*3; //double(100*n)+3.f; - f_even[2*N+n] = 2+ 0.01*4; //double(100*n)+4.f; - f_odd[2*N+n] = 2+ 0.01*5; //double(100*n)+5.f; - f_even[3*N+n] = 3+ 0.01*6; //double(100*n)+6.f; - f_odd[3*N+n] = 3+ 0.01*7; //double(100*n)+7.f; - f_even[4*N+n] = 4+ 0.01*8; //double(100*n)+8.f; - f_odd[4*N+n] = 4+ 0.01*9; //double(100*n)+9.f; - f_even[5*N+n] = 5+ 0.01*10; //double(100*n)+10.f; - f_odd[5*N+n] = 5+ 0.01*11; //double(100*n)+11.f; - f_even[6*N+n] = 6+ 0.01*12; //double(100*n)+12.f; - f_odd[6*N+n] = 6+ 0.01*13; //double(100*n)+13.f; - f_even[7*N+n] = 7+ 0.01*14; //double(100*n)+14.f; - f_odd[7*N+n] = 7+ 0.01*15; //double(100*n)+15.f; - f_even[8*N+n] = 8+ 0.01*16; //double(100*n)+16.f; - f_odd[8*N+n] = 8+ 0.01*17; //double(100*n)+17.f; - f_even[9*N+n] = 9+ 0.01*18; //double(100*n)+18.f; - } - else{ - for(int q=0; q<9; q++){ - f_even[q*N+n] = -1.0; - f_odd[q*N+n] = -1.0; - } - f_even[9*N+n] = -1.0; - } + f5 = f6 + 0.33333333333333338*uz; + f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz; + f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz; + f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz; + f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz; + //........Store in "opposite" memory location.......... + distodd[2*N+n] = f5; + distodd[5*N+n] = f11; + disteven[7*N+n] = f14; + distodd[7*N+n] = f15; + disteven[9*N+n] = f18; } } -} - -//__global__ void dvc_ScaLBL_AAeven_Compact(char * ID, double *disteven, double *distodd, int Np) { -// -// -// int q,n,nn; -// double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; -// double f10,f11,f12,f13,f14,f15,f16,f17,f18; -// -// char id; -// int S = Np/NBLOCKS/NTHREADS + 1; -// for (int s=0; s 0) { -// -// //........................................................................ -// // READ THE DISTRIBUTIONS -// // (read from opposite array due to previous swap operation) -// //........................................................................ -// // even -// f2 = distodd[n]; -// f4 = distodd[Np+n]; -// f6 = distodd[2*Np+n]; -// f8 = distodd[3*Np+n]; -// f10 = distodd[4*Np+n]; -// f12 = distodd[5*Np+n]; -// f14 = distodd[6*Np+n]; -// f16 = distodd[7*Np+n]; -// f18 = distodd[8*Np+n]; -// f0 = disteven[n]; -// // odd -// f1 = disteven[Np+n]; -// f3 = disteven[2*Np+n]; -// f5 = disteven[3*Np+n]; -// f7 = disteven[4*Np+n]; -// f9 = disteven[5*Np+n]; -// f11 = disteven[6*Np+n]; -// f13 = disteven[7*Np+n]; -// f15 = disteven[8*Np+n]; -// f17 = disteven[9*Np+n]; -// -// //........................................................................ -// // WRITE THE DISTRIBUTIONS -// // even -// //disteven[n] = f0; -// disteven[Np+n] = f2; -// disteven[2*Np+n] = f4; -// disteven[3*Np+n] = f6; -// disteven[4*Np+n] = f8; -// disteven[5*Np+n] = f10; -// disteven[6*Np+n] = f12; -// disteven[7*Np+n] = f14; -// disteven[8*Np+n] = f16; -// disteven[9*Np+n] = f18; -// // odd -// distodd[n] = f1; -// distodd[Np+n] = f3; -// distodd[2*Np+n] = f5; -// distodd[3*Np+n] = f7; -// distodd[4*Np+n] = f9; -// distodd[5*Np+n] = f11; -// distodd[6*Np+n] = f13; -// distodd[7*Np+n] = f15; -// distodd[8*Np+n] = f17; -// //........................................................................ -// } -// } -// } -//} - - __global__ void dvc_ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout, - int Nx, int Ny, int Nz, int outlet) + int Nx, int Ny, int Nz, int outlet) { int n,N; // distributions @@ -1385,27 +817,26 @@ extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, dou } //************************************************************************* extern "C" void ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz){ - // dvc_ScaLBL_D3Q19_Init<<>>(ID, f_even, f_odd, Nx, Ny, Nz); - dvc_ScaLBL_D3Q19_Init_Simple<<>>(ID, f_even, f_odd, Nx, Ny, Nz); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); - } + dvc_ScaLBL_D3Q19_Init<<>>(ID, f_even, f_odd, Nx, Ny, Nz); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); + } } extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz){ dvc_ScaLBL_D3Q19_Swap<<>>(ID, disteven, distodd, Nx, Ny, Nz); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",cudaGetErrorString(err)); - } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",cudaGetErrorString(err)); + } } extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, double *distodd, int Np) { const int Q = 9; - // cudaStream_t streams[Q]; +// cudaStream_t streams[Q]; // Launch the swap operation as different kernels for (int q=0; q>>(neighborList, disteven, distodd, Np, q); @@ -1418,39 +849,14 @@ extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, d } } -//extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *d_disteven, double *d_distodd, int Np) { -// dvc_ScaLBL_AAeven_Compact<<>>(ID, d_disteven, d_distodd,Np); -// cudaError_t err = cudaGetLastError(); -// if (cudaSuccess != err){ -// printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); -// } -//} - -extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *d_dist, int Np) { - dvc_ScaLBL_AAeven_Compact<<>>(ID, d_dist, Np); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); - } -} - -extern "C" void ScaLBL_D3Q19_AAodd_Compact(char * ID, int *d_neighborList, double *d_dist, int Np) { - dvc_ScaLBL_AAodd_Compact<<>>(ID,d_neighborList, d_dist,Np); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); - } -} - - extern "C" void ScaLBL_D3Q19_Velocity(char *ID, double *disteven, double *distodd, double *vel, int Nx, int \ - Ny, int Nz){ +Ny, int Nz){ - dvc_ScaLBL_D3Q19_Velocity<<>>(ID, disteven, distodd, vel, Nx, Ny, Nz); + dvc_ScaLBL_D3Q19_Velocity<<>>(ID, disteven, distodd, vel, Nx, Ny, Nz); } extern "C" void ScaLBL_D3Q19_Pressure(char *ID, double *disteven, double *distodd, double *Pressure, - int Nx, int Ny, int Nz){ - dvc_ScaLBL_D3Q19_Pressure<<< NBLOCKS,NTHREADS >>>(ID, disteven, distodd, Pressure, Nx, Ny, Nz); + int Nx, int Ny, int Nz){ + dvc_ScaLBL_D3Q19_Pressure<<< NBLOCKS,NTHREADS >>>(ID, disteven, distodd, Pressure, Nx, Ny, Nz); } extern "C" void ScaLBL_D3Q19_Velocity_BC_z(double *disteven, double *distodd, double uz,int Nx, int Ny, int Nz){ @@ -1463,9 +869,6 @@ extern "C" void ScaLBL_D3Q19_Velocity_BC_Z(double *disteven, double *distodd, do dvc_D3Q19_Velocity_BC_Z<<>>(disteven, distodd, uz, Nx, Ny, Nz, outlet); } - - - extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux,int Nx, int Ny, int Nz){ int GRID = Nx*Ny / 512 + 1; @@ -1529,13 +932,13 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub extern "C" double deviceReduce(double *in, double* out, int N) { - int threads = 512; - int blocks = min((N + threads - 1) / threads, 1024); + int threads = 512; + int blocks = min((N + threads - 1) / threads, 1024); - double sum = 0.f; - deviceReduceKernel<<>>(in, out, N); - deviceReduceKernel<<<1, 1024>>>(out, out, blocks); - return sum; + double sum = 0.f; + deviceReduceKernel<<>>(in, out, N); + deviceReduceKernel<<<1, 1024>>>(out, out, blocks); + return sum; } extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din, int Nx, int Ny, int Nz){ @@ -1543,7 +946,7 @@ extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, do dvc_ScaLBL_D3Q19_Pressure_BC_z<<>>(disteven, distodd, din, Nx, Ny, Nz); } extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout, - int Nx, int Ny, int Nz, int outlet){ + int Nx, int Ny, int Nz, int outlet){ int GRID = Nx*Ny / 512 + 1; dvc_ScaLBL_D3Q19_Pressure_BC_Z<<>>(disteven, distodd, dout, Nx, Ny, Nz, outlet); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 77e08021..de7cb14b 100755 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -20,7 +20,7 @@ ADD_LBPM_EXECUTABLE( lbpm_inkbottle_pp ) ADD_LBPM_EXECUTABLE( lbpm_plates_pp ) ADD_LBPM_EXECUTABLE( lbpm_squaretube_pp ) ADD_LBPM_EXECUTABLE( lbpm_BlobAnalysis ) -ADD_LBPM_EXECUTABLE( TestBubble ) +#ADD_LBPM_EXECUTABLE( TestBubble ) ADD_LBPM_EXECUTABLE( ComponentLabel ) ADD_LBPM_EXECUTABLE( ColorToBinary ) ADD_LBPM_EXECUTABLE( BlobAnalysis ) @@ -33,7 +33,7 @@ CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/cylindertest ${CMAKE_CURRENT_BINARY_ # Add the tests ADD_LBPM_TEST( pmmc_cylinder ) -ADD_LBPM_TEST( TestBubble ) +#ADD_LBPM_TEST( TestBubble ) ADD_LBPM_TEST( TestTorus ) ADD_LBPM_TEST( TestFluxBC ) ADD_LBPM_TEST( TestInterfaceSpeed ) diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp index df1b1fb2..409debb7 100644 --- a/tests/lbpm_color_simulator.cpp +++ b/tests/lbpm_color_simulator.cpp @@ -662,7 +662,6 @@ int main(int argc, char **argv) //ScaLBL_Color_BC_Z(Phi,Den,A_even,A_odd,B_even,B_odd,Nx,Ny,Nz); ScaLBL_SetSlice_z(Phi,-1.0,Nx,Ny,Nz,Nz-1); } - // Set dynamic pressure boundary conditions double dp, slope; dp = slope = 0.0; @@ -687,18 +686,6 @@ int main(int argc, char **argv) ScaLBL_D3Q19_Pressure(ID,f_even,f_odd,Pressure,Nx,Ny,Nz); ScaLBL_D3Q19_Velocity(ID,f_even,f_odd,Velocity,Nx,Ny,Nz); - /* if (BoundaryCondition==1 && Mask.kproc == 0){ - for (n=Nx*Ny; n<2*Nx*Ny; n++){ - if (Mask.id[n]>0 && 3.0*Pressure[n] != din) printf("Inlet pBC error: %f != %f \n",3.0*Pressure[n],din); - } - } - - if (BoundaryCondition==1 && Mask.kproc == nprocz-1){ - for (n=Nx*Ny*(Nz-2); n0 && 3.0*Pressure[n] != dout) printf("Outlet pBC error: %f != %f \n",3.0*Pressure[n],din); - } - } - */ //........................................................................... // Copy the phase indicator field for the earlier timestep ScaLBL_DeviceBarrier();