/*
Copyright 2013--2018 James E. McClure, Virginia Polytechnic & State University
Copyright Equnior ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see .
*/
#include
#include "hip/hip_runtime.h"
#include "hip/hip_cooperative_groups.h"
#define NBLOCKS 1024
#define NTHREADS 512
/*
1. constants that are known at compile time should be defined using preprocessor macros (e.g. #define) or via C/C++ const variables at global/file scope.
2. Usage of __constant__ memory may be beneficial for programs who use certain values that don't change for the duration of the kernel and for which certain access patterns are present (e.g. all threads access the same value at the same time). This is not better or faster than constants that satisfy the requirements of item 1 above.
3. If the number of choices to be made by a program are relatively small in number, and these choices affect kernel execution, one possible approach for additional compile-time optimization would be to use templated code/kernels
*/
__constant__ __device__ double mrt_V1=0.05263157894736842;
__constant__ __device__ double mrt_V2=0.012531328320802;
__constant__ __device__ double mrt_V3=0.04761904761904762;
__constant__ __device__ double mrt_V4=0.004594820384294068;
__constant__ __device__ double mrt_V5=0.01587301587301587;
__constant__ __device__ double mrt_V6=0.0555555555555555555555555;
__constant__ __device__ double mrt_V7=0.02777777777777778;
__constant__ __device__ double mrt_V8=0.08333333333333333;
__constant__ __device__ double mrt_V9=0.003341687552213868;
__constant__ __device__ double mrt_V10=0.003968253968253968;
__constant__ __device__ double mrt_V11=0.01388888888888889;
__constant__ __device__ double mrt_V12=0.04166666666666666;
// functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location
// functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
//__shared__ double Transform[722]=
// {};
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
using namespace cooperative_groups;
__device__ double reduce_sum(thread_group g, double *temp, double val)
{
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2)
{
temp[lane] = val;
g.sync(); // wait for all threads to store
if(lane 0; offset /= 2)
val += __shfl_down_sync(0xFFFFFFFF, val, offset, 32);
return val;
#else
short int id = threadIdx.x % warpSize;
__shared__ double tmp[64];
tmp[id] = val;
__syncthreads();
if ( warpSize == 64) {
tmp[id] += tmp[id+32]; __syncthreads();
}
tmp[id] += tmp[id+16]; __syncthreads();
tmp[id] += tmp[id+8]; __syncthreads();
tmp[id] += tmp[id+4]; __syncthreads();
tmp[id] += tmp[id+2]; __syncthreads();
tmp[id] += tmp[id+1]; __syncthreads();
return tmp[0];
#endif
}
__inline__ __device__
double blockReduceSum(double val) {
static __shared__ double shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(double *in, double* out, int N) {
double sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x) {
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
__global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){
//....................................................................................
// Pack distribution q into the send buffer for the listed lattice sites
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int idx,n;
idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx 0 ){
f_even[n] = 0.3333333333333333;
f_odd[n] = 0.055555555555555555; //double(100*n)+1.f;
f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f;
f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f;
f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f;
f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f;
f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f;
f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f;
f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f;
f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f;
f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f;
f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f;
f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f;
f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f;
f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f;
f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f;
f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f;
f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f;
f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f;
}
else{
for(int q=0; q<9; q++){
f_even[q*N+n] = -1.0;
f_odd[q*N+n] = -1.0;
}
f_even[9*N+n] = -1.0;
}
}
}
}
__global__ void dvc_ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np)
{
int n;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s 10Np => odd part of dist)
fq = dist[nread]; // reading the f1 data into register fq
//fp = dist[10*Np+n];
rho += fq;
m1 -= 11.0*fq;
m2 -= 4.0*fq;
jx = fq;
m4 = -4.0*fq;
m9 = 2.0*fq;
m10 = -4.0*fq;
// f2 = dist[10*Np+n];
nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
fq = dist[nread]; // reading the f2 data into register fq
//fq = dist[Np+n];
rho += fq;
m1 -= 11.0*(fq);
m2 -= 4.0*(fq);
jx -= fq;
m4 += 4.0*(fq);
m9 += 2.0*(fq);
m10 -= 4.0*(fq);
// q=3
nread = neighborList[n+2*Np]; // neighbor 4
fq = dist[nread];
//fq = dist[11*Np+n];
rho += fq;
m1 -= 11.0*fq;
m2 -= 4.0*fq;
jy = fq;
m6 = -4.0*fq;
m9 -= fq;
m10 += 2.0*fq;
m11 = fq;
m12 = -2.0*fq;
// q = 4
nread = neighborList[n+3*Np]; // neighbor 3
fq = dist[nread];
//fq = dist[2*Np+n];
rho+= fq;
m1 -= 11.0*fq;
m2 -= 4.0*fq;
jy -= fq;
m6 += 4.0*fq;
m9 -= fq;
m10 += 2.0*fq;
m11 += fq;
m12 -= 2.0*fq;
// q=5
nread = neighborList[n+4*Np];
fq = dist[nread];
//fq = dist[12*Np+n];
rho += fq;
m1 -= 11.0*fq;
m2 -= 4.0*fq;
jz = fq;
m8 = -4.0*fq;
m9 -= fq;
m10 += 2.0*fq;
m11 -= fq;
m12 += 2.0*fq;
// q = 6
nread = neighborList[n+5*Np];
fq = dist[nread];
//fq = dist[3*Np+n];
rho+= fq;
m1 -= 11.0*fq;
m2 -= 4.0*fq;
jz -= fq;
m8 += 4.0*fq;
m9 -= fq;
m10 += 2.0*fq;
m11 -= fq;
m12 += 2.0*fq;
// q=7
nread = neighborList[n+6*Np];
fq = dist[nread];
//fq = dist[13*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx += fq;
m4 += fq;
jy += fq;
m6 += fq;
m9 += fq;
m10 += fq;
m11 += fq;
m12 += fq;
m13 = fq;
m16 = fq;
m17 = -fq;
// q = 8
nread = neighborList[n+7*Np];
fq = dist[nread];
//fq = dist[4*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx -= fq;
m4 -= fq;
jy -= fq;
m6 -= fq;
m9 += fq;
m10 += fq;
m11 += fq;
m12 += fq;
m13 += fq;
m16 -= fq;
m17 += fq;
// q=9
nread = neighborList[n+8*Np];
fq = dist[nread];
//fq = dist[14*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx += fq;
m4 += fq;
jy -= fq;
m6 -= fq;
m9 += fq;
m10 += fq;
m11 += fq;
m12 += fq;
m13 -= fq;
m16 += fq;
m17 += fq;
// q = 10
nread = neighborList[n+9*Np];
fq = dist[nread];
//fq = dist[5*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx -= fq;
m4 -= fq;
jy += fq;
m6 += fq;
m9 += fq;
m10 += fq;
m11 += fq;
m12 += fq;
m13 -= fq;
m16 -= fq;
m17 -= fq;
// q=11
nread = neighborList[n+10*Np];
fq = dist[nread];
//fq = dist[15*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx += fq;
m4 += fq;
jz += fq;
m8 += fq;
m9 += fq;
m10 += fq;
m11 -= fq;
m12 -= fq;
m15 = fq;
m16 -= fq;
m18 = fq;
// q=12
nread = neighborList[n+11*Np];
fq = dist[nread];
//fq = dist[6*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx -= fq;
m4 -= fq;
jz -= fq;
m8 -= fq;
m9 += fq;
m10 += fq;
m11 -= fq;
m12 -= fq;
m15 += fq;
m16 += fq;
m18 -= fq;
// q=13
nread = neighborList[n+12*Np];
fq = dist[nread];
//fq = dist[16*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx += fq;
m4 += fq;
jz -= fq;
m8 -= fq;
m9 += fq;
m10 += fq;
m11 -= fq;
m12 -= fq;
m15 -= fq;
m16 -= fq;
m18 -= fq;
// q=14
nread = neighborList[n+13*Np];
fq = dist[nread];
//fq = dist[7*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jx -= fq;
m4 -= fq;
jz += fq;
m8 += fq;
m9 += fq;
m10 += fq;
m11 -= fq;
m12 -= fq;
m15 -= fq;
m16 += fq;
m18 += fq;
// q=15
nread = neighborList[n+14*Np];
fq = dist[nread];
//fq = dist[17*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jy += fq;
m6 += fq;
jz += fq;
m8 += fq;
m9 -= 2.0*fq;
m10 -= 2.0*fq;
m14 = fq;
m17 += fq;
m18 -= fq;
// q=16
nread = neighborList[n+15*Np];
fq = dist[nread];
//fq = dist[8*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jy -= fq;
m6 -= fq;
jz -= fq;
m8 -= fq;
m9 -= 2.0*fq;
m10 -= 2.0*fq;
m14 += fq;
m17 -= fq;
m18 += fq;
// q=17
//fq = dist[18*Np+n];
nread = neighborList[n+16*Np];
fq = dist[nread];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jy += fq;
m6 += fq;
jz -= fq;
m8 -= fq;
m9 -= 2.0*fq;
m10 -= 2.0*fq;
m14 -= fq;
m17 += fq;
m18 += fq;
// q=18
nread = neighborList[n+17*Np];
fq = dist[nread];
//fq = dist[9*Np+n];
rho += fq;
m1 += 8.0*fq;
m2 += fq;
jy -= fq;
m6 -= fq;
jz += fq;
m8 += fq;
m9 -= 2.0*fq;
m10 -= 2.0*fq;
m14 -= fq;
m17 -= fq;
m18 -= fq;
//..............incorporate external force................................................
//..............carry out relaxation process...............................................
m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho - 11*rho) - m1);
m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho) - m2);
m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho) - m9);
m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/rho) - m10);
m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho) - m11);
m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/rho) - m12);
m13 = m13 + rlx_setA*((jx*jy/rho) - m13);
m14 = m14 + rlx_setA*((jy*jz/rho) - m14);
m15 = m15 + rlx_setA*((jx*jz/rho) - m15);
m16 = m16 + rlx_setB*( - m16);
m17 = m17 + rlx_setB*( - m17);
m18 = m18 + rlx_setB*( - m18);
//.......................................................................................................
//.................inverse transformation......................................................
// q=0
fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2;
dist[n] = fq;
// q = 1
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx;
nread = neighborList[n+Np];
dist[nread] = fq;
// q=2
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx;
nread = neighborList[n];
dist[nread] = fq;
// q = 3
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy;
nread = neighborList[n+3*Np];
dist[nread] = fq;
// q = 4
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy;
nread = neighborList[n+2*Np];
dist[nread] = fq;
// q = 5
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz;
nread = neighborList[n+5*Np];
dist[nread] = fq;
// q = 6
fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz;
nread = neighborList[n+4*Np];
dist[nread] = fq;
// q = 7
fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+
mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy);
nread = neighborList[n+7*Np];
dist[nread] = fq;
// q = 8
fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11
+mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy);
nread = neighborList[n+6*Np];
dist[nread] = fq;
// q = 9
fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+
mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy);
nread = neighborList[n+9*Np];
dist[nread] = fq;
// q = 10
fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+
mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy);
nread = neighborList[n+8*Np];
dist[nread] = fq;
// q = 11
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)
+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11
-mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz);
nread = neighborList[n+11*Np];
dist[nread] = fq;
// q = 12
fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+
mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz);
nread = neighborList[n+10*Np];
dist[nread]= fq;
// q = 13
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)
+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11
-mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz);
nread = neighborList[n+13*Np];
dist[nread] = fq;
// q= 14
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)
+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11
-mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz);
nread = neighborList[n+12*Np];
dist[nread] = fq;
// q = 15
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)
-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz);
nread = neighborList[n+15*Np];
dist[nread] = fq;
// q = 16
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)
-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz);
nread = neighborList[n+14*Np];
dist[nread] = fq;
// q = 17
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)
-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz);
nread = neighborList[n+17*Np];
dist[nread] = fq;
// q = 18
fq = mrt_V1*rho+mrt_V9*m1
+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)
-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz);
nread = neighborList[n+16*Np];
dist[nread] = fq;
}
}
}
//__launch_bounds__(512,1)
__global__ void
__launch_bounds__(512,1)
dvc_ScaLBL_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz) {
int n;
double fq;
// conserved momemnts
double rho,jx,jy,jz;
// non-conserved moments
double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s 0){
//.......Back out the 3-D indices for node n..............
k = n/(Nx*Ny);
j = (n-Nx*Ny*k)/Nx;
i = n-Nx*Ny*k-Nx*j;
//........................................................................
// Retrieve even distributions from the local node (swap convention)
// f0 = disteven[n]; // Does not particupate in streaming
f1 = distodd[n];
f3 = distodd[N+n];
f5 = distodd[2*N+n];
f7 = distodd[3*N+n];
f9 = distodd[4*N+n];
f11 = distodd[5*N+n];
f13 = distodd[6*N+n];
f15 = distodd[7*N+n];
f17 = distodd[8*N+n];
//........................................................................
//........................................................................
// Retrieve odd distributions from neighboring nodes (swap convention)
//........................................................................
nn = n+1; // neighbor index (pull convention)
if (!(i+1 0.0){
distodd[n] = f2;
disteven[N+nn] = f1;
}
//}
//........................................................................
nn = n+Nx; // neighbor index (pull convention)
if (!(j+1 0.0){
distodd[N+n] = f4;
disteven[2*N+nn] = f3;
// }
}
//........................................................................
nn = n+Nx*Ny; // neighbor index (pull convention)
if (!(k+1 0.0){
distodd[2*N+n] = f6;
disteven[3*N+nn] = f5;
// }
}
//........................................................................
nn = n+Nx+1; // neighbor index (pull convention)
if (!(i+1 0.0){
distodd[3*N+n] = f8;
disteven[4*N+nn] = f7;
// }
}
//........................................................................
nn = n-Nx+1; // neighbor index (pull convention)
if (!(i+1 0.0){
distodd[4*N+n] = f10;
disteven[5*N+nn] = f9;
// }
}
//........................................................................
nn = n+Nx*Ny+1; // neighbor index (pull convention)
if (!(i+1 0.0){
distodd[5*N+n] = f12;
disteven[6*N+nn] = f11;
// }
}
//........................................................................
nn = n-Nx*Ny+1; // neighbor index (pull convention)
if (!(i+1 0.0){
distodd[6*N+n] = f14;
disteven[7*N+nn] = f13;
// }
}
//........................................................................
nn = n+Nx*Ny+Nx; // neighbor index (pull convention)
if (!(j+1 0.0){
distodd[7*N+n] = f16;
disteven[8*N+nn] = f15;
// }
}
//........................................................................
nn = n-Nx*Ny+Nx; // neighbor index (pull convention)
if (!(j+1 0.0){
distodd[8*N+n] = f18;
disteven[9*N+nn] = f17;
// }
}
//........................................................................
}
}
}
}
__global__ void dvc_ScaLBL_D3Q19_Momentum(double *dist, double *vel, int N)
{
int n;
// distributions
double f1,f2,f3,f4,f5,f6,f7,f8,f9;
double f10,f11,f12,f13,f14,f15,f16,f17,f18;
double vx,vy,vz;
int S = N/NBLOCKS/NTHREADS + 1;
for (int s=0; s 0 ){
f_even[n] = 0 + 0.01*0;
f_odd[n] = 0+ 0.01*1; //double(100*n)+1.f;
f_even[N+n] = 1+ 0.01*2; //double(100*n)+2.f;
f_odd[N+n] = 1+ 0.01*3; //double(100*n)+3.f;
f_even[2*N+n] = 2+ 0.01*4; //double(100*n)+4.f;
f_odd[2*N+n] = 2+ 0.01*5; //double(100*n)+5.f;
f_even[3*N+n] = 3+ 0.01*6; //double(100*n)+6.f;
f_odd[3*N+n] = 3+ 0.01*7; //double(100*n)+7.f;
f_even[4*N+n] = 4+ 0.01*8; //double(100*n)+8.f;
f_odd[4*N+n] = 4+ 0.01*9; //double(100*n)+9.f;
f_even[5*N+n] = 5+ 0.01*10; //double(100*n)+10.f;
f_odd[5*N+n] = 5+ 0.01*11; //double(100*n)+11.f;
f_even[6*N+n] = 6+ 0.01*12; //double(100*n)+12.f;
f_odd[6*N+n] = 6+ 0.01*13; //double(100*n)+13.f;
f_even[7*N+n] = 7+ 0.01*14; //double(100*n)+14.f;
f_odd[7*N+n] = 7+ 0.01*15; //double(100*n)+15.f;
f_even[8*N+n] = 8+ 0.01*16; //double(100*n)+16.f;
f_odd[8*N+n] = 8+ 0.01*17; //double(100*n)+17.f;
f_even[9*N+n] = 9+ 0.01*18; //double(100*n)+18.f;
}
else{
for(int q=0; q<9; q++){
f_even[q*N+n] = -1.0;
f_odd[q*N+n] = -1.0;
}
f_even[9*N+n] = -1.0;
}
}
}
}
//*************************************************************************
//extern "C" void ScaLBL_D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list, int start, int count,
// int *d3q19_recvlist, int Nx, int Ny, int Nz){
// int GRID = count / 512 + 1;
// dvc_ScaLBL_D3Q19_Unpack <<>>(q, Cqx, Cqy, Cqz, list, start, count, d3q19_recvlist, Nx, Ny, Nz);
//}
extern "C" void ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_Pack <<>>(q, list, start, count, sendbuf, dist, N);
}
extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_Unpack <<>>(q, list, start, count, recvbuf, dist, N);
}
//*************************************************************************
extern "C" void ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np){
dvc_ScaLBL_D3Q19_AA_Init<<>>(f_even, f_odd, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AA_Init: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Init(double *dist, int Np){
dvc_ScaLBL_D3Q19_Init<<>>(dist, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AA_Init: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz){
dvc_ScaLBL_D3Q19_Swap<<>>(ID, disteven, distodd, Nx, Ny, Nz);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, double *distodd, int Np)
{
const int Q = 9;
// hipStream_t streams[Q];
// Launch the swap operation as different kernels
for (int q=0; q>>(neighborList, disteven, distodd, Np, q);
}
// cpu should wait for all kernels to finish (to avoid launch of dependent kernels)
//hipDeviceSynchronize();
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAeven_Compact( double *d_dist, int Np) {
hipFuncSetCacheConfig( (void*) dvc_ScaLBL_AAeven_Compact, hipFuncCachePreferL1);
dvc_ScaLBL_AAeven_Compact<<>>( d_dist, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAodd_Compact( int *d_neighborList, double *d_dist, int Np) {
hipFuncSetCacheConfig( (void*) dvc_ScaLBL_AAodd_Compact, hipFuncCachePreferL1);
dvc_ScaLBL_AAodd_Compact<<>>(d_neighborList, d_dist,Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Momentum(double *dist, double *vel, int Np){
dvc_ScaLBL_D3Q19_Momentum<<>>(dist, vel, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Velocity: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Pressure(double *fq, double *Pressure, int Np){
dvc_ScaLBL_D3Q19_Pressure<<< NBLOCKS,NTHREADS >>>(fq, Pressure, Np);
}
extern "C" void ScaLBL_D3Q19_Velocity_BC_z(double *disteven, double *distodd, double uz,int Nx, int Ny, int Nz){
int GRID = Nx*Ny / 512 + 1;
dvc_D3Q19_Velocity_BC_z<<>>(disteven,distodd, uz, Nx, Ny, Nz);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Velocity_BC_z: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Velocity_BC_Z(double *disteven, double *distodd, double uz, int Nx, int Ny, int Nz, int outlet){
int GRID = Nx*Ny / 512 + 1;
dvc_D3Q19_Velocity_BC_Z<<>>(disteven, distodd, uz, Nx, Ny, Nz, outlet);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Velocity_BC_Z: %s \n",hipGetErrorString(err));
}
}
extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux,int Nx, int Ny, int Nz){
int GRID = Nx*Ny / 512 + 1;
// IMPORTANT -- this routine may fail if Nx*Ny > 512*512
if (Nx*Ny > 512*512){
printf("WARNING (ScaLBL_D3Q19_Flux_BC_z): CUDA reduction operation may fail if Nx*Ny > 512*512");
}
// Allocate memory to store the sums
double din;
double sum[1];
double *dvcsum;
int sharedBytes = NTHREADS*sizeof(double);
hipMalloc((void **)&dvcsum,sizeof(double)*Nx*Ny);
hipMemset(dvcsum,0,sizeof(double)*Nx*Ny);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Flux_BC_z (memory allocation): %s \n",hipGetErrorString(err));
}
// compute the local flux and store the result
dvc_D3Q19_Flux_BC_z<<>>(disteven, distodd, flux, dvcsum, Nx, Ny, Nz);
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Flux_BC_z (flux calculation, step 1): %s \n",hipGetErrorString(err));
}
// Now read the total flux
hipMemcpy(&sum[0],dvcsum,sizeof(double),hipMemcpyDeviceToHost);
din=sum[0];
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_Flux_BC_z (flux calculation, step 2): %s \n",hipGetErrorString(err));
}
// free the memory needed for reduction
hipFree(dvcsum);
return din;
}
extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, double din, int count, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_z<<>>(list, dist, din, count, N);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_Pressure_BC_z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist, double dout, int count, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_Z<<>>(list, dist, dout, count, N);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_Pressure_BC_Z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *neighborList, int *list, double *dist, double din, int count, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_AAodd_Pressure_BC_z<<>>(neighborList, list, dist, din, count, N);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAodd_Pressure_BC_z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *neighborList, int *list, double *dist, double dout, int count, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_AAodd_Pressure_BC_Z<<>>(neighborList, list, dist, dout, count, N);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAodd_Pressure_BC_Z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" double ScaLBL_D3Q19_AAeven_Flux_BC_z(int *list, double *dist, double flux, double area,
int count, int N){
int GRID = count / 512 + 1;
// IMPORTANT -- this routine may fail if Nx*Ny > 512*512
if (count > 512*512){
printf("WARNING (ScaLBL_D3Q19_Flux_BC_Z): CUDA reduction operation may fail if count > 512*512");
}
// Allocate memory to store the sums
double din;
double sum[1];
double *dvcsum;
hipMalloc((void **)&dvcsum,sizeof(double)*count);
hipMemset(dvcsum,0,sizeof(double)*count);
int sharedBytes = 512*sizeof(double);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_Flux_BC_z (memory allocation): %s \n",hipGetErrorString(err));
}
// compute the local flux and store the result
dvc_ScaLBL_D3Q19_AAeven_Flux_BC_z<<>>(list, dist, flux, area, dvcsum, count, N);
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_Flux_BC_z (kernel): %s \n",hipGetErrorString(err));
}
// Now read the total flux
hipMemcpy(&sum[0],dvcsum,sizeof(double),hipMemcpyDeviceToHost);
din=sum[0];
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_Flux_BC_z (reduction): %s \n",hipGetErrorString(err));
}
// free the memory needed for reduction
hipFree(dvcsum);
return din;
}
extern "C" double ScaLBL_D3Q19_AAodd_Flux_BC_z(int *neighborList, int *list, double *dist, double flux,
double area, int count, int N){
int GRID = count / 512 + 1;
// IMPORTANT -- this routine may fail if Nx*Ny > 512*512
if (count > 512*512){
printf("WARNING (ScaLBL_D3Q19_AAodd_Flux_BC_z): CUDA reduction operation may fail if count > 512*512");
}
// Allocate memory to store the sums
double din;
double sum[1];
double *dvcsum;
hipMalloc((void **)&dvcsum,sizeof(double)*count);
hipMemset(dvcsum,0,sizeof(double)*count);
int sharedBytes = 512*sizeof(double);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAodd_Flux_BC_z (memory allocation): %s \n",hipGetErrorString(err));
}
// compute the local flux and store the result
dvc_ScaLBL_D3Q19_AAodd_Flux_BC_z<<>>(neighborList, list, dist, flux, area, dvcsum, count, N);
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAodd_Flux_BC_z (kernel): %s \n",hipGetErrorString(err));
}
// Now read the total flux
hipMemcpy(&sum[0],dvcsum,sizeof(double),hipMemcpyDeviceToHost);
din=sum[0];
err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAodd_Flux_BC_z (reduction): %s \n",hipGetErrorString(err));
}
// free the memory needed for reduction
hipFree(dvcsum);
return din;
}
extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, int Nx, int Ny, int Nz, int outlet){
int GRID = Nx*Ny / 512 + 1;
// IMPORTANT -- this routine may fail if Nx*Ny > 512*512
if (Nx*Ny > 512*512){
printf("WARNING (ScaLBL_D3Q19_Flux_BC_Z): CUDA reduction operation may fail if Nx*Ny > 512*512");
}
// Allocate memory to store the sums
double dout;
double sum[1];
double *dvcsum;
hipMalloc((void **)&dvcsum,sizeof(double)*Nx*Ny);
hipMemset(dvcsum,0,sizeof(double)*Nx*Ny);
// compute the local flux and store the result
dvc_D3Q19_Flux_BC_Z<<>>(disteven, distodd, flux, dvcsum, Nx, Ny, Nz, outlet);
// Now read the total flux
hipMemcpy(&sum[0],dvcsum,sizeof(double),hipMemcpyDeviceToHost);
// free the memory needed for reduction
dout = sum[0];
hipFree(dvcsum);
return dout;
}
extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_Reflection_BC_z<<>>(list, dist, count, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("HIP error in ScaLBL_D3Q19_Reflection_BC_z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q19_Reflection_BC_Z<<>>(list, dist, count, Np);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("HIP error in ScaLBL_D3Q19_Reflection_BC_Z (kernel): %s \n",hipGetErrorString(err));
}
}
extern "C" double deviceReduce(double *in, double* out, int N) {
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
double sum = 0.f;
deviceReduceKernel<<>>(in, out, N);
deviceReduceKernel<<<1, 1024>>>(out, out, blocks);
return sum;
}
//
//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(int *list, double *dist, double dout, int count, int Np){
// int GRID = count / 512 + 1;
// dvc_ScaLBL_D3Q19_Pressure_BC_Z<<>>(disteven, distodd, dout, Nx, Ny, Nz, outlet);
//}
extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
double Fy, double Fz){
dvc_ScaLBL_AAeven_MRT<<>>(dist,start,finish,Np,rlx_setA,rlx_setB,Fx,Fy,Fz);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_MRT: %s \n",hipGetErrorString(err));
}
}
extern "C" void ScaLBL_D3Q19_AAodd_MRT(int *neighborlist, double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
double Fy, double Fz){
dvc_ScaLBL_AAodd_MRT<<>>(neighborlist,dist,start,finish,Np,rlx_setA,rlx_setB,Fx,Fy,Fz);
hipError_t err = hipGetLastError();
if (hipSuccess != err){
printf("CUDA error in ScaLBL_D3Q19_AAeven_MRT: %s \n",hipGetErrorString(err));
}
}