diff --git a/cpu/Color.cpp b/cpu/Color.cpp index 1629b202..49d5fbc9 100644 --- a/cpu/Color.cpp +++ b/cpu/Color.cpp @@ -103,13 +103,10 @@ extern "C" void ScaLBL_Color_InitDistance(char *ID, double *Den, double *Phi, do //************************************************************************* -extern "C" void ScaLBL_Color_BC_z(double *Phi, double *Den, double *Velocity, double *A_even, double *A_odd, +extern "C" void ScaLBL_Color_BC_z(double *Phi, double *Den, double *A_even, double *A_odd, double *B_even, double *B_odd, int Nx, int Ny, int Nz) { int i,j,k,n,N; - double na,nb,ux,uy,uz; - double a1,b1,a2,b2; - N = Nx*Ny*Nz; // Fill the inlet with component a for (k=0; k<1; k++){ @@ -129,7 +126,6 @@ extern "C" void ScaLBL_Color_BC_z(double *Phi, double *Den, double *Velocity, do Den[n] = 1.0; Den[N+n] = 0.0; - /* A_even[n] = 0.3333333333333333; A_odd[n] = 0.1111111111111111; A_even[N+n] = 0.1111111111111111; @@ -139,175 +135,32 @@ extern "C" void ScaLBL_Color_BC_z(double *Phi, double *Den, double *Velocity, do A_even[3*N+n] = 0.1111111111111111; B_even[n] = 0.0; - B_odd[n] = 0.0; + B_odd[n] = 0.0; B_even[N+n] = 0.0; - B_odd[N+n] = 0.0; + B_odd[N+n] = 0.0; B_even[2*N+n] = 0.0; B_odd[2*N+n] = 0.0; B_even[3*N+n] = 0.0; - */ - - na = 1.0; - nb = 0.0; - //....Load the flow velocity........... - ux = -1.0*Velocity[n]; - uy = -1.0*Velocity[N+n]; - uz = -1.0*Velocity[2*N+n]; - - A_even[n] = 0.3333333333333333*na; - B_even[n] = 0.3333333333333333*nb; - // Non-Stationary equilibrium distributions - //feq[0] = 0.1111111111111111*(1+4.5*ux); - //feq[1] = 0.1111111111111111*(1-4.5*ux); - //feq[2] = 0.1111111111111111*(1+4.5*uy); - //feq[3] = 0.1111111111111111*(1-4.5*uy); - //feq[4] = 0.1111111111111111*(1+4.5*uz); - //feq[5] = 0.1111111111111111*(1-4.5*uz); - - //............................................... - // q = 0,2,4 - // Cq = {1,0,0}, {0,1,0}, {0,0,1} - a1 = na*(0.1111111111111111*(1+4.5*ux)); - b1 = nb*(0.1111111111111111*(1+4.5*ux)); - a2 = na*(0.1111111111111111*(1-4.5*ux)); - b2 = nb*(0.1111111111111111*(1-4.5*ux)); - - A_odd[n] = a1; - A_even[N+n] = a2; - B_odd[n] = b1; - B_even[N+n] = b2; - //............................................... - // q = 2 - // Cq = {0,1,0} - a1 = na*(0.1111111111111111*(1+4.5*uy)); - b1 = nb*(0.1111111111111111*(1+4.5*uy)); - a2 = na*(0.1111111111111111*(1-4.5*uy)); - b2 = nb*(0.1111111111111111*(1-4.5*uy)); - - A_odd[N+n] = a1; - A_even[2*N+n] = a2; - B_odd[N+n] = b1; - B_even[2*N+n] = b2; - //............................................... - // q = 4 - // Cq = {0,0,1} - a1 = na*(0.1111111111111111*(1+4.5*uz)); - b1 = nb*(0.1111111111111111*(1+4.5*uz)); - a2 = na*(0.1111111111111111*(1-4.5*uz)); - b2 = nb*(0.1111111111111111*(1-4.5*uz)); - - A_odd[2*N+n] = a1; - A_even[3*N+n] = a2; - B_odd[2*N+n] = b1; - B_even[3*N+n] = b2; - - } } } } //************************************************************************* -extern "C" void ScaLBL_Color_BC_Z(double *Phi, double *Den, double *Velocity, double *A_even, double *A_odd, - double *B_even, double *B_odd, int Nx, int Ny, int Nz) +extern "C" void ScaLBL_Color_BC(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np) { - int i,j,k,n,N; - double na,nb,ux,uy,uz; - double a1,b1,a2,b2; - - N = Nx*Ny*Nz; + int idx,n,nm; // Fill the outlet with component b - for (k=Nz-3; k 0){ - - //.......Back out the 3-D indices for node n.............. - k = n/(Nx*Ny); - j = (n-Nx*Ny*k)/Nx; - i = n-Nx*Ny*k-Nx*j; - //........................................................................ - //........Get 1-D index for this thread.................... - // n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x; - //........................................................................ - // COMPUTE THE COLOR GRADIENT - //........................................................................ - //.................Read Phase Indicator Values............................ - //........................................................................ - nn = n-1; // neighbor index (get convention) - if (i-1<0) nn += Nx; // periodic BC along the x-boundary - f1 = phi[nn]; // get neighbor for phi - 1 - //........................................................................ - nn = n+1; // neighbor index (get convention) - if (!(i+1>> (ID, Den, Phi, das, dbs, Nx, Ny, Nz, S); -} -// ************************************************************************* -extern "C" void ScaLBL_D3Q19_ColorGradient(int nBlocks, int nthreads, int S, - char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz) -{ - ScaLBL_D3Q19_ColorGradient<<>>(ID, Phi, ColorGrad, Nx, Ny, Nz, S); -} -// ************************************************************************* -extern "C" void ColorCollide(int nBlocks, int nthreads, int S, - char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity, - double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz, - int Nx, int Ny, int Nz, bool pBC) -{ - ColorCollide<<>>(ID, f_even, f_odd, ColorGrad, Velocity, Nx, Ny, Nz, S, - rlxA, rlxB, alpha, beta, Fx, Fy, Fz, pBC); -} -// ************************************************************************* -extern "C" void ScaLBL_D3Q19_ColorCollide(int nBlocks, int nthreads, int S, - char *ID, double *f_even, double *f_odd, double *Phi, double *ColorGrad, - double *Velocity, int Nx, int Ny, int Nz,double rlxA, double rlxB, - double alpha, double beta, double Fx, double Fy, double Fz) -{ - ScaLBL_D3Q19_ColorCollide<<>>(ID, f_even, f_odd, Phi, ColorGrad, Velocity, Nx, Ny, Nz, S, - rlxA, rlxB, alpha, beta, Fx, Fy, Fz); -// bool pBC = false; -// ColorCollide<<>>(ID, f_even, f_odd, ColorGrad, Velocity, Nx, Ny, Nz, S, -// rlxA, rlxB, alpha, beta, Fx, Fy, Fz, pBC); + +//extern "C" void ScaLBL_D3Q19_AAeven_Color(double *dist, double *Aq, double *Bq, double *Den, double *Velocity, +// double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, +// double Fx, double Fy, double Fz, int start, int finish, int Np){ +extern "C" void ScaLBL_D3Q19_AAeven_Color(int *Map, double *dist, double *Aq, double *Bq, double *Den, double *Phi, + double *Velocity, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){ + + int ijk,nn,n; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + double m3,m5,m7; + double nA,nB; // number density + double a1,b1,a2,b2,nAB,delta; + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; + + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + + + for (int n=start; n0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + Aq[1*Np+n] = a1; + Bq[1*Np+n] = b1; + Aq[2*Np+n] = a2; + Bq[2*Np+n] = b2; + + //............................................... + // q = 2 + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + Aq[3*Np+n] = a1; + Bq[3*Np+n] = b1; + Aq[4*Np+n] = a2; + Bq[4*Np+n] = b2; + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + Aq[5*Np+n] = a1; + Bq[5*Np+n] = b1; + Aq[6*Np+n] = a2; + Bq[6*Np+n] = b2; + //............................................... + + } + } -// ************************************************************************* -extern "C" void DensityStreamD3Q7(int nBlocks, int nthreads, int S, - char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity, - double beta, int Nx, int Ny, int Nz, bool pBC) -{ - DensityStreamD3Q7<<>>(ID,Den,Copy,Phi,ColorGrad,Velocity,beta,Nx,Ny,Nz,pBC,S); -} -// ************************************************************************* -extern "C" void ScaLBL_ComputePhaseField(int nBlocks, int nthreads, int S, - char *ID, double *Phi, double *Copy, double *Den, int N) -{ - ScaLBL_ComputePhaseField<<>>(ID,Phi,Copy,Den,N,S); -} -// ************************************************************************* -extern "C" void ComputePressure(int nBlocks, int nthreads, int S, - char *ID, double *disteven, double *distodd, - double *Pressure, int Nx, int Ny, int Nz) -{ +//extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, double *dist, double *Aq, double *Bq, double *Den, double *Velocity, +// double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, +// double Fx, double Fy, double Fz, int start, int finish, int Np){ +extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den, + double *Phi, double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){ + + int n,nn,ijk,nread; + int nr1,nr2,nr3,nr4,nr5,nr6; + int nr7,nr8,nr9,nr10; + int nr11,nr12,nr13,nr14; + //int nr15,nr16,nr17,nr18; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + double m3,m5,m7; + double nA,nB; // number density + double a1,b1,a2,b2,nAB,delta; + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; - ScaLBL_D3Q19_Pressure<<>>(ID,disteven,distodd,Pressure,Nx,Ny,Nz,S); + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + for (int n=start; n even part of dist) + //fq = dist[nread]; // reading the f2 data into register fq + nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist) + fq = dist[nr2]; // reading the f2 data into register fq + rho += fq; + m1 -= 11.0*(fq); + m2 -= 4.0*(fq); + jx -= fq; + m4 += 4.0*(fq); + m9 += 2.0*(fq); + m10 -= 4.0*(fq); + + // q=3 + //nread = neighborList[n+2*Np]; // neighbor 4 + //fq = dist[nread]; + nr3 = neighborList[n+2*Np]; // neighbor 4 + fq = dist[nr3]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy = fq; + m6 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 = fq; + m12 = -2.0*fq; + + // q = 4 + //nread = neighborList[n+3*Np]; // neighbor 3 + //fq = dist[nread]; + nr4 = neighborList[n+3*Np]; // neighbor 3 + fq = dist[nr4]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy -= fq; + m6 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 += fq; + m12 -= 2.0*fq; + + // q=5 + //nread = neighborList[n+4*Np]; + //fq = dist[nread]; + nr5 = neighborList[n+4*Np]; + fq = dist[nr5]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz = fq; + m8 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + + // q = 6 + //nread = neighborList[n+5*Np]; + //fq = dist[nread]; + nr6 = neighborList[n+5*Np]; + fq = dist[nr6]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz -= fq; + m8 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + // q=7 + //nread = neighborList[n+6*Np]; + //fq = dist[nread]; + nr7 = neighborList[n+6*Np]; + fq = dist[nr7]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 = fq; + m16 = fq; + m17 = -fq; + + // q = 8 + //nread = neighborList[n+7*Np]; + //fq = dist[nread]; + nr8 = neighborList[n+7*Np]; + fq = dist[nr8]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 += fq; + m16 -= fq; + m17 += fq; + + // q=9 + //nread = neighborList[n+8*Np]; + //fq = dist[nread]; + nr9 = neighborList[n+8*Np]; + fq = dist[nr9]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 += fq; + m17 += fq; + + // q = 10 + //nread = neighborList[n+9*Np]; + //fq = dist[nread]; + nr10 = neighborList[n+9*Np]; + fq = dist[nr10]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 -= fq; + m17 -= fq; + + // q=11 + //nread = neighborList[n+10*Np]; + //fq = dist[nread]; + nr11 = neighborList[n+10*Np]; + fq = dist[nr11]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 = fq; + m16 -= fq; + m18 = fq; + + // q=12 + //nread = neighborList[n+11*Np]; + //fq = dist[nread]; + nr12 = neighborList[n+11*Np]; + fq = dist[nr12]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 += fq; + m16 += fq; + m18 -= fq; + + // q=13 + //nread = neighborList[n+12*Np]; + //fq = dist[nread]; + nr13 = neighborList[n+12*Np]; + fq = dist[nr13]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 -= fq; + m18 -= fq; + + // q=14 + //nread = neighborList[n+13*Np]; + //fq = dist[nread]; + nr14 = neighborList[n+13*Np]; + fq = dist[nr14]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 += fq; + m18 += fq; + + // q=15 + nread = neighborList[n+14*Np]; + fq = dist[nread]; + //fq = dist[17*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 = fq; + m17 += fq; + m18 -= fq; + + // q=16 + nread = neighborList[n+15*Np]; + fq = dist[nread]; + //fq = dist[8*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 += fq; + m17 -= fq; + m18 += fq; + + // q=17 + //fq = dist[18*Np+n]; + nread = neighborList[n+16*Np]; + fq = dist[nread]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 += fq; + m18 += fq; + + // q=18 + nread = neighborList[n+17*Np]; + fq = dist[nread]; + //fq = dist[9*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 -= fq; + m18 -= fq; + + //........................................................................ + //..............carry out relaxation process.............................. + //..........Toelke, Fruediger et. al. 2006................................ + if (C == 0.0) nx = ny = nz = 0.0; + m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho0 - 11*rho) -alpha*C - m1); + m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho0)- m2); + m4 = m4 + rlx_setB*((-0.6666666666666666*jx)- m4); + m6 = m6 + rlx_setB*((-0.6666666666666666*jy)- m6); + m8 = m8 + rlx_setB*((-0.6666666666666666*jz)- m8); + m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9); + m10 = m10 + rlx_setA*( - m10); + m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(ny*ny-nz*nz)- m11); + m12 = m12 + rlx_setA*( - m12); + m13 = m13 + rlx_setA*( (jx*jy/rho0) + 0.5*alpha*C*nx*ny - m13); + m14 = m14 + rlx_setA*( (jy*jz/rho0) + 0.5*alpha*C*ny*nz - m14); + m15 = m15 + rlx_setA*( (jx*jz/rho0) + 0.5*alpha*C*nx*nz - m15); + m16 = m16 + rlx_setB*( - m16); + m17 = m17 + rlx_setB*( - m17); + m18 = m18 + rlx_setB*( - m18); + //.................inverse transformation...................................................... + + // q=0 + fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2; + dist[n] = fq; + + // q = 1 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx; + //nread = neighborList[n+Np]; + dist[nr2] = fq; + + // q=2 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx; + //nread = neighborList[n]; + dist[nr1] = fq; + + // q = 3 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy; + //nread = neighborList[n+3*Np]; + dist[nr4] = fq; + + // q = 4 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy; + //nread = neighborList[n+2*Np]; + dist[nr3] = fq; + + // q = 5 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz; + //nread = neighborList[n+5*Np]; + dist[nr6] = fq; + + // q = 6 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz; + //nread = neighborList[n+4*Np]; + dist[nr5] = fq; + + // q = 7 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy); + //nread = neighborList[n+7*Np]; + dist[nr8] = fq; + + // q = 8 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy); + //nread = neighborList[n+6*Np]; + dist[nr7] = fq; + + // q = 9 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy); + //nread = neighborList[n+9*Np]; + dist[nr10] = fq; + + // q = 10 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy); + //nread = neighborList[n+8*Np]; + dist[nr9] = fq; + + // q = 11 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz); + //nread = neighborList[n+11*Np]; + dist[nr12] = fq; + + // q = 12 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+ + mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz); + //nread = neighborList[n+10*Np]; + dist[nr11]= fq; + + // q = 13 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz); + //nread = neighborList[n+13*Np]; + dist[nr14] = fq; + + // q= 14 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz); + //nread = neighborList[n+12*Np]; + dist[nr13] = fq; + + + // q = 15 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz); + nread = neighborList[n+15*Np]; + dist[nread] = fq; + + // q = 16 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz); + nread = neighborList[n+14*Np]; + dist[nread] = fq; + + + // q = 17 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8) + -mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz); + nread = neighborList[n+17*Np]; + dist[nread] = fq; + + // q = 18 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6) + -mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz); + nread = neighborList[n+16*Np]; + dist[nread] = fq; + + // write the velocity + ux = jx / rho0; + uy = jy / rho0; + uz = jz / rho0; + //Velocity[n] = ux; + //Velocity[Np+n] = uy; + //Velocity[2*Np+n] = uz; + + // Instantiate mass transport distributions + // Stationary value - distribution 0 + nAB = 1.0/(nA+nB); + Aq[n] = 0.3333333333333333*nA; + Bq[n] = 0.3333333333333333*nB; + + //............................................... + // q = 0,2,4 + // Cq = {1,0,0}, {0,1,0}, {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nx; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + // q = 1 + //nread = neighborList[n+Np]; + Aq[nr2] = a1; + Bq[nr2] = b1; + // q=2 + //nread = neighborList[n]; + Aq[nr1] = a2; + Bq[nr1] = b2; + + //............................................... + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + // q = 3 + //nread = neighborList[n+3*Np]; + Aq[nr4] = a1; + Bq[nr4] = b1; + // q = 4 + //nread = neighborList[n+2*Np]; + Aq[nr3] = a2; + Bq[nr3] = b2; + + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + // q = 5 + //nread = neighborList[n+5*Np]; + Aq[nr6] = a1; + Bq[nr6] = b1; + // q = 6 + //nread = neighborList[n+4*Np]; + Aq[nr5] = a2; + Bq[nr5] = b2; + //............................................... + } } -*/ + +extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double *Aq, double *Bq, + double *Den, double *Phi, int start, int finish, int Np){ + + int idx,n,nread; + double fq,nA,nB; + + for (int n=start; n 0.f){ + nA = 1.0; nB = 0.f; + } + else{ + nB = 1.0; nA = 0.f; + } + Den[idx] = nA; + Den[Np+idx] = nB; + + Aq[idx]=0.3333333333333333*nA; + Aq[Np+idx]=0.1111111111111111*nA; + Aq[2*Np+idx]=0.1111111111111111*nA; + Aq[3*Np+idx]=0.1111111111111111*nA; + Aq[4*Np+idx]=0.1111111111111111*nA; + Aq[5*Np+idx]=0.1111111111111111*nA; + Aq[6*Np+idx]=0.1111111111111111*nA; + + Bq[idx]=0.3333333333333333*nB; + Bq[Np+idx]=0.1111111111111111*nB; + Bq[2*Np+idx]=0.1111111111111111*nB; + Bq[3*Np+idx]=0.1111111111111111*nB; + Bq[4*Np+idx]=0.1111111111111111*nB; + Bq[5*Np+idx]=0.1111111111111111*nB; + Bq[6*Np+idx]=0.1111111111111111*nB; + } +} + diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp index 3d5522dc..2dc1e8fc 100644 --- a/cpu/D3Q19.cpp +++ b/cpu/D3Q19.cpp @@ -43,7 +43,7 @@ extern "C" void ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, - double *recvbuf, double *dist, int N){ + double *recvbuf, double *dist, int N){ //.................................................................................... // Unack distribution from the recv buffer // Distribution q matche Cqx, Cqy, Cqz @@ -90,42 +90,57 @@ extern "C" void ScaLBL_D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list } } -*/ -extern "C" void ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz) + */ +extern "C" void ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np) { - int n,N; - N = Nx*Ny*Nz; + int n; + for (n=0; n 0){ - f_even[n] = 0.3333333333333333; - f_odd[n] = 0.055555555555555555; //double(100*n)+1.f; - f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f; - f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f; - f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f; - f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f; - f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f; - f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f; - f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f; - f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f; - f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f; - f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f; - f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f; - f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f; - f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f; - f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f; - f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f; - f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f; - f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f; - } - else{ - for(int q=0; q<9; q++){ - f_even[q*N+n] = -1.0; - f_odd[q*N+n] = -1.0; - } - f_even[9*N+n] = -1.0; - } +extern "C" void ScaLBL_D3Q19_Init(double *dist, int Np) +{ + int n; + for (n=0; n 0){ //........................................................................ // Retrieve even distributions from the local node (swap convention) @@ -159,7 +174,7 @@ extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, i f15 = distodd[7*N+n]; f17 = distodd[8*N+n]; //........................................................................ - + //........................................................................ // Retrieve odd distributions from neighboring nodes (swap convention) //........................................................................ @@ -259,7 +274,7 @@ extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, i // } } //........................................................................ - + } } } @@ -282,8 +297,8 @@ extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, d } -extern "C" double ScaLBL_D3Q19_Flux_BC_z(char *ID, double *disteven, double *distodd, double Q, double area, - int Nx, int Ny, int Nz){ +extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux, + int Nx, int Ny, int Nz){ // Note that this routine assumes the distributions are stored "opposite" // odd distributions in disteven and even distributions in distodd. int n,N; @@ -293,49 +308,138 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(char *ID, double *disteven, double *di double din = 0.f; N = Nx*Ny*Nz; + double A = 1.f*double(Nx*Ny); double sum = 0.f; - char id; for (n=Nx*Ny; n<2*Nx*Ny; n++){ - id = ID[n]; - if (id > 0){ - //........................................................................ - // Read distributions from "opposite" memory convention - //........................................................................ - //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - f12 = distodd[5*N+n]; - //f14 = distodd[6*N+n]; - f16 = distodd[7*N+n]; - //f18 = distodd[8*N+n]; - //........................................................................ - f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - //f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - //f11 = disteven[6*N+n]; - f13 = disteven[7*N+n]; - //f15 = disteven[8*N+n]; - f17 = disteven[9*N+n]; - //................................................... - // Determine the outlet flow velocity - //sum += 1.0 - (f0+f4+f3+f2+f1+f8+f7+f9+ f10 + 2*(f5+ f15+f18+f11+f14))/din; - //sum += (f0+f4+f3+f2+f1+f8+f7+f9+ f10 + 2*(f5+f15+f18+f11+f14)); - sum += (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17)); - } + //........................................................................ + // Read distributions from "opposite" memory convention + //........................................................................ + //........................................................................ + f1 = distodd[n]; + f3 = distodd[N+n]; + f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; + f11 = distodd[5*N+n]; + f13 = distodd[6*N+n]; + f15 = distodd[7*N+n]; + f17 = distodd[8*N+n]; + //........................................................................ + f0 = disteven[n]; + f2 = disteven[N+n]; + f4 = disteven[2*N+n]; + f6 = disteven[3*N+n]; + f8 = disteven[4*N+n]; + f10 = disteven[5*N+n]; + f12 = disteven[6*N+n]; + f14 = disteven[7*N+n]; + f16 = disteven[8*N+n]; + f18 = disteven[9*N+n]; + //................................................... + + // Determine the outlet flow velocity + //sum += 1.0 - (f0+f4+f3+f2+f1+f8+f7+f9+ f10 + 2*(f5+ f15+f18+f11+f14))/din; + //sum += (f0+f4+f3+f2+f1+f8+f7+f9+ f10 + 2*(f5+f15+f18+f11+f14)); + sum += (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17)); } - din = sum/area; - + din = sum/(A*(1.0-flux)); return din; } -extern "C" double ScaLBL_D3Q19_Flux_BC_Z(char *ID, double *disteven, double *distodd, double Q, double area, +extern "C" double ScaLBL_D3Q19_AAodd_Flux_BC_z(int *d_neighborList, int *list, double *dist, double flux, + double area, int count, int Np){ + int idx, n; + int nread; + + // distributions + double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; + double f10,f11,f12,f13,f14,f15,f16,f17,f18; + double factor = 1.f/(area); + double sum = 0.f; + + for (idx=0; idx0){ - //........................................................................ - // Read distributions from "opposite" memory convention - //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - //f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - //f12 = distodd[5*N+n]; - f14 = distodd[6*N+n]; - //f16 = distodd[7*N+n]; - f18 = distodd[8*N+n]; - //........................................................................ - f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - f11 = disteven[6*N+n]; - //f13 = disteven[7*N+n]; - f15 = disteven[8*N+n]; - //f17 = disteven[9*N+n]; - - sum += (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18)); - } - } - dout = sum/area; - return dout; -} - -extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din, - int Nx, int Ny, int Nz) -{ - int n,N; - // distributions - double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; - double f10,f11,f12,f13,f14,f15,f16,f17,f18; - double ux,uy,uz; - ux = uy = 0.0; - - N = Nx*Ny*Nz; - - double Cxz,Cyz; - - for (n=Nx*Ny; n<2*Nx*Ny; n++){ //........................................................................ // Read distributions from "opposite" memory convention //........................................................................ - //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - f12 = distodd[5*N+n]; - f14 = distodd[6*N+n]; - f16 = distodd[7*N+n]; - f18 = distodd[8*N+n]; + f1 = distodd[n]; + f3 = distodd[N+n]; + f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; + f11 = distodd[5*N+n]; + f13 = distodd[6*N+n]; + f15 = distodd[7*N+n]; + f17 = distodd[8*N+n]; //........................................................................ f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - f11 = disteven[6*N+n]; - f13 = disteven[7*N+n]; - f15 = disteven[8*N+n]; - f17 = disteven[9*N+n]; + f2 = disteven[N+n]; + f4 = disteven[2*N+n]; + f6 = disteven[3*N+n]; + f8 = disteven[4*N+n]; + f10 = disteven[5*N+n]; + f12 = disteven[6*N+n]; + f14 = disteven[7*N+n]; + f16 = disteven[8*N+n]; + f18 = disteven[9*N+n]; + + sum += (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18)); + + } + dout = sum/(A*(1.0+flux)); + return dout; +} + +extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, double din, int count, int Np) +{ + int idx, n; + // distributions + double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; + double f10,f11,f12,f13,f14,f15,f16,f17,f18; + double ux,uy,uz,Cyz,Cxz; + ux = uy = 0.0; + for (int idx=0; idx 0){ - //........................................................................ - // Registers to store the distributions - //........................................................................ - f2 = disteven[N+n]; - f4 = disteven[2*N+n]; - f6 = disteven[3*N+n]; - f8 = disteven[4*N+n]; - f10 = disteven[5*N+n]; - f12 = disteven[6*N+n]; - f14 = disteven[7*N+n]; - f16 = disteven[8*N+n]; - f18 = disteven[9*N+n]; - //........................................................................ - f1 = distodd[n]; - f3 = distodd[1*N+n]; - f5 = distodd[2*N+n]; - f7 = distodd[3*N+n]; - f9 = distodd[4*N+n]; - f11 = distodd[5*N+n]; - f13 = distodd[6*N+n]; - f15 = distodd[7*N+n]; - f17 = distodd[8*N+n]; - //.................Compute the velocity................................... - vx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14; - vy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18; - vz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18; - //..................Write the velocity..................................... - vel[n] = vx; - vel[N+n] = vy; - vel[2*N+n] = vz; - //........................................................................ - - } - else{ - for(int q=0; q<9; q++){ - disteven[q*N+n] = -1.0; - distodd[q*N+n] = -1.0; - } - disteven[9*N+n] = -1.0; - - //For ID[n]<0 - solid nodes - vel[n] = 0.0; - vel[N+n] = 0.0; - vel[2*N+n] = 0.0; - } + //........................................................................ + // Registers to store the distributions + //........................................................................ + f2 = dist[2*N+n]; + f4 = dist[4*N+n]; + f6 = dist[6*N+n]; + f8 = dist[8*N+n]; + f10 = dist[10*N+n]; + f12 = dist[12*N+n]; + f14 = dist[14*N+n]; + f16 = dist[16*N+n]; + f18 = dist[18*N+n]; + //........................................................................ + f1 = dist[N+n]; + f3 = dist[3*N+n]; + f5 = dist[5*N+n]; + f7 = dist[7*N+n]; + f9 = dist[9*N+n]; + f11 = dist[11*N+n]; + f13 = dist[13*N+n]; + f15 = dist[15*N+n]; + f17 = dist[17*N+n]; + //.................Compute the velocity................................... + vx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14; + vy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18; + vz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18; + //..................Write the velocity..................................... + vel[n] = vx; + vel[N+n] = vy; + vel[2*N+n] = vz; + //........................................................................ } } extern "C" void ScaLBL_D3Q19_Pressure(const char *ID, const double *disteven, const double *distodd, - double *Pressure, int Nx, int Ny, int Nz) + double *Pressure, int Nx, int Ny, int Nz) { int n,N; // distributions @@ -783,3 +1096,1096 @@ extern "C" void ScaLBL_D3Q19_Pressure(const char *ID, const double *disteven, co } } +extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx, + double Fy, double Fz){ + int n; + double fq,fp; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + + + for (int n=start; n 10Np => odd part of dist) + fq = dist[nread]; // reading the f1 data into register fq + //fp = dist[10*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jx = fq; + m4 = -4.0*fq; + m9 = 2.0*fq; + m10 = -4.0*fq; + + // f2 = dist[10*Np+n]; + nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist) + fq = dist[nread]; // reading the f2 data into register fq + //fq = dist[Np+n]; + rho += fq; + m1 -= 11.0*(fq); + m2 -= 4.0*(fq); + jx -= fq; + m4 += 4.0*(fq); + m9 += 2.0*(fq); + m10 -= 4.0*(fq); + + // q=3 + nread = neighborList[n+2*Np]; // neighbor 4 + fq = dist[nread]; + //fq = dist[11*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy = fq; + m6 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 = fq; + m12 = -2.0*fq; + + // q = 4 + nread = neighborList[n+3*Np]; // neighbor 3 + fq = dist[nread]; + //fq = dist[2*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy -= fq; + m6 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 += fq; + m12 -= 2.0*fq; + + // q=5 + nread = neighborList[n+4*Np]; + fq = dist[nread]; + //fq = dist[12*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz = fq; + m8 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + + // q = 6 + nread = neighborList[n+5*Np]; + fq = dist[nread]; + //fq = dist[3*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz -= fq; + m8 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + // q=7 + nread = neighborList[n+6*Np]; + fq = dist[nread]; + //fq = dist[13*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 = fq; + m16 = fq; + m17 = -fq; + + // q = 8 + nread = neighborList[n+7*Np]; + fq = dist[nread]; + //fq = dist[4*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 += fq; + m16 -= fq; + m17 += fq; + + // q=9 + nread = neighborList[n+8*Np]; + fq = dist[nread]; + //fq = dist[14*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 += fq; + m17 += fq; + + // q = 10 + nread = neighborList[n+9*Np]; + fq = dist[nread]; + //fq = dist[5*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 -= fq; + m17 -= fq; + + // q=11 + nread = neighborList[n+10*Np]; + fq = dist[nread]; + //fq = dist[15*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 = fq; + m16 -= fq; + m18 = fq; + + // q=12 + nread = neighborList[n+11*Np]; + fq = dist[nread]; + //fq = dist[6*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 += fq; + m16 += fq; + m18 -= fq; + + // q=13 + nread = neighborList[n+12*Np]; + fq = dist[nread]; + //fq = dist[16*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 -= fq; + m18 -= fq; + + // q=14 + nread = neighborList[n+13*Np]; + fq = dist[nread]; + //fq = dist[7*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 += fq; + m18 += fq; + + // q=15 + nread = neighborList[n+14*Np]; + fq = dist[nread]; + //fq = dist[17*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 = fq; + m17 += fq; + m18 -= fq; + + // q=16 + nread = neighborList[n+15*Np]; + fq = dist[nread]; + //fq = dist[8*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 += fq; + m17 -= fq; + m18 += fq; + + // q=17 + //fq = dist[18*Np+n]; + nread = neighborList[n+16*Np]; + fq = dist[nread]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 += fq; + m18 += fq; + + // q=18 + nread = neighborList[n+17*Np]; + fq = dist[nread]; + //fq = dist[9*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 -= fq; + m18 -= fq; + + //..............incorporate external force................................................ + //..............carry out relaxation process............................................... + m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho - 11*rho) - m1); + m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho) - m2); + m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4); + m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6); + m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8); + m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho) - m9); + m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/rho) - m10); + m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho) - m11); + m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/rho) - m12); + m13 = m13 + rlx_setA*((jx*jy/rho) - m13); + m14 = m14 + rlx_setA*((jy*jz/rho) - m14); + m15 = m15 + rlx_setA*((jx*jz/rho) - m15); + m16 = m16 + rlx_setB*( - m16); + m17 = m17 + rlx_setB*( - m17); + m18 = m18 + rlx_setB*( - m18); + //....................................................................................................... + //.................inverse transformation...................................................... + + // q=0 + fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2; + dist[n] = fq; + + // q = 1 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx; + nread = neighborList[n+Np]; + dist[nread] = fq; + + // q=2 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx; + nread = neighborList[n]; + dist[nread] = fq; + + // q = 3 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy; + nread = neighborList[n+3*Np]; + dist[nread] = fq; + + // q = 4 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy; + nread = neighborList[n+2*Np]; + dist[nread] = fq; + + // q = 5 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz; + nread = neighborList[n+5*Np]; + dist[nread] = fq; + + // q = 6 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz; + nread = neighborList[n+4*Np]; + dist[nread] = fq; + + // q = 7 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6) + +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy); + nread = neighborList[n+7*Np]; + dist[nread] = fq; + + // q = 8 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy); + nread = neighborList[n+6*Np]; + dist[nread] = fq; + + // q = 9 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6) + +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy); + nread = neighborList[n+9*Np]; + dist[nread] = fq; + + // q = 10 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4) + +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy); + nread = neighborList[n+8*Np]; + dist[nread] = fq; + + // q = 11 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz); + nread = neighborList[n+11*Np]; + dist[nread] = fq; + + // q = 12 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz); + nread = neighborList[n+10*Np]; + dist[nread]= fq; + + // q = 13 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz); + nread = neighborList[n+13*Np]; + dist[nread] = fq; + + // q= 14 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz); + nread = neighborList[n+12*Np]; + dist[nread] = fq; + + + // q = 15 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz); + nread = neighborList[n+15*Np]; + dist[nread] = fq; + + // q = 16 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz); + nread = neighborList[n+14*Np]; + dist[nread] = fq; + + + // q = 17 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8) + -mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz); + nread = neighborList[n+17*Np]; + dist[nread] = fq; + + // q = 18 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6) + -mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz); + nread = neighborList[n+16*Np]; + dist[nread] = fq; + + } +} + +extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *dist, int Np) { + + int n; + double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; + double f10,f11,f12,f13,f14,f15,f16,f17,f18; + + for (int n=0; n #include #include +#include extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){ //cudaMalloc(address,size); - (*address) = malloc(size); - memset(*address,0,size); + (*address) = _mm_malloc(size,64); + memset(*address,0,size); if (*address==NULL){ printf("Memory allocation failed! \n"); } } - -extern "C" void ScaLBL_FreeDeviceMemory(void* address){ - if ( address != NULL ) - free( address ); +extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){ + _mm_free(pointer); } - extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){ // cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice); memcpy(dest, source, size); diff --git a/gpu/Color.cu b/gpu/Color.cu index 7edb21b4..11c82f64 100644 --- a/gpu/Color.cu +++ b/gpu/Color.cu @@ -1,7 +1,9 @@ #include +#include +#include -#define NBLOCKS 32 -#define NTHREADS 128 +#define NBLOCKS 1024 +#define NTHREADS 256 __global__ void dvc_ScaLBL_Color_Init(char *ID, double *Den, double *Phi, double das, double dbs, int Nx, int Ny, int Nz) { @@ -121,129 +123,24 @@ __global__ void dvc_ScaLBL_Color_InitDistance(char *ID, double *Den, double *Ph } } } + //************************************************************************* -__global__ void dvc_ScaLBL_Color_BC_z(double *Phi, double *Den, double *Velocity, double *A_even, double *A_odd, - double *B_even, double *B_odd, int Nx, int Ny, int Nz) +__global__ void dvc_ScaLBL_Color_BC(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np) { - int ij,k,n,N; - N = Nx*Ny*Nz; - // Fill the inlet with component a - - ij = blockIdx.x*blockDim.x + threadIdx.x; - if (ij < Nx*Ny){ - for (k=0; k<1; k++){ - n = k*Nx*Ny+ij; - Phi[n] = 1.0; - } - - for (k=1; k<3; k++){ - n = k*Nx*Ny+ij; - Phi[n] = 1.0; - Den[n] = 1.0; - Den[N+n] = 0.0; - - na = 1.0; - nb = 0.0; - ux = -1.0*Velocity[n]; - uy = -1.0*Velocity[N+n]; - uz = -1.0*Velocity[2*N+n]; - A_even[n] = 0.3333333333333333*na; - B_even[n] = 0.3333333333333333*nb; - - a1 = na*(0.1111111111111111*(1+4.5*ux)); - b1 = nb*(0.1111111111111111*(1+4.5*ux)); - a2 = na*(0.1111111111111111*(1-4.5*ux)); - b2 = nb*(0.1111111111111111*(1-4.5*ux)); - - A_odd[n] = a1; - A_even[N+n] = a2; - B_odd[n] = b1; - B_even[N+n] = b2; - - a1 = na*(0.1111111111111111*(1+4.5*uy)); - b1 = nb*(0.1111111111111111*(1+4.5*uy)); - a2 = na*(0.1111111111111111*(1-4.5*uy)); - b2 = nb*(0.1111111111111111*(1-4.5*uy)); - - A_odd[N+n] = a1; - A_even[2*N+n] = a2; - B_odd[N+n] = b1; - B_even[2*N+n] = b2; - - a1 = na*(0.1111111111111111*(1+4.5*uz)); - b1 = nb*(0.1111111111111111*(1+4.5*uz)); - a2 = na*(0.1111111111111111*(1-4.5*uz)); - b2 = nb*(0.1111111111111111*(1-4.5*uz)); - - A_odd[2*N+n] = a1; - A_even[3*N+n] = a2; - B_odd[2*N+n] = b1; - B_even[3*N+n] = b2; - } - } -} -//************************************************************************* -__global__ void dvc_ScaLBL_Color_BC_Z(double *Phi, double *Den, double *Velocity, double *A_even, double *A_odd, - double *B_even, double *B_odd, int Nx, int Ny, int Nz) -{ - int ij,k,n,N; - N = Nx*Ny*Nz; + int idx,n,nm; // Fill the outlet with component b - ij = blockIdx.x*blockDim.x + threadIdx.x; - if (ij < Nx*Ny){ - for (k=Nz-3; k 0){ - - //.......Back out the 3-D indices for node n.............. - k = n/(Nx*Ny); - j = (n-Nx*Ny*k)/Nx; - i = n-Nx*Ny*k-Nx*j; - //........................................................................ - //........Get 1-D index for this thread.................... - // n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x; - //........................................................................ - // COMPUTE THE COLOR GRADIENT - //........................................................................ - //.................Read Phase Indicator Values............................ - //........................................................................ - nn = n-1; // neighbor index (get convention) - if (i-1<0) nn += Nx; // periodic BC along the x-boundary - f1 = phi[nn]; // get neighbor for phi - 1 - //........................................................................ - nn = n+1; // neighbor index (get convention) - if (!(i+10)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + Aq[1*Np+n] = a1; + Bq[1*Np+n] = b1; + Aq[2*Np+n] = a2; + Bq[2*Np+n] = b2; + + //............................................... + // q = 2 + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + Aq[3*Np+n] = a1; + Bq[3*Np+n] = b1; + Aq[4*Np+n] = a2; + Bq[4*Np+n] = b2; + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + Aq[5*Np+n] = a1; + Bq[5*Np+n] = b1; + Aq[6*Np+n] = a2; + Bq[6*Np+n] = b2; + //............................................... + + } + } +} + + +__global__ void dvc_ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den, + double *Phi, double *Velocity, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){ + + int n,nn,ijk,nread; + int nr1,nr2,nr3,nr4,nr5,nr6; + int nr7,nr8,nr9,nr10; + int nr11,nr12,nr13,nr14; + //int nr15,nr16,nr17,nr18; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + double m3,m5,m7; + double nA,nB; // number density + double a1,b1,a2,b2,nAB,delta; + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; + + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s even part of dist) + //fq = dist[nread]; // reading the f2 data into register fq + nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist) + fq = dist[nr2]; // reading the f2 data into register fq + rho += fq; + m1 -= 11.0*(fq); + m2 -= 4.0*(fq); + jx -= fq; + m4 += 4.0*(fq); + m9 += 2.0*(fq); + m10 -= 4.0*(fq); + + // q=3 + //nread = neighborList[n+2*Np]; // neighbor 4 + //fq = dist[nread]; + nr3 = neighborList[n+2*Np]; // neighbor 4 + fq = dist[nr3]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy = fq; + m6 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 = fq; + m12 = -2.0*fq; + + // q = 4 + //nread = neighborList[n+3*Np]; // neighbor 3 + //fq = dist[nread]; + nr4 = neighborList[n+3*Np]; // neighbor 3 + fq = dist[nr4]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy -= fq; + m6 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 += fq; + m12 -= 2.0*fq; + + // q=5 + //nread = neighborList[n+4*Np]; + //fq = dist[nread]; + nr5 = neighborList[n+4*Np]; + fq = dist[nr5]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz = fq; + m8 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + + // q = 6 + //nread = neighborList[n+5*Np]; + //fq = dist[nread]; + nr6 = neighborList[n+5*Np]; + fq = dist[nr6]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz -= fq; + m8 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + // q=7 + //nread = neighborList[n+6*Np]; + //fq = dist[nread]; + nr7 = neighborList[n+6*Np]; + fq = dist[nr7]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 = fq; + m16 = fq; + m17 = -fq; + + // q = 8 + //nread = neighborList[n+7*Np]; + //fq = dist[nread]; + nr8 = neighborList[n+7*Np]; + fq = dist[nr8]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 += fq; + m16 -= fq; + m17 += fq; + + // q=9 + //nread = neighborList[n+8*Np]; + //fq = dist[nread]; + nr9 = neighborList[n+8*Np]; + fq = dist[nr9]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 += fq; + m17 += fq; + + // q = 10 + //nread = neighborList[n+9*Np]; + //fq = dist[nread]; + nr10 = neighborList[n+9*Np]; + fq = dist[nr10]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 -= fq; + m17 -= fq; + + // q=11 + //nread = neighborList[n+10*Np]; + //fq = dist[nread]; + nr11 = neighborList[n+10*Np]; + fq = dist[nr11]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 = fq; + m16 -= fq; + m18 = fq; + + // q=12 + //nread = neighborList[n+11*Np]; + //fq = dist[nread]; + nr12 = neighborList[n+11*Np]; + fq = dist[nr12]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 += fq; + m16 += fq; + m18 -= fq; + + // q=13 + //nread = neighborList[n+12*Np]; + //fq = dist[nread]; + nr13 = neighborList[n+12*Np]; + fq = dist[nr13]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 -= fq; + m18 -= fq; + + // q=14 + //nread = neighborList[n+13*Np]; + //fq = dist[nread]; + nr14 = neighborList[n+13*Np]; + fq = dist[nr14]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 += fq; + m18 += fq; + + // q=15 + nread = neighborList[n+14*Np]; + fq = dist[nread]; + //fq = dist[17*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 = fq; + m17 += fq; + m18 -= fq; + + // q=16 + nread = neighborList[n+15*Np]; + fq = dist[nread]; + //fq = dist[8*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 += fq; + m17 -= fq; + m18 += fq; + + // q=17 + //fq = dist[18*Np+n]; + nread = neighborList[n+16*Np]; + fq = dist[nread]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 += fq; + m18 += fq; + + // q=18 + nread = neighborList[n+17*Np]; + fq = dist[nread]; + //fq = dist[9*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 -= fq; + m18 -= fq; + + //........................................................................ + //..............carry out relaxation process.............................. + //..........Toelke, Fruediger et. al. 2006................................ + if (C == 0.0) nx = ny = nz = 0.0; + m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho0 - 11*rho) -alpha*C - m1); + m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho0)- m2); + m4 = m4 + rlx_setB*((-0.6666666666666666*jx)- m4); + m6 = m6 + rlx_setB*((-0.6666666666666666*jy)- m6); + m8 = m8 + rlx_setB*((-0.6666666666666666*jz)- m8); + m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9); + m10 = m10 + rlx_setA*( - m10); + m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(ny*ny-nz*nz)- m11); + m12 = m12 + rlx_setA*( - m12); + m13 = m13 + rlx_setA*( (jx*jy/rho0) + 0.5*alpha*C*nx*ny - m13); + m14 = m14 + rlx_setA*( (jy*jz/rho0) + 0.5*alpha*C*ny*nz - m14); + m15 = m15 + rlx_setA*( (jx*jz/rho0) + 0.5*alpha*C*nx*nz - m15); + m16 = m16 + rlx_setB*( - m16); + m17 = m17 + rlx_setB*( - m17); + m18 = m18 + rlx_setB*( - m18); + //.................inverse transformation...................................................... + + // q=0 + fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2; + dist[n] = fq; + + // q = 1 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx; + //nread = neighborList[n+Np]; + dist[nr2] = fq; + + // q=2 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx; + //nread = neighborList[n]; + dist[nr1] = fq; + + // q = 3 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy; + //nread = neighborList[n+3*Np]; + dist[nr4] = fq; + + // q = 4 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy; + //nread = neighborList[n+2*Np]; + dist[nr3] = fq; + + // q = 5 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz; + //nread = neighborList[n+5*Np]; + dist[nr6] = fq; + + // q = 6 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz; + //nread = neighborList[n+4*Np]; + dist[nr5] = fq; + + // q = 7 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy); + //nread = neighborList[n+7*Np]; + dist[nr8] = fq; + + // q = 8 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy); + //nread = neighborList[n+6*Np]; + dist[nr7] = fq; + + // q = 9 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy); + //nread = neighborList[n+9*Np]; + dist[nr10] = fq; + + // q = 10 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy); + //nread = neighborList[n+8*Np]; + dist[nr9] = fq; + + // q = 11 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz); + //nread = neighborList[n+11*Np]; + dist[nr12] = fq; + + // q = 12 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+ + mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz); + //nread = neighborList[n+10*Np]; + dist[nr11]= fq; + + // q = 13 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz); + //nread = neighborList[n+13*Np]; + dist[nr14] = fq; + + // q= 14 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz); + //nread = neighborList[n+12*Np]; + dist[nr13] = fq; + + + // q = 15 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz); + nread = neighborList[n+15*Np]; + dist[nread] = fq; + + // q = 16 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz); + nread = neighborList[n+14*Np]; + dist[nread] = fq; + + + // q = 17 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8) + -mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz); + nread = neighborList[n+17*Np]; + dist[nread] = fq; + + // q = 18 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6) + -mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz); + nread = neighborList[n+16*Np]; + dist[nread] = fq; + + // write the velocity + ux = jx / rho0; + uy = jy / rho0; + uz = jz / rho0; + //Velocity[n] = ux; + //Velocity[Np+n] = uy; + //Velocity[2*Np+n] = uz; + + // Instantiate mass transport distributions + // Stationary value - distribution 0 + nAB = 1.0/(nA+nB); + Aq[n] = 0.3333333333333333*nA; + Bq[n] = 0.3333333333333333*nB; + + //............................................... + // q = 0,2,4 + // Cq = {1,0,0}, {0,1,0}, {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nx; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + // q = 1 + //nread = neighborList[n+Np]; + Aq[nr2] = a1; + Bq[nr2] = b1; + // q=2 + //nread = neighborList[n]; + Aq[nr1] = a2; + Bq[nr1] = b2; + + //............................................... + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + // q = 3 + //nread = neighborList[n+3*Np]; + Aq[nr4] = a1; + Bq[nr4] = b1; + // q = 4 + //nread = neighborList[n+2*Np]; + Aq[nr3] = a2; + Bq[nr3] = b2; + + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + // q = 5 + //nread = neighborList[n+5*Np]; + Aq[nr6] = a1; + Bq[nr6] = b1; + // q = 6 + //nread = neighborList[n+4*Np]; + Aq[nr5] = a2; + Bq[nr5] = b2; + //............................................... + } + } +} + +__global__ void dvc_ScaLBL_D3Q19_AAodd_ColorMomentum(int *neighborList, double *dist, double *Den, + double *Velocity, double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int start, int finish, int Np){ + + int n,nread; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + double nA,nB; // number density + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; + + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s 10Np => odd part of dist) + fq = dist[nread]; // reading the f1 data into register fq + //fp = dist[10*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jx = fq; + m4 = -4.0*fq; + m9 = 2.0*fq; + m10 = -4.0*fq; + + // f2 = dist[10*Np+n]; + nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist) + fq = dist[nread]; // reading the f2 data into register fq + //fq = dist[Np+n]; + rho += fq; + m1 -= 11.0*(fq); + m2 -= 4.0*(fq); + jx -= fq; + m4 += 4.0*(fq); + m9 += 2.0*(fq); + m10 -= 4.0*(fq); + + // q=3 + nread = neighborList[n+2*Np]; // neighbor 4 + fq = dist[nread]; + //fq = dist[11*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy = fq; + m6 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 = fq; + m12 = -2.0*fq; + + // q = 4 + nread = neighborList[n+3*Np]; // neighbor 3 + fq = dist[nread]; + //fq = dist[2*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy -= fq; + m6 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 += fq; + m12 -= 2.0*fq; + + // q=5 + nread = neighborList[n+4*Np]; + fq = dist[nread]; + //fq = dist[12*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz = fq; + m8 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + + // q = 6 + nread = neighborList[n+5*Np]; + fq = dist[nread]; + //fq = dist[3*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz -= fq; + m8 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + // q=7 + nread = neighborList[n+6*Np]; + fq = dist[nread]; + //fq = dist[13*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 = fq; + m16 = fq; + m17 = -fq; + + // q = 8 + nread = neighborList[n+7*Np]; + fq = dist[nread]; + //fq = dist[4*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 += fq; + m16 -= fq; + m17 += fq; + + // q=9 + nread = neighborList[n+8*Np]; + fq = dist[nread]; + //fq = dist[14*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 += fq; + m17 += fq; + + // q = 10 + nread = neighborList[n+9*Np]; + fq = dist[nread]; + //fq = dist[5*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 -= fq; + m17 -= fq; + + // q=11 + nread = neighborList[n+10*Np]; + fq = dist[nread]; + //fq = dist[15*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 = fq; + m16 -= fq; + m18 = fq; + + // q=12 + nread = neighborList[n+11*Np]; + fq = dist[nread]; + //fq = dist[6*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 += fq; + m16 += fq; + m18 -= fq; + + // q=13 + nread = neighborList[n+12*Np]; + fq = dist[nread]; + //fq = dist[16*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 -= fq; + m18 -= fq; + + // q=14 + nread = neighborList[n+13*Np]; + fq = dist[nread]; + //fq = dist[7*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 += fq; + m18 += fq; + + // q=15 + nread = neighborList[n+14*Np]; + fq = dist[nread]; + //fq = dist[17*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 = fq; + m17 += fq; + m18 -= fq; + + // q=16 + nread = neighborList[n+15*Np]; + fq = dist[nread]; + //fq = dist[8*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 += fq; + m17 -= fq; + m18 += fq; + + // q=17 + //fq = dist[18*Np+n]; + nread = neighborList[n+16*Np]; + fq = dist[nread]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 += fq; + m18 += fq; + + // q=18 + nread = neighborList[n+17*Np]; + fq = dist[nread]; + //fq = dist[9*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 -= fq; + m18 -= fq; + + //........................................................................ + //..............carry out relaxation process.............................. + //..........Toelke, Fruediger et. al. 2006................................ + if (C == 0.0) nx = ny = nz = 0.0; + m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho0 - 11*rho) -alpha*C - m1); + m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho0)- m2); + m4 = m4 + rlx_setB*((-0.6666666666666666*jx)- m4); + m6 = m6 + rlx_setB*((-0.6666666666666666*jy)- m6); + m8 = m8 + rlx_setB*((-0.6666666666666666*jz)- m8); + m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9); + m10 = m10 + rlx_setA*( - m10); + m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(ny*ny-nz*nz)- m11); + m12 = m12 + rlx_setA*( - m12); + m13 = m13 + rlx_setA*( (jx*jy/rho0) + 0.5*alpha*C*nx*ny - m13); + m14 = m14 + rlx_setA*( (jy*jz/rho0) + 0.5*alpha*C*ny*nz - m14); + m15 = m15 + rlx_setA*( (jx*jz/rho0) + 0.5*alpha*C*nx*nz - m15); + m16 = m16 + rlx_setB*( - m16); + m17 = m17 + rlx_setB*( - m17); + m18 = m18 + rlx_setB*( - m18); + //.................inverse transformation...................................................... + + // q=0 + fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2; + dist[n] = fq; + + // q = 1 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx; + nread = neighborList[n+Np]; + dist[nread] = fq; + + // q=2 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx; + nread = neighborList[n]; + dist[nread] = fq; + + // q = 3 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy; + nread = neighborList[n+3*Np]; + dist[nread] = fq; + + // q = 4 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy; + nread = neighborList[n+2*Np]; + dist[nread] = fq; + + // q = 5 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz; + nread = neighborList[n+5*Np]; + dist[nread] = fq; + + // q = 6 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz; + nread = neighborList[n+4*Np]; + dist[nread] = fq; + + // q = 7 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy); + nread = neighborList[n+7*Np]; + dist[nread] = fq; + + // q = 8 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy); + nread = neighborList[n+6*Np]; + dist[nread] = fq; + + // q = 9 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy); + nread = neighborList[n+9*Np]; + dist[nread] = fq; + + // q = 10 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+ + mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy); + nread = neighborList[n+8*Np]; + dist[nread] = fq; + + // q = 11 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz); + nread = neighborList[n+11*Np]; + dist[nread] = fq; + + // q = 12 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+ + mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz); + nread = neighborList[n+10*Np]; + dist[nread]= fq; + + // q = 13 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz); + nread = neighborList[n+13*Np]; + dist[nread] = fq; + + // q= 14 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz); + nread = neighborList[n+12*Np]; + dist[nread] = fq; + + + // q = 15 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz); + nread = neighborList[n+15*Np]; + dist[nread] = fq; + + // q = 16 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz); + nread = neighborList[n+14*Np]; + dist[nread] = fq; + + + // q = 17 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8) + -mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz); + nread = neighborList[n+17*Np]; + dist[nread] = fq; + + // q = 18 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6) + -mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz); + nread = neighborList[n+16*Np]; + dist[nread] = fq; + + // write the velocity + ux = jx / rho0; + uy = jy / rho0; + uz = jz / rho0; + Velocity[n] = ux; + Velocity[Np+n] = uy; + Velocity[2*Np+n] = uz; + } + } +} + +__global__ void dvc_ScaLBL_D3Q19_AAeven_ColorMomentum(double *dist, double *Den, double *Velocity, + double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int start, int finish, int Np){ + int n; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + double nA,nB; // number density + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; + + const double mrt_V1=0.05263157894736842; + const double mrt_V2=0.012531328320802; + const double mrt_V3=0.04761904761904762; + const double mrt_V4=0.004594820384294068; + const double mrt_V5=0.01587301587301587; + const double mrt_V6=0.0555555555555555555555555; + const double mrt_V7=0.02777777777777778; + const double mrt_V8=0.08333333333333333; + const double mrt_V9=0.003341687552213868; + const double mrt_V10=0.003968253968253968; + const double mrt_V11=0.01388888888888889; + const double mrt_V12=0.04166666666666666; + + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + Aq[1*Np+n] = a1; + Bq[1*Np+n] = b1; + Aq[2*Np+n] = a2; + Bq[2*Np+n] = b2; + + //............................................... + // q = 2 + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + Aq[3*Np+n] = a1; + Bq[3*Np+n] = b1; + Aq[4*Np+n] = a2; + Bq[4*Np+n] = b2; + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + Aq[5*Np+n] = a1; + Bq[5*Np+n] = b1; + Aq[6*Np+n] = a2; + Bq[6*Np+n] = b2; + //............................................... + + } + } +} + +__global__ void dvc_ScaLBL_D3Q19_AAodd_ColorMass(int *neighborList, double *Aq, double *Bq, double *Den, + double *Velocity, double *ColorGrad, double beta, int start, int finish, int Np){ + + int n,nread; + double fq; + // non-conserved moments + double nA,nB; // number density + double a1,b1,a2,b2,nAB,delta; + double C,nx,ny,nz; //color gradient magnitude and direction + double ux,uy,uz; + double phi,tau,rho0,rlx_setA,rlx_setB; + + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta; + + // q = 1 + nread = neighborList[n+Np]; + Aq[nread] = a1; + Bq[nread] = b1; + // q=2 + nread = neighborList[n]; + Aq[nread] = a2; + Bq[nread] = b2; + + //............................................... + // Cq = {0,1,0} + delta = beta*nA*nB*nAB*0.1111111111111111*ny; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta; + + // q = 3 + nread = neighborList[n+3*Np]; + Aq[nread] = a1; + Bq[nread] = b1; + // q = 4 + nread = neighborList[n+2*Np]; + Aq[nread] = a2; + Bq[nread] = b2; + + //............................................... + // q = 4 + // Cq = {0,0,1} + delta = beta*nA*nB*nAB*0.1111111111111111*nz; + if (!(nA*nB*nAB>0)) delta=0; + a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta; + b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta; + a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta; + b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta; + + // q = 5 + nread = neighborList[n+5*Np]; + Aq[nread] = a1; + Bq[nread] = b1; + // q = 6 + nread = neighborList[n+4*Np]; + Aq[nread] = a2; + Bq[nread] = b2; + //............................................... + } + } +} + +__global__ void dvc_ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double *Aq, double *Bq, + double *Den, double *Phi, int start, int finish, int Np){ + int idx,n,nread; + double fq,nA,nB; + + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s 0.f){ + nA = 1.0; nB = 0.f; + } + else{ + nB = 1.0; nA = 0.f; + } + Den[idx] = nA; + Den[Np+idx] = nB; + + Aq[idx]=0.3333333333333333*nA; + Aq[Np+idx]=0.1111111111111111*nA; + Aq[2*Np+idx]=0.1111111111111111*nA; + Aq[3*Np+idx]=0.1111111111111111*nA; + Aq[4*Np+idx]=0.1111111111111111*nA; + Aq[5*Np+idx]=0.1111111111111111*nA; + Aq[6*Np+idx]=0.1111111111111111*nA; + + Bq[idx]=0.3333333333333333*nB; + Bq[Np+idx]=0.1111111111111111*nB; + Bq[2*Np+idx]=0.1111111111111111*nB; + Bq[3*Np+idx]=0.1111111111111111*nB; + Bq[4*Np+idx]=0.1111111111111111*nB; + Bq[5*Np+idx]=0.1111111111111111*nB; + Bq[6*Np+idx]=0.1111111111111111*nB; + } + } +} + extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice){ int GRID = Nx*Ny / 512 + 1; dvc_ScaLBL_SetSlice_z<<>>(Phi,value,Nx,Ny,Nz,Slice); @@ -1689,14 +3920,6 @@ extern "C" void ColorCollide( char *ID, double *disteven, double *distodd, doubl } -extern "C" void ScaLBL_D3Q19_ColorCollide_gen( char *ID, double *disteven, double *distodd, double *phi, double *ColorGrad, - double *Velocity, int Nx, int Ny, int Nz, double tau1, double tau2, - double rho1, double rho2, double alpha, double beta, double Fx, double Fy, double Fz){ - - dvc_ScaLBL_D3Q19_ColorCollide_gen<<>>(ID, disteven, distodd, phi, ColorGrad, Velocity, Nx, Ny, Nz, tau1, tau2, - alpha, beta, Fx, Fy, Fz); -} - extern "C" void ScaLBL_D3Q19_ColorCollide( char *ID, double *disteven, double *distodd, double *phi, double *ColorGrad, double *Velocity, int Nx, int Ny, int Nz,double rlx_setA, double rlx_setB, double alpha, double beta, double Fx, double Fy, double Fz){ @@ -1719,14 +3942,148 @@ extern "C" void ScaLBL_D3Q7_ColorCollideMass(char *ID, double *A_even, double *A dvc_ScaLBL_D3Q7_ColorCollideMass<<>>(ID, A_even, A_odd, B_even, B_odd, Den, Phi, ColorGrad, Velocity, beta, N, pBC); } // Pressure Boundary Conditions Functions -extern "C" void ScaLBL_Color_BC_z(double *Phi, double *Den, double *Vel, double *A_even, double *A_odd, - double *B_even, double *B_odd, int Nx, int Ny, int Nz){ - int GRID = Nx*Ny / 512 + 1; - dvc_ScaLBL_Color_BC_z<<< GRID,512 >>>(Phi, Den, Vel, A_even, A_odd, B_even, B_odd, Nx, Ny, Nz); -} -extern "C" void ScaLBL_Color_BC_Z(double *Phi, double *Den, double *Vel, double *A_even, double *A_odd, - double *B_even, double *B_odd, int Nx, int Ny, int Nz){ - int GRID = Nx*Ny / 512 + 1; - dvc_ScaLBL_Color_BC_Z<<< GRID,512 >>>(Phi, Den, Vel, A_even, A_odd, B_even, B_odd, Nx, Ny, Nz); +extern "C" void ScaLBL_D3Q19_AAeven_Color(int *Map, double *dist, double *Aq, double *Bq, double *Den, double *Phi, + double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){ + + cudaProfilerStart(); + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAeven_Color, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAeven_Color<<>>(Map, dist, Aq, Bq, Den, Phi, Vel, rhoA, rhoB, tauA, tauB, + alpha, beta, Fx, Fy, Fz, strideY, strideZ, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAeven_Color: %s \n",cudaGetErrorString(err)); + } + cudaProfilerStop(); + } + +extern "C" void ScaLBL_D3Q19_AAodd_Color(int *d_neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den, + double *Phi, double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){ + + cudaProfilerStart(); + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAodd_Color, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAodd_Color<<>>(d_neighborList, Map, dist, Aq, Bq, Den, Phi, Vel, + rhoA, rhoB, tauA, tauB, alpha, beta, Fx, Fy, Fz, strideY, strideZ, start, finish, Np); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAodd_Color: %s \n",cudaGetErrorString(err)); + } + cudaProfilerStop(); +} + +extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *NeighborList, int *Map, double *Aq, double *Bq, + double *Den, double *Phi, int start, int finish, int Np){ + + cudaProfilerStart(); + dvc_ScaLBL_D3Q7_AAodd_PhaseField<<>>(NeighborList, Map, Aq, Bq, Den, Phi, start, finish, Np); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q7_AAodd_PhaseField: %s \n",cudaGetErrorString(err)); + } + cudaProfilerStop(); +} + +extern "C" void ScaLBL_D3Q7_AAeven_PhaseField(int *Map, double *Aq, double *Bq, double *Den, double *Phi, + int start, int finish, int Np){ + + cudaProfilerStart(); + dvc_ScaLBL_D3Q7_AAeven_PhaseField<<>>(Map, Aq, Bq, Den, Phi, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q7_AAeven_PhaseField: %s \n",cudaGetErrorString(err)); + } + cudaProfilerStop(); + +} + +extern "C" void ScaLBL_D3Q19_Gradient(int *Map, double *Phi, double *ColorGrad, int start, int finish, int Np, + int Nx, int Ny, int Nz){ + + int strideY=Nx; + int strideZ=Nx*Ny; + dvc_ScaLBL_D3Q19_Gradient<<>>(Map, Phi, ColorGrad, start, finish, Np, strideY, strideZ); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_ColorGrad: %s \n",cudaGetErrorString(err)); + } + +} + +extern "C" void ScaLBL_PhaseField_Init(int *Map, double *Phi, double *Den, double *Aq, double *Bq, int Np){ + dvc_ScaLBL_PhaseField_Init<<>>(Map, Phi, Den, Aq, Bq, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_ColorGrad: %s \n",cudaGetErrorString(err)); + } +} + +extern "C" void ScaLBL_D3Q19_AAeven_ColorMomentum(double *dist, double *Den, double *Vel, + double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int start, int finish, int Np){ + + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAeven_ColorMomentum, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAeven_ColorMomentum<<>>(dist, Den, Vel, ColorGrad, rhoA, rhoB, tauA, tauB, + alpha, beta, Fx, Fy, Fz, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAeven_ColorMomentum: %s \n",cudaGetErrorString(err)); + } +} + +extern "C" void ScaLBL_D3Q19_AAodd_ColorMomentum(int *d_neighborList, double *dist, double *Den, double *Vel, + double *ColorGrad, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta, + double Fx, double Fy, double Fz, int start, int finish, int Np){ + + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAodd_ColorMomentum, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAodd_ColorMomentum<<>>(d_neighborList, dist, Den, Vel, ColorGrad, + rhoA, rhoB, tauA, tauB, alpha, beta, Fx, Fy, Fz, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAodd_ColorMomentum: %s \n",cudaGetErrorString(err)); + } +} + +extern "C" void ScaLBL_D3Q19_AAeven_ColorMass(double *Aq, double *Bq, double *Den, double *Vel, + double *ColorGrad, double beta, int start, int finish, int Np){ + + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAeven_ColorMass, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAeven_ColorMass<<>>(Aq, Bq, Den, Vel, ColorGrad, beta, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAeven_Color: %s \n",cudaGetErrorString(err)); + } + +} + +extern "C" void ScaLBL_D3Q19_AAodd_ColorMass(int *d_neighborList, double *Aq, double *Bq, double *Den, double *Vel, + double *ColorGrad, double beta, int start, int finish, int Np){ + + cudaFuncSetCacheConfig(dvc_ScaLBL_D3Q19_AAodd_ColorMass, cudaFuncCachePreferL1); + + dvc_ScaLBL_D3Q19_AAodd_ColorMass<<>>(d_neighborList, Aq, Bq, Den, Vel, ColorGrad, beta, start, finish, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAodd_Color: %s \n",cudaGetErrorString(err)); + } +} + +extern "C" void ScaLBL_Color_BC(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np){ + int GRID = count / 512 + 1; + dvc_ScaLBL_Color_BC<<>>(list, Map, Phi, Den, vA, vB, count, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_Color_BC: %s \n",cudaGetErrorString(err)); + } +} + + diff --git a/gpu/CudaExtras.cu b/gpu/CudaExtras.cu index 4a31a466..86ec713d 100644 --- a/gpu/CudaExtras.cu +++ b/gpu/CudaExtras.cu @@ -1,20 +1,34 @@ // Basic cuda functions callable from C/C++ code #include -extern "C" void dvc_ScaLBL_ScaLBL_ScaLBL_AllocateDeviceMemory(void** address, size_t size){ +extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){ cudaMalloc(address,size); cudaMemset(*address,0,size); } -extern "C" void dvc_ScaLBL_ScaLBL_CopyToDevice(void* dest, void* source, size_t size){ +extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){ cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice); } -extern "C" void dvc_ScaLBL_CopyToHost(void* dest, void* source, size_t size){ +extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){ cudaMemcpy(dest,source,size,cudaMemcpyDeviceToHost); } extern "C" void dvc_Barrier(){ cudaDeviceSynchronize(); } +/* +#if __CUDA_ARCH__ < 600 +__device__ double atomicAdd(double* address, double val) { +unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } + while (assumed != old); return __longlong_as_double(old); +} + +#endif +*/ \ No newline at end of file diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu index 3e78b70d..9bef4971 100644 --- a/gpu/D3Q19.cu +++ b/gpu/D3Q19.cu @@ -1,64 +1,73 @@ #include #define NBLOCKS 1024 -#define NTHREADS 128 +#define NTHREADS 256 + +/* +1. constants that are known at compile time should be defined using preprocessor macros (e.g. #define) or via C/C++ const variables at global/file scope. +2. Usage of __constant__ memory may be beneficial for programs who use certain values that don't change for the duration of the kernel and for which certain access patterns are present (e.g. all threads access the same value at the same time). This is not better or faster than constants that satisfy the requirements of item 1 above. +3. If the number of choices to be made by a program are relatively small in number, and these choices affect kernel execution, one possible approach for additional compile-time optimization would be to use templated code/kernels + */ + +__constant__ __device__ double mrt_V1=0.05263157894736842; +__constant__ __device__ double mrt_V2=0.012531328320802; +__constant__ __device__ double mrt_V3=0.04761904761904762; +__constant__ __device__ double mrt_V4=0.004594820384294068; +__constant__ __device__ double mrt_V5=0.01587301587301587; +__constant__ __device__ double mrt_V6=0.0555555555555555555555555; +__constant__ __device__ double mrt_V7=0.02777777777777778; +__constant__ __device__ double mrt_V8=0.08333333333333333; +__constant__ __device__ double mrt_V9=0.003341687552213868; +__constant__ __device__ double mrt_V10=0.003968253968253968; +__constant__ __device__ double mrt_V11=0.01388888888888889; +__constant__ __device__ double mrt_V12=0.04166666666666666; // functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location // functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - - unsigned long long int old = *address_as_ull, assumed; - - do{ assumed = old; - old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val +__longlong_as_double(assumed))); - } while (assumed != old); - - return __longlong_as_double(old); -} +//__shared__ double Transform[722]= +// {}; __inline__ __device__ double warpReduceSum(double val) { - for (int offset = warpSize/2; offset > 0; offset /= 2) - val += __shfl_down(val, offset); - return val; + for (int offset = warpSize/2; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; } __inline__ __device__ double blockReduceSum(double val) { - static __shared__ double shared[32]; // Shared mem for 32 partial sums - int lane = threadIdx.x % warpSize; - int wid = threadIdx.x / warpSize; + static __shared__ double shared[32]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; - val = warpReduceSum(val); // Each warp performs partial reduction + val = warpReduceSum(val); // Each warp performs partial reduction - if (lane==0) shared[wid]=val; // Write reduced value to shared memory + if (lane==0) shared[wid]=val; // Write reduced value to shared memory - __syncthreads(); // Wait for all partial reductions + __syncthreads(); // Wait for all partial reductions - //read from shared memory only if that warp existed - val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + //read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - if (wid==0) val = warpReduceSum(val); //Final reduce within first warp + if (wid==0) val = warpReduceSum(val); //Final reduce within first warp - return val; + return val; } __global__ void deviceReduceKernel(double *in, double* out, int N) { - double sum = 0; - //reduce multiple elements per thread - for (int i = blockIdx.x * blockDim.x + threadIdx.x; - i < N; - i += blockDim.x * gridDim.x) { - sum += in[i]; - } - sum = blockReduceSum(sum); - if (threadIdx.x==0) - out[blockIdx.x]=sum; + double sum = 0; + //reduce multiple elements per thread + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < N; + i += blockDim.x * gridDim.x) { + sum += in[i]; + } + sum = blockReduceSum(sum); + if (threadIdx.x==0) + out[blockIdx.x]=sum; } __global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){ @@ -71,13 +80,15 @@ __global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, d if (idx 0 ){ - f_even[n] = 0.3333333333333333; - f_odd[n] = 0.055555555555555555; //double(100*n)+1.f; - f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f; - f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f; - f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f; - f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f; - f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f; - f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f; - f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f; - f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f; - f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f; - f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f; - f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f; - f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f; - f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f; - f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f; - f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f; - f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f; - f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f; - } - else{ - for(int q=0; q<9; q++){ - f_even[q*N+n] = -1.0; - f_odd[q*N+n] = -1.0; + id = ID[n]; + if (id > 0 ){ + f_even[n] = 0.3333333333333333; + f_odd[n] = 0.055555555555555555; //double(100*n)+1.f; + f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f; + f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f; + f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f; + f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f; + f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f; + f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f; + f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f; + f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f; + f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f; + f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f; + f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f; + f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f; + f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f; + f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f; + f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f; + f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f; + f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f; + } + else{ + for(int q=0; q<9; q++){ + f_even[q*N+n] = -1.0; + f_odd[q*N+n] = -1.0; + } + f_even[9*N+n] = -1.0; } - f_even[9*N+n] = -1.0; - } } } } +__global__ void dvc_ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np) +{ + int n; + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s 10Np => odd part of dist) + fq = dist[nread]; // reading the f1 data into register fq + //fp = dist[10*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jx = fq; + m4 = -4.0*fq; + m9 = 2.0*fq; + m10 = -4.0*fq; + + // f2 = dist[10*Np+n]; + nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist) + fq = dist[nread]; // reading the f2 data into register fq + //fq = dist[Np+n]; + rho += fq; + m1 -= 11.0*(fq); + m2 -= 4.0*(fq); + jx -= fq; + m4 += 4.0*(fq); + m9 += 2.0*(fq); + m10 -= 4.0*(fq); + + // q=3 + nread = neighborList[n+2*Np]; // neighbor 4 + fq = dist[nread]; + //fq = dist[11*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy = fq; + m6 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 = fq; + m12 = -2.0*fq; + + // q = 4 + nread = neighborList[n+3*Np]; // neighbor 3 + fq = dist[nread]; + //fq = dist[2*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jy -= fq; + m6 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 += fq; + m12 -= 2.0*fq; + + // q=5 + nread = neighborList[n+4*Np]; + fq = dist[nread]; + //fq = dist[12*Np+n]; + rho += fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz = fq; + m8 = -4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + + // q = 6 + nread = neighborList[n+5*Np]; + fq = dist[nread]; + //fq = dist[3*Np+n]; + rho+= fq; + m1 -= 11.0*fq; + m2 -= 4.0*fq; + jz -= fq; + m8 += 4.0*fq; + m9 -= fq; + m10 += 2.0*fq; + m11 -= fq; + m12 += 2.0*fq; + + // q=7 + nread = neighborList[n+6*Np]; + fq = dist[nread]; + //fq = dist[13*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 = fq; + m16 = fq; + m17 = -fq; + + // q = 8 + nread = neighborList[n+7*Np]; + fq = dist[nread]; + //fq = dist[4*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 += fq; + m16 -= fq; + m17 += fq; + + // q=9 + nread = neighborList[n+8*Np]; + fq = dist[nread]; + //fq = dist[14*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jy -= fq; + m6 -= fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 += fq; + m17 += fq; + + // q = 10 + nread = neighborList[n+9*Np]; + fq = dist[nread]; + //fq = dist[5*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jy += fq; + m6 += fq; + m9 += fq; + m10 += fq; + m11 += fq; + m12 += fq; + m13 -= fq; + m16 -= fq; + m17 -= fq; + + // q=11 + nread = neighborList[n+10*Np]; + fq = dist[nread]; + //fq = dist[15*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 = fq; + m16 -= fq; + m18 = fq; + + // q=12 + nread = neighborList[n+11*Np]; + fq = dist[nread]; + //fq = dist[6*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 += fq; + m16 += fq; + m18 -= fq; + + // q=13 + nread = neighborList[n+12*Np]; + fq = dist[nread]; + //fq = dist[16*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx += fq; + m4 += fq; + jz -= fq; + m8 -= fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 -= fq; + m18 -= fq; + + // q=14 + nread = neighborList[n+13*Np]; + fq = dist[nread]; + //fq = dist[7*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jx -= fq; + m4 -= fq; + jz += fq; + m8 += fq; + m9 += fq; + m10 += fq; + m11 -= fq; + m12 -= fq; + m15 -= fq; + m16 += fq; + m18 += fq; + + // q=15 + nread = neighborList[n+14*Np]; + fq = dist[nread]; + //fq = dist[17*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 = fq; + m17 += fq; + m18 -= fq; + + // q=16 + nread = neighborList[n+15*Np]; + fq = dist[nread]; + //fq = dist[8*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 += fq; + m17 -= fq; + m18 += fq; + + // q=17 + //fq = dist[18*Np+n]; + nread = neighborList[n+16*Np]; + fq = dist[nread]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy += fq; + m6 += fq; + jz -= fq; + m8 -= fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 += fq; + m18 += fq; + + // q=18 + nread = neighborList[n+17*Np]; + fq = dist[nread]; + //fq = dist[9*Np+n]; + rho += fq; + m1 += 8.0*fq; + m2 += fq; + jy -= fq; + m6 -= fq; + jz += fq; + m8 += fq; + m9 -= 2.0*fq; + m10 -= 2.0*fq; + m14 -= fq; + m17 -= fq; + m18 -= fq; + + //..............incorporate external force................................................ + //..............carry out relaxation process............................................... + m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho - 11*rho) - m1); + m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho) - m2); + m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4); + m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6); + m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8); + m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho) - m9); + m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/rho) - m10); + m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho) - m11); + m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/rho) - m12); + m13 = m13 + rlx_setA*((jx*jy/rho) - m13); + m14 = m14 + rlx_setA*((jy*jz/rho) - m14); + m15 = m15 + rlx_setA*((jx*jz/rho) - m15); + m16 = m16 + rlx_setB*( - m16); + m17 = m17 + rlx_setB*( - m17); + m18 = m18 + rlx_setB*( - m18); + //....................................................................................................... + //.................inverse transformation...................................................... + + // q=0 + fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2; + dist[n] = fq; + + // q = 1 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10)+0.16666666*Fx; + nread = neighborList[n+Np]; + dist[nread] = fq; + + // q=2 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) - 0.16666666*Fx; + nread = neighborList[n]; + dist[nread] = fq; + + // q = 3 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy; + nread = neighborList[n+3*Np]; + dist[nread] = fq; + + // q = 4 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy; + nread = neighborList[n+2*Np]; + dist[nread] = fq; + + // q = 5 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz; + nread = neighborList[n+5*Np]; + dist[nread] = fq; + + // q = 6 + fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz; + nread = neighborList[n+4*Np]; + dist[nread] = fq; + + // q = 7 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+ + mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy); + + nread = neighborList[n+7*Np]; + dist[nread] = fq; + + // q = 8 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11 + +mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy); + nread = neighborList[n+6*Np]; + dist[nread] = fq; + + // q = 9 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+ + mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy); + nread = neighborList[n+9*Np]; + dist[nread] = fq; + + // q = 10 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+ + mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy); + nread = neighborList[n+8*Np]; + dist[nread] = fq; + + // q = 11 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz); + nread = neighborList[n+11*Np]; + dist[nread] = fq; + + // q = 12 + fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+ + mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz); + nread = neighborList[n+10*Np]; + dist[nread]= fq; + + // q = 13 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz); + nread = neighborList[n+13*Np]; + dist[nread] = fq; + + // q= 14 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4) + +mrt_V7*m9+mrt_V11*m10-mrt_V8*m11 + -mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz); + nread = neighborList[n+12*Np]; + dist[nread] = fq; + + + // q = 15 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz); + nread = neighborList[n+15*Np]; + dist[nread] = fq; + + // q = 16 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8) + -mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz); + nread = neighborList[n+14*Np]; + dist[nread] = fq; + + + // q = 17 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8) + -mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz); + nread = neighborList[n+17*Np]; + dist[nread] = fq; + + // q = 18 + fq = mrt_V1*rho+mrt_V9*m1 + +mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6) + -mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz); + nread = neighborList[n+16*Np]; + dist[nread] = fq; + + } + } +} + + +//__launch_bounds__(512,1) +__global__ void +dvc_ScaLBL_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz) { + + int n; + double fq; + // conserved momemnts + double rho,jx,jy,jz; + // non-conserved moments + double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18; + int S = Np/NBLOCKS/NTHREADS + 1; + for (int s=0; s 0){ - //.......Back out the 3-D indices for node n.............. - k = n/(Nx*Ny); - j = (n-Nx*Ny*k)/Nx; - i = n-Nx*Ny*k-Nx*j; - //........................................................................ - // Retrieve even distributions from the local node (swap convention) - // f0 = disteven[n]; // Does not particupate in streaming - f1 = distodd[n]; - f3 = distodd[N+n]; - f5 = distodd[2*N+n]; - f7 = distodd[3*N+n]; - f9 = distodd[4*N+n]; - f11 = distodd[5*N+n]; - f13 = distodd[6*N+n]; - f15 = distodd[7*N+n]; - f17 = distodd[8*N+n]; - //........................................................................ - - //........................................................................ - // Retrieve odd distributions from neighboring nodes (swap convention) - //........................................................................ - nn = n+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[n] = f2; - disteven[N+nn] = f1; - } - //} - //........................................................................ - nn = n+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ - distodd[N+n] = f4; - disteven[2*N+nn] = f3; - // } - } - //........................................................................ - nn = n+Nx*Ny; // neighbor index (pull convention) - if (!(k+1 0.0){ - distodd[2*N+n] = f6; - disteven[3*N+nn] = f5; - // } - } - //........................................................................ - nn = n+Nx+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[3*N+n] = f8; - disteven[4*N+nn] = f7; - // } - } - //........................................................................ - nn = n-Nx+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[4*N+n] = f10; - disteven[5*N+nn] = f9; - // } - } - //........................................................................ - nn = n+Nx*Ny+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[5*N+n] = f12; - disteven[6*N+nn] = f11; - // } - } - //........................................................................ - nn = n-Nx*Ny+1; // neighbor index (pull convention) - if (!(i+1 0.0){ - distodd[6*N+n] = f14; - disteven[7*N+nn] = f13; - // } - } - //........................................................................ - nn = n+Nx*Ny+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ - distodd[7*N+n] = f16; - disteven[8*N+nn] = f15; - // } - } - //........................................................................ - nn = n-Nx*Ny+Nx; // neighbor index (pull convention) - if (!(j+1 0.0){ + id = ID[n]; + if (id > 0){ + //.......Back out the 3-D indices for node n.............. + k = n/(Nx*Ny); + j = (n-Nx*Ny*k)/Nx; + i = n-Nx*Ny*k-Nx*j; + //........................................................................ + // Retrieve even distributions from the local node (swap convention) + // f0 = disteven[n]; // Does not particupate in streaming + f1 = distodd[n]; + f3 = distodd[N+n]; + f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; + f11 = distodd[5*N+n]; + f13 = distodd[6*N+n]; + f15 = distodd[7*N+n]; + f17 = distodd[8*N+n]; + //........................................................................ + + //........................................................................ + // Retrieve odd distributions from neighboring nodes (swap convention) + //........................................................................ + nn = n+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[n] = f2; + disteven[N+nn] = f1; + } + //} + //........................................................................ + nn = n+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ + distodd[N+n] = f4; + disteven[2*N+nn] = f3; + // } + } + //........................................................................ + nn = n+Nx*Ny; // neighbor index (pull convention) + if (!(k+1 0.0){ + distodd[2*N+n] = f6; + disteven[3*N+nn] = f5; + // } + } + //........................................................................ + nn = n+Nx+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[3*N+n] = f8; + disteven[4*N+nn] = f7; + // } + } + //........................................................................ + nn = n-Nx+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[4*N+n] = f10; + disteven[5*N+nn] = f9; + // } + } + //........................................................................ + nn = n+Nx*Ny+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[5*N+n] = f12; + disteven[6*N+nn] = f11; + // } + } + //........................................................................ + nn = n-Nx*Ny+1; // neighbor index (pull convention) + if (!(i+1 0.0){ + distodd[6*N+n] = f14; + disteven[7*N+nn] = f13; + // } + } + //........................................................................ + nn = n+Nx*Ny+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ + distodd[7*N+n] = f16; + disteven[8*N+nn] = f15; + // } + } + //........................................................................ + nn = n-Nx*Ny+Nx; // neighbor index (pull convention) + if (!(j+1 0.0){ distodd[8*N+n] = f18; - disteven[9*N+nn] = f17; - // } + disteven[9*N+nn] = f17; + // } + } + //........................................................................ + } - //........................................................................ - - } } } } -__global__ void dvc_ScaLBL_D3Q19_Velocity(char *ID, double *disteven, double *distodd, double *vel, int Nx, int Ny, int Nz) +__global__ void dvc_ScaLBL_D3Q19_Momentum(double *dist, double *vel, int N) { - int n,N; + int n; // distributions double f1,f2,f3,f4,f5,f6,f7,f8,f9; double f10,f11,f12,f13,f14,f15,f16,f17,f18; double vx,vy,vz; char id; - N = Nx*Ny*Nz; int S = N/NBLOCKS/NTHREADS + 1; for (int s=0; s 0){ + //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - f12 = distodd[5*N+n]; - //f14 = distodd[6*N+n]; - f16 = distodd[7*N+n]; - //f18 = distodd[8*N+n]; + f1 = distodd[n]; + f3 = distodd[N+n]; +// f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; +// f11 = distodd[5*N+n]; + f13 = distodd[6*N+n]; +// f15 = distodd[7*N+n]; + f17 = distodd[8*N+n]; //........................................................................ f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - //f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - //f11 = disteven[6*N+n]; - f13 = disteven[7*N+n]; - //f15 = disteven[8*N+n]; - f17 = disteven[9*N+n]; + f2 = disteven[N+n]; + f4 = disteven[2*N+n]; + f6 = disteven[3*N+n]; + f8 = disteven[4*N+n]; + f10 = disteven[5*N+n]; + f12 = disteven[6*N+n]; +// f14 = disteven[7*N+n]; + f16 = disteven[8*N+n]; +// f18 = disteven[9*N+n]; //................................................... // compute local sum to determine the density value to set pressure //sum = (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17))/(A*(1.0-flux)); sum = factor*(f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17)); //localsum[n]=sum; - } } - //atomicAdd(dvcsum, sum); - - //sum = warpReduceSum(sum); //if (threadIdx.x & (warpSize-1) == 0 ){ // atomicAdd(dvcsum,sum); @@ -609,8 +2161,8 @@ __global__ void dvc_D3Q19_Flux_BC_z(char *ID, double *disteven, double *distodd, atomicAdd(dvcsum, sum); } -__global__ void dvc_D3Q19_Flux_BC_Z(char *ID, double *disteven, double *distodd, double flux, double area, double *dvcsum, - int Nx, int Ny, int Nz, int outlet){ +__global__ void dvc_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, double *dvcsum, + int Nx, int Ny, int Nz, int outlet){ int n,N; // distributions double f0,f1,f2,f3,f4,f5,f7,f8,f9; @@ -619,207 +2171,91 @@ __global__ void dvc_D3Q19_Flux_BC_Z(char *ID, double *disteven, double *distodd, N = Nx*Ny*Nz; n = outlet + blockIdx.x*blockDim.x + threadIdx.x; - double factor = 1.f/area; - + double factor = 1.f/(double(Nx*Ny)*(1.0+flux)); double sum = 0.f; // Loop over the boundary - threadblocks delineated by start...finish if ( n0){ - //........................................................................ + //........................................................................ // Read distributions from "opposite" memory convention //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - //f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - //f12 = distodd[5*N+n]; - f14 = distodd[6*N+n]; - //f16 = distodd[7*N+n]; - f18 = distodd[8*N+n]; + f1 = distodd[n]; + f3 = distodd[N+n]; + f5 = distodd[2*N+n]; + f7 = distodd[3*N+n]; + f9 = distodd[4*N+n]; + f11 = distodd[5*N+n]; +// f13 = distodd[6*N+n]; + f15 = distodd[7*N+n]; +// f17 = distodd[8*N+n]; //........................................................................ f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - f11 = disteven[6*N+n]; - //f13 = disteven[7*N+n]; - f15 = disteven[8*N+n]; - //f17 = disteven[9*N+n]; + f2 = disteven[N+n]; + f4 = disteven[2*N+n]; +// f6 = disteven[3*N+n]; + f8 = disteven[4*N+n]; + f10 = disteven[5*N+n]; +// f12 = disteven[6*N+n]; + f14 = disteven[7*N+n]; +// f16 = disteven[8*N+n]; + f18 = disteven[9*N+n]; // Local sum (based on the consistency condition) //sum = (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18))/(A*(1.0+flux)); sum = factor*(f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18)); //localsum[n]=sum; - } } - //sum = warpReduceSum(sum); -// if (threadIdx.x & (warpSize-1) == 0 ){ - // atomicAdd(dvcsum,sum); -// } - sum = blockReduceSum(sum); if (threadIdx.x==0) atomicAdd(dvcsum, sum); } - -__global__ void dvc_ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din, - int Nx, int Ny, int Nz) +__global__ void dvc_ScaLBL_D3Q19_Init_Simple(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz) { int n,N; - // distributions - double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; - double f10,f11,f12,f13,f14,f15,f16,f17,f18; - double ux,uy,uz,Cyz,Cxz; - ux=uy=0.0; - N = Nx*Ny*Nz; - n = Nx*Ny + blockIdx.x*blockDim.x + threadIdx.x; - - if (n < 2*Nx*Ny){ - - //........................................................................ - // Read distributions from "opposite" memory convention - //........................................................................ - //........................................................................ - f2 = distodd[n]; - f4 = distodd[N+n]; - f6 = distodd[2*N+n]; - f8 = distodd[3*N+n]; - f10 = distodd[4*N+n]; - f12 = distodd[5*N+n]; - f14 = distodd[6*N+n]; - f16 = distodd[7*N+n]; - f18 = distodd[8*N+n]; - //........................................................................ - f0 = disteven[n]; - f1 = disteven[N+n]; - f3 = disteven[2*N+n]; - f5 = disteven[3*N+n]; - f7 = disteven[4*N+n]; - f9 = disteven[5*N+n]; - f11 = disteven[6*N+n]; - f13 = disteven[7*N+n]; - f15 = disteven[8*N+n]; - f17 = disteven[9*N+n]; - //................................................... - //........Determine the inlet flow velocity......... - // uz = -1 + (f0+f3+f4+f1+f2+f7+f8+f10+f9 - // + 2*(f5+f15+f18+f11+f14))/din; - //........Set the unknown distributions.............. - // f6 = f5 - 0.3333333333333333*din*uz; - // f16 = f15 - 0.1666666666666667*din*uz; - // f17 = f16 - f3 + f4-f15+f18-f7+f8-f10+f9; - // f12= 0.5*(-din*uz+f5+f15+f18+f11+f14-f6-f16- - // f17+f1-f2-f14+f11+f7-f8-f10+f9); - // f13= -din*uz+f5+f15+f18+f11+f14-f6-f16-f17-f12; - // Determine the inlet flow velocity - //ux = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14); - //uy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18); - uz = din - (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17)); - - Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux; - Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy; - - f5 = f6 + 0.33333333333333338*uz; - f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz; - f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz; - f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz; - f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz; - //........Store in "opposite" memory location.......... - disteven[3*N+n] = f5; - disteven[6*N+n] = f11; - distodd[6*N+n] = f14; - disteven[8*N+n] = f15; - distodd[8*N+n] = f18; + char id; + int S = N/NBLOCKS/NTHREADS + 1; + for (int s=0; s 0 ){ + f_even[n] = 0 + 0.01*0; + f_odd[n] = 0+ 0.01*1; //double(100*n)+1.f; + f_even[N+n] = 1+ 0.01*2; //double(100*n)+2.f; + f_odd[N+n] = 1+ 0.01*3; //double(100*n)+3.f; + f_even[2*N+n] = 2+ 0.01*4; //double(100*n)+4.f; + f_odd[2*N+n] = 2+ 0.01*5; //double(100*n)+5.f; + f_even[3*N+n] = 3+ 0.01*6; //double(100*n)+6.f; + f_odd[3*N+n] = 3+ 0.01*7; //double(100*n)+7.f; + f_even[4*N+n] = 4+ 0.01*8; //double(100*n)+8.f; + f_odd[4*N+n] = 4+ 0.01*9; //double(100*n)+9.f; + f_even[5*N+n] = 5+ 0.01*10; //double(100*n)+10.f; + f_odd[5*N+n] = 5+ 0.01*11; //double(100*n)+11.f; + f_even[6*N+n] = 6+ 0.01*12; //double(100*n)+12.f; + f_odd[6*N+n] = 6+ 0.01*13; //double(100*n)+13.f; + f_even[7*N+n] = 7+ 0.01*14; //double(100*n)+14.f; + f_odd[7*N+n] = 7+ 0.01*15; //double(100*n)+15.f; + f_even[8*N+n] = 8+ 0.01*16; //double(100*n)+16.f; + f_odd[8*N+n] = 8+ 0.01*17; //double(100*n)+17.f; + f_even[9*N+n] = 9+ 0.01*18; //double(100*n)+18.f; + } + else{ + for(int q=0; q<9; q++){ + f_even[q*N+n] = -1.0; + f_odd[q*N+n] = -1.0; + } + f_even[9*N+n] = -1.0; + } } } - -__global__ void dvc_ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout, - int Nx, int Ny, int Nz, int outlet) -{ - int n,N; - // distributions - double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9; - double f10,f11,f12,f13,f14,f15,f16,f17,f18; - double ux,uy,uz,Cyz,Cxz; - ux=uy=0.0; - - N = Nx*Ny*Nz; - n = outlet + blockIdx.x*blockDim.x + threadIdx.x; - - // Loop over the boundary - threadblocks delineated by start...finish - if ( n>>(q, list, start, count, sendbuf, dist, N); } + extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){ int GRID = count / 512 + 1; dvc_ScaLBL_D3Q19_Unpack <<>>(q, list, start, count, recvbuf, dist, N); } //************************************************************************* -extern "C" void ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz){ - dvc_ScaLBL_D3Q19_Init<<>>(ID, f_even, f_odd, Nx, Ny, Nz); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); - } +extern "C" void ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np){ + dvc_ScaLBL_D3Q19_AA_Init<<>>(f_even, f_odd, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AA_Init: %s \n",cudaGetErrorString(err)); + } } + +extern "C" void ScaLBL_D3Q19_Init(double *dist, int Np){ + dvc_ScaLBL_D3Q19_Init<<>>(dist, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AA_Init: %s \n",cudaGetErrorString(err)); + } +} + extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz){ dvc_ScaLBL_D3Q19_Swap<<>>(ID, disteven, distodd, Nx, Ny, Nz); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err){ - printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",cudaGetErrorString(err)); - } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Swap: %s \n",cudaGetErrorString(err)); + } } extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, double *distodd, int Np) { const int Q = 9; -// cudaStream_t streams[Q]; + // cudaStream_t streams[Q]; // Launch the swap operation as different kernels for (int q=0; q>>(neighborList, disteven, distodd, Np, q); @@ -870,14 +2316,37 @@ extern "C" void ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, d } } -extern "C" void ScaLBL_D3Q19_Velocity(char *ID, double *disteven, double *distodd, double *vel, int Nx, int \ -Ny, int Nz){ - - dvc_ScaLBL_D3Q19_Velocity<<>>(ID, disteven, distodd, vel, Nx, Ny, Nz); +extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *d_dist, int Np) { + cudaFuncSetCacheConfig(dvc_ScaLBL_AAeven_Compact, cudaFuncCachePreferL1); + dvc_ScaLBL_AAeven_Compact<<>>(ID, d_dist, Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); + } } + +extern "C" void ScaLBL_D3Q19_AAodd_Compact(char * ID, int *d_neighborList, double *d_dist, int Np) { + cudaFuncSetCacheConfig(dvc_ScaLBL_AAodd_Compact, cudaFuncCachePreferL1); + dvc_ScaLBL_AAodd_Compact<<>>(ID,d_neighborList, d_dist,Np); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err)); + } +} + +extern "C" void ScaLBL_D3Q19_Momentum(double *dist, double *vel, int Np){ + + dvc_ScaLBL_D3Q19_Momentum<<>>(dist, vel, Np); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_Velocity: %s \n",cudaGetErrorString(err)); + } +} + extern "C" void ScaLBL_D3Q19_Pressure(char *ID, double *disteven, double *distodd, double *Pressure, - int Nx, int Ny, int Nz){ - dvc_ScaLBL_D3Q19_Pressure<<< NBLOCKS,NTHREADS >>>(ID, disteven, distodd, Pressure, Nx, Ny, Nz); + int Nx, int Ny, int Nz){ + dvc_ScaLBL_D3Q19_Pressure<<< NBLOCKS,NTHREADS >>>(disteven, distodd, Pressure, Nx, Ny, Nz); } extern "C" void ScaLBL_D3Q19_Velocity_BC_z(double *disteven, double *distodd, double uz,int Nx, int Ny, int Nz){ @@ -890,7 +2359,7 @@ extern "C" void ScaLBL_D3Q19_Velocity_BC_Z(double *disteven, double *distodd, do dvc_D3Q19_Velocity_BC_Z<<>>(disteven, distodd, uz, Nx, Ny, Nz, outlet); } -extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux, double area, int Nx, int Ny, int Nz){ +extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, double flux,int Nx, int Ny, int Nz){ int GRID = Nx*Ny / 512 + 1; @@ -907,7 +2376,7 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, doub cudaMemset(dvcsum,0,sizeof(double)*Nx*Ny); // compute the local flux and store the result - dvc_D3Q19_Flux_BC_z<<>>(disteven, distodd, flux, area, dvcsum, Nx, Ny, Nz); + dvc_D3Q19_Flux_BC_z<<>>(disteven, distodd, flux, dvcsum, Nx, Ny, Nz); // Now read the total flux cudaMemcpy(&sum[0],dvcsum,sizeof(double),cudaMemcpyDeviceToHost); @@ -919,7 +2388,89 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, doub return din; } -extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, double area, int Nx, int Ny, int Nz, int outlet){ + +extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, double din, int count, int N){ + int GRID = count / 512 + 1; + dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_z<<>>(list, dist, din, count, N); +} + +extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist, double dout, int count, int N){ + int GRID = count / 512 + 1; + dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_Z<<>>(list, dist, dout, count, N); +} + +extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *neighborList, int *list, double *dist, double din, int count, int N){ + int GRID = count / 512 + 1; + dvc_ScaLBL_D3Q19_AAodd_Pressure_BC_z<<>>(neighborList, list, dist, din, count, N); +} + +extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *neighborList, int *list, double *dist, double dout, int count, int N){ + int GRID = count / 512 + 1; + dvc_ScaLBL_D3Q19_AAodd_Pressure_BC_Z<<>>(neighborList, list, dist, dout, count, N); +} + + +extern "C" double ScaLBL_D3Q19_AAeven_Flux_BC_z(int *list, double *dist, double flux, double area, + int count, int N){ + + int GRID = count / 512 + 1; + + // IMPORTANT -- this routine may fail if Nx*Ny > 512*512 + if (count > 512*512){ + printf("WARNING (ScaLBL_D3Q19_Flux_BC_Z): CUDA reduction operation may fail if count > 512*512"); + } + + // Allocate memory to store the sums + double din; + double sum[1]; + double *dvcsum; + cudaMalloc((void **)&dvcsum,sizeof(double)*count); + cudaMemset(dvcsum,0,sizeof(double)*count); + + // compute the local flux and store the result + dvc_ScaLBL_D3Q19_AAeven_Flux_BC_z<<>>(list, dist, flux, area, dvcsum, count, N); + + // Now read the total flux + cudaMemcpy(&sum[0],dvcsum,sizeof(double),cudaMemcpyDeviceToHost); + din=sum[0]; + + // free the memory needed for reduction + cudaFree(dvcsum); + + return din; +} + +extern "C" double ScaLBL_D3Q19_AAodd_Flux_BC_z(int *neighborList, int *list, double *dist, double flux, + double area, int count, int N){ + + int GRID = count / 512 + 1; + + // IMPORTANT -- this routine may fail if Nx*Ny > 512*512 + if (count > 512*512){ + printf("WARNING (ScaLBL_D3Q19_Flux_BC_Z): CUDA reduction operation may fail if count > 512*512"); + } + + // Allocate memory to store the sums + double din; + double sum[1]; + double *dvcsum; + cudaMalloc((void **)&dvcsum,sizeof(double)*count); + cudaMemset(dvcsum,0,sizeof(double)*count); + + // compute the local flux and store the result + dvc_ScaLBL_D3Q19_AAodd_Flux_BC_z<<>>(neighborList, list, dist, flux, area, dvcsum, count, N); + + // Now read the total flux + cudaMemcpy(&sum[0],dvcsum,sizeof(double),cudaMemcpyDeviceToHost); + din=sum[0]; + + // free the memory needed for reduction + cudaFree(dvcsum); + + return din; +} + +extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, double flux, int Nx, int Ny, int Nz, int outlet){ int GRID = Nx*Ny / 512 + 1; @@ -936,7 +2487,7 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub cudaMemset(dvcsum,0,sizeof(double)*Nx*Ny); // compute the local flux and store the result - dvc_D3Q19_Flux_BC_Z<<>>(disteven, distodd, flux, area, dvcsum, Nx, Ny, Nz, outlet); + dvc_D3Q19_Flux_BC_Z<<>>(disteven, distodd, flux, dvcsum, Nx, Ny, Nz, outlet); // Now read the total flux cudaMemcpy(&sum[0],dvcsum,sizeof(double),cudaMemcpyDeviceToHost); @@ -951,23 +2502,41 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub } - extern "C" double deviceReduce(double *in, double* out, int N) { - int threads = 512; - int blocks = min((N + threads - 1) / threads, 1024); + int threads = 512; + int blocks = min((N + threads - 1) / threads, 1024); - double sum = 0.f; - deviceReduceKernel<<>>(in, out, N); - deviceReduceKernel<<<1, 1024>>>(out, out, blocks); - return sum; + double sum = 0.f; + deviceReduceKernel<<>>(in, out, N); + deviceReduceKernel<<<1, 1024>>>(out, out, blocks); + return sum; } -extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din, int Nx, int Ny, int Nz){ - int GRID = Nx*Ny / 512 + 1; - dvc_ScaLBL_D3Q19_Pressure_BC_z<<>>(disteven, distodd, din, Nx, Ny, Nz); +// +//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(int *list, double *dist, double dout, int count, int Np){ +// int GRID = count / 512 + 1; +// dvc_ScaLBL_D3Q19_Pressure_BC_Z<<>>(disteven, distodd, dout, Nx, Ny, Nz, outlet); +//} + +extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx, + double Fy, double Fz){ + + dvc_ScaLBL_AAeven_MRT<<>>(dist,start,finish,Np,rlx_setA,rlx_setB,Fx,Fy,Fz); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAeven_MRT: %s \n",cudaGetErrorString(err)); + } } -extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout, - int Nx, int Ny, int Nz, int outlet){ - int GRID = Nx*Ny / 512 + 1; - dvc_ScaLBL_D3Q19_Pressure_BC_Z<<>>(disteven, distodd, dout, Nx, Ny, Nz, outlet); + +extern "C" void ScaLBL_D3Q19_AAodd_MRT(int *neighborlist, double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx, + double Fy, double Fz){ + + dvc_ScaLBL_AAodd_MRT<<>>(neighborlist,dist,start,finish,Np,rlx_setA,rlx_setB,Fx,Fy,Fz); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err){ + printf("CUDA error in ScaLBL_D3Q19_AAeven_MRT: %s \n",cudaGetErrorString(err)); + } } + diff --git a/gpu/D3Q7.cu b/gpu/D3Q7.cu index d74d4c17..16863fec 100644 --- a/gpu/D3Q7.cu +++ b/gpu/D3Q7.cu @@ -1,6 +1,6 @@ // GPU Functions for D3Q7 Lattice Boltzmann Methods -#define NBLOCKS 32 +#define NBLOCKS 560 #define NTHREADS 128 __global__ void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){ @@ -59,6 +59,26 @@ __global__ void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, } } +__global__ void dvc_ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count, + double *recvbuf, double *dist, int N){ + //.................................................................................... + // Unpack distribution from the recv buffer + // Distribution q matche Cqx, Cqy, Cqz + // swap rule means that the distributions in recvbuf are OPPOSITE of q + // dist may be even or odd distributions stored by stream layout + //.................................................................................... + int n,idx; + idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx>>(q, list, start, count, recvbuf, dist, N); +} + extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){ int GRID = count / 512 + 1; dvc_ScaLBL_Scalar_Pack <<>>(list, count, sendbuf, Data, N); diff --git a/gpu/Extras.cu b/gpu/Extras.cu index 2d27a689..2803dc91 100644 --- a/gpu/Extras.cu +++ b/gpu/Extras.cu @@ -10,9 +10,8 @@ extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){ } } -extern "C" void ScaLBL_FreeDeviceMemory(void* address){ - if ( address != NULL ) - cudaFree( address ); +extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){ + cudaFree(pointer); } extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){ diff --git a/gpu/MRT.cu b/gpu/MRT.cu index 1cc556ef..e79a41d1 100644 --- a/gpu/MRT.cu +++ b/gpu/MRT.cu @@ -4,7 +4,7 @@ //************************************************************************* #include -#define NBLOCKS 32 +#define NBLOCKS 560 #define NTHREADS 128 __global__ void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz) @@ -106,7 +106,9 @@ __global__ void Compute_VELOCITY(char *ID, double *disteven, double *distodd, do } //************************************************************************* -__global__ void D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz, +__global__ void +__launch_bounds__(512,2) +D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz, double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz) {