Initial hip addition
This commit is contained in:
@@ -104,7 +104,7 @@ IF ( USE_DOXYGEN )
|
||||
ADD_DEPENDENCIES( doc latex_docs doxygen )
|
||||
ELSE()
|
||||
SET( USE_DOXYGEN 0 )
|
||||
ENDIF()
|
||||
ENDIF()lbpm-wia
|
||||
ENDIF()
|
||||
|
||||
|
||||
@@ -123,11 +123,26 @@ IF ( USE_CUDA )
|
||||
ADD_DEFINITIONS( -DUSE_CUDA )
|
||||
ENABLE_LANGUAGE( CUDA )
|
||||
ELSEIF ( USE_HIP )
|
||||
FIND_PACKAGE( HIP )
|
||||
MESSAGE( FATAL_ERROR "STOP" )
|
||||
IF ( NOT DEFINED HIP_PATH )
|
||||
IF ( NOT DEFINED ENV{HIP_PATH} )
|
||||
SET( HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed" )
|
||||
ELSE()
|
||||
SET( HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed" )
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
SET( CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH} )
|
||||
FIND_PACKAGE( HIP REQUIRED )
|
||||
FIND_PACKAGE( CUDA QUIET )
|
||||
MESSAGE( "HIP Found")
|
||||
MESSAGE( " HIP version: ${HIP_VERSION_STRING}")
|
||||
MESSAGE( " HIP platform: ${HIP_PLATFORM}")
|
||||
MESSAGE( " HIP Include Path: ${HIP_INCLUDE_DIRS}")
|
||||
MESSAGE( " HIP Libraries: ${HIP_LIBRARIES}")
|
||||
ADD_DEFINITIONS( -DUSE_HIP )
|
||||
ENDIF()
|
||||
|
||||
|
||||
|
||||
# Configure external packages
|
||||
IF ( NOT ONLY_BUILD_DOCS )
|
||||
CONFIGURE_MPI() # MPI must be before other libraries
|
||||
@@ -161,7 +176,10 @@ IF ( NOT ONLY_BUILD_DOCS )
|
||||
ADD_PACKAGE_SUBDIRECTORY( StackTrace )
|
||||
ADD_PACKAGE_SUBDIRECTORY( models )
|
||||
IF ( USE_CUDA )
|
||||
ADD_PACKAGE_SUBDIRECTORY( gpu )
|
||||
ADD_PACKAGE_SUBDIRECTORY( cuda )
|
||||
ELSEIF ( USE_HIP )
|
||||
ADD_SUBDIRECTORY( gpu )
|
||||
SET( LBPM_LIBRARIES lbpm-hip lbpm-wia )
|
||||
ELSE()
|
||||
ADD_PACKAGE_SUBDIRECTORY( cpu )
|
||||
ENDIF()
|
||||
|
||||
311
hip/BGK.hip
Normal file
311
hip/BGK.hip
Normal file
@@ -0,0 +1,311 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#define NBLOCKS 1024
|
||||
#define NTHREADS 256
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
|
||||
int n;
|
||||
// conserved momemnts
|
||||
double rho,ux,uy,uz,uu;
|
||||
// non-conserved moments
|
||||
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
|
||||
|
||||
int S = Np/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
|
||||
|
||||
if ( n<finish ){
|
||||
// q=0
|
||||
f0 = dist[n];
|
||||
f1 = dist[2*Np+n];
|
||||
f2 = dist[1*Np+n];
|
||||
f3 = dist[4*Np+n];
|
||||
f4 = dist[3*Np+n];
|
||||
f5 = dist[6*Np+n];
|
||||
f6 = dist[5*Np+n];
|
||||
f7 = dist[8*Np+n];
|
||||
f8 = dist[7*Np+n];
|
||||
f9 = dist[10*Np+n];
|
||||
f10 = dist[9*Np+n];
|
||||
f11 = dist[12*Np+n];
|
||||
f12 = dist[11*Np+n];
|
||||
f13 = dist[14*Np+n];
|
||||
f14 = dist[13*Np+n];
|
||||
f15 = dist[16*Np+n];
|
||||
f16 = dist[15*Np+n];
|
||||
f17 = dist[18*Np+n];
|
||||
f18 = dist[17*Np+n];
|
||||
|
||||
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
|
||||
ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
|
||||
uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
|
||||
uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
|
||||
uu = 1.5*(ux*ux+uy*uy+uz*uz);
|
||||
|
||||
// q=0
|
||||
dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
|
||||
|
||||
// q = 1
|
||||
dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
|
||||
|
||||
// q=2
|
||||
dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)- 0.16666666*Fx;
|
||||
|
||||
// q = 3
|
||||
dist[3*Np+n] = f3*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
|
||||
|
||||
// q = 4
|
||||
dist[4*Np+n] = f4*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
|
||||
|
||||
// q = 5
|
||||
dist[5*Np+n] = f5*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
|
||||
|
||||
// q = 6
|
||||
dist[6*Np+n] = f6*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
|
||||
|
||||
// q = 7
|
||||
dist[7*Np+n] = f7*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
|
||||
|
||||
// q = 8
|
||||
dist[8*Np+n] = f8*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
|
||||
|
||||
// q = 9
|
||||
dist[9*Np+n] = f9*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
|
||||
|
||||
// q = 10
|
||||
dist[10*Np+n] = f10*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
|
||||
|
||||
// q = 11
|
||||
dist[11*Np+n] = f11*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
|
||||
|
||||
// q = 12
|
||||
dist[12*Np+n] = f12*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) - 0.08333333333*(Fx+Fz);
|
||||
|
||||
// q = 13
|
||||
dist[13*Np+n] = f13*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
|
||||
|
||||
// q= 14
|
||||
dist[14*Np+n] = f14*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
|
||||
|
||||
// q = 15
|
||||
dist[15*Np+n] = f15*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
|
||||
|
||||
// q = 16
|
||||
dist[16*Np+n] = f16*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
|
||||
|
||||
// q = 17
|
||||
dist[17*Np+n] = f17*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
|
||||
|
||||
// q = 18
|
||||
dist[18*Np+n] = f18*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
|
||||
|
||||
//........................................................................
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
|
||||
int n;
|
||||
// conserved momemnts
|
||||
double rho,ux,uy,uz,uu;
|
||||
// non-conserved moments
|
||||
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
|
||||
int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
|
||||
|
||||
int S = Np/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
|
||||
|
||||
if ( n<finish ){
|
||||
// q=0
|
||||
f0 = dist[n];
|
||||
// q=1
|
||||
nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
|
||||
f1 = dist[nr1]; // reading the f1 data into register fq
|
||||
|
||||
nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
|
||||
f2 = dist[nr2]; // reading the f2 data into register fq
|
||||
|
||||
// q=3
|
||||
nr3 = neighborList[n+2*Np]; // neighbor 4
|
||||
f3 = dist[nr3];
|
||||
|
||||
// q = 4
|
||||
nr4 = neighborList[n+3*Np]; // neighbor 3
|
||||
f4 = dist[nr4];
|
||||
|
||||
// q=5
|
||||
nr5 = neighborList[n+4*Np];
|
||||
f5 = dist[nr5];
|
||||
|
||||
// q = 6
|
||||
nr6 = neighborList[n+5*Np];
|
||||
f6 = dist[nr6];
|
||||
|
||||
// q=7
|
||||
nr7 = neighborList[n+6*Np];
|
||||
f7 = dist[nr7];
|
||||
|
||||
// q = 8
|
||||
nr8 = neighborList[n+7*Np];
|
||||
f8 = dist[nr8];
|
||||
|
||||
// q=9
|
||||
nr9 = neighborList[n+8*Np];
|
||||
f9 = dist[nr9];
|
||||
|
||||
// q = 10
|
||||
nr10 = neighborList[n+9*Np];
|
||||
f10 = dist[nr10];
|
||||
|
||||
// q=11
|
||||
nr11 = neighborList[n+10*Np];
|
||||
f11 = dist[nr11];
|
||||
|
||||
// q=12
|
||||
nr12 = neighborList[n+11*Np];
|
||||
f12 = dist[nr12];
|
||||
|
||||
// q=13
|
||||
nr13 = neighborList[n+12*Np];
|
||||
f13 = dist[nr13];
|
||||
|
||||
// q=14
|
||||
nr14 = neighborList[n+13*Np];
|
||||
f14 = dist[nr14];
|
||||
|
||||
// q=15
|
||||
nr15 = neighborList[n+14*Np];
|
||||
f15 = dist[nr15];
|
||||
|
||||
// q=16
|
||||
nr16 = neighborList[n+15*Np];
|
||||
f16 = dist[nr16];
|
||||
|
||||
// q=17
|
||||
//fq = dist[18*Np+n];
|
||||
nr17 = neighborList[n+16*Np];
|
||||
f17 = dist[nr17];
|
||||
|
||||
// q=18
|
||||
nr18 = neighborList[n+17*Np];
|
||||
f18 = dist[nr18];
|
||||
|
||||
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
|
||||
ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
|
||||
uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
|
||||
uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
|
||||
uu = 1.5*(ux*ux+uy*uy+uz*uz);
|
||||
|
||||
// q=0
|
||||
dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
|
||||
|
||||
// q = 1
|
||||
dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
|
||||
|
||||
// q=2
|
||||
dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)- 0.16666666*Fx;
|
||||
|
||||
// q = 3
|
||||
dist[nr4] = f3*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
|
||||
|
||||
// q = 4
|
||||
dist[nr3] = f4*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
|
||||
|
||||
// q = 5
|
||||
dist[nr6] = f5*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
|
||||
|
||||
// q = 6
|
||||
dist[nr5] = f6*(1.0-rlx) +
|
||||
rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
|
||||
|
||||
// q = 7
|
||||
dist[nr8] = f7*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
|
||||
|
||||
// q = 8
|
||||
dist[nr7] = f8*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
|
||||
|
||||
// q = 9
|
||||
dist[nr10] = f9*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
|
||||
|
||||
// q = 10
|
||||
dist[nr9] = f10*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
|
||||
|
||||
// q = 11
|
||||
dist[nr12] = f11*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
|
||||
|
||||
// q = 12
|
||||
dist[nr11] = f12*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) - 0.08333333333*(Fx+Fz);
|
||||
|
||||
// q = 13
|
||||
dist[nr14] = f13*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
|
||||
|
||||
// q= 14
|
||||
dist[nr13] = f14*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
|
||||
|
||||
// q = 15
|
||||
dist[nr16] = f15*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
|
||||
|
||||
// q = 16
|
||||
dist[nr15] = f16*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
|
||||
|
||||
// q = 17
|
||||
dist[nr18] = f17*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
|
||||
|
||||
// q = 18
|
||||
dist[nr17] = f18*(1.0-rlx) +
|
||||
rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
|
||||
|
||||
dvc_ScaLBL_D3Q19_AAeven_BGK<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz);
|
||||
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("CUDA error in ScaLBL_D3Q19_AAeven_BGK: %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
|
||||
dvc_ScaLBL_D3Q19_AAodd_BGK<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz);
|
||||
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("CUDA error in ScaLBL_D3Q19_AAeven_BGK: %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
9
hip/CMakeLists.txt
Normal file
9
hip/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
SET( HIP_SEPERABLE_COMPILATION ON )
|
||||
SET_SOURCE_FILES_PROPERTIES( BGK.hip Color.hip CudaExtras.hip D3Q19.hip D3Q7.hip dfh.hip Extras.hip MRT.hip PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1 )
|
||||
HIP_ADD_LIBRARY( lbpm-hip BGK.hip Color.hip CudaExtras.hip D3Q19.hip D3Q7.hip dfh.hip Extras.hip MRT.hip SHARED HIPCC_OPTIONS ${HIP_HIPCC_OPTIONS} HCC_OPTIONS ${HIP_HCC_OPTIONS} NVCC_OPTIONS ${HIP_NVCC_OPTIONS} ${HIP_NVCC_FLAGS} )
|
||||
TARGET_LINK_LIBRARIES( lbpm-hip /opt/rocm-3.3.0/lib/libhip_hcc.so )
|
||||
TARGET_LINK_LIBRARIES( lbpm-wia lbpm-hip )
|
||||
ADD_DEPENDENCIES( lbpm-hip copy-include )
|
||||
|
||||
|
||||
|
||||
4131
hip/Color.hip
Normal file
4131
hip/Color.hip
Normal file
File diff suppressed because it is too large
Load Diff
34
hip/CudaExtras.hip
Normal file
34
hip/CudaExtras.hip
Normal file
@@ -0,0 +1,34 @@
|
||||
// Basic hip functions callable from C/C++ code
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){
|
||||
hipMalloc(address,size);
|
||||
hipMemset(*address,0,size);
|
||||
}
|
||||
|
||||
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){
|
||||
hipMemcpy(dest,source,size,hipMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
|
||||
extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){
|
||||
hipMemcpy(dest,source,size,hipMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
extern "C" void dvc_Barrier(){
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
/*
|
||||
#if __CUDA_ARCH__ < 600
|
||||
__device__ double atomicAdd(double* address, double val) {
|
||||
unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||
}
|
||||
while (assumed != old); return __longlong_as_double(old);
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
2645
hip/D3Q19.hip
Normal file
2645
hip/D3Q19.hip
Normal file
File diff suppressed because it is too large
Load Diff
246
hip/D3Q7.hip
Normal file
246
hip/D3Q7.hip
Normal file
@@ -0,0 +1,246 @@
|
||||
// GPU Functions for D3Q7 Lattice Boltzmann Methods
|
||||
|
||||
#define NBLOCKS 560
|
||||
#define NTHREADS 128
|
||||
|
||||
__global__ void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Pack distribution q into the send buffer for the listed lattice sites
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int idx,n;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
n = list[idx];
|
||||
sendbuf[idx] = Data[n];
|
||||
}
|
||||
}
|
||||
__global__ void dvc_ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Pack distribution q into the send buffer for the listed lattice sites
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int idx,n;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
n = list[idx];
|
||||
Data[n] = recvbuf[idx];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Pack distribution into the send buffer for the listed lattice sites
|
||||
//....................................................................................
|
||||
int idx,n,component;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
for (component=0; component<number; component++){
|
||||
n = list[idx];
|
||||
sendbuf[idx*number+component] = Data[number*n+component];
|
||||
Data[number*n+component] = 0.0; // Set the data value to zero once it's in the buffer!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int number, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Unack distribution from the recv buffer
|
||||
// Sum to the existing density value
|
||||
//....................................................................................
|
||||
int idx,n,component;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
for (component=0; component<number; component++){
|
||||
n = list[idx];
|
||||
Data[number*n+component] += recvbuf[idx*number+component];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
|
||||
double *recvbuf, double *dist, int N){
|
||||
//....................................................................................
|
||||
// Unpack distribution from the recv buffer
|
||||
// Distribution q matche Cqx, Cqy, Cqz
|
||||
// swap rule means that the distributions in recvbuf are OPPOSITE of q
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int n,idx;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
// Get the value from the list -- note that n is the index is from the send (non-local) process
|
||||
n = list[idx];
|
||||
// unpack the distribution to the proper location
|
||||
if (!(n<0)) { dist[q*N+n] = recvbuf[start+idx];
|
||||
//printf("%f \n",,dist[q*N+n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
|
||||
{
|
||||
int n,N;
|
||||
N = Nx*Ny*Nz;
|
||||
double value;
|
||||
char id;
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
if (n<N){
|
||||
id = ID[n];
|
||||
if (id > 0){
|
||||
value = Den[n];
|
||||
f_even[n] = 0.3333333333333333*value;
|
||||
f_odd[n] = 0.1111111111111111*value; //double(100*n)+1.f;
|
||||
f_even[N+n] = 0.1111111111111111*value; //double(100*n)+2.f;
|
||||
f_odd[N+n] = 0.1111111111111111*value; //double(100*n)+3.f;
|
||||
f_even[2*N+n] = 0.1111111111111111*value; //double(100*n)+4.f;
|
||||
f_odd[2*N+n] = 0.1111111111111111*value; //double(100*n)+5.f;
|
||||
f_even[3*N+n] = 0.1111111111111111*value; //double(100*n)+6.f;
|
||||
}
|
||||
else{
|
||||
for(int q=0; q<3; q++){
|
||||
f_even[q*N+n] = -1.0;
|
||||
f_odd[q*N+n] = -1.0;
|
||||
}
|
||||
f_even[3*N+n] = -1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*************************************************************************
|
||||
__global__ void dvc_ScaLBL_D3Q7_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz)
|
||||
{
|
||||
int i,j,k,n,nn,N;
|
||||
// distributions
|
||||
double f1,f2,f3,f4,f5,f6;
|
||||
char id;
|
||||
N = Nx*Ny*Nz;
|
||||
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
|
||||
if (n<N ){
|
||||
id = ID[n];
|
||||
if (id > 0){
|
||||
//.......Back out the 3-D indices for node n..............
|
||||
k = n/(Nx*Ny);
|
||||
j = (n-Nx*Ny*k)/Nx;
|
||||
i = n-Nx*Ny*k-Nx*j;
|
||||
//........................................................................
|
||||
// Retrieve even distributions from the local node (swap convention)
|
||||
// f0 = disteven[n]; // Does not particupate in streaming
|
||||
f1 = distodd[n];
|
||||
f3 = distodd[N+n];
|
||||
f5 = distodd[2*N+n];
|
||||
//........................................................................
|
||||
|
||||
//........................................................................
|
||||
// Retrieve odd distributions from neighboring nodes (swap convention)
|
||||
//........................................................................
|
||||
nn = n+1; // neighbor index (pull convention)
|
||||
if (!(i+1<Nx)) nn -= Nx; // periodic BC along the x-boundary
|
||||
//if (i+1<Nx){
|
||||
f2 = disteven[N+nn]; // pull neighbor for distribution 2
|
||||
if (!(f2 < 0.0)){
|
||||
distodd[n] = f2;
|
||||
disteven[N+nn] = f1;
|
||||
}
|
||||
//}
|
||||
//........................................................................
|
||||
nn = n+Nx; // neighbor index (pull convention)
|
||||
if (!(j+1<Ny)) nn -= Nx*Ny; // Perioidic BC along the y-boundary
|
||||
//if (j+1<Ny){
|
||||
f4 = disteven[2*N+nn]; // pull neighbor for distribution 4
|
||||
if (!(f4 < 0.0)){
|
||||
distodd[N+n] = f4;
|
||||
disteven[2*N+nn] = f3;
|
||||
}
|
||||
//........................................................................
|
||||
nn = n+Nx*Ny; // neighbor index (pull convention)
|
||||
if (!(k+1<Nz)) nn -= Nx*Ny*Nz; // Perioidic BC along the z-boundary
|
||||
//if (k+1<Nz){
|
||||
f6 = disteven[3*N+nn]; // pull neighbor for distribution 6
|
||||
if (!(f6 < 0.0)){
|
||||
distodd[2*N+n] = f6;
|
||||
disteven[3*N+nn] = f5;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*************************************************************************
|
||||
__global__ void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *distodd, double *Den,
|
||||
int Nx, int Ny, int Nz)
|
||||
{
|
||||
char id;
|
||||
int n;
|
||||
double f0,f1,f2,f3,f4,f5,f6;
|
||||
int N = Nx*Ny*Nz;
|
||||
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
if (n<N){
|
||||
id = ID[n];
|
||||
if (id > 0 ){
|
||||
// Read the distributions
|
||||
f0 = disteven[n];
|
||||
f2 = disteven[N+n];
|
||||
f4 = disteven[2*N+n];
|
||||
f6 = disteven[3*N+n];
|
||||
f1 = distodd[n];
|
||||
f3 = distodd[N+n];
|
||||
f5 = distodd[2*N+n];
|
||||
// Compute the density
|
||||
Den[n] = f0+f1+f2+f3+f4+f5+f6;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_D3Q7_Unpack <<<GRID,512 >>>(q, list, start, count, recvbuf, dist, N);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_Scalar_Pack <<<GRID,512 >>>(list, count, sendbuf, Data, N);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, double *Data, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_Scalar_Unpack <<<GRID,512 >>>(list, count, recvbuf, Data, N);
|
||||
}
|
||||
extern "C" void ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_PackDenD3Q7 <<<GRID,512 >>>(list, count, sendbuf, number, Data, N);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int number, double *Data, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_UnpackDenD3Q7 <<<GRID,512 >>>(list, count, recvbuf, number, Data, N);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz){
|
||||
dvc_ScaLBL_D3Q7_Init <<<NBLOCKS,NTHREADS >>>(ID, f_even, f_odd, Den, Nx, Ny, Nz);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz){
|
||||
dvc_ScaLBL_D3Q7_Swap <<<NBLOCKS,NTHREADS >>>(ID, disteven, distodd, Nx, Ny, Nz);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Density(char *ID, double *disteven, double *distodd, double *Den,
|
||||
int Nx, int Ny, int Nz){
|
||||
dvc_ScaLBL_D3Q7_Density <<<NBLOCKS,NTHREADS >>>(ID, disteven, distodd, Den, Nx, Ny, Nz);
|
||||
}
|
||||
|
||||
62
hip/Extras.hip
Normal file
62
hip/Extras.hip
Normal file
@@ -0,0 +1,62 @@
|
||||
// Basic hip functions callable from C/C++ code
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <stdio.h>
|
||||
|
||||
extern "C" int ScaLBL_SetDevice(int rank){
|
||||
int n_devices;
|
||||
//int local_rank = atoi(getenv("MV2_COMM_WORLD_LOCAL_RANK"));
|
||||
hipGetDeviceCount(&n_devices);
|
||||
//int device = local_rank % n_devices;
|
||||
int device = rank % n_devices;
|
||||
hipSetDevice(device);
|
||||
if (rank < n_devices) printf("MPI rank=%i will use GPU ID %i / %i \n",rank,device,n_devices);
|
||||
return device;
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
|
||||
hipMalloc(address,size);
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("Error in hipMalloc: %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
|
||||
hipFree(pointer);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
|
||||
hipMemcpy(dest,source,size,hipMemcpyHostToDevice);
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("Error in hipMemcpy (host->device): %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_AllocateZeroCopy(void** address, size_t size){
|
||||
//hipMallocHost(address,size);
|
||||
hipMalloc(address,size);
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("Error in hipMallocHost: %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_CopyToZeroCopy(void* dest, const void* source, size_t size){
|
||||
hipMemcpy(dest,source,size,hipMemcpyHostToDevice);
|
||||
hipError_t err = hipGetLastError();
|
||||
//memcpy(dest, source, size);
|
||||
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_CopyToHost(void* dest, const void* source, size_t size){
|
||||
hipMemcpy(dest,source,size,hipMemcpyDeviceToHost);
|
||||
hipError_t err = hipGetLastError();
|
||||
if (hipSuccess != err){
|
||||
printf("Error in hipMemcpy (device->host): %s \n",hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_DeviceBarrier(){
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
310
hip/MRT.hip
Normal file
310
hip/MRT.hip
Normal file
@@ -0,0 +1,310 @@
|
||||
//*************************************************************************
|
||||
// CUDA kernels for single-phase ScaLBL_D3Q19_MRT code
|
||||
// James McClure
|
||||
//*************************************************************************
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define NBLOCKS 560
|
||||
#define NTHREADS 128
|
||||
|
||||
__global__ void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz)
|
||||
{
|
||||
int n,N;
|
||||
N = Nx*Ny*Nz;
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
for (int s=0; s<S; s++){
|
||||
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
|
||||
if (n<N){
|
||||
if (ID[n] > 0){
|
||||
f_even[n] = 0.3333333333333333;
|
||||
f_odd[n] = 0.055555555555555555; //double(100*n)+1.f;
|
||||
f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f;
|
||||
f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f;
|
||||
f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f;
|
||||
f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f;
|
||||
f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f;
|
||||
f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f;
|
||||
f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f;
|
||||
f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f;
|
||||
f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f;
|
||||
f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f;
|
||||
f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f;
|
||||
f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f;
|
||||
f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f;
|
||||
f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f;
|
||||
f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f;
|
||||
f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f;
|
||||
f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f;
|
||||
}
|
||||
else{
|
||||
for(int q=0; q<9; q++){
|
||||
f_even[q*N+n] = -1.0;
|
||||
f_odd[q*N+n] = -1.0;
|
||||
}
|
||||
f_even[9*N+n] = -1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Compute_VELOCITY(char *ID, double *disteven, double *distodd, double *vel, int Nx, int Ny, int Nz)
|
||||
{
|
||||
int n,N;
|
||||
// distributions
|
||||
double f1,f2,f3,f4,f5,f6,f7,f8,f9;
|
||||
double f10,f11,f12,f13,f14,f15,f16,f17,f18;
|
||||
double vx,vy,vz;
|
||||
|
||||
N = Nx*Ny*Nz;
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
// S - number of threadblocks per grid block
|
||||
for (int s=0; s<S; s++){
|
||||
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
|
||||
if (n<N){
|
||||
if (ID[n] > 0){
|
||||
//........................................................................
|
||||
// Registers to store the distributions
|
||||
//........................................................................
|
||||
f2 = disteven[N+n];
|
||||
f4 = disteven[2*N+n];
|
||||
f6 = disteven[3*N+n];
|
||||
f8 = disteven[4*N+n];
|
||||
f10 = disteven[5*N+n];
|
||||
f12 = disteven[6*N+n];
|
||||
f14 = disteven[7*N+n];
|
||||
f16 = disteven[8*N+n];
|
||||
f18 = disteven[9*N+n];
|
||||
//........................................................................
|
||||
f1 = distodd[n];
|
||||
f3 = distodd[1*N+n];
|
||||
f5 = distodd[2*N+n];
|
||||
f7 = distodd[3*N+n];
|
||||
f9 = distodd[4*N+n];
|
||||
f11 = distodd[5*N+n];
|
||||
f13 = distodd[6*N+n];
|
||||
f15 = distodd[7*N+n];
|
||||
f17 = distodd[8*N+n];
|
||||
//.................Compute the velocity...................................
|
||||
vx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
|
||||
vy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
|
||||
vz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
|
||||
//..................Write the velocity.....................................
|
||||
vel[n] = vx;
|
||||
vel[N+n] = vy;
|
||||
vel[2*N+n] = vz;
|
||||
//........................................................................
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*************************************************************************
|
||||
__global__ void
|
||||
__launch_bounds__(512,2)
|
||||
D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
|
||||
double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz)
|
||||
{
|
||||
|
||||
int n,N;
|
||||
// distributions
|
||||
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
|
||||
double f10,f11,f12,f13,f14,f15,f16,f17,f18;
|
||||
|
||||
// conserved momemnts
|
||||
double rho,jx,jy,jz;
|
||||
// non-conserved moments
|
||||
double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
|
||||
|
||||
N = Nx*Ny*Nz;
|
||||
|
||||
char id;
|
||||
int S = N/NBLOCKS/NTHREADS + 1;
|
||||
// S - number of threadblocks per grid block
|
||||
for (int s=0; s<S; s++){
|
||||
// for (int n=0; n<N; n++){
|
||||
//........Get 1-D index for this thread....................
|
||||
n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
|
||||
|
||||
id = ID[n];
|
||||
|
||||
if (n<N){
|
||||
if (id > 0){
|
||||
//........................................................................
|
||||
// Registers to store the distributions - read based on swap convention
|
||||
//........................................................................
|
||||
f2 = distodd[n];
|
||||
f4 = distodd[N+n];
|
||||
f6 = distodd[2*N+n];
|
||||
f8 = distodd[3*N+n];
|
||||
f10 = distodd[4*N+n];
|
||||
f12 = distodd[5*N+n];
|
||||
f14 = distodd[6*N+n];
|
||||
f16 = distodd[7*N+n];
|
||||
f18 = distodd[8*N+n];
|
||||
//........................................................................
|
||||
f0 = disteven[n];
|
||||
f1 = disteven[N+n];
|
||||
f3 = disteven[2*N+n];
|
||||
f5 = disteven[3*N+n];
|
||||
f7 = disteven[4*N+n];
|
||||
f9 = disteven[5*N+n];
|
||||
f11 = disteven[6*N+n];
|
||||
f13 = disteven[7*N+n];
|
||||
f15 = disteven[8*N+n];
|
||||
f17 = disteven[9*N+n];
|
||||
//........................................................................
|
||||
//....................compute the moments...............................................
|
||||
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
|
||||
m1 = -30*f0-11*(f2+f1+f4+f3+f6+f5)+8*(f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18 +f17);
|
||||
m2 = 12*f0-4*(f2+f1 +f4+f3+f6 +f5)+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
|
||||
jx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
|
||||
m4 = 4*(-f1+f2)+f7-f8+f9-f10+f11-f12+f13-f14;
|
||||
jy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
|
||||
m6 = -4*(f3-f4)+f7-f8-f9+f10+f15-f16+f17-f18;
|
||||
jz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
|
||||
m8 = -4*(f5-f6)+f11-f12-f13+f14+f15-f16-f17+f18;
|
||||
m9 = 2*(f1+f2)-f3-f4-f5-f6+f7+f8+f9+f10+f11+f12+f13+f14-2*(f15+f16+f17+f18);
|
||||
m10 = -4*(f1+f2)+2*(f4+f3+f6+f5)+f8+f7+f10+f9+f12+f11+f14+f13-2*(f16+f15+f18+f17);
|
||||
m11 = f4+f3-f6-f5+f8+f7+f10+f9-f12-f11-f14-f13;
|
||||
m12 = -2*(f4+f3-f6-f5)+f8+f7+f10+f9-f12-f11-f14-f13;
|
||||
m13 = f8+f7-f10-f9;
|
||||
m14 = f16+f15-f18-f17;
|
||||
m15 = f12+f11-f14-f13;
|
||||
m16 = f7-f8+f9-f10-f11+f12-f13+f14;
|
||||
m17 = -f7+f8+f9-f10+f15-f16+f17-f18;
|
||||
m18 = f11-f12-f13+f14-f15+f16+f17-f18;
|
||||
//..............incorporate external force................................................
|
||||
//jx += 0.5*Fx;
|
||||
//jy += 0.5*Fy;
|
||||
//jz += 0.5*Fz;
|
||||
//..............carry out relaxation process...............................................
|
||||
m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho - 11*rho) - m1);
|
||||
m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho) - m2);
|
||||
m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
|
||||
m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
|
||||
m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
|
||||
m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho) - m9);
|
||||
m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/rho) - m10);
|
||||
m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho) - m11);
|
||||
m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/rho) - m12);
|
||||
m13 = m13 + rlx_setA*((jx*jy/rho) - m13);
|
||||
m14 = m14 + rlx_setA*((jy*jz/rho) - m14);
|
||||
m15 = m15 + rlx_setA*((jx*jz/rho) - m15);
|
||||
m16 = m16 + rlx_setB*( - m16);
|
||||
m17 = m17 + rlx_setB*( - m17);
|
||||
m18 = m18 + rlx_setB*( - m18);
|
||||
//.................inverse transformation......................................................
|
||||
f0 = 0.05263157894736842*rho-0.012531328320802*m1+0.04761904761904762*m2;
|
||||
f1 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(jx-m4)+0.05555555555555555*(m9-m10);
|
||||
f2 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(m4-jx)+0.05555555555555555*(m9-m10);
|
||||
f3 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(jy-m6)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m11-m12);
|
||||
f4 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(m6-jy)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m11-m12);
|
||||
f5 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(jz-m8)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m12-m11);
|
||||
f6 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
|
||||
+0.1*(m8-jz)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m12-m11);
|
||||
f7 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jx+jy)+0.025*(m4+m6)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
|
||||
+0.04166666666666666*m12+0.25*m13+0.125*(m16-m17);
|
||||
f8 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2-0.1*(jx+jy)-0.025*(m4+m6)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
|
||||
+0.04166666666666666*m12+0.25*m13+0.125*(m17-m16);
|
||||
f9 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jx-jy)+0.025*(m4-m6)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
|
||||
+0.04166666666666666*m12-0.25*m13+0.125*(m16+m17);
|
||||
f10 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jy-jx)+0.025*(m6-m4)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
|
||||
+0.04166666666666666*m12-0.25*m13-0.125*(m16+m17);
|
||||
f11 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jx+jz)+0.025*(m4+m8)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
|
||||
-0.04166666666666666*m12+0.25*m15+0.125*(m18-m16);
|
||||
f12 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2-0.1*(jx+jz)-0.025*(m4+m8)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
|
||||
-0.04166666666666666*m12+0.25*m15+0.125*(m16-m18);
|
||||
f13 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jx-jz)+0.025*(m4-m8)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
|
||||
-0.04166666666666666*m12-0.25*m15-0.125*(m16+m18);
|
||||
f14 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jz-jx)+0.025*(m8-m4)
|
||||
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
|
||||
-0.04166666666666666*m12-0.25*m15+0.125*(m16+m18);
|
||||
f15 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jy+jz)+0.025*(m6+m8)
|
||||
-0.05555555555555555*m9-0.02777777777777778*m10+0.25*m14+0.125*(m17-m18);
|
||||
f16 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2-0.1*(jy+jz)-0.025*(m6+m8)
|
||||
-0.05555555555555555*m9-0.02777777777777778*m10+0.25*m14+0.125*(m18-m17);
|
||||
f17 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jy-jz)+0.025*(m6-m8)
|
||||
-0.05555555555555555*m9-0.02777777777777778*m10-0.25*m14+0.125*(m17+m18);
|
||||
f18 = 0.05263157894736842*rho+0.003341687552213868*m1
|
||||
+0.003968253968253968*m2+0.1*(jz-jy)+0.025*(m8-m6)
|
||||
-0.05555555555555555*m9-0.02777777777777778*m10-0.25*m14-0.125*(m17+m18);
|
||||
//.......................................................................................................
|
||||
// incorporate external force
|
||||
f1 += 0.16666666*Fx;
|
||||
f2 -= 0.16666666*Fx;
|
||||
f3 += 0.16666666*Fy;
|
||||
f4 -= 0.16666666*Fy;
|
||||
f5 += 0.16666666*Fz;
|
||||
f6 -= 0.16666666*Fz;
|
||||
f7 += 0.08333333333*(Fx+Fy);
|
||||
f8 -= 0.08333333333*(Fx+Fy);
|
||||
f9 += 0.08333333333*(Fx-Fy);
|
||||
f10 -= 0.08333333333*(Fx-Fy);
|
||||
f11 += 0.08333333333*(Fx+Fz);
|
||||
f12 -= 0.08333333333*(Fx+Fz);
|
||||
f13 += 0.08333333333*(Fx-Fz);
|
||||
f14 -= 0.08333333333*(Fx-Fz);
|
||||
f15 += 0.08333333333*(Fy+Fz);
|
||||
f16 -= 0.08333333333*(Fy+Fz);
|
||||
f17 += 0.08333333333*(Fy-Fz);
|
||||
f18 -= 0.08333333333*(Fy-Fz);
|
||||
//.......................................................................................................
|
||||
// Write data based on un-swapped convention
|
||||
disteven[n] = f0;
|
||||
disteven[N+n] = f2;
|
||||
disteven[2*N+n] = f4;
|
||||
disteven[3*N+n] = f6;
|
||||
disteven[4*N+n] = f8;
|
||||
disteven[5*N+n] = f10;
|
||||
disteven[6*N+n] = f12;
|
||||
disteven[7*N+n] = f14;
|
||||
disteven[8*N+n] = f16;
|
||||
disteven[9*N+n] = f18;
|
||||
|
||||
distodd[n] = f1;
|
||||
distodd[N+n] = f3;
|
||||
distodd[2*N+n] = f5;
|
||||
distodd[3*N+n] = f7;
|
||||
distodd[4*N+n] = f9;
|
||||
distodd[5*N+n] = f11;
|
||||
distodd[6*N+n] = f13;
|
||||
distodd[7*N+n] = f15;
|
||||
distodd[8*N+n] = f17;
|
||||
//.......................................................................................................
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q19_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,int Nx, int Ny, int Nz)
|
||||
{
|
||||
D3Q19_MRT <<< NBLOCKS,NTHREADS>>> (ID, f_even, f_odd, Nx, Ny, Nz, rlxA, rlxB, Fx, Fy, Fz);
|
||||
}
|
||||
|
||||
|
||||
1508
hip/dfh.hip
Normal file
1508
hip/dfh.hip
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user