Compare commits

...

2 Commits
master ... SYCL

Author SHA1 Message Date
James McClure
e7b9407586 Merge branch 'master' into SYCL 2023-04-03 15:49:34 -04:00
amitkumarhyd
b09731a5ef Migrated SYCL port with Makefile config 2023-03-24 05:34:19 -07:00
19 changed files with 29485 additions and 2 deletions

View File

@ -119,6 +119,7 @@ ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu cuda hip example
# Check for CUDA
CHECK_ENABLE_FLAG( USE_CUDA 0 )
CHECK_ENABLE_FLAG( USE_HIP 0 )
CHECK_ENABLE_FLAG( USE_SYCL 0 )
NULL_USE( CMAKE_CUDA_FLAGS )
IF ( USE_CUDA )
ADD_DEFINITIONS( -DUSE_CUDA )
@ -126,6 +127,8 @@ IF ( USE_CUDA )
ELSEIF ( USE_HIP )
ENABLE_LANGUAGE( HIP )
ADD_DEFINITIONS( -DUSE_HIP )
ELSEIF ( USE_SYCL )
ADD_DEFINITIONS( -DUSE_SYCL )
ENDIF()
@ -167,6 +170,8 @@ IF ( NOT ONLY_BUILD_DOCS )
ADD_PACKAGE_SUBDIRECTORY( cuda )
ELSEIF ( USE_HIP )
ADD_PACKAGE_SUBDIRECTORY( hip )
ELSEIF ( USE_SYCL )
ADD_PACKAGE_SUBDIRECTORY( sycl )
ELSE()
ADD_PACKAGE_SUBDIRECTORY( cpu )
ENDIF()

1695
Makefile_sycl.dpct Normal file

File diff suppressed because it is too large Load Diff

View File

@ -196,6 +196,9 @@ MACRO( FIND_FILES )
# Find the HIP sources
SET( T_HIPSOURCES "" )
FILE( GLOB T_HIPSOURCES "*.hip" )
# Find the SYCL sources
SET( T_SYCLSOURCES "" )
FILE( GLOB T_SYCLSOURCES "*.dp.cpp" )
# Find the C sources
SET( T_CSOURCES "" )
FILE( GLOB T_CSOURCES "*.c" )
@ -216,10 +219,11 @@ MACRO( FIND_FILES )
SET( CXXSOURCES ${CXXSOURCES} ${T_CXXSOURCES} )
SET( CUDASOURCES ${CUDASOURCES} ${T_CUDASOURCES} )
SET( HIPSOURCES ${HIPSOURCES} ${T_HIPSOURCES} )
SET( SYCLSOURCES ${SYCLSOURCES} ${T_SYCLSOURCES} )
SET( CSOURCES ${CSOURCES} ${T_CSOURCES} )
SET( FSOURCES ${FSOURCES} ${T_FSOURCES} )
SET( M4FSOURCES ${M4FSOURCES} ${T_M4FSOURCES} )
SET( SOURCES ${SOURCES} ${T_CXXSOURCES} ${T_CSOURCES} ${T_FSOURCES} ${T_M4FSOURCES} ${CUDASOURCES} ${HIPSOURCES} )
SET( SOURCES ${SOURCES} ${T_CXXSOURCES} ${T_CSOURCES} ${T_FSOURCES} ${T_M4FSOURCES} ${CUDASOURCES} ${HIPSOURCES} ${SYCLSOURCES})
ENDMACRO()
@ -234,6 +238,9 @@ MACRO( FIND_FILES_PATH IN_PATH )
# Find the HIP sources
SET( T_HIPSOURCES "" )
FILE( GLOB T_HIPSOURCES "${IN_PATH}/*.hip" )
# Find the SYCL sources
SET( T_SYCLSOURCES "" )
FILE( GLOB T_SYCLSOURCES "${IN_PATH}/*.sycl" )
# Find the C sources
SET( T_CSOURCES "" )
FILE( GLOB T_CSOURCES "${IN_PATH}/*.c" )
@ -254,9 +261,10 @@ MACRO( FIND_FILES_PATH IN_PATH )
SET( CXXSOURCES ${CXXSOURCES} ${T_CXXSOURCES} )
SET( CUDASOURCES ${CUDASOURCES} ${T_CUDASOURCES} )
SET( HIPSOURCES ${HIPSOURCES} ${T_HIPSOURCES} )
SET( SYCLSOURCES ${SYCLSOURCES} ${T_SYCLSOURCES} )
SET( CSOURCES ${CSOURCES} ${T_CSOURCES} )
SET( FSOURCES ${FSOURCES} ${T_FSOURCES} )
SET( SOURCES ${SOURCES} ${T_CXXSOURCES} ${T_CSOURCES} ${T_FSOURCES} ${CUDASOURCES} ${HIPSOURCES} )
SET( SOURCES ${SOURCES} ${T_CXXSOURCES} ${T_CSOURCES} ${T_FSOURCES} ${CUDASOURCES} ${HIPSOURCES} ${SYCLSOURCES} )
ENDMACRO()

349
sycl/BGK.dp.cpp Normal file
View File

@ -0,0 +1,349 @@
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdio.h>
#define NBLOCKS 1024
#define NTHREADS 512
void dvc_ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
sycl::nd_item<3> item_ct1){
int n;
// conserved momemnts
double rho,ux,uy,uz,uu;
// non-conserved moments
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if ( n<finish ){
// q=0
f0 = dist[n];
f1 = dist[2*Np+n];
f2 = dist[1*Np+n];
f3 = dist[4*Np+n];
f4 = dist[3*Np+n];
f5 = dist[6*Np+n];
f6 = dist[5*Np+n];
f7 = dist[8*Np+n];
f8 = dist[7*Np+n];
f9 = dist[10*Np+n];
f10 = dist[9*Np+n];
f11 = dist[12*Np+n];
f12 = dist[11*Np+n];
f13 = dist[14*Np+n];
f14 = dist[13*Np+n];
f15 = dist[16*Np+n];
f16 = dist[15*Np+n];
f17 = dist[18*Np+n];
f18 = dist[17*Np+n];
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
uu = 1.5*(ux*ux+uy*uy+uz*uz);
// q=0
dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
// q = 1
dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
// q=2
dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)- 0.16666666*Fx;
// q = 3
dist[3*Np+n] = f3*(1.0-rlx) +
rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
// q = 4
dist[4*Np+n] = f4*(1.0-rlx) +
rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
// q = 5
dist[5*Np+n] = f5*(1.0-rlx) +
rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
// q = 6
dist[6*Np+n] = f6*(1.0-rlx) +
rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
// q = 7
dist[7*Np+n] = f7*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
// q = 8
dist[8*Np+n] = f8*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
// q = 9
dist[9*Np+n] = f9*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
// q = 10
dist[10*Np+n] = f10*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
// q = 11
dist[11*Np+n] = f11*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
// q = 12
dist[12*Np+n] = f12*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) - 0.08333333333*(Fx+Fz);
// q = 13
dist[13*Np+n] = f13*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
// q= 14
dist[14*Np+n] = f14*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
// q = 15
dist[15*Np+n] = f15*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
// q = 16
dist[16*Np+n] = f16*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
// q = 17
dist[17*Np+n] = f17*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
// q = 18
dist[18*Np+n] = f18*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
//........................................................................
}
}
}
void dvc_ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
sycl::nd_item<3> item_ct1){
int n;
// conserved momemnts
double rho,ux,uy,uz,uu;
// non-conserved moments
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if ( n<finish ){
// q=0
f0 = dist[n];
// q=1
nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
f1 = dist[nr1]; // reading the f1 data into register fq
nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
f2 = dist[nr2]; // reading the f2 data into register fq
// q=3
nr3 = neighborList[n+2*Np]; // neighbor 4
f3 = dist[nr3];
// q = 4
nr4 = neighborList[n+3*Np]; // neighbor 3
f4 = dist[nr4];
// q=5
nr5 = neighborList[n+4*Np];
f5 = dist[nr5];
// q = 6
nr6 = neighborList[n+5*Np];
f6 = dist[nr6];
// q=7
nr7 = neighborList[n+6*Np];
f7 = dist[nr7];
// q = 8
nr8 = neighborList[n+7*Np];
f8 = dist[nr8];
// q=9
nr9 = neighborList[n+8*Np];
f9 = dist[nr9];
// q = 10
nr10 = neighborList[n+9*Np];
f10 = dist[nr10];
// q=11
nr11 = neighborList[n+10*Np];
f11 = dist[nr11];
// q=12
nr12 = neighborList[n+11*Np];
f12 = dist[nr12];
// q=13
nr13 = neighborList[n+12*Np];
f13 = dist[nr13];
// q=14
nr14 = neighborList[n+13*Np];
f14 = dist[nr14];
// q=15
nr15 = neighborList[n+14*Np];
f15 = dist[nr15];
// q=16
nr16 = neighborList[n+15*Np];
f16 = dist[nr16];
// q=17
//fq = dist[18*Np+n];
nr17 = neighborList[n+16*Np];
f17 = dist[nr17];
// q=18
nr18 = neighborList[n+17*Np];
f18 = dist[nr18];
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
uu = 1.5*(ux*ux+uy*uy+uz*uz);
// q=0
dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
// q = 1
dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
// q=2
dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)- 0.16666666*Fx;
// q = 3
dist[nr4] = f3*(1.0-rlx) +
rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
// q = 4
dist[nr3] = f4*(1.0-rlx) +
rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
// q = 5
dist[nr6] = f5*(1.0-rlx) +
rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
// q = 6
dist[nr5] = f6*(1.0-rlx) +
rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
// q = 7
dist[nr8] = f7*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
// q = 8
dist[nr7] = f8*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
// q = 9
dist[nr10] = f9*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
// q = 10
dist[nr9] = f10*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
// q = 11
dist[nr12] = f11*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
// q = 12
dist[nr11] = f12*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) - 0.08333333333*(Fx+Fz);
// q = 13
dist[nr14] = f13*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
// q= 14
dist[nr13] = f14*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
// q = 15
dist[nr16] = f15*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
// q = 16
dist[nr15] = f16*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
// q = 17
dist[nr18] = f17*(1.0-rlx) +
rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
// q = 18
dist[nr17] = f18*(1.0-rlx) +
rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
}
}
}
extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
/*
DPCT1049:15: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_AAeven_BGK(dist, start, finish, Np, rlx,
Fx, Fy, Fz, item_ct1);
});
/*
DPCT1010:150: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
/*
DPCT1049:16: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_AAodd_BGK(neighborList, dist, start,
finish, Np, rlx, Fx, Fy, Fz,
item_ct1);
});
/*
DPCT1010:152: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}

4472
sycl/Color.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

35
sycl/CudaExtras.dp.cpp Normal file
View File

@ -0,0 +1,35 @@
// Basic cuda functions callable from C/C++ code
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){
*address = (void *)sycl::malloc_device(size, dpct::get_default_queue());
dpct::get_default_queue().memset(*address, 0, size).wait();
}
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){
dpct::get_default_queue().memcpy(dest, source, size).wait();
}
extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){
dpct::get_default_queue().memcpy(dest, source, size).wait();
}
extern "C" void dvc_Barrier(){
dpct::get_current_device().queues_wait_and_throw();
}
/*
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
}
while (assumed != old); return __longlong_as_double(old);
}
#endif
*/

3282
sycl/D3Q19.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

413
sycl/D3Q7.dp.cpp Normal file
View File

@ -0,0 +1,413 @@
// GPU Functions for D3Q7 Lattice Boltzmann Methods
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdio.h>
#define NBLOCKS 560
#define NTHREADS 128
void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N,
sycl::nd_item<3> item_ct1){
//....................................................................................
// Pack distribution q into the send buffer for the listed lattice sites
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int idx,n;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<count){
n = list[idx];
sendbuf[idx] = Data[n];
}
}
void dvc_ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, double *Data, int N,
sycl::nd_item<3> item_ct1){
//....................................................................................
// Pack distribution q into the send buffer for the listed lattice sites
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int idx,n;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<count){
n = list[idx];
Data[n] = recvbuf[idx];
}
}
void dvc_ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N,
sycl::nd_item<3> item_ct1){
//....................................................................................
// Pack distribution into the send buffer for the listed lattice sites
//....................................................................................
int idx,n,component;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<count){
for (component=0; component<number; component++){
n = list[idx];
sendbuf[idx*number+component] = Data[number*n+component];
Data[number*n+component] = 0.0; // Set the data value to zero once it's in the buffer!
}
}
}
void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int number, double *Data, int N,
sycl::nd_item<3> item_ct1){
//....................................................................................
// Unack distribution from the recv buffer
// Sum to the existing density value
//....................................................................................
int idx,n,component;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<count){
for (component=0; component<number; component++){
n = list[idx];
Data[number*n+component] += recvbuf[idx*number+component];
}
}
}
void dvc_ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
double *recvbuf, double *dist, int N, sycl::nd_item<3> item_ct1){
//....................................................................................
// Unpack distribution from the recv buffer
// Distribution q matche Cqx, Cqy, Cqz
// swap rule means that the distributions in recvbuf are OPPOSITE of q
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int n,idx;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<count){
// Get the value from the list -- note that n is the index is from the send (non-local) process
n = list[idx];
// unpack the distribution to the proper location
if (!(n<0)) { dist[q*N+n] = recvbuf[start+idx];
//printf("%f \n",,dist[q*N+n]);
}
}
}
void dvc_ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np,
sycl::nd_item<3> item_ct1){
int idx, n;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx < count){
n = list[idx];
double f5 = 0.222222222222222222222222 - dist[6*Np+n];
dist[6*Np+n] = f5;
}
}
void dvc_ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np,
sycl::nd_item<3> item_ct1){
int idx, n;
idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx < count){
n = list[idx];
double f6 = 0.222222222222222222222222 - dist[5*Np+n];
dist[5*Np+n] = f6;
}
}
void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz,
sycl::nd_item<3> item_ct1)
{
int n,N;
N = Nx*Ny*Nz;
double value;
char id;
int S = N/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (n<N){
id = ID[n];
if (id > 0){
value = Den[n];
f_even[n] = 0.3333333333333333*value;
f_odd[n] = 0.1111111111111111*value; //double(100*n)+1.f;
f_even[N+n] = 0.1111111111111111*value; //double(100*n)+2.f;
f_odd[N+n] = 0.1111111111111111*value; //double(100*n)+3.f;
f_even[2*N+n] = 0.1111111111111111*value; //double(100*n)+4.f;
f_odd[2*N+n] = 0.1111111111111111*value; //double(100*n)+5.f;
f_even[3*N+n] = 0.1111111111111111*value; //double(100*n)+6.f;
}
else{
for(int q=0; q<3; q++){
f_even[q*N+n] = -1.0;
f_odd[q*N+n] = -1.0;
}
f_even[3*N+n] = -1.0;
}
}
}
}
//*************************************************************************
void dvc_ScaLBL_D3Q7_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
sycl::nd_item<3> item_ct1)
{
int i,j,k,n,nn,N;
// distributions
double f1,f2,f3,f4,f5,f6;
char id;
N = Nx*Ny*Nz;
int S = N/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (n<N ){
id = ID[n];
if (id > 0){
//.......Back out the 3-D indices for node n..............
k = n/(Nx*Ny);
j = (n-Nx*Ny*k)/Nx;
i = n-Nx*Ny*k-Nx*j;
//........................................................................
// Retrieve even distributions from the local node (swap convention)
// f0 = disteven[n]; // Does not particupate in streaming
f1 = distodd[n];
f3 = distodd[N+n];
f5 = distodd[2*N+n];
//........................................................................
//........................................................................
// Retrieve odd distributions from neighboring nodes (swap convention)
//........................................................................
nn = n+1; // neighbor index (pull convention)
if (!(i+1<Nx)) nn -= Nx; // periodic BC along the x-boundary
//if (i+1<Nx){
f2 = disteven[N+nn]; // pull neighbor for distribution 2
if (!(f2 < 0.0)){
distodd[n] = f2;
disteven[N+nn] = f1;
}
//}
//........................................................................
nn = n+Nx; // neighbor index (pull convention)
if (!(j+1<Ny)) nn -= Nx*Ny; // Perioidic BC along the y-boundary
//if (j+1<Ny){
f4 = disteven[2*N+nn]; // pull neighbor for distribution 4
if (!(f4 < 0.0)){
distodd[N+n] = f4;
disteven[2*N+nn] = f3;
}
//........................................................................
nn = n+Nx*Ny; // neighbor index (pull convention)
if (!(k+1<Nz)) nn -= Nx*Ny*Nz; // Perioidic BC along the z-boundary
//if (k+1<Nz){
f6 = disteven[3*N+nn]; // pull neighbor for distribution 6
if (!(f6 < 0.0)){
distodd[2*N+n] = f6;
disteven[3*N+nn] = f5;
}
}
}
}
}
//*************************************************************************
void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *distodd, double *Den,
int Nx, int Ny, int Nz, sycl::nd_item<3> item_ct1)
{
char id;
int n;
double f0,f1,f2,f3,f4,f5,f6;
int N = Nx*Ny*Nz;
int S = N/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (n<N){
id = ID[n];
if (id > 0 ){
// Read the distributions
f0 = disteven[n];
f2 = disteven[N+n];
f4 = disteven[2*N+n];
f6 = disteven[3*N+n];
f1 = distodd[n];
f3 = distodd[N+n];
f5 = distodd[2*N+n];
// Compute the density
Den[n] = f0+f1+f2+f3+f4+f5+f6;
}
}
}
}
extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np){
int GRID = count / 512 + 1;
/*
DPCT1049:75: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Reflection_BC_z(list, dist, count, Np,
item_ct1);
});
/*
DPCT1010:311: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np){
int GRID = count / 512 + 1;
/*
DPCT1049:76: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Reflection_BC_Z(list, dist, count, Np,
item_ct1);
});
/*
DPCT1010:313: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){
int GRID = count / 512 + 1;
/*
DPCT1049:77: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Unpack(q, list, start, count, recvbuf, dist,
N, item_ct1);
});
}
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
int GRID = count / 512 + 1;
/*
DPCT1049:78: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_Scalar_Pack(list, count, sendbuf, Data, N,
item_ct1);
});
}
extern "C" void ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, double *Data, int N){
int GRID = count / 512 + 1;
/*
DPCT1049:79: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_Scalar_Unpack(list, count, recvbuf, Data, N,
item_ct1);
});
}
extern "C" void ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
int GRID = count / 512 + 1;
/*
DPCT1049:80: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_PackDenD3Q7(list, count, sendbuf, number, Data,
N, item_ct1);
});
}
extern "C" void ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int number, double *Data, int N){
int GRID = count / 512 + 1;
/*
DPCT1049:81: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, GRID) *
sycl::range<3>(1, 1, 512),
sycl::range<3>(1, 1, 512)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_UnpackDenD3Q7(list, count, recvbuf, number, Data,
N, item_ct1);
});
}
extern "C" void ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz){
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Init(ID, f_even, f_odd, Den, Nx, Ny, Nz,
item_ct1);
});
}
extern "C" void ScaLBL_D3Q7_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz){
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Swap(ID, disteven, distodd, Nx, Ny, Nz,
item_ct1);
});
}
extern "C" void ScaLBL_D3Q7_Density(char *ID, double *disteven, double *distodd, double *Den,
int Nx, int Ny, int Nz){
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Density(ID, disteven, distodd, Den, Nx, Ny,
Nz, item_ct1);
});
}

1301
sycl/D3Q7BC.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

102
sycl/Extras.dp.cpp Normal file
View File

@ -0,0 +1,102 @@
/*
Copyright 2013--2018 James E. McClure, Virginia Polytechnic & State University
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
// Basic cuda functions callable from C/C++ code
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdio.h>
extern "C" int ScaLBL_SetDevice(int rank){
int n_devices;
//int local_rank = atoi(getenv("MV2_COMM_WORLD_LOCAL_RANK"));
n_devices = dpct::dev_mgr::instance().device_count();
//int device = local_rank % n_devices;
int device = rank % n_devices;
/*
DPCT1093:208: The "device" may not be the best XPU device. Adjust the
selected device if needed.
*/
dpct::select_device(device);
if (rank < n_devices) printf("MPI rank=%i will use GPU ID %i / %i \n",rank,device,n_devices);
return device;
}
extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
*address = (void *)sycl::malloc_device(size, dpct::get_default_queue());
/*
DPCT1010:209: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
sycl::free(pointer, dpct::get_default_queue());
}
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
dpct::get_default_queue().memcpy(dest, source, size).wait();
/*
DPCT1010:211: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_AllocateZeroCopy(void** address, size_t size){
//cudaMallocHost(address,size);
*address = (void *)sycl::malloc_device(size, dpct::get_default_queue());
/*
DPCT1010:213: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_CopyToZeroCopy(void* dest, const void* source, size_t size){
dpct::get_default_queue().memcpy(dest, source, size).wait();
/*
DPCT1010:215: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//memcpy(dest, source, size);
}
extern "C" void ScaLBL_CopyToHost(void* dest, const void* source, size_t size){
int errorStatus=0;
// BUILD_FIX_AFTER_MIGRATION : Commenting below line , instead add try catch block. Fix TestBubbleDFH test case.
//dpct::get_default_queue().memcpy(dest, source, size).wait();
/*
DPCT1010:216: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
try {
errorStatus=(dpct::get_default_queue().memcpy(dest, source, size).wait(),0);
}
catch (sycl::exception const &exc) {
printf("Error in cudaMemcpy (device->host): %s \n",exc.what());
}
}
extern "C" void ScaLBL_DeviceBarrier(){
dpct::get_current_device().queues_wait_and_throw();
}

6622
sycl/FreeLee.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

2860
sycl/Greyscale.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

3165
sycl/GreyscaleColor.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

1207
sycl/Ion.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

329
sycl/MRT.dp.cpp Normal file
View File

@ -0,0 +1,329 @@
//*************************************************************************
// CUDA kernels for single-phase ScaLBL_D3Q19_MRT code
// James McClure
//*************************************************************************
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#define NBLOCKS 1024
#define NTHREADS 512
void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz,
sycl::nd_item<3> item_ct1)
{
int n,N;
N = Nx*Ny*Nz;
int S = N/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (n<N){
if (ID[n] > 0){
f_even[n] = 0.3333333333333333;
f_odd[n] = 0.055555555555555555; //double(100*n)+1.f;
f_even[N+n] = 0.055555555555555555; //double(100*n)+2.f;
f_odd[N+n] = 0.055555555555555555; //double(100*n)+3.f;
f_even[2*N+n] = 0.055555555555555555; //double(100*n)+4.f;
f_odd[2*N+n] = 0.055555555555555555; //double(100*n)+5.f;
f_even[3*N+n] = 0.055555555555555555; //double(100*n)+6.f;
f_odd[3*N+n] = 0.0277777777777778; //double(100*n)+7.f;
f_even[4*N+n] = 0.0277777777777778; //double(100*n)+8.f;
f_odd[4*N+n] = 0.0277777777777778; //double(100*n)+9.f;
f_even[5*N+n] = 0.0277777777777778; //double(100*n)+10.f;
f_odd[5*N+n] = 0.0277777777777778; //double(100*n)+11.f;
f_even[6*N+n] = 0.0277777777777778; //double(100*n)+12.f;
f_odd[6*N+n] = 0.0277777777777778; //double(100*n)+13.f;
f_even[7*N+n] = 0.0277777777777778; //double(100*n)+14.f;
f_odd[7*N+n] = 0.0277777777777778; //double(100*n)+15.f;
f_even[8*N+n] = 0.0277777777777778; //double(100*n)+16.f;
f_odd[8*N+n] = 0.0277777777777778; //double(100*n)+17.f;
f_even[9*N+n] = 0.0277777777777778; //double(100*n)+18.f;
}
else{
for(int q=0; q<9; q++){
f_even[q*N+n] = -1.0;
f_odd[q*N+n] = -1.0;
}
f_even[9*N+n] = -1.0;
}
}
}
}
void Compute_VELOCITY(char *ID, double *disteven, double *distodd, double *vel, int Nx, int Ny, int Nz,
sycl::nd_item<3> item_ct1)
{
int n,N;
// distributions
double f1,f2,f3,f4,f5,f6,f7,f8,f9;
double f10,f11,f12,f13,f14,f15,f16,f17,f18;
double vx,vy,vz;
N = Nx*Ny*Nz;
int S = N/NBLOCKS/NTHREADS + 1;
// S - number of threadblocks per grid block
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (n<N){
if (ID[n] > 0){
//........................................................................
// Registers to store the distributions
//........................................................................
f2 = disteven[N+n];
f4 = disteven[2*N+n];
f6 = disteven[3*N+n];
f8 = disteven[4*N+n];
f10 = disteven[5*N+n];
f12 = disteven[6*N+n];
f14 = disteven[7*N+n];
f16 = disteven[8*N+n];
f18 = disteven[9*N+n];
//........................................................................
f1 = distodd[n];
f3 = distodd[1*N+n];
f5 = distodd[2*N+n];
f7 = distodd[3*N+n];
f9 = distodd[4*N+n];
f11 = distodd[5*N+n];
f13 = distodd[6*N+n];
f15 = distodd[7*N+n];
f17 = distodd[8*N+n];
//.................Compute the velocity...................................
vx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
vy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
vz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
//..................Write the velocity.....................................
vel[n] = vx;
vel[N+n] = vy;
vel[2*N+n] = vz;
//........................................................................
}
}
}
}
//*************************************************************************
void
D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz,
sycl::nd_item<3> item_ct1)
{
int n,N;
// distributions
double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
double f10,f11,f12,f13,f14,f15,f16,f17,f18;
// conserved momemnts
double rho,jx,jy,jz;
// non-conserved moments
double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
N = Nx*Ny*Nz;
char id;
int S = N/NBLOCKS/NTHREADS + 1;
// S - number of threadblocks per grid block
for (int s=0; s<S; s++){
// for (int n=0; n<N; n++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
id = ID[n];
if (n<N){
if (id > 0){
//........................................................................
// Registers to store the distributions - read based on swap convention
//........................................................................
f2 = distodd[n];
f4 = distodd[N+n];
f6 = distodd[2*N+n];
f8 = distodd[3*N+n];
f10 = distodd[4*N+n];
f12 = distodd[5*N+n];
f14 = distodd[6*N+n];
f16 = distodd[7*N+n];
f18 = distodd[8*N+n];
//........................................................................
f0 = disteven[n];
f1 = disteven[N+n];
f3 = disteven[2*N+n];
f5 = disteven[3*N+n];
f7 = disteven[4*N+n];
f9 = disteven[5*N+n];
f11 = disteven[6*N+n];
f13 = disteven[7*N+n];
f15 = disteven[8*N+n];
f17 = disteven[9*N+n];
//........................................................................
//....................compute the moments...............................................
rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
m1 = -30*f0-11*(f2+f1+f4+f3+f6+f5)+8*(f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18 +f17);
m2 = 12*f0-4*(f2+f1 +f4+f3+f6 +f5)+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
jx = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
m4 = 4*(-f1+f2)+f7-f8+f9-f10+f11-f12+f13-f14;
jy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
m6 = -4*(f3-f4)+f7-f8-f9+f10+f15-f16+f17-f18;
jz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
m8 = -4*(f5-f6)+f11-f12-f13+f14+f15-f16-f17+f18;
m9 = 2*(f1+f2)-f3-f4-f5-f6+f7+f8+f9+f10+f11+f12+f13+f14-2*(f15+f16+f17+f18);
m10 = -4*(f1+f2)+2*(f4+f3+f6+f5)+f8+f7+f10+f9+f12+f11+f14+f13-2*(f16+f15+f18+f17);
m11 = f4+f3-f6-f5+f8+f7+f10+f9-f12-f11-f14-f13;
m12 = -2*(f4+f3-f6-f5)+f8+f7+f10+f9-f12-f11-f14-f13;
m13 = f8+f7-f10-f9;
m14 = f16+f15-f18-f17;
m15 = f12+f11-f14-f13;
m16 = f7-f8+f9-f10-f11+f12-f13+f14;
m17 = -f7+f8+f9-f10+f15-f16+f17-f18;
m18 = f11-f12-f13+f14-f15+f16+f17-f18;
//..............incorporate external force................................................
//jx += 0.5*Fx;
//jy += 0.5*Fy;
//jz += 0.5*Fz;
//..............carry out relaxation process...............................................
m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho - 11*rho) - m1);
m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho) - m2);
m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho) - m9);
m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/rho) - m10);
m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho) - m11);
m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/rho) - m12);
m13 = m13 + rlx_setA*((jx*jy/rho) - m13);
m14 = m14 + rlx_setA*((jy*jz/rho) - m14);
m15 = m15 + rlx_setA*((jx*jz/rho) - m15);
m16 = m16 + rlx_setB*( - m16);
m17 = m17 + rlx_setB*( - m17);
m18 = m18 + rlx_setB*( - m18);
//.................inverse transformation......................................................
f0 = 0.05263157894736842*rho-0.012531328320802*m1+0.04761904761904762*m2;
f1 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(jx-m4)+0.05555555555555555*(m9-m10);
f2 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(m4-jx)+0.05555555555555555*(m9-m10);
f3 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(jy-m6)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m11-m12);
f4 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(m6-jy)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m11-m12);
f5 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(jz-m8)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m12-m11);
f6 = 0.05263157894736842*rho-0.004594820384294068*m1-0.01587301587301587*m2
+0.1*(m8-jz)+0.02777777777777778*(m10-m9)+0.08333333333333333*(m12-m11);
f7 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jx+jy)+0.025*(m4+m6)
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
+0.04166666666666666*m12+0.25*m13+0.125*(m16-m17);
f8 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2-0.1*(jx+jy)-0.025*(m4+m6)
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
+0.04166666666666666*m12+0.25*m13+0.125*(m17-m16);
f9 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jx-jy)+0.025*(m4-m6)
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
+0.04166666666666666*m12-0.25*m13+0.125*(m16+m17);
f10 = 0.05263157894736842*rho+0.003341687552213868*m1+0.003968253968253968*m2+0.1*(jy-jx)+0.025*(m6-m4)
+0.02777777777777778*m9+0.01388888888888889*m10+0.08333333333333333*m11
+0.04166666666666666*m12-0.25*m13-0.125*(m16+m17);
f11 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jx+jz)+0.025*(m4+m8)
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
-0.04166666666666666*m12+0.25*m15+0.125*(m18-m16);
f12 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2-0.1*(jx+jz)-0.025*(m4+m8)
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
-0.04166666666666666*m12+0.25*m15+0.125*(m16-m18);
f13 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jx-jz)+0.025*(m4-m8)
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
-0.04166666666666666*m12-0.25*m15-0.125*(m16+m18);
f14 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jz-jx)+0.025*(m8-m4)
+0.02777777777777778*m9+0.01388888888888889*m10-0.08333333333333333*m11
-0.04166666666666666*m12-0.25*m15+0.125*(m16+m18);
f15 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jy+jz)+0.025*(m6+m8)
-0.05555555555555555*m9-0.02777777777777778*m10+0.25*m14+0.125*(m17-m18);
f16 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2-0.1*(jy+jz)-0.025*(m6+m8)
-0.05555555555555555*m9-0.02777777777777778*m10+0.25*m14+0.125*(m18-m17);
f17 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jy-jz)+0.025*(m6-m8)
-0.05555555555555555*m9-0.02777777777777778*m10-0.25*m14+0.125*(m17+m18);
f18 = 0.05263157894736842*rho+0.003341687552213868*m1
+0.003968253968253968*m2+0.1*(jz-jy)+0.025*(m8-m6)
-0.05555555555555555*m9-0.02777777777777778*m10-0.25*m14-0.125*(m17+m18);
//.......................................................................................................
// incorporate external force
f1 += 0.16666666*Fx;
f2 -= 0.16666666*Fx;
f3 += 0.16666666*Fy;
f4 -= 0.16666666*Fy;
f5 += 0.16666666*Fz;
f6 -= 0.16666666*Fz;
f7 += 0.08333333333*(Fx+Fy);
f8 -= 0.08333333333*(Fx+Fy);
f9 += 0.08333333333*(Fx-Fy);
f10 -= 0.08333333333*(Fx-Fy);
f11 += 0.08333333333*(Fx+Fz);
f12 -= 0.08333333333*(Fx+Fz);
f13 += 0.08333333333*(Fx-Fz);
f14 -= 0.08333333333*(Fx-Fz);
f15 += 0.08333333333*(Fy+Fz);
f16 -= 0.08333333333*(Fy+Fz);
f17 += 0.08333333333*(Fy-Fz);
f18 -= 0.08333333333*(Fy-Fz);
//.......................................................................................................
// Write data based on un-swapped convention
disteven[n] = f0;
disteven[N+n] = f2;
disteven[2*N+n] = f4;
disteven[3*N+n] = f6;
disteven[4*N+n] = f8;
disteven[5*N+n] = f10;
disteven[6*N+n] = f12;
disteven[7*N+n] = f14;
disteven[8*N+n] = f16;
disteven[9*N+n] = f18;
distodd[n] = f1;
distodd[N+n] = f3;
distodd[2*N+n] = f5;
distodd[3*N+n] = f7;
distodd[4*N+n] = f9;
distodd[5*N+n] = f11;
distodd[6*N+n] = f13;
distodd[7*N+n] = f15;
distodd[8*N+n] = f17;
//.......................................................................................................
}
}
}
}
extern "C" void ScaLBL_D3Q19_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,int Nx, int Ny, int Nz)
{
/*
DPCT1049:13: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
D3Q19_MRT(ID, f_even, f_odd, Nx, Ny, Nz, rlxA, rlxB, Fx, Fy,
Fz, item_ct1);
});
}

100
sycl/MixedGradient.dp.cpp Normal file
View File

@ -0,0 +1,100 @@
/* Implement Mixed Gradient (Lee et al. JCP 2016)*/
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdio.h>
#define NBLOCKS 560
#define NTHREADS 128
void dvc_ScaLBL_D3Q19_MixedGradient(int *Map, double *Phi, double *Gradient, int start, int finish, int Np, int Nx, int Ny, int Nz,
sycl::nd_item<3> item_ct1)
{
// BUILD_FIX_AFTER_MIGRATION : Replaced static int with static const int
static const int D3Q19[18][3]={{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1},
{1,1,0},{-1,-1,0},{1,-1,0},{-1,1,0},
{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1},
{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1}};
int i,j,k,n,N,idx;
int np,np2,nm; // neighbors
double v,vp,vp2,vm; // values at neighbors
double grad;
N = Nx*Ny*Nz;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
idx = start +
S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
if (idx<finish){
n = Map[idx]; // layout in regular array
//.......Back out the 3-D indices for node n..............
k = n/(Nx*Ny);
j = (n-Nx*Ny*k)/Nx;
i = n-Nx*Ny*k-Nx*j;
v = Phi[n];
grad = 0.0;
for (int q=0; q<6; q++){
int iqx = D3Q19[q][0];
int iqy = D3Q19[q][1];
int iqz = D3Q19[q][2];
np = (k+iqz)*Nx*Ny + (j+iqy)*Nx + i + iqx;
np2 = (k+2*iqz)*Nx*Ny + (j+2*iqy)*Nx + i + 2*iqx;
nm = (k-iqz)*Nx*Ny + (j-iqy)*Nx + i - iqx;
vp = Phi[np];
vp2 = Phi[np2];
vm = Phi[nm];
grad += 0.25*(5.0*vp-vp2-3.0*v-vm);
}
for (int q=6; q<18; q++){
int iqx = D3Q19[q][0];
int iqy = D3Q19[q][1];
int iqz = D3Q19[q][2];
np = (k+iqz)*Nx*Ny + (j+iqy)*Nx + i + iqx;
np2 = (k+2*iqz)*Nx*Ny + (j+2*iqy)*Nx + i + 2*iqx;
nm = (k-iqz)*Nx*Ny + (j-iqy)*Nx + i - iqx;
vp = Phi[np];
vp2 = Phi[np2];
vm = Phi[nm];
grad += 0.125*(5.0*vp-vp2-3.0*v-vm);
}
Gradient[n] = grad;
}
}
}
extern "C" void ScaLBL_D3Q19_MixedGradient(int *Map, double *Phi, double *Gradient, int start, int finish, int Np, int Nx, int Ny, int Nz)
{
/*
DPCT1007:82: Migration of cudaProfilerStart is not supported.
*/
// BUILD_FIX_AFTER_MIGRATION : Commented below line.
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_MixedGradient(Map, Phi, Gradient, start,
finish, Np, Nx, Ny, Nz,
item_ct1);
});
/*
DPCT1010:84: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
/*
DPCT1007:83: Migration of cudaProfilerStop is not supported.
*/
// BUILD_FIX_AFTER_MIGRATION : Commented below line.
//cudaProfilerStop();
}

845
sycl/Poisson.dp.cpp Normal file
View File

@ -0,0 +1,845 @@
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdio.h>
#include <math.h>
//#include <cuda_profiler_api.h>
#define NBLOCKS 1024
#define NTHREADS 256
void dvc_ScaLBL_D3Q7_AAodd_Poisson_ElectricPotential(int *neighborList,int *Map, double *dist, double *Psi, int start, int finish, int Np,
sycl::nd_item<3> item_ct1){
int n;
double psi;//electric potential
double fq;
int nread;
int idx;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
// q=0
fq = dist[n];
psi = fq;
// q=1
nread = neighborList[n];
fq = dist[nread];
psi += fq;
// q=2
nread = neighborList[n+Np];
fq = dist[nread];
psi += fq;
// q=3
nread = neighborList[n+2*Np];
fq = dist[nread];
psi += fq;
// q = 4
nread = neighborList[n+3*Np];
fq = dist[nread];
psi += fq;
// q=5
nread = neighborList[n+4*Np];
fq = dist[nread];
psi += fq;
// q = 6
nread = neighborList[n+5*Np];
fq = dist[nread];
psi += fq;
idx=Map[n];
Psi[idx] = psi;
}
}
}
void dvc_ScaLBL_D3Q7_AAeven_Poisson_ElectricPotential(int *Map, double *dist, double *Psi, int start, int finish, int Np,
sycl::nd_item<3> item_ct1){
int n;
double psi;//electric potential
double fq;
int idx;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
// q=0
fq = dist[n];
psi = fq;
// q=1
fq = dist[2*Np+n];
psi += fq;
// q=2
fq = dist[1*Np+n];
psi += fq;
// q=3
fq = dist[4*Np+n];
psi += fq;
// q=4
fq = dist[3*Np+n];
psi += fq;
// q=5
fq = dist[6*Np+n];
psi += fq;
// q=6
fq = dist[5*Np+n];
psi += fq;
idx=Map[n];
Psi[idx] = psi;
}
}
}
void dvc_ScaLBL_D3Q7_AAodd_Poisson(int *neighborList, int *Map, double *dist, double *Den_charge, double *Psi, double *ElectricField, double tau, double epsilon_LB,bool UseSlippingVelBC,int start, int finish, int Np,
sycl::nd_item<3> item_ct1){
int n;
double psi;//electric potential
double Ex,Ey,Ez;//electric field
double rho_e;//local charge density
double f0,f1,f2,f3,f4,f5,f6;
int nr1,nr2,nr3,nr4,nr5,nr6;
double rlx=1.0/tau;
int idx;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
//Load data
//When Helmholtz-Smoluchowski slipping velocity BC is used, the bulk fluid is considered as electroneutral
//and thus the net space charge density is zero.
rho_e = (UseSlippingVelBC==1) ? 0.0 : Den_charge[n] / epsilon_LB;
idx=Map[n];
psi = Psi[idx];
// q=0
f0 = dist[n];
// q=1
nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
f1 = dist[nr1]; // reading the f1 data into register fq
nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
f2 = dist[nr2]; // reading the f2 data into register fq
// q=3
nr3 = neighborList[n+2*Np]; // neighbor 4
f3 = dist[nr3];
// q = 4
nr4 = neighborList[n+3*Np]; // neighbor 3
f4 = dist[nr4];
// q=5
nr5 = neighborList[n+4*Np];
f5 = dist[nr5];
// q = 6
nr6 = neighborList[n+5*Np];
f6 = dist[nr6];
Ex = (f1-f2)*rlx*4.0;//NOTE the unit of electric field here is V/lu
Ey = (f3-f4)*rlx*4.0;//factor 4.0 is D3Q7 lattice speed of sound
Ez = (f5-f6)*rlx*4.0;
ElectricField[n+0*Np] = Ex;
ElectricField[n+1*Np] = Ey;
ElectricField[n+2*Np] = Ez;
// q = 0
dist[n] = f0*(1.0-rlx) + 0.25*(rlx*psi+rho_e);
// q = 1
dist[nr2] = f1*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 2
dist[nr1] = f2*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 3
dist[nr4] = f3*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 4
dist[nr3] = f4*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 5
dist[nr6] = f5*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 6
dist[nr5] = f6*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
//........................................................................
}
}
}
void dvc_ScaLBL_D3Q7_AAeven_Poisson(int *Map, double *dist, double *Den_charge, double *Psi, double *ElectricField, double tau, double epsilon_LB,bool UseSlippingVelBC,int start, int finish, int Np,
sycl::nd_item<3> item_ct1){
int n;
double psi;//electric potential
double Ex,Ey,Ez;//electric field
double rho_e;//local charge density
double f0,f1,f2,f3,f4,f5,f6;
double rlx=1.0/tau;
int idx;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
//Load data
//When Helmholtz-Smoluchowski slipping velocity BC is used, the bulk fluid is considered as electroneutral
//and thus the net space charge density is zero.
rho_e = (UseSlippingVelBC==1) ? 0.0 : Den_charge[n] / epsilon_LB;
idx=Map[n];
psi = Psi[idx];
f0 = dist[n];
f1 = dist[2*Np+n];
f2 = dist[1*Np+n];
f3 = dist[4*Np+n];
f4 = dist[3*Np+n];
f5 = dist[6*Np+n];
f6 = dist[5*Np+n];
Ex = (f1-f2)*rlx*4.0;//NOTE the unit of electric field here is V/lu
Ey = (f3-f4)*rlx*4.0;//factor 4.0 is D3Q7 lattice speed of sound
Ez = (f5-f6)*rlx*4.0;
ElectricField[n+0*Np] = Ex;
ElectricField[n+1*Np] = Ey;
ElectricField[n+2*Np] = Ez;
// q = 0
dist[n] = f0*(1.0-rlx) + 0.25*(rlx*psi+rho_e);
// q = 1
dist[1*Np+n] = f1*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 2
dist[2*Np+n] = f2*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 3
dist[3*Np+n] = f3*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 4
dist[4*Np+n] = f4*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 5
dist[5*Np+n] = f5*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
// q = 6
dist[6*Np+n] = f6*(1.0-rlx) + 0.125*(rlx*psi+rho_e);
//........................................................................
}
}
}
void dvc_ScaLBL_D3Q7_Poisson_Init(int *Map, double *dist, double *Psi, int start, int finish, int Np,
sycl::nd_item<3> item_ct1){
int n;
int ijk;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
ijk = Map[n];
dist[0*Np+n] = 0.25*Psi[ijk];
dist[1*Np+n] = 0.125*Psi[ijk];
dist[2*Np+n] = 0.125*Psi[ijk];
dist[3*Np+n] = 0.125*Psi[ijk];
dist[4*Np+n] = 0.125*Psi[ijk];
dist[5*Np+n] = 0.125*Psi[ijk];
dist[6*Np+n] = 0.125*Psi[ijk];
}
}
}
void dvc_ScaLBL_D3Q19_AAeven_Poisson_ElectricPotential(
int *Map, double *dist, double *Den_charge, double *Psi, double epsilon_LB, bool UseSlippingVelBC, int start, int finish, int Np) {
int n;
double psi,sum; //electric potential
double rho_e; //local charge density
double f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15,
f16, f17, f18;
double Gs;
int idx;
for (n = start; n < finish; n++) {
rho_e = (UseSlippingVelBC==1) ? 0.0 : Den_charge[n] / epsilon_LB;
//........................................................................
// q=0
f0 = dist[n];
f1 = dist[2 * Np + n];
f2 = dist[1 * Np + n];
f3 = dist[4 * Np + n];
f4 = dist[3 * Np + n];
f5 = dist[6 * Np + n];
f6 = dist[5 * Np + n];
f7 = dist[8 * Np + n];
f8 = dist[7 * Np + n];
f9 = dist[10 * Np + n];
f10 = dist[9 * Np + n];
f11 = dist[12 * Np + n];
f12 = dist[11 * Np + n];
f13 = dist[14 * Np + n];
f14 = dist[13 * Np + n];
f15 = dist[16 * Np + n];
f16 = dist[15 * Np + n];
f17 = dist[18 * Np + n];
f18 = dist[17 * Np + n];
psi = f0 + f2 + f1 + f4 + f3 + f6 + f5 + f8 + f7 + f10 + f9 + f12 +
f11 + f14 + f13 + f16 + f15 + f18 + f17;
idx = Map[n];
Psi[idx] = psi - 0.5*rho_e;
}
}
void dvc_ScaLBL_D3Q19_AAodd_Poisson(int *neighborList, int *Map,
double *dist, double *Den_charge,
double *Psi, double *ElectricField,
double tau, double epsilon_LB, bool UseSlippingVelBC,
int start, int finish, int Np, sycl::nd_item<3> item_ct1) {
int n;
double psi; //electric potential
double Ex, Ey, Ez; //electric field
double rho_e; //local charge density
double f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15,
f16, f17, f18;
int nr1, nr2, nr3, nr4, nr5, nr6, nr7, nr8, nr9, nr10, nr11, nr12, nr13,
nr14, nr15, nr16, nr17, nr18;
double error,sum_q;
double rlx = 1.0 / tau;
int idx;
double W0 = 0.5;
double W1 = 1.0/24.0;
double W2 = 1.0/48.0;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
//Load data
//When Helmholtz-Smoluchowski slipping velocity BC is used, the bulk fluid is considered as electroneutral
//and thus the net space charge density is zero.
rho_e = (UseSlippingVelBC==1) ? 0.0 : Den_charge[n] / epsilon_LB;
// q=0
f0 = dist[n];
// q=1
nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
f1 = dist[nr1]; // reading the f1 data into register fq
nr2 = neighborList[n + Np]; // neighbor 1 ( < 10Np => even part of dist)
f2 = dist[nr2]; // reading the f2 data into register fq
// q=3
nr3 = neighborList[n + 2 * Np]; // neighbor 4
f3 = dist[nr3];
// q = 4
nr4 = neighborList[n + 3 * Np]; // neighbor 3
f4 = dist[nr4];
// q=5
nr5 = neighborList[n + 4 * Np];
f5 = dist[nr5];
// q = 6
nr6 = neighborList[n + 5 * Np];
f6 = dist[nr6];
// q=7
nr7 = neighborList[n + 6 * Np];
f7 = dist[nr7];
// q = 8
nr8 = neighborList[n + 7 * Np];
f8 = dist[nr8];
// q=9
nr9 = neighborList[n + 8 * Np];
f9 = dist[nr9];
// q = 10
nr10 = neighborList[n + 9 * Np];
f10 = dist[nr10];
// q=11
nr11 = neighborList[n + 10 * Np];
f11 = dist[nr11];
// q=12
nr12 = neighborList[n + 11 * Np];
f12 = dist[nr12];
// q=13
nr13 = neighborList[n + 12 * Np];
f13 = dist[nr13];
// q=14
nr14 = neighborList[n + 13 * Np];
f14 = dist[nr14];
// q=15
nr15 = neighborList[n + 14 * Np];
f15 = dist[nr15];
// q=16
nr16 = neighborList[n + 15 * Np];
f16 = dist[nr16];
// q=17
//fq = dist[18*Np+n];
nr17 = neighborList[n + 16 * Np];
f17 = dist[nr17];
// q=18
nr18 = neighborList[n + 17 * Np];
f18 = dist[nr18];
sum_q = f1+f2+f3+f4+f5+f6+f7+f8+f9+f10+f11+f12+f13+f14+f15+f16+f17+f18;
error = 8.0*(sum_q - f0) + rho_e;
psi = 2.0*(f0*(1.0 - rlx) + rlx*(sum_q + 0.125*rho_e));
idx = Map[n];
Psi[idx] = psi;
Ex = (f1 - f2 + 0.5*(f7 - f8 + f9 - f10 + f11 - f12 + f13 - f14))*4.0; //NOTE the unit of electric field here is V/lu
Ey = (f3 - f4 + 0.5*(f7 - f8 - f9 + f10 + f15 - f16 + f17 - f18))*4.0;
Ez = (f5 - f6 + 0.5*(f11 - f12 - f13 + f14 + f15 - f16 - f17 + f18))*4.0;
ElectricField[n + 0 * Np] = Ex;
ElectricField[n + 1 * Np] = Ey;
ElectricField[n + 2 * Np] = Ez;
// q = 0
dist[n] = W0*psi; //f0 * (1.0 - rlx) - (1.0-0.5*rlx)*W0*rho_e;
// q = 1
dist[nr2] = W1*psi; //f1 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 2
dist[nr1] = W1*psi; //f2 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 3
dist[nr4] = W1*psi; //f3 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 4
dist[nr3] = W1*psi; //f4 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 5
dist[nr6] = W1*psi; //f5 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 6
dist[nr5] = W1*psi; //f6 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
//........................................................................
// q = 7
dist[nr8] = W2*psi; //f7 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 8
dist[nr7] = W2*psi; //f8 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 9
dist[nr10] = W2*psi; //f9 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 10
dist[nr9] = W2*psi; //f10 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 11
dist[nr12] = W2*psi; //f11 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 12
dist[nr11] = W2*psi; //f12 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 13
dist[nr14] = W2*psi; //f13 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q= 14
dist[nr13] = W2*psi; //f14 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 15
dist[nr16] = W2*psi; //f15 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 16
dist[nr15] = W2*psi; //f16 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 17
dist[nr18] = W2*psi; //f17 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
// q = 18
dist[nr17] = W2*psi; //f18 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
}
}
}
void dvc_ScaLBL_D3Q19_AAeven_Poisson(int *Map, double *dist,
double *Den_charge, double *Psi,
double *ElectricField, double *Error, double tau,
double epsilon_LB, bool UseSlippingVelBC,
int start, int finish, int Np, sycl::nd_item<3> item_ct1) {
int n;
double psi; //electric potential
double Ex, Ey, Ez; //electric field
double rho_e; //local charge density
double f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15,
f16, f17, f18;
double error,sum_q;
double rlx = 1.0 / tau;
int idx;
double W0 = 0.5;
double W1 = 1.0/24.0;
double W2 = 1.0/48.0;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
//Load data
//When Helmholtz-Smoluchowski slipping velocity BC is used, the bulk fluid is considered as electroneutral
//and thus the net space charge density is zero.
rho_e = (UseSlippingVelBC==1) ? 0.0 : Den_charge[n] / epsilon_LB;
f0 = dist[n];
f1 = dist[2 * Np + n];
f2 = dist[1 * Np + n];
f3 = dist[4 * Np + n];
f4 = dist[3 * Np + n];
f5 = dist[6 * Np + n];
f6 = dist[5 * Np + n];
f7 = dist[8 * Np + n];
f8 = dist[7 * Np + n];
f9 = dist[10 * Np + n];
f10 = dist[9 * Np + n];
f11 = dist[12 * Np + n];
f12 = dist[11 * Np + n];
f13 = dist[14 * Np + n];
f14 = dist[13 * Np + n];
f15 = dist[16 * Np + n];
f16 = dist[15 * Np + n];
f17 = dist[18 * Np + n];
f18 = dist[17 * Np + n];
Ex = (f1 - f2 + 0.5*(f7 - f8 + f9 - f10 + f11 - f12 + f13 - f14))*4.0; //NOTE the unit of electric field here is V/lu
Ey = (f3 - f4 + 0.5*(f7 - f8 - f9 + f10 + f15 - f16 + f17 - f18))*4.0;
Ez = (f5 - f6 + 0.5*(f11 - f12 - f13 + f14 + f15 - f16 - f17 + f18))*4.0;
ElectricField[n + 0 * Np] = Ex;
ElectricField[n + 1 * Np] = Ey;
ElectricField[n + 2 * Np] = Ez;
sum_q = f1+f2+f3+f4+f5+f6+f7+f8+f9+f10+f11+f12+f13+f14+f15+f16+f17+f18;
error = 8.0*(sum_q - f0) + rho_e;
psi = 2.0*(f0*(1.0 - rlx) + rlx*(sum_q + 0.125*rho_e));
idx = Map[n];
Psi[idx] = psi;
// q = 0
dist[n] = W0*psi;//
// q = 1
dist[1 * Np + n] = W1*psi;//f1 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 2
dist[2 * Np + n] = W1*psi;//f2 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 3
dist[3 * Np + n] = W1*psi;//f3 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 4
dist[4 * Np + n] = W1*psi;//f4 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 5
dist[5 * Np + n] = W1*psi;//f5 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
// q = 6
dist[6 * Np + n] = W1*psi;//f6 * (1.0 - rlx) +W1* (rlx * psi) - (1.0-0.5*rlx)*0.05555555555555555*rho_e;
dist[7 * Np + n] = W2*psi;//f7 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[8 * Np + n] = W2*psi;//f8* (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[9 * Np + n] = W2*psi;//f9 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[10 * Np + n] = W2*psi;//f10 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[11 * Np + n] = W2*psi;//f11 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[12 * Np + n] = W2*psi;//f12 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[13 * Np + n] = W2*psi;//f13 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[14 * Np + n] = W2*psi;//f14 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[15 * Np + n] = W2*psi;//f15 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[16 * Np + n] = W2*psi;//f16 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[17 * Np + n] = W2*psi;//f17 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
dist[18 * Np + n] = W2*psi;//f18 * (1.0 - rlx) +W2* (rlx * psi) - (1.0-0.5*rlx)*0.02777777777777778*rho_e;
//........................................................................
}
}
}
void dvc_ScaLBL_D3Q19_Poisson_Init(int *Map, double *dist, double *Psi,
int start, int finish, int Np, sycl::nd_item<3> item_ct1) {
int n;
int ijk;
double W0 = 0.5;
double W1 = 1.0/24.0;
double W2 = 1.0/48.0;
int S = Np/NBLOCKS/NTHREADS + 1;
for (int s=0; s<S; s++){
//........Get 1-D index for this thread....................
n = S * item_ct1.get_group(2) * item_ct1.get_local_range(2) +
s * item_ct1.get_local_range(2) + item_ct1.get_local_id(2) +
start;
if (n<finish) {
ijk = Map[n];
dist[0 * Np + n] = W0 * Psi[ijk];//3333333333333333* Psi[ijk];
dist[1 * Np + n] = W1 * Psi[ijk];
dist[2 * Np + n] = W1 * Psi[ijk];
dist[3 * Np + n] = W1 * Psi[ijk];
dist[4 * Np + n] = W1 * Psi[ijk];
dist[5 * Np + n] = W1 * Psi[ijk];
dist[6 * Np + n] = W1 * Psi[ijk];
dist[7 * Np + n] = W2* Psi[ijk];
dist[8 * Np + n] = W2* Psi[ijk];
dist[9 * Np + n] = W2* Psi[ijk];
dist[10 * Np + n] = W2* Psi[ijk];
dist[11 * Np + n] = W2* Psi[ijk];
dist[12 * Np + n] = W2* Psi[ijk];
dist[13 * Np + n] = W2* Psi[ijk];
dist[14 * Np + n] = W2* Psi[ijk];
dist[15 * Np + n] = W2* Psi[ijk];
dist[16 * Np + n] = W2* Psi[ijk];
dist[17 * Np + n] = W2* Psi[ijk];
dist[18 * Np + n] = W2* Psi[ijk];
}
}
}
extern "C" void ScaLBL_D3Q19_AAodd_Poisson(int *neighborList, int *Map,
double *dist, double *Den_charge,
double *Psi, double *ElectricField,
double tau, double epsilon_LB, bool UseSlippingVelBC,
int start, int finish, int Np) {
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_AAodd_Poisson(
neighborList, Map, dist, Den_charge, Psi, ElectricField,
tau, epsilon_LB, UseSlippingVelBC, start, finish, Np,
item_ct1);
});
/*
DPCT1010:315: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q19_AAeven_Poisson(int *Map, double *dist,
double *Den_charge, double *Psi,
double *ElectricField, double *Error, double tau,
double epsilon_LB, bool UseSlippingVelBC,
int start, int finish, int Np) {
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_AAeven_Poisson(
Map, dist, Den_charge, Psi, ElectricField, Error, tau,
epsilon_LB, UseSlippingVelBC, start, finish, Np,
item_ct1);
});
/*
DPCT1010:317: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q19_Poisson_Init(int *Map, double *dist, double *Psi,
int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q19_Poisson_Init(Map, dist, Psi, start, finish,
Np, item_ct1);
});
/*
DPCT1010:319: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
}
extern "C" void ScaLBL_D3Q7_AAodd_Poisson_ElectricPotential(int *neighborList,int *Map, double *dist, double *Psi, int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_AAodd_Poisson_ElectricPotential(
neighborList, Map, dist, Psi, start, finish, Np,
item_ct1);
});
/*
DPCT1010:321: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//cudaProfilerStop();
}
extern "C" void ScaLBL_D3Q7_AAeven_Poisson_ElectricPotential(int *Map, double *dist, double *Psi, int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_AAeven_Poisson_ElectricPotential(
Map, dist, Psi, start, finish, Np, item_ct1);
});
/*
DPCT1010:323: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//cudaProfilerStop();
}
extern "C" void ScaLBL_D3Q7_AAodd_Poisson(int *neighborList, int *Map, double *dist, double *Den_charge, double *Psi, double *ElectricField, double tau, double epsilon_LB,bool UseSlippingVelBC,int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_AAodd_Poisson(
neighborList, Map, dist, Den_charge, Psi, ElectricField,
tau, epsilon_LB, UseSlippingVelBC, start, finish, Np,
item_ct1);
});
/*
DPCT1010:325: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//cudaProfilerStop();
}
extern "C" void ScaLBL_D3Q7_AAeven_Poisson(int *Map, double *dist, double *Den_charge, double *Psi, double *ElectricField, double tau, double epsilon_LB,bool UseSlippingVelBC,int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_AAeven_Poisson(Map, dist, Den_charge, Psi,
ElectricField, tau,
epsilon_LB, UseSlippingVelBC,
start, finish, Np, item_ct1);
});
/*
DPCT1010:327: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//cudaProfilerStop();
}
extern "C" void ScaLBL_D3Q7_Poisson_Init(int *Map, double *dist, double *Psi, int start, int finish, int Np){
//cudaProfilerStart();
dpct::get_default_queue().parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, NBLOCKS) *
sycl::range<3>(1, 1, NTHREADS),
sycl::range<3>(1, 1, NTHREADS)),
[=](sycl::nd_item<3> item_ct1) {
dvc_ScaLBL_D3Q7_Poisson_Init(Map, dist, Psi, start, finish,
Np, item_ct1);
});
/*
DPCT1010:329: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this
code.
*/
int err = 0;
//cudaProfilerStop();
}

1034
sycl/Stokes.dp.cpp Normal file

File diff suppressed because it is too large Load Diff

1659
sycl/dfh.dp.cpp Normal file

File diff suppressed because it is too large Load Diff