Rewrote gpu and cpu with contents from ScaLBL (AA refactor)
This commit is contained in:
parent
db8a1bfba9
commit
5ce47ab205
2132
cpu/Color.cpp
2132
cpu/Color.cpp
File diff suppressed because it is too large
Load Diff
2090
cpu/D3Q19.cpp
2090
cpu/D3Q19.cpp
File diff suppressed because it is too large
Load Diff
38
cpu/D3Q7.cpp
38
cpu/D3Q7.cpp
@ -1,23 +1,5 @@
|
||||
// CPU Functions for D3Q7 Lattice Boltzmann Methods
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
|
||||
double *recvbuf, double *dist, int N){
|
||||
//....................................................................................
|
||||
// Unack distribution from the recv buffer
|
||||
// Distribution q matche Cqx, Cqy, Cqz
|
||||
// swap rule means that the distributions in recvbuf are OPPOSITE of q
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int n,idx;
|
||||
for (idx=0; idx<count; idx++){
|
||||
// Get the value from the list -- note that n is the index is from the send (non-local) process
|
||||
n = list[idx];
|
||||
// unpack the distribution to the proper location
|
||||
if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
|
||||
//dist[q*N+n] = recvbuf[start+idx];
|
||||
|
||||
}
|
||||
}
|
||||
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Pack distribution q into the send buffer for the listed lattice sites
|
||||
@ -41,6 +23,26 @@ extern "C" void ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, doub
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
|
||||
double *recvbuf, double *dist, int N){
|
||||
//....................................................................................
|
||||
// Unack distribution from the recv buffer
|
||||
// Distribution q matche Cqx, Cqy, Cqz
|
||||
// swap rule means that the distributions in recvbuf are OPPOSITE of q
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int n,idx;
|
||||
for (idx=0; idx<count; idx++){
|
||||
// Get the value from the list -- note that n is the index is from the send (non-local) process
|
||||
n = list[idx];
|
||||
// unpack the distribution to the proper location
|
||||
if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
|
||||
//dist[q*N+n] = recvbuf[start+idx];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" void ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
|
||||
//....................................................................................
|
||||
// Pack distribution into the send buffer for the listed lattice sites
|
||||
|
@ -2,24 +2,22 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <mm_malloc.h>
|
||||
|
||||
extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
|
||||
//cudaMalloc(address,size);
|
||||
(*address) = malloc(size);
|
||||
memset(*address,0,size);
|
||||
(*address) = _mm_malloc(size,64);
|
||||
memset(*address,0,size);
|
||||
|
||||
if (*address==NULL){
|
||||
printf("Memory allocation failed! \n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" void ScaLBL_FreeDeviceMemory(void* address){
|
||||
if ( address != NULL )
|
||||
free( address );
|
||||
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
|
||||
_mm_free(pointer);
|
||||
}
|
||||
|
||||
|
||||
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
|
||||
// cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
|
||||
memcpy(dest, source, size);
|
||||
|
3349
gpu/Color.cu
3349
gpu/Color.cu
File diff suppressed because it is too large
Load Diff
@ -1,20 +1,34 @@
|
||||
// Basic cuda functions callable from C/C++ code
|
||||
#include <cuda.h>
|
||||
|
||||
extern "C" void dvc_ScaLBL_ScaLBL_ScaLBL_AllocateDeviceMemory(void** address, size_t size){
|
||||
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){
|
||||
cudaMalloc(address,size);
|
||||
cudaMemset(*address,0,size);
|
||||
}
|
||||
|
||||
extern "C" void dvc_ScaLBL_ScaLBL_CopyToDevice(void* dest, void* source, size_t size){
|
||||
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){
|
||||
cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
|
||||
extern "C" void dvc_ScaLBL_CopyToHost(void* dest, void* source, size_t size){
|
||||
extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){
|
||||
cudaMemcpy(dest,source,size,cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
extern "C" void dvc_Barrier(){
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
/*
|
||||
#if __CUDA_ARCH__ < 600
|
||||
__device__ double atomicAdd(double* address, double val) {
|
||||
unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||
}
|
||||
while (assumed != old); return __longlong_as_double(old);
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
2575
gpu/D3Q19.cu
2575
gpu/D3Q19.cu
File diff suppressed because it is too large
Load Diff
27
gpu/D3Q7.cu
27
gpu/D3Q7.cu
@ -1,6 +1,6 @@
|
||||
// GPU Functions for D3Q7 Lattice Boltzmann Methods
|
||||
|
||||
#define NBLOCKS 32
|
||||
#define NBLOCKS 560
|
||||
#define NTHREADS 128
|
||||
|
||||
__global__ void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
|
||||
@ -59,6 +59,26 @@ __global__ void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
|
||||
double *recvbuf, double *dist, int N){
|
||||
//....................................................................................
|
||||
// Unpack distribution from the recv buffer
|
||||
// Distribution q matche Cqx, Cqy, Cqz
|
||||
// swap rule means that the distributions in recvbuf are OPPOSITE of q
|
||||
// dist may be even or odd distributions stored by stream layout
|
||||
//....................................................................................
|
||||
int n,idx;
|
||||
idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx<count){
|
||||
// Get the value from the list -- note that n is the index is from the send (non-local) process
|
||||
n = list[idx];
|
||||
// unpack the distribution to the proper location
|
||||
if (!(n<0)) { dist[q*N+n] = recvbuf[start+idx];
|
||||
//printf("%f \n",,dist[q*N+n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
|
||||
{
|
||||
int n,N;
|
||||
@ -187,6 +207,11 @@ __global__ void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *dis
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_D3Q7_Unpack <<<GRID,512 >>>(q, list, start, count, recvbuf, dist, N);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
|
||||
int GRID = count / 512 + 1;
|
||||
dvc_ScaLBL_Scalar_Pack <<<GRID,512 >>>(list, count, sendbuf, Data, N);
|
||||
|
@ -10,9 +10,8 @@ extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_FreeDeviceMemory(void* address){
|
||||
if ( address != NULL )
|
||||
cudaFree( address );
|
||||
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
|
||||
cudaFree(pointer);
|
||||
}
|
||||
|
||||
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
|
||||
|
@ -4,7 +4,7 @@
|
||||
//*************************************************************************
|
||||
#include <cuda.h>
|
||||
|
||||
#define NBLOCKS 32
|
||||
#define NBLOCKS 560
|
||||
#define NTHREADS 128
|
||||
|
||||
__global__ void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz)
|
||||
@ -106,7 +106,9 @@ __global__ void Compute_VELOCITY(char *ID, double *disteven, double *distodd, do
|
||||
}
|
||||
|
||||
//*************************************************************************
|
||||
__global__ void D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
|
||||
__global__ void
|
||||
__launch_bounds__(512,2)
|
||||
D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
|
||||
double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz)
|
||||
{
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user