Rewrote gpu and cpu with contents from ScaLBL (AA refactor)

This commit is contained in:
James E McClure 2018-01-24 10:08:43 -05:00
parent db8a1bfba9
commit 5ce47ab205
10 changed files with 8312 additions and 1942 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +1,5 @@
// CPU Functions for D3Q7 Lattice Boltzmann Methods
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
double *recvbuf, double *dist, int N){
//....................................................................................
// Unack distribution from the recv buffer
// Distribution q matche Cqx, Cqy, Cqz
// swap rule means that the distributions in recvbuf are OPPOSITE of q
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int n,idx;
for (idx=0; idx<count; idx++){
// Get the value from the list -- note that n is the index is from the send (non-local) process
n = list[idx];
// unpack the distribution to the proper location
if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
//dist[q*N+n] = recvbuf[start+idx];
}
}
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
//....................................................................................
// Pack distribution q into the send buffer for the listed lattice sites
@ -41,6 +23,26 @@ extern "C" void ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, doub
}
}
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
double *recvbuf, double *dist, int N){
//....................................................................................
// Unack distribution from the recv buffer
// Distribution q matche Cqx, Cqy, Cqz
// swap rule means that the distributions in recvbuf are OPPOSITE of q
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int n,idx;
for (idx=0; idx<count; idx++){
// Get the value from the list -- note that n is the index is from the send (non-local) process
n = list[idx];
// unpack the distribution to the proper location
if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
//dist[q*N+n] = recvbuf[start+idx];
}
}
extern "C" void ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
//....................................................................................
// Pack distribution into the send buffer for the listed lattice sites

View File

@ -2,24 +2,22 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mm_malloc.h>
extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
//cudaMalloc(address,size);
(*address) = malloc(size);
memset(*address,0,size);
(*address) = _mm_malloc(size,64);
memset(*address,0,size);
if (*address==NULL){
printf("Memory allocation failed! \n");
}
}
extern "C" void ScaLBL_FreeDeviceMemory(void* address){
if ( address != NULL )
free( address );
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
_mm_free(pointer);
}
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
// cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
memcpy(dest, source, size);

File diff suppressed because it is too large Load Diff

View File

@ -1,20 +1,34 @@
// Basic cuda functions callable from C/C++ code
#include <cuda.h>
extern "C" void dvc_ScaLBL_ScaLBL_ScaLBL_AllocateDeviceMemory(void** address, size_t size){
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){
cudaMalloc(address,size);
cudaMemset(*address,0,size);
}
extern "C" void dvc_ScaLBL_ScaLBL_CopyToDevice(void* dest, void* source, size_t size){
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){
cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
}
extern "C" void dvc_ScaLBL_CopyToHost(void* dest, void* source, size_t size){
extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){
cudaMemcpy(dest,source,size,cudaMemcpyDeviceToHost);
}
extern "C" void dvc_Barrier(){
cudaDeviceSynchronize();
}
/*
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
}
while (assumed != old); return __longlong_as_double(old);
}
#endif
*/

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
// GPU Functions for D3Q7 Lattice Boltzmann Methods
#define NBLOCKS 32
#define NBLOCKS 560
#define NTHREADS 128
__global__ void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
@ -59,6 +59,26 @@ __global__ void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf,
}
}
__global__ void dvc_ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count,
double *recvbuf, double *dist, int N){
//....................................................................................
// Unpack distribution from the recv buffer
// Distribution q matche Cqx, Cqy, Cqz
// swap rule means that the distributions in recvbuf are OPPOSITE of q
// dist may be even or odd distributions stored by stream layout
//....................................................................................
int n,idx;
idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<count){
// Get the value from the list -- note that n is the index is from the send (non-local) process
n = list[idx];
// unpack the distribution to the proper location
if (!(n<0)) { dist[q*N+n] = recvbuf[start+idx];
//printf("%f \n",,dist[q*N+n]);
}
}
}
__global__ void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
{
int n,N;
@ -187,6 +207,11 @@ __global__ void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *dis
}
}
extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list, int start, int count, double *recvbuf, double *dist, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_D3Q7_Unpack <<<GRID,512 >>>(q, list, start, count, recvbuf, dist, N);
}
extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
int GRID = count / 512 + 1;
dvc_ScaLBL_Scalar_Pack <<<GRID,512 >>>(list, count, sendbuf, Data, N);

View File

@ -10,9 +10,8 @@ extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
}
}
extern "C" void ScaLBL_FreeDeviceMemory(void* address){
if ( address != NULL )
cudaFree( address );
extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
cudaFree(pointer);
}
extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){

View File

@ -4,7 +4,7 @@
//*************************************************************************
#include <cuda.h>
#define NBLOCKS 32
#define NBLOCKS 560
#define NTHREADS 128
__global__ void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz)
@ -106,7 +106,9 @@ __global__ void Compute_VELOCITY(char *ID, double *disteven, double *distodd, do
}
//*************************************************************************
__global__ void D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
__global__ void
__launch_bounds__(512,2)
D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz)
{