Rewrote gpu and cpu with contents from ScaLBL (AA refactor)

2018-01-24 10:08:43 -05:00 · 2018-01-24 10:08:43 -05:00 · 5ce47ab205
commit 5ce47ab205
parent db8a1bfba9
10 changed files with 8312 additions and 1942 deletions
--- a/cpu/Color.cpp
+++ b/cpu/Color.cpp
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
--- a/cpu/D3Q7.cpp
+++ b/cpu/D3Q7.cpp
@ -1,23 +1,5 @@
 // CPU Functions for D3Q7 Lattice Boltzmann Methods

-extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list,  int start, int count,
-					   double *recvbuf, double *dist, int N){
-	//....................................................................................
-	// Unack distribution from the recv buffer
-	// Distribution q matche Cqx, Cqy, Cqz
-	// swap rule means that the distributions in recvbuf are OPPOSITE of q
-	// dist may be even or odd distributions stored by stream layout
-	//....................................................................................
-	int n,idx;
-	for (idx=0; idx<count; idx++){
-		// Get the value from the list -- note that n is the index is from the send (non-local) process
-		n = list[idx];
-		// unpack the distribution to the proper location
-		if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
-		//dist[q*N+n] = recvbuf[start+idx];
-
-	}
-}
 extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
 	//....................................................................................
 	// Pack distribution q into the send buffer for the listed lattice sites
@ -41,6 +23,26 @@ extern "C" void ScaLBL_Scalar_Unpack(int *list, int count, double *recvbuf, doub
 	}
 }

+extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list,  int start, int count,
+		double *recvbuf, double *dist, int N){
+	//....................................................................................
+	// Unack distribution from the recv buffer
+	// Distribution q matche Cqx, Cqy, Cqz
+	// swap rule means that the distributions in recvbuf are OPPOSITE of q
+	// dist may be even or odd distributions stored by stream layout
+	//....................................................................................
+	int n,idx;
+	for (idx=0; idx<count; idx++){
+		// Get the value from the list -- note that n is the index is from the send (non-local) process
+		n = list[idx];
+		// unpack the distribution to the proper location
+		if (!(n<0)) dist[q*N+n] = recvbuf[start+idx];
+		//dist[q*N+n] = recvbuf[start+idx];
+
+	}
+}
+
+
 extern "C" void ScaLBL_PackDenD3Q7(int *list, int count, double *sendbuf, int number, double *Data, int N){
 	//....................................................................................
 	// Pack distribution into the send buffer for the listed lattice sites
--- a/cpu/Extras.cpp
+++ b/cpu/Extras.cpp
@ -2,24 +2,22 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <mm_malloc.h>

 extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
 	//cudaMalloc(address,size);
-	(*address) = malloc(size);
-    memset(*address,0,size);
+	(*address) = _mm_malloc(size,64);
+	memset(*address,0,size);
 	
 	if (*address==NULL){
 		printf("Memory allocation failed! \n");
 	}
 }

-
-extern "C" void ScaLBL_FreeDeviceMemory(void* address){
-	if ( address != NULL )
-        free( address );
+extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
+	_mm_free(pointer);
 }

-
 extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
 //	cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
 	memcpy(dest, source, size);
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
--- a/gpu/CudaExtras.cu
+++ b/gpu/CudaExtras.cu
@ -1,20 +1,34 @@
 // Basic cuda functions callable from C/C++ code
 #include <cuda.h>

-extern "C" void dvc_ScaLBL_ScaLBL_ScaLBL_AllocateDeviceMemory(void** address, size_t size){
+extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size){
 	cudaMalloc(address,size);
    cudaMemset(*address,0,size);
 }

-extern "C" void dvc_ScaLBL_ScaLBL_CopyToDevice(void* dest, void* source, size_t size){
+extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size){
 	cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
 }


-extern "C" void dvc_ScaLBL_CopyToHost(void* dest, void* source, size_t size){
+extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size){
 	cudaMemcpy(dest,source,size,cudaMemcpyDeviceToHost);
 }

 extern "C" void dvc_Barrier(){
 	cudaDeviceSynchronize();
 }
+/*
+#if __CUDA_ARCH__ < 600 
+__device__ double atomicAdd(double* address, double val) { 
+unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed;
+ do { 
+      assumed = old; 
+      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); 
+      // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) 
+    } 
+    while (assumed != old); return __longlong_as_double(old); 
+} 
+
+#endif
+*/
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
--- a/gpu/D3Q7.cu
+++ b/gpu/D3Q7.cu
@ -1,6 +1,6 @@
 // GPU Functions for D3Q7 Lattice Boltzmann Methods

-#define NBLOCKS 32
+#define NBLOCKS 560
 #define NTHREADS 128

 __global__  void dvc_ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
@ -59,6 +59,26 @@ __global__ void dvc_ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf,
 	}
 }

+__global__ void dvc_ScaLBL_D3Q7_Unpack(int q,  int *list,  int start, int count,
+		double *recvbuf, double *dist, int N){
+	//....................................................................................
+	// Unpack distribution from the recv buffer
+	// Distribution q matche Cqx, Cqy, Cqz
+	// swap rule means that the distributions in recvbuf are OPPOSITE of q
+	// dist may be even or odd distributions stored by stream layout
+	//....................................................................................
+	int n,idx;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx<count){
+		// Get the value from the list -- note that n is the index is from the send (non-local) process
+		n = list[idx];
+		// unpack the distribution to the proper location
+		if (!(n<0)) { dist[q*N+n] = recvbuf[start+idx];
+		//printf("%f \n",,dist[q*N+n]);
+		}
+	}
+}
+
 __global__ void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
 {
 	int n,N;
@ -187,6 +207,11 @@ __global__  void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *dis
 	}
 }

+extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list,  int start, int count, double *recvbuf, double *dist, int N){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_Unpack <<<GRID,512 >>>(q, list, start, count, recvbuf, dist, N);
+}
+
 extern "C" void ScaLBL_Scalar_Pack(int *list, int count, double *sendbuf, double *Data, int N){
 	int GRID = count / 512 + 1;
 	dvc_ScaLBL_Scalar_Pack <<<GRID,512 >>>(list, count, sendbuf, Data, N);
--- a/gpu/Extras.cu
+++ b/gpu/Extras.cu
@ -10,9 +10,8 @@ extern "C" void ScaLBL_AllocateDeviceMemory(void** address, size_t size){
 	}	
 }

-extern "C" void ScaLBL_FreeDeviceMemory(void* address){
-	if ( address != NULL )
-        cudaFree( address );
+extern "C" void ScaLBL_FreeDeviceMemory(void* pointer){
+       cudaFree(pointer);
 }

 extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size){
--- a/gpu/MRT.cu
+++ b/gpu/MRT.cu
@ -4,7 +4,7 @@
 //*************************************************************************
 #include <cuda.h>

-#define NBLOCKS 32
+#define NBLOCKS 560
 #define NTHREADS 128

 __global__ void INITIALIZE(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz)
@ -106,7 +106,9 @@ __global__ void Compute_VELOCITY(char *ID, double *disteven, double *distodd, do
 }

 //*************************************************************************
-__global__ void D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
+__global__ void 
+__launch_bounds__(512,2)
+D3Q19_MRT(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz,
 					double rlx_setA, double rlx_setB, double Fx, double Fy, double Fz)
 {