diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu index c3698997..d1203ebe 100644 --- a/gpu/D3Q19.cu +++ b/gpu/D3Q19.cu @@ -7,6 +7,20 @@ // functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location // functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; +old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + __inline__ __device__ double warpReduceSum(double val) { for (int offset = warpSize/2; offset > 0; offset /= 2)