Added atomicAdd double routine for pre-CUda 8 on Titan
This commit is contained in:
parent
7ae276c10a
commit
be59b70139
14
gpu/D3Q19.cu
14
gpu/D3Q19.cu
@ -7,6 +7,20 @@
|
|||||||
// functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location
|
// functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location
|
||||||
// functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
|
// functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
|
||||||
|
|
||||||
|
__device__ double atomicAdd(double* address, double val)
|
||||||
|
{
|
||||||
|
unsigned long long int* address_as_ull =
|
||||||
|
(unsigned long long int*)address;
|
||||||
|
unsigned long long int old = *address_as_ull, assumed;
|
||||||
|
do {
|
||||||
|
assumed = old;
|
||||||
|
old = atomicCAS(address_as_ull, assumed,
|
||||||
|
__double_as_longlong(val +
|
||||||
|
__longlong_as_double(assumed)));
|
||||||
|
} while (assumed != old);
|
||||||
|
return __longlong_as_double(old);
|
||||||
|
}
|
||||||
|
|
||||||
__inline__ __device__
|
__inline__ __device__
|
||||||
double warpReduceSum(double val) {
|
double warpReduceSum(double val) {
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2)
|
for (int offset = warpSize/2; offset > 0; offset /= 2)
|
||||||
|
Loading…
Reference in New Issue
Block a user