Added atomicAdd double routine for pre-CUda 8 on Titan

This commit is contained in:
James E McClure 2017-09-18 06:13:12 -04:00
parent 7ae276c10a
commit be59b70139

View File

@ -7,6 +7,20 @@
// functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location // functionality for parallel reduction in Flux BC routines -- probably should be re-factored to another location
// functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ // functions copied from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__inline__ __device__ __inline__ __device__
double warpReduceSum(double val) { double warpReduceSum(double val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) for (int offset = warpSize/2; offset > 0; offset /= 2)