Merge pull request #3755 from blattms/fix-opencl-apply-stdwells-rebased

Fix opencl apply stdwells rebased (replaces #3746)
This commit is contained in:
Markus Blatt 2022-01-11 19:18:03 +01:00 committed by GitHub
commit 790fa8044e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 13 deletions

View File

@ -80,11 +80,16 @@ __global__ void apply_well_contributions(
}
// merge all blocks into 1 dim*dim_wells block
// since NORNE has only 2 parallel blocks, do not use a loop
// since 3*4 blocks has give 2 parallel blocks, do not use a loop
// 0x00ffffff contains 24 ones, representing the two blocks that are added
// block 1: block 2:
// 0 1 2 12 13 14
// 3 4 5 15 16 17
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// thread i will hold the sum of thread i and i + vals_per_block
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
b = idx_t / vals_per_block + val_pointers[idx_b];
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
if (idx_t < vals_per_block) {
// should be a loop as well, now only works for dim == 3

View File

@ -37,15 +37,23 @@ __kernel void stdwell_apply(
b += numBlocksPerWarp;
}
// merge all blocks in this workgroup into 1 block
// if numBlocksPerWarp >= 3, should use loop
// block 1: block 2:
// 0 1 2 12 13 14
// 3 4 5 15 16 17
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// workitem i will hold the sum of workitems i and i + valsPerBlock
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
for (int i = 1; i < numBlocksPerWarp; ++i) {
localSum[wiId] += localSum[wiId + i*valsPerBlock];
}
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
for(unsigned int i = dim - 1; i > 0; --i){
localSum[wiId] += localSum[wiId + i];
}
z1[r] = localSum[wiId];
}

View File

@ -35,15 +35,23 @@ __kernel void stdwell_apply_no_reorder(
b += numBlocksPerWarp;
}
// merge all blocks in this workgroup into 1 block
// if numBlocksPerWarp >= 3, should use loop
// block 1: block 2:
// 0 1 2 12 13 14
// 3 4 5 15 16 17
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// workitem i will hold the sum of workitems i and i + valsPerBlock
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
for (int i = 1; i < numBlocksPerWarp; ++i) {
localSum[wiId] += localSum[wiId + i*valsPerBlock];
}
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
for(unsigned int i = dim - 1; i > 0; --i){
localSum[wiId] += localSum[wiId + i];
}
z1[r] = localSum[wiId];
}