Added comment to apply_stdwell kernels

2025-02-25 18:55:30 -06:00 · 2021-12-16 09:44:33 +01:00 · 2021-12-16 09:44:33 +01:00 · f48fe632cb
commit f48fe632cb
parent 3decb73561
3 changed files with 18 additions and 0 deletions
--- a/opm/simulators/linalg/bda/cuWellContributions.cu
+++ b/opm/simulators/linalg/bda/cuWellContributions.cu
@ -82,6 +82,12 @@ __global__ void apply_well_contributions(
        // merge all blocks into 1 dim*dim_wells block
        // since 3*4 blocks has give 2 parallel blocks, do not use a loop
        // 0x00ffffff contains 24 ones, representing the two blocks that are added
+        // block 1:     block 2:
+        //  0  1  2     12 13 14
+        //  3  4  5     15 16 17
+        //  6  7  8     18 19 20
+        //  9 10 11     21 22 23
+        // thread i will hold the sum of thread i and i + vals_per_block
        temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);

        // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
--- a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
@ -39,6 +39,12 @@ __kernel void stdwell_apply(

        // merge all blocks in this workgroup into 1 block
        // if numBlocksPerWarp >= 3, should use loop
+        // block 1:     block 2:
+        //  0  1  2     12 13 14
+        //  3  4  5     15 16 17
+        //  6  7  8     18 19 20
+        //  9 10 11     21 22 23
+        // workitem i will hold the sum of workitems i and i + valsPerBlock
        if(wiId < valsPerBlock){
            for (int i = 1; i < numBlocksPerWarp; ++i) {
                localSum[wiId] += localSum[wiId + i*valsPerBlock];
--- a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl
@ -37,6 +37,12 @@ __kernel void stdwell_apply_no_reorder(

        // merge all blocks in this workgroup into 1 block
        // if numBlocksPerWarp >= 3, should use loop
+        // block 1:     block 2:
+        //  0  1  2     12 13 14
+        //  3  4  5     15 16 17
+        //  6  7  8     18 19 20
+        //  9 10 11     21 22 23
+        // workitem i will hold the sum of workitems i and i + valsPerBlock
        if(wiId < valsPerBlock){
            for (int i = 1; i < numBlocksPerWarp; ++i) {
                localSum[wiId] += localSum[wiId + i*valsPerBlock];