opm-simulators/opm/simulators/linalg/bda/WellContributions.cu

/*
  Copyright 2020 Equinor ASA

  This file is part of the Open Porous Media project (OPM).

  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
*/


#include <cstdlib>
#include <cstring>
#include <config.h> // CMake

#ifdef __NVCC__
#include "opm/simulators/linalg/bda/cuda_header.hpp"
#include <cuda_runtime.h>
#endif

#include "opm/simulators/linalg/bda/WellContributions.hpp"

namespace Opm
{

    // apply WellContributions using y -= C^T * (D^-1 * (B * x))
#if HAVE_CUDA
    __global__ void apply_well_contributions(
        const double * __restrict__ Cnnzs,
        const double * __restrict__ Dnnzs,
        const double * __restrict__ Bnnzs,
        const int * __restrict__ Ccols,
        const int * __restrict__ Bcols,
        const double * __restrict__ x,
        double * __restrict__ y,
        const int dim,
        const int dim_wells,
        const unsigned int * __restrict__ val_pointers
        )
    {
        const int idx_b = blockIdx.x;
        const int idx_t = threadIdx.x;
        int idx = idx_b * blockDim.x + idx_t;
        const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];

        const int vals_per_block = dim * dim_wells;        // 12
        const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
        const int num_blocks_per_warp = 32/vals_per_block; // 2
        const int lane = idx_t % 32;
        const int c = lane % dim;                           // col in block
        const int r = (lane / dim) % dim_wells;             // row in block
        const int NUM_THREADS = gridDim.x * blockDim.x;

        extern __shared__ double smem[];
        double * __restrict__ z1 = smem;
        double * __restrict__ z2 = z1 + dim_wells;

        if(idx_t < dim_wells){
            z1[idx_t] = 0.0;
        }

        __syncthreads();

        // z1 = B * x
        if(idx_t < num_active_threads){
            // multiply all blocks with x
            double temp = 0.0;
            int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
            while(b < val_size + val_pointers[idx_b]){
                int colIdx = Bcols[b];
                temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
                b += num_blocks_per_warp;
            }

            // merge all blocks into 1 dim*dim_wells block
            // since NORNE has only 2 parallel blocks, do not use a loop
            temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);

            b = idx_t / vals_per_block + val_pointers[idx_b];

            // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
            if(idx_t < vals_per_block){
                // should be a loop as well, now only works for dim == 3
                if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);}  // add col 2 to col 0
                if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);}  // add col 1 to col 0
            }

            // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
            if(c == 0 && idx_t < vals_per_block){
                z1[r] = temp;
            }
        }

        __syncthreads();

        // z2 = D^-1 * B * x = D^-1 * z1
        if(idx_t < dim_wells){
            double temp = 0.0;
            for(int c = 0; c < dim_wells; ++c){
                temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
            }
            z2[idx_t] = temp;
        }

        __syncthreads();

        // y -= C^T * D^-1 * B * x
        // use dim * val_size threads, each block is assigned 'dim' threads
        if(idx_t < dim * val_size){
            double temp = 0.0;
            int b = idx_t / dim + val_pointers[idx_b];
            int cc = idx_t % dim;
            int colIdx = Ccols[b];
            for(unsigned int c = 0; c < dim_wells; ++c){
                temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
            }
            y[colIdx * dim + cc] -= temp;
        }

    }
#endif


        void WellContributions::alloc(){
            cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
            cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
            cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
            cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
            cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
            val_pointers = new unsigned int[num_wells + 1];
            cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
            cudaCheckLastError("apply_gpu malloc failed");
            allocated = true;
        }

        WellContributions::~WellContributions()
        {
            cudaFree(d_Cnnzs);
            cudaFree(d_Dnnzs);
            cudaFree(d_Bnnzs);
            cudaFree(d_Ccols);
            cudaFree(d_Bcols);
            delete[] val_pointers;
            cudaFree(d_val_pointers);
        }


        // Apply the WellContributions, similar to StandardWell::apply()
        // y -= (C^T *(D^-1*(   B*x)))
        void WellContributions::apply(double *d_x, double *d_y)
        {
            int smem_size = 2 * sizeof(double) * dim_wells;
            apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
        }


        void WellContributions::addMatrix(int idx, int *colIndices, double *values, unsigned int val_size)
        {
            switch (idx) {
            case 0:
                cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
                cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
                break;
            case 1:
                cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
                break;
            case 2:
                cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
                cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
                val_pointers[num_wells_so_far] = num_blocks_so_far;
                if(num_wells_so_far == num_wells - 1){
                    val_pointers[num_wells] = num_blocks;
                }
                cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
                break;
            default:
                OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
            }
            cudaCheckLastError("WellContributions::addMatrix() failed");
            if(idx == 2){
                num_blocks_so_far += val_size;
            }
            if(idx == 2){
                num_wells_so_far++;
            }
        }

        void WellContributions::setCudaStream(cudaStream_t stream_)
        {
            this->stream = stream_;
        }


        void WellContributions::addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq)
        {
            if(allocated){
                OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
            }
            num_blocks += nnz;
            dim = numEq;
            dim_wells = numWellEq;
            num_wells++;
        }

} //namespace Opm
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`/*`
Added header to CMake_files, updated copyright year 2020-03-16 06:57:35 -05:00			`Copyright 2020 Equinor ASA`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00
			`This file is part of the Open Porous Media project (OPM).`

			`OPM is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`OPM is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with OPM. If not, see <http://www.gnu.org/licenses/>.`
			`*/`


			`#include <cstdlib>`
			`#include <cstring>`
			`#include <config.h> // CMake`

			`#ifdef __NVCC__`
			`#include "opm/simulators/linalg/bda/cuda_header.hpp"`
			`#include <cuda_runtime.h>`
			`#endif`

			`#include "opm/simulators/linalg/bda/WellContributions.hpp"`

			`namespace Opm`
			`{`

			`// apply WellContributions using y -= C^T * (D^-1 * (B * x))`
			`#if HAVE_CUDA`
Removed unnecessary line continuation chars. Merged unsupported cases in switch 2020-03-18 09:14:38 -05:00			`__global__ void apply_well_contributions(`
			`const double * __restrict__ Cnnzs,`
			`const double * __restrict__ Dnnzs,`
			`const double * __restrict__ Bnnzs,`
			`const int * __restrict__ Ccols,`
			`const int * __restrict__ Bcols,`
			`const double * __restrict__ x,`
			`double * __restrict__ y,`
			`const int dim,`
			`const int dim_wells,`
			`const unsigned int * __restrict__ val_pointers`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`)`
			`{`
			`const int idx_b = blockIdx.x;`
			`const int idx_t = threadIdx.x;`
			`int idx = idx_b * blockDim.x + idx_t;`
			`const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];`

			`const int vals_per_block = dim * dim_wells; // 12`
			`const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24`
			`const int num_blocks_per_warp = 32/vals_per_block; // 2`
			`const int lane = idx_t % 32;`
			`const int c = lane % dim; // col in block`
			`const int r = (lane / dim) % dim_wells; // row in block`
			`const int NUM_THREADS = gridDim.x * blockDim.x;`

			`extern __shared__ double smem[];`
			`double * __restrict__ z1 = smem;`
			`double * __restrict__ z2 = z1 + dim_wells;`

			`if(idx_t < dim_wells){`
			`z1[idx_t] = 0.0;`
			`}`

			`__syncthreads();`

			`// z1 = B * x`
			`if(idx_t < num_active_threads){`
			`// multiply all blocks with x`
			`double temp = 0.0;`
			`int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks`
			`while(b < val_size + val_pointers[idx_b]){`
			`int colIdx = Bcols[b];`
			`temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];`
			`b += num_blocks_per_warp;`
			`}`

			`// merge all blocks into 1 dim*dim_wells block`
			`// since NORNE has only 2 parallel blocks, do not use a loop`
			`temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);`

			`b = idx_t / vals_per_block + val_pointers[idx_b];`

			`// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD`
			`if(idx_t < vals_per_block){`
			`// should be a loop as well, now only works for dim == 3`
			`if(c == 0 \|\| c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0`
			`if(c == 0 \|\| c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0`
			`}`

			`// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether`
			`if(c == 0 && idx_t < vals_per_block){`
			`z1[r] = temp;`
			`}`
			`}`

			`__syncthreads();`

			`// z2 = D^-1 * B * x = D^-1 * z1`
			`if(idx_t < dim_wells){`
			`double temp = 0.0;`
			`for(int c = 0; c < dim_wells; ++c){`
			`temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];`
			`}`
			`z2[idx_t] = temp;`
			`}`

			`__syncthreads();`

			`// y -= C^T * D^-1 * B * x`
			`// use dim * val_size threads, each block is assigned 'dim' threads`
			`if(idx_t < dim * val_size){`
			`double temp = 0.0;`
			`int b = idx_t / dim + val_pointers[idx_b];`
			`int cc = idx_t % dim;`
			`int colIdx = Ccols[b];`
			`for(unsigned int c = 0; c < dim_wells; ++c){`
			`temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];`
			`}`
			`y[colIdx * dim + cc] -= temp;`
			`}`

			`}`
			`#endif`


Removed CPU implementation of WellContributions 2020-03-18 09:08:48 -05:00			`void WellContributions::alloc(){`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`cudaMalloc((void*)&d_Cnnzs, sizeof(double) num_blocks * dim * dim_wells);`
			`cudaMalloc((void*)&d_Dnnzs, sizeof(double) num_wells * dim_wells * dim_wells);`
			`cudaMalloc((void*)&d_Bnnzs, sizeof(double) num_blocks * dim * dim_wells);`
			`cudaMalloc((void*)&d_Ccols, sizeof(int) num_blocks);`
			`cudaMalloc((void*)&d_Bcols, sizeof(int) num_blocks);`
			`val_pointers = new unsigned int[num_wells + 1];`
			`cudaMalloc((void*)&d_val_pointers, sizeof(int) (num_wells + 1));`
			`cudaCheckLastError("apply_gpu malloc failed");`
Removed CPU implementation of WellContributions 2020-03-18 09:08:48 -05:00			`allocated = true;`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`}`

			`WellContributions::~WellContributions()`
			`{`
			`cudaFree(d_Cnnzs);`
			`cudaFree(d_Dnnzs);`
			`cudaFree(d_Bnnzs);`
			`cudaFree(d_Ccols);`
			`cudaFree(d_Bcols);`
			`delete[] val_pointers;`
			`cudaFree(d_val_pointers);`
			`}`


			`// Apply the WellContributions, similar to StandardWell::apply()`
			`// y -= (C^T (D^-1( B*x)))`
Removed CPU implementation of WellContributions 2020-03-18 09:08:48 -05:00			`void WellContributions::apply(double d_x, double d_y)`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`{`
			`int smem_size = 2 * sizeof(double) * dim_wells;`
			`apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);`
			`}`


Removed CPU implementation of WellContributions 2020-03-18 09:08:48 -05:00			`void WellContributions::addMatrix(int idx, int colIndices, double values, unsigned int val_size)`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`{`
			`switch (idx) {`
			`case 0:`
			`cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);`
			`cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);`
			`break;`
			`case 1:`
			`cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);`
			`break;`
			`case 2:`
			`cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);`
			`cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);`
			`val_pointers[num_wells_so_far] = num_blocks_so_far;`
			`if(num_wells_so_far == num_wells - 1){`
			`val_pointers[num_wells] = num_blocks;`
			`}`
			`cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);`
			`break;`
			`default:`
Removed unnecessary line continuation chars. Merged unsupported cases in switch 2020-03-18 09:14:38 -05:00			`OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`}`
			`cudaCheckLastError("WellContributions::addMatrix() failed");`
Removed CPU implementation of WellContributions 2020-03-18 09:08:48 -05:00			`if(idx == 2){`
			`num_blocks_so_far += val_size;`
			`}`
			`if(idx == 2){`
			`num_wells_so_far++;`
			`}`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`}`

			`void WellContributions::setCudaStream(cudaStream_t stream_)`
			`{`
			`this->stream = stream_;`
			`}`


			`void WellContributions::addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq)`
			`{`
			`if(allocated){`
Removed unnecessary line continuation chars. Merged unsupported cases in switch 2020-03-18 09:14:38 -05:00			`OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");`
cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore 2020-03-13 08:21:59 -05:00			`}`
			`num_blocks += nnz;`
			`dim = numEq;`
			`dim_wells = numWellEq;`
			`num_wells++;`
			`}`

			`} //namespace Opm`