From ac26216869573e42a335b785fc6492eda744e375 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 1 Jun 2023 09:45:30 +0400 Subject: [PATCH] [GPU] Functional fixes for nvidia (#17735) --- .../kernel_selector/kernels/gemm/gemm_kernel_base.cpp | 7 ------- .../kernel_selector/kernels/gemm/gemm_kernel_base.h | 2 -- .../kernels/gemm/gemm_kernel_mmad_int8.cpp | 7 +++++++ .../kernels/gemm/gemm_kernel_mmad_int8.h | 1 + .../kernels/gemm/gemm_kernel_mmad_int8_slm.cpp | 7 +++++++ .../kernels/gemm/gemm_kernel_mmad_int8_slm.h | 1 + .../kernel_selector/kernels/gemm/gemm_kernel_ref.cpp | 4 ++++ .../src/kernel_selector/kernels/gemm/gemm_kernel_ref.h | 1 + .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 7 +++++++ .../kernels/gemm/gemm_kernel_tiled_opt.h | 1 + .../kernels/softmax/softmax_kernel_bf.cpp | 9 +++++++++ .../kernels/softmax/softmax_kernel_bf.h | 1 + src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 10 ++++++++++ 13 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp index 2e72409ac86..205207ecb57 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp @@ -94,13 +94,6 @@ bool GemmKernelBase::Validate(const Params& p, const optional_params&) const { return true; } -DeviceFeaturesKey GemmKernelBase::get_required_device_features_key(const Params& params, const optional_params& options) const { - auto k = get_common_subgroups_device_features_key(params, options); - k.requires_subgroup_shuffle(); - - return k; -} - Datatype GemmKernelBase::GetActivationType(const gemm_params& params) const { if (params.quantization != QuantizationType::NONE) return Datatype::F32; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h index 60fb0fe1dda..eddcb427ad4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h @@ -53,8 +53,6 @@ protected: virtual JitConstants GetFusedPrimitivesJitConstants(const gemm_params& params, const DispatchData& dispatchData) const; Datatype GetActivationType(const gemm_params& params) const; // --Fused ops - - DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; bool Validate(const Params& p, const optional_params&) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.cpp index 619e817f47c..1fab54da075 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.cpp @@ -32,6 +32,13 @@ ParamsKey GemmKernelMMADint8::GetSupportedKey() const { return k; } +DeviceFeaturesKey GemmKernelMMADint8::get_required_device_features_key(const Params& params, const optional_params& options) const { + auto k = get_common_subgroups_device_features_key(params, options); + k.requires_subgroup_shuffle(); + + return k; +} + JitConstants GemmKernelMMADint8::GetJitConstants(const gemm_params& params) const { JitConstants jit = Parent::GetJitConstants(params); GemmTuningData td = SetTuningParams(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.h index 5f70ee141b8..0c1acaf6ca8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8.h @@ -41,5 +41,6 @@ protected: GemmTuningData SetTuningParams(const gemm_params& params) const; size_t GetMmadOperationsNumber(const GemmTuningData& tuning_data) const; bool HasLeftovers(const GemmTuningData& tuning_data, int tile_size) const; + DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.cpp index 7ab518a9c80..16119fd5b72 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.cpp @@ -32,6 +32,13 @@ ParamsKey GemmKernelMMADslmInt8::GetSupportedKey() const { return k; } +DeviceFeaturesKey GemmKernelMMADslmInt8::get_required_device_features_key(const Params& params, const optional_params& options) const { + auto k = get_common_subgroups_device_features_key(params, options); + k.requires_subgroup_shuffle(); + + return k; +} + JitConstants GemmKernelMMADslmInt8::GetJitConstants(const gemm_params& params) const { JitConstants jit = Parent::GetJitConstants(params); GemmTuningData td = SetTuningParams(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.h index 945a39e092e..12ff9c4caee 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_mmad_int8_slm.h @@ -43,5 +43,6 @@ protected: GemmTuningData SetTuningParams(const gemm_params& params) const; size_t GetMmadOperationsNumber(const GemmTuningData& tuning_data) const; bool HasLeftovers(const GemmTuningData& tuning_data) const; + DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp index 971a726857b..36d58b7cebf 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp @@ -31,6 +31,10 @@ ParamsKey GemmKernelRef::GetSupportedKey() const { return k; } +DeviceFeaturesKey GemmKernelRef::get_required_device_features_key(const Params& params, const optional_params& options) const { + return DeviceFeaturesKey(); +} + JitConstants GemmKernelRef::GetJitConstants(const gemm_params& params) const { JitConstants jit = Parent::GetJitConstants(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.h index e0a89f475c8..bdabba80d03 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.h @@ -25,5 +25,6 @@ protected: } bool Validate(const Params& params, const optional_params& options) const override; JitConstants GetJitConstants(const gemm_params& params) const override; + DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index dfa8fadd551..44ada811c8d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -30,6 +30,13 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const { return k; } +DeviceFeaturesKey GemmKernelTiledOpt::get_required_device_features_key(const Params& params, const optional_params& options) const { + auto k = get_common_subgroups_device_features_key(params, options); + k.requires_subgroup_shuffle(); + + return k; +} + GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& params) const { const auto& output = params.outputs[0]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.h index 75f5347a387..71cea86222e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.h @@ -35,5 +35,6 @@ protected: DispatchData SetDefault(const gemm_params& params) const override; JitConstants GetJitConstants(const gemm_params& params) const override; GemmTuningData SetTuningParams(const gemm_params& params) const; + DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp index 50cb399c3b5..30171493dae 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp @@ -29,6 +29,15 @@ ParamsKey SoftmaxKernel_bf::GetSupportedKey() const { return k; } +DeviceFeaturesKey SoftmaxKernel_bf::get_required_device_features_key(const Params& params, const optional_params& options) const { + DeviceFeaturesKey k; + k.requires_subgroups(); + k.requires_subgroup_reduce(); + k.requires_reqd_subgroup_size(); + + return k; +} + SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softmax_params& params) const { auto dispatchData = Parent::SetDefault(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.h index 2aaffedabf7..4f122267339 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.h @@ -20,6 +20,7 @@ public: protected: DispatchData SetDefault(const softmax_params& params) const override; JitConstants GetJitConstants(const softmax_params& params, DispatchData dispatchData) const override; + DeviceFeaturesKey get_required_device_features_key(const Params& params, const optional_params& /*options*/) const override; std::vector GetSupportedFusedOps() const override { return { FusedOpType::QUANTIZE }; } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index c96fcb8169e..24e9e5f48f5 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -166,6 +166,16 @@ device_info init_device_info(const cl::Device& device) { info.max_work_group_size = static_cast(device.getInfo()); + // For some reason nvidia runtime throws an exception (CL_INVALID_KERNEL_ARGS) for WG as follows: + // global: < 1 x 32 x 5184 > + // local: < 1 x 1 x 576 > + // While local < 1 x 1 x 36 > works fine + // So below we limit max WG size by 64 which was selected based on few experiments. + constexpr int nvidia_vendor_id = 0x10DE; + if (info.vendor_id == nvidia_vendor_id) { + info.max_work_group_size = 64; + } + info.max_local_mem_size = static_cast(device.getInfo()); info.max_global_mem_size = static_cast(device.getInfo()); info.max_alloc_mem_size = static_cast(device.getInfo());