diff --git a/converter/main.py b/converter/main.py index 310a03ac555a0f05de679a5a4f3eed3eb9e77bf4..e3b11f9a3a5461a2a815a5447f355508752ced13 100644 --- a/converter/main.py +++ b/converter/main.py @@ -125,6 +125,8 @@ class OPTYPE(IntEnum): ScatterND = (22,) GridSample = (23,) Resize = (24,) + Compare = (25,) + Where = (26,) # "BatchMatMulV2" did not exist in Tensorflow 1.9. It exists in # Tensorflow 1.15. @@ -1109,6 +1111,93 @@ def parse_graph_node( myGraph[node.output[0]]["additional"]["mode"] = mode myGraph[node.output[0]]["additional"]["nearest_mode"] = nearest_mode map_onnx_to_myGraph[node.output[0]] = node.output[0] + + elif node.op_type == "Less": + additional = {} + additional["data"] = node + if is_constant(node.input[1], model_onnx.graph.initializer): + n2 = getNodesWithOutput(node.input[1], model_onnx) #constant + additional["dims"], additional["raw_data"], additional[ + "dtype" + ] = extract_additional_data( + node.input[1], + False, + model_onnx.graph, + verbose, + ) + myGraph[node.input[1]] = {} + myGraph[node.input[1]]["op_type"] = OPTYPE.Const + myGraph[node.input[1]]["inputs"] = [] + myGraph[node.input[1]]["additional"] = additional + map_onnx_to_myGraph[node.input[1]] = node.input[1] + + myGraph[node.output[0]] = {} + myGraph[node.output[0]]["op_type"] = OPTYPE.Compare + myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]] + myGraph[node.output[0]]["additional"] = {} + myGraph[node.output[0]]["additional"]["data"] = node + myGraph[node.output[0]]["additional"]["mode"] = 0 + map_onnx_to_myGraph[node.output[0]] = node.output[0] + + elif node.op_type == "Greater": + additional = {} + additional["data"] = node + if is_constant(node.input[1], model_onnx.graph.initializer): + n2 = getNodesWithOutput(node.input[1], model_onnx) #constant + additional["dims"], additional["raw_data"], additional[ + "dtype" + ] = extract_additional_data( + node.input[1], + False, + model_onnx.graph, + verbose, + ) + myGraph[node.input[1]] = {} + myGraph[node.input[1]]["op_type"] = OPTYPE.Const + myGraph[node.input[1]]["inputs"] = [] + myGraph[node.input[1]]["additional"] = additional + map_onnx_to_myGraph[node.input[1]] = node.input[1] + + myGraph[node.output[0]] = {} + myGraph[node.output[0]]["op_type"] = OPTYPE.Compare + myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]] + myGraph[node.output[0]]["additional"] = {} + myGraph[node.output[0]]["additional"]["data"] = node + myGraph[node.output[0]]["additional"]["mode"] = 1 + map_onnx_to_myGraph[node.output[0]] = node.output[0] + + elif node.op_type == "Where": + if is_constant(node.input[1], model_onnx.graph.initializer): + additional = {} + additional["data"] = node + n2 = getNodesWithOutput(node.input[1], model_onnx) + additional["dims"], additional["raw_data"], additional[ + "dtype" + ] = extract_additional_data(node.input[1], False, model_onnx.graph, verbose) + myGraph[node.input[1]] = {} + myGraph[node.input[1]]["op_type"] = OPTYPE.Const + myGraph[node.input[1]]["inputs"] = [] + myGraph[node.input[1]]["additional"] = additional + map_onnx_to_myGraph[node.input[1]] = node.input[1] + if is_constant(node.input[2], model_onnx.graph.initializer): + additional = {} + additional["data"] = node + n2 = getNodesWithOutput(node.input[2], model_onnx) + additional["dims"], additional["raw_data"], additional[ + "dtype" + ] = extract_additional_data(node.input[2], False, model_onnx.graph, verbose) + myGraph[node.input[2]] = {} + myGraph[node.input[2]]["op_type"] = OPTYPE.Const + myGraph[node.input[2]]["inputs"] = [] + myGraph[node.input[2]]["additional"] = additional + map_onnx_to_myGraph[node.input[2]] = node.input[2] + + myGraph[node.output[0]] = {} + myGraph[node.output[0]]["op_type"] = OPTYPE.Where + myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]]+[map_onnx_to_myGraph[node.input[2]]] + myGraph[node.output[0]]["additional"] = {} + myGraph[node.output[0]]["additional"]["data"] = node + map_onnx_to_myGraph[node.output[0]] = node.output[0] else: raise Exception("[ERROR] node not supported:\n{})".format(node)) @@ -1412,6 +1501,12 @@ def dump_onnx(graph, my_inputs, my_outputs, output_filename, verbose=False): print("#\t nearest_mode", node["additional"]["nearest_mode"]) f.write(struct.pack("i", int(node["additional"]["nearest_mode"]))) + elif node["op_type"] == OPTYPE.Compare: + if verbose: + print("#\t mode", node["additional"]["mode"]) + f.write(struct.pack("i", int(node["additional"]["mode"]))) + + if ( node["op_type"] == OPTYPE.Conv2D or node["op_type"] == OPTYPE.Conv2DTranspose diff --git a/sadl/layer.h b/sadl/layer.h index 9a5b492f0a453df3983224585c5c14b89ad87f2f..3b6737d1801f7cc47a14f35027e6d56594f0f526 100644 --- a/sadl/layer.h +++ b/sadl/layer.h @@ -72,7 +72,9 @@ struct OperationType ScatterND = 22, GridSample = 23, Resize = 24, - OperationTypeCount = 25 + Compare = 25, + Where = 26, + OperationTypeCount = 27 }; }; diff --git a/sadl/layer_compare.h b/sadl/layer_compare.h new file mode 100644 index 0000000000000000000000000000000000000000..fdd02652d003cbea82804e6a84268075b3bb9660 --- /dev/null +++ b/sadl/layer_compare.h @@ -0,0 +1,181 @@ +/* The copyright in this software is being made available under the BSD + * License, included below. This software may be subject to other third party + * and contributor rights, including patent rights, and no such rights are + * granted under this license. + * + * Copyright (c) 2010-2024, ITU/ISO/IEC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of the ITU/ISO/IEC nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +#pragma once +#include "layer.h" + +namespace sadl +{ +namespace layers +{ +enum class Compare_mode +{ + LessThan, + GreaterThan +}; +template<typename T> class Compare : public Layer<T> +{ +public: + using Layer<T>::Layer; + using Layer<T>::m_out; // to avoid this-> + using Layer<T>::m_initDone; + + virtual bool apply(std::vector<Tensor<T> *> &in) override; + virtual bool init(const std::vector<Tensor<T> *> &in) override; + +protected: + virtual bool loadInternal(std::istream &file, Version) override; + bool apply_less(std::vector<Tensor<T> *> &in); + bool apply_greater(std::vector<Tensor<T> *> &in); + + Compare_mode m_mode; + DUMP_MODEL_EXT; +}; + +template<typename T> bool Compare<T>::apply(std::vector<Tensor<T> *> &in) +{ + assert(in.size() == 2); + assert(in[0]->dims() == m_out.dims() || (in[0]->dims().size() == 1 && in[0]->dims()[0] == 1)); + assert(in[1]->dims() == m_out.dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1)); + if(m_mode == Compare_mode::LessThan) + return apply_less(in); + else if(m_mode == Compare_mode::GreaterThan) + return apply_greater(in); + else return false; +} + +template<typename T> bool Compare<T>::apply_less(std::vector<Tensor<T> *> &in) +{ + const Tensor<T> &A = *in[0]; + const Tensor<T> &B = *in[1]; + const int &A_q = A.quantizer; + const int &B_q = B.quantizer; + const int A_shift = std::max(0, B_q - A_q); + const int B_shift = std::max(0, A_q - B_q); + m_out.quantizer = 0;// bool tensor + if(B.dims().size() == 1) + { + for (int i = 0; i < m_out.size(); i++) + { + T A_i = A[i]; + T B_i = B[0]; + ComputationType<T>::shift_left(A_i, A_shift);//quantization + ComputationType<T>::shift_left(B_i, B_shift);//quantization + T z = A_i < B_i; + COUNTERS(z); + m_out[i] = z; + } + } + + else + { + for (int i = 0; i < m_out.size(); i++) + { + T A_i = A[i]; + T B_i = B[i]; + ComputationType<T>::shift_left(A_i, A_shift);//quantization + ComputationType<T>::shift_left(B_i, B_shift);//quantization + T z = A_i < B_i; + COUNTERS(z); + m_out[i] = z; + } + } + return true; +} + +template<typename T> bool Compare<T>::apply_greater(std::vector<Tensor<T> *> &in) +{ + const Tensor<T> &A = *in[0]; + const Tensor<T> &B = *in[1]; + const int &A_q = A.quantizer; + const int &B_q = B.quantizer; + const int A_shift = std::max(0, B_q - A_q); + const int B_shift = std::max(0, A_q - B_q); + m_out.quantizer = 0;// bool tensor + if(B.dims().size() == 1) + { + for (int i = 0; i < m_out.size(); i++) + { + T A_i = A[i]; + T B_i = B[0]; + ComputationType<T>::shift_left(A_i, A_shift);//quantization + ComputationType<T>::shift_left(B_i, B_shift);//quantization + T z = A_i > B_i; + COUNTERS(z); + m_out[i] = z; + } + } + + else + { + for (int i = 0; i < m_out.size(); i++) + { + T A_i = A[i]; + T B_i = B[i]; + ComputationType<T>::shift_left(A_i, A_shift);//quantization + ComputationType<T>::shift_left(B_i, B_shift);//quantization + T z = A_i > B_i; + COUNTERS(z); + m_out[i] = z; + } + } + return true; +} + +template<typename T> bool Compare<T>::init(const std::vector<Tensor<T> *> &in) +{ + if (in.size() != 2) + return false; + m_out.resize(in[0]->dims()); + m_initDone = true; + return true; +} + +template<typename T> bool Compare<T>::loadInternal(std::istream &file, Version) +{ + int32_t x = 0; + file.read((char *) &x, sizeof(x)); + if(x == (int32_t) Compare_mode::LessThan) + m_mode = Compare_mode::LessThan; + else if(x == (int32_t) Compare_mode::GreaterThan) + m_mode = Compare_mode::GreaterThan; + else + { + std::cerr << "[ERROR] invalid mode: " << x << std::endl; + return false; + } + SADL_DBG(std::cout << " - mode: " << x << std::endl); + return true; +} + +} // namespace layers +} // namespace sadl diff --git a/sadl/layer_conv2d.h b/sadl/layer_conv2d.h index 9a51098539f3f387e487c930b1bfc133af4819d4..be6be41b46e68ea40fe424502c773f0ae9d436c7 100644 --- a/sadl/layer_conv2d.h +++ b/sadl/layer_conv2d.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -126,6 +126,9 @@ protected: } + template<int in_D, int ihalf_size, int jhalf_size> void simd32_conv2d_ixj_s11_g1_d_core(const Tensor<T> &A, const Tensor<T> &kernel) { + simd16_conv2d_ixj_s11_g1_d_core<in_D,ihalf_size,jhalf_size>(A,kernel); + } #endif DUMP_MODEL_EXT; }; diff --git a/sadl/layer_conv2d_ixj.h b/sadl/layer_conv2d_ixj.h index be79c29cc1edcab0ca3c588bd5e7b81b5b0d2beb..e8e4b796400560a48589081011706da74beb6a33 100644 --- a/sadl/layer_conv2d_ixj.h +++ b/sadl/layer_conv2d_ixj.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -315,7 +315,7 @@ template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Con int start_h{ ihalf_size - top }; int start_w{ jhalf_size - left }; #if DEBUG_SIMD && __AVX2__ - std::cout << "\n[WARN] generic version conv (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters + std::cout << "\n[WARN] generic version conv2d_ixj_s11_g1_d_core (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters << " s=[" << s_w << ' ' << s_h << "] " << in_H << 'x' << in_W << " " << "?? kMAC" << std::endl; #endif @@ -406,7 +406,7 @@ template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<float> #endif #if __AVX512BW__ -template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel) +template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::simd32_conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel) { static_assert(in_D % 32 == 0, "Should be used with mod32 filters."); using T=int16_t; @@ -460,6 +460,67 @@ template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_ } } #endif +#if __AVX2__ +template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::simd16_conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel) +{ + static_assert(in_D % 16 == 0, "Should be used with mod16 filters."); + using T=int16_t; + constexpr int im_nb = 0; + constexpr int s_h = 1; + constexpr int s_w = 1; + const int nb_filters{ kernel.dims()[2] }; + const int shift = kernel.quantizer + m_q; + const int top{ m_pads[0] }; + const int left{ m_pads[1] }; + int in_H{ A.dims()[1] }; + int in_W{ A.dims()[2] }; + int start_h{ ihalf_size - top }; + int start_w{ jhalf_size - left }; +#if DEBUG_SIMD && __AVX512BW__ + if (in_D % 32 == 0 || in_D>=32) + std::cout << "\n[WARN] avx2 (instead of avx512) version simd16_conv2d_ixj_s11_g1_d_core (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters + << " s=[" << s_w << ' ' << s_h << "] " << in_H << 'x' << in_W << " " + << "?? kMAC" << std::endl; +#endif + + assert(start_h + s_h - ihalf_size >= 0); + assert(start_w + s_w - jhalf_size >= 0); + for (int im_i = start_h + s_h; im_i < in_H - ihalf_size; im_i += s_h) + { + for (int im_j = start_w + s_w; im_j < in_W - jhalf_size; im_j += s_w) + { + for (int filter = 0; filter < nb_filters; ++filter) + { + __m256i s = _mm256_setzero_si256(); + for (int filter_i = -ihalf_size; filter_i <= ihalf_size; ++filter_i) + { // fixed + for (int filter_j = -jhalf_size; filter_j <= jhalf_size; ++filter_j) + { // fixed + for (int filter_d = 0; filter_d < in_D; filter_d += 16) + { + const int ii = im_i + filter_i; + const int jj = im_j + filter_j; + const int ki = ihalf_size + filter_i; + const int kj = jhalf_size + filter_j; + const __m256i *kptr = (const __m256i *) kernel.addr(ki, kj, filter, filter_d); + const __m256i k0 = _mm256_load_si256(kptr); + const __m256i *aptr = (const __m256i *) A.addr(im_nb, ii, jj, filter_d); + const __m256i v0 = _mm256_load_si256(aptr); + + const __m256i mad0 = _mm256_madd_epi16(k0, v0); // res in si32 + s = _mm256_add_epi32(s, mad0); + } + } + } + typename ComputationType<T>::type z = (sum32_int16(s) >> shift); + COUNTERS(z); + SATURATE(z); + m_out(im_nb, im_i / s_h, im_j / s_w, filter) = z; + } + } + } +} +#endif template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<T>::conv2d_ixj_s11_gD_d_core(const Tensor<T> &A, const Tensor<T> &kernel) { constexpr int nb_filters = in_D; @@ -474,7 +535,7 @@ template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Con int start_h{ ihalf_size - top }; int start_w{ jhalf_size - left }; #if DEBUG_SIMD && __AVX2__ - std::cout << "\n[WARN] partially generic version conv " << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D + std::cout << "\n[WARN] partially generic version conv2d_ixj_s11_gD_d_core " << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters << " s=[" << s_w << ' ' << s_h << "] " << in_H << 'x' << in_W << " " << "?? kMAC" << std::endl; #endif @@ -515,9 +576,11 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor const int nb_filters{ kernel.dims()[2] }; // grouped conv with stride 1 and inD==outD -#if __AVX512F__ || __AVX512BW__ +#if __AVX2__ +#define CONV_MOD32 simd32_conv2d_ixj_s11_g1_d_core #define CONV_MOD16 simd16_conv2d_ixj_s11_g1_d_core #else +#define CONV_MOD32 conv2d_ixj_s11_g1_d_core #define CONV_MOD16 conv2d_ixj_s11_g1_d_core #endif if (in_D == m_groups && in_D == nb_filters && s_h == 1 && s_w == 1) @@ -577,12 +640,16 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor constexpr int kj = 1; switch (in_D) { + case 16: + CONV_MOD16<16, ki, kj>(A, kernel); + return; + break; case 32: - CONV_MOD16<32, ki, kj>(A, kernel); + CONV_MOD32<32, ki, kj>(A, kernel); return; break; case 64: - CONV_MOD16<64, ki, kj>(A, kernel); + CONV_MOD32<64, ki, kj>(A, kernel); return; break; default: // do default @@ -595,12 +662,16 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor constexpr int kj = 0; switch (in_D) { + case 16: + CONV_MOD16<16, ki, kj>(A, kernel); + return; + break; case 32: - CONV_MOD16<32, ki, kj>(A, kernel); + CONV_MOD32<32, ki, kj>(A, kernel); return; break; case 64: - CONV_MOD16<64, ki, kj>(A, kernel); + CONV_MOD32<64, ki, kj>(A, kernel); return; break; default: // do default diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h index a8ff2ae1a7c567138ef27a8da0955d5263826446..62a37f2f7e2053c3ebaaec10c0fd67a6ddbfeb4b 100644 --- a/sadl/layer_gridsample.h +++ b/sadl/layer_gridsample.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -214,12 +214,12 @@ template<> inline void GridSample<float>::gs_denormalize(float &x, int length) { if (m_mode == gridsample_mode_nearest) { - x = (x + 1) * (length - 1) / 2.0; + x = (x + 1) * (length - 1) / 2.0f; x = round(x); } else if (m_mode == gridsample_mode_bilinear) { - x = (x + 1) * (length - 1) / 2.0; + x = (x + 1) * (length - 1) / 2.0f; } } @@ -244,8 +244,8 @@ template<> inline void GridSample<float>::get_bilinear_coeffs(float y, float x, float &coeff11 = coeffs[0], &coeff12 = coeffs[1], &coeff21 = coeffs[2], &coeff22 = coeffs[3]; int &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3]; - x_ori_left = floor(x); - y_ori_top = floor(y); + x_ori_left = (int)floor(x); + y_ori_top = (int)floor(y); x_ori_right = x_ori_left + 1; y_ori_bottom = y_ori_top + 1; float dy2 = y_ori_bottom - y; diff --git a/sadl/layer_leakyrelu.h b/sadl/layer_leakyrelu.h index e4a3e7bc25a77a6674bae76e7c96895413b55de0..11c24d8c0d3de51a5e7f5623a2e976124349d934 100644 --- a/sadl/layer_leakyrelu.h +++ b/sadl/layer_leakyrelu.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,64 +32,12 @@ */ #pragma once #include "layer.h" +#include "layer_prelu.h" namespace sadl { namespace layers { -template<typename T> class LeakyRelu : public Layer<T> -{ -public: - using Layer<T>::Layer; - using Layer<T>::m_out; // to avoid this-> - using Layer<T>::m_initDone; - - virtual bool apply(std::vector<Tensor<T> *> &in) override; - virtual bool init(const std::vector<Tensor<T> *> &in) override; - virtual bool mutateInput() const override { return true; } - -protected: - virtual bool loadInternal(std::istream &file, Version) override; -}; - -template<typename T> bool LeakyRelu<T>::apply(std::vector<Tensor<T> *> &in) -{ - assert(in.size() == 2); - assert(in[0]->dims() == m_out.dims()); - const Tensor<T> &A = *in[1]; - swap(*in[0], m_out); - // keep same qunatiz as input - const typename ComputationType<T>::type alpha = A[0]; - - const int alpha_q = A.quantizer; - for (auto &x: m_out) - { - if (x < 0) - { - typename ComputationType<T>::type z = x * alpha; - ComputationType<T>::quantize(z, alpha_q); - COUNTERS(z); - COUNTERS_MAC(z); - SATURATE(z); - x = z; - } else { - COUNTERS_MAC_NOP(1); - } - } - - return true; -} - -template<typename T> bool LeakyRelu<T>::init(const std::vector<Tensor<T> *> &in) -{ - if (in.size() != 2) - return false; - m_out.resize(in[0]->dims()); - m_initDone = true; - return true; -} - -template<typename T> bool LeakyRelu<T>::loadInternal(std::istream &, Version) { return true; } - +template<typename T> using LeakyRelu=PReLU<T>; } // namespace layers } // namespace sadl diff --git a/sadl/layer_prelu.h b/sadl/layer_prelu.h index f943437804d7a08de1c9c42b8c9973b00acc4448..2d139579dfa0a0589d60065fd2242290d300381d 100644 --- a/sadl/layer_prelu.h +++ b/sadl/layer_prelu.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -49,50 +49,344 @@ public: virtual bool mutateInput() const override { return true; } protected: - virtual bool loadInternal(std::istream &file, Version) override; + virtual bool loadInternal(std::istream &file, Version) override; + template<bool multialpha> bool apply_scalar(std::vector<Tensor<T> *> &in); +#if __AVX2__ + template<bool multialpha> bool apply_simd256(std::vector<Tensor<T> *> &in); +#endif +#if __AVX512F__ || __AVX512BW__ + template<bool multialpha> bool apply_simd512(std::vector<Tensor<T> *> &in); +#endif }; template<typename T> bool PReLU<T>::apply(std::vector<Tensor<T> *> &in) +{ + assert(in.size() == 2); + assert(in[0]->dims() == m_out.dims()); +#if __AVX512F__ + if (std::is_same<T, float>::value && in[0]->size() % 16 == 0) + { + if (in[1]->size() == 1) + { + return apply_simd512<false>(in); + } + else if (in[1]->size() % 16 == 0) + { + return apply_simd512<true>(in); + } + } +#endif +#if __AVX512BW__ + if (std::is_same<T, int16_t>::value && in[0]->size() % 32 == 0) + { + if (in[1]->size() == 1) + { + return apply_simd512<false>(in); + } + else if (in[1]->size() % 32 == 0) + { + return apply_simd512<true>(in); + } + } +#endif +#if __AVX2__ + if (std::is_same<T, float>::value && in[0]->size() % 8 == 0) + { + if (in[1]->size() == 1) + { + return apply_simd256<false>(in); + } + else if (in[1]->size() % 8 == 0) + { + return apply_simd256<true>(in); + } + } +#endif + +#if __AVX2__ + if (std::is_same<T, int16_t>::value && in[0]->size() % 16 == 0) + { + if (in[1]->size() == 1) + { + return apply_simd256<false>(in); + } + else if (in[1]->size() % 16 == 0) + { + return apply_simd256<true>(in); + } + } +#endif + if (in[1]->size() == 1) + { + return apply_scalar<false>(in); + } + else + { + return apply_scalar<true>(in); + } +} + +template<typename T> template<bool multialpha> bool PReLU<T>::apply_scalar(std::vector<Tensor<T> *> &in) // without simd { const int in_N{ in[0]->dims()[0] }; const int in_H{ in[0]->dims()[1] }; const int in_W{ in[0]->dims()[2] }; const int in_C{ in[0]->dims()[3] }; - assert(in.size() == 2); - assert(in[0]->dims() == m_out.dims()); const Tensor<T> &A = *in[1]; swap(*in[0], m_out); // keep same qunatiz as input const int alpha_q = A.quantizer; - for (int n_nb = 0; n_nb < in_N; n_nb++) + if (multialpha) { - for (int c_nb = 0; c_nb < in_C; c_nb++) + for (int n_nb = 0; n_nb < in_N; n_nb++) { - // A.dims()[0] == 1, means all channels share the same alpha parameter - const typename ComputationType<T>::type alpha = (A.dims()[0] == 1) ? A(0, 0, 0) : A(c_nb, 0, 0); - for (int h_nb = 0; h_nb < in_H; h_nb++) + for (int c_nb = 0; c_nb < in_C; c_nb++) { - for (int w_nb = 0; w_nb < in_W; w_nb++) + // A.dims()[0] == 1, means all channels share the same alpha parameter + const typename ComputationType<T>::type alpha = (A.dims()[0] == 1) ? A(0, 0, 0) : A(c_nb, 0, 0); + for (int h_nb = 0; h_nb < in_H; h_nb++) { - if (m_out(n_nb, h_nb, w_nb, c_nb) < 0) + for (int w_nb = 0; w_nb < in_W; w_nb++) { - typename ComputationType<T>::type z = m_out(n_nb, h_nb, w_nb, c_nb) * alpha; - ComputationType<T>::quantize(z, alpha_q); - COUNTERS(z); - COUNTERS_MAC(z); - SATURATE(z); - m_out(n_nb, h_nb, w_nb, c_nb) = z; - } else { - COUNTERS_MAC_NOP(1); - } + if (m_out(n_nb, h_nb, w_nb, c_nb) < 0) + { + typename ComputationType<T>::type z = m_out(n_nb, h_nb, w_nb, c_nb) * alpha; + ComputationType<T>::quantize(z, alpha_q); + COUNTERS(z); + COUNTERS_MAC(z); + SATURATE(z); + m_out(n_nb, h_nb, w_nb, c_nb) = z; + } + else + { + COUNTERS_MAC_NOP(1); + } + } } } } } + else + { + const typename ComputationType<T>::type alpha = A[0]; + for (auto &x: m_out) + { + if (x < 0) + { + typename ComputationType<T>::type z = x * alpha; + ComputationType<T>::quantize(z, alpha_q); + COUNTERS(z); + COUNTERS_MAC(z); + SATURATE(z); + x = z; + } + else + { + COUNTERS_MAC_NOP(1); + } + } + } return true; } +#if __AVX2__ +template<> template<bool multialpha> inline bool PReLU<float>::apply_simd256(std::vector<Tensor<float> *> &in) // simd256 float +{ + exit(-1); // to correct + Tensor<float> &A = *in[1]; + swap(*in[0], m_out); + float *const data_ptr = m_out.data(); + const float *const alpha_ptr = A.data(); + const __m256 m_zeros = _mm256_setzero_ps(); + __m256 alpha = _mm256_set1_ps(*A.data()); + for (int iter = 0; iter < m_out.size(); iter += 8) + { + if (multialpha) + alpha = _mm256_load_ps(alpha_ptr + iter % A.size()); + + float *const aptr = data_ptr + iter; + auto a = _mm256_load_ps(aptr); // load + auto min_a_zero = _mm256_min_ps(a, m_zeros); // min(a,0) + auto max_a_zero = _mm256_max_ps(a, m_zeros); // max(a,0) + auto b = _mm256_mul_ps(min_a_zero, alpha); // min(a,0)*alpha + const __m256 v = _mm256_add_ps(max_a_zero, b); // max(a,0)+min(a,0)*alpha + /*store*/ _mm256_store_ps(aptr, v); + } + + return true; +} + +template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd256(std::vector<Tensor<int16_t> *> &in) +{ + Tensor<int16_t> &A = *in[1]; + swap(*in[0], m_out); + int16_t *const data_ptr = m_out.data(); + [[maybe_unused]] const int16_t *const alpha_ptr = A.data(); + const int alpha_q = A.quantizer; + + __m256i alpha = _mm256_set1_epi16(A[0]); + const __m256i mask = _mm256_set1_epi32(65535); + const __m256i max = _mm256_set1_epi32(32767); + const __m256i min = _mm256_set1_epi32(-32768); + const __m256i zeros = _mm256_setzero_si256(); + const int N = m_out.size(); + for (int iter = 0; iter < N; iter += 16) + { + int16_t *aptr = data_ptr + iter; + auto a = _mm256_load_si256((__m256i *) aptr); // load + if (multialpha) + { + alpha = _mm256_load_si256((__m256i *) (alpha_ptr + (iter % A.size()))); + } + + // prepare branches + auto max0 = _mm256_max_epi16(a, zeros); + auto min0 = _mm256_min_epi16(a, zeros); + // branch neg + // mul + auto lo = _mm256_mullo_epi16(min0, alpha); // min(a,0)*alpha lo part + auto hi = _mm256_mulhi_epi16(min0, alpha); // min(a,0)*alpha hi part + // repack32 + auto lo32 = _mm256_unpacklo_epi16(lo, hi); + auto hi32 = _mm256_unpackhi_epi16(lo, hi); + auto y0 = _mm256_permute2x128_si256(lo32, hi32, _MM_SHUFFLE(0, 2, 0, 0)); + auto y1 = _mm256_permute2x128_si256(lo32, hi32, _MM_SHUFFLE(0, 3, 0, 1)); + // shift + auto y0s = _mm256_srai_epi32(y0, alpha_q); + auto y1s = _mm256_srai_epi32(y1, alpha_q); +#if SATURATE_RESULT + // clip + auto y0c = _mm256_max_epi32(y0s, min); + auto y1c = _mm256_max_epi32(y1s, min); + auto y0c2 = _mm256_min_epi32(y0c, max); + auto y1c2 = _mm256_min_epi32(y1c, max); +#else + auto y0c2 = y0s; + auto y1c2 = y1s; +#endif + // mask 16bits + auto y0p = _mm256_and_si256(y0c2, mask); + auto y1p = _mm256_and_si256(y1c2, mask); + // repack + auto z = _mm256_packus_epi32(y0p, y1p); + auto z2 = _mm256_permute4x64_epi64(z, _MM_SHUFFLE(3, 1, 2, 0)); + // merge 2 branches + auto r = _mm256_add_epi16(max0, z2); + _mm256_store_si256((__m256i *) aptr, r); + } + return true; +} + +template<typename T> template<bool multialpha> bool PReLU<T>::apply_simd256(std::vector<Tensor<T> *> &in) // +{ + std::cerr << "[ERROR] simd type not supported: " << std::endl; + exit(-1); +} +#endif + +#if __AVX512F__ +template<> template<bool multialpha> inline bool PReLU<float>::apply_simd512(std::vector<Tensor<float> *> &in) // simd512 float +{ + Tensor<float> &A = *in[1]; + swap(*in[0], m_out); + float *const data_ptr = m_out.data(); + const float *const alpha_ptr = A.data(); + const __m512 m_zeros = _mm512_setzero_ps(); + __m512 alpha = _mm512_set1_ps(*A.data()); + for (int iter = 0; iter < m_out.size(); iter += 16) + { + if (multialpha) + alpha = _mm512_load_ps(alpha_ptr + iter % A.size()); + + float *const aptr = data_ptr + iter; // load + auto a = _mm512_load_ps(aptr); + auto min_a_zero = _mm512_min_ps(a, m_zeros); // min(a,0) + auto max_a_zero = _mm512_max_ps(a, m_zeros); // max(a,0) + auto b = _mm512_mul_ps(min_a_zero, alpha); // min(a,0)*alpha + auto v = _mm512_add_ps(max_a_zero, b); // max(a,0)+min(a,0)*alpha + /*store*/ _mm512_store_ps(aptr, v); + } + + return true; +} + +#endif + +#if __AVX512BW__ +template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd512(std::vector<Tensor<int16_t> *> &in) // simd512 int16 quantize +{ + Tensor<int16_t> &A = *in[1]; + swap(*in[0], m_out); + int16_t *data_ptr = m_out.data(); + [[maybe_unused]] const int16_t *const alpha_ptr = A.data(); + const int alpha_q = A.quantizer; + auto alpha0 = _mm512_set1_epi32(A[0]); + auto alpha1 = alpha0; + const auto max = _mm512_set1_epi32(32767); + const auto min = _mm512_set1_epi32(-32768); + const auto zeros = _mm512_setzero_si512(); + static constexpr int16_t data[]={0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; + const auto shuffle= _mm512_loadu_si512((void *)data); + + const int N = m_out.size(); + + for (int iter = 0; iter < N; iter += 32) + { + int16_t *aptr = data_ptr + iter; + auto a = _mm512_loadu_si512((__m512i *) aptr); // load + if (multialpha) + { + auto a2 = _mm512_loadu_si512((__m512i *) (alpha_ptr + (iter % A.size()))); + auto a2lo = _mm512_castsi512_si256(a2); + alpha0 = _mm512_cvtepi16_epi32(a2lo); + auto a2hi = _mm512_extracti64x4_epi64(a2, 1); + alpha1 = _mm512_cvtepi16_epi32(a2hi); + } + // prepare branches + auto max0 = _mm512_max_epi16(a, zeros); + auto min0 = _mm512_min_epi16(a, zeros); + // branch neg + // extract + auto lo = _mm512_castsi512_si256(min0); + auto hi = _mm512_extracti64x4_epi64(min0, 1); + // unpack 16 to 32 + auto lo32 = _mm512_cvtepi16_epi32(lo); + auto hi32 = _mm512_cvtepi16_epi32(hi); + // mul + auto y0 = _mm512_mullo_epi32(lo32, alpha0); + auto y1 = _mm512_mullo_epi32(hi32, alpha1); + // shift + auto y0s = _mm512_srai_epi32(y0, alpha_q); + auto y1s = _mm512_srai_epi32(y1, alpha_q); +#if SATURATE_RESULT + // clip + auto y0c = _mm512_max_epi32(y0s, min); + auto y1c = _mm512_max_epi32(y1s, min); + auto y0c2 = _mm512_min_epi32(y0c, max); + auto y1c2 = _mm512_min_epi32(y1c, max); +#else + auto y0c2 = y0s; + auto y1c2 = y1s; +#endif + // pack + auto z2 = _mm512_permutex2var_epi16(y0c2, shuffle, y1c2); + // merge branches + auto r = _mm512_add_epi16(max0, z2); + _mm512_storeu_si512((__m512i *) aptr, r); + } + return true; +} +#endif + +#if __AVX512F__ || __AVX512BW__ +template<typename T> template<bool multialpha> bool PReLU<T>::apply_simd512(std::vector<Tensor<T> *> &in) // +{ + std::cerr << "[ERROR] simd type not supported: " << std::endl; + exit(-1); +} +#endif + template<typename T> bool PReLU<T>::init(const std::vector<Tensor<T> *> &in) { if (in.size() != 2) diff --git a/sadl/layer_resize.h b/sadl/layer_resize.h index c90c3449b5ef5bad90f7d8d1021ebf1e74513565..b7702adfcdd7a750eb271d57d7a4898915563eef 100644 --- a/sadl/layer_resize.h +++ b/sadl/layer_resize.h @@ -3,7 +3,7 @@ * and contributor rights, including patent rights, and no such rights are * granted under this license. * - * Copyright (c) 2010-2023, ITU/ISO/IEC + * Copyright (c) 2010-2024, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -135,10 +135,10 @@ template<typename T> bool Resize<T>::init(const std::vector<Tensor<T> *> &in) float scale_N = 0, scale_C = 0, scale_H = 0, scale_W = 0; if (m_input_label == 1) // inputs are X and sizes { - scale_N = in[1]->data()[0] / N; - scale_C = in[1]->data()[1] / C; - scale_H = in[1]->data()[2] / H_in; - scale_W = in[1]->data()[3] / W_in; + scale_N = in[1]->data()[0] / (float)N; + scale_C = in[1]->data()[1] / (float)C; + scale_H = in[1]->data()[2] / (float)H_in; + scale_W = in[1]->data()[3] / (float)W_in; } else if (m_input_label == 2) // inputs are X and scales { @@ -153,17 +153,17 @@ template<typename T> bool Resize<T>::init(const std::vector<Tensor<T> *> &in) return false; } scale_factors.resize(in[1]->dims()); - scale_factors[0] = scale_N; - scale_factors[1] = scale_H; - scale_factors[2] = scale_W; - scale_factors[3] = scale_C; + scale_factors[0] = static_cast<T2>(scale_N); + scale_factors[1] = static_cast<T2>(scale_H); + scale_factors[2] = static_cast<T2>(scale_W); + scale_factors[3] = static_cast<T2>(scale_C); // resize m_out Dimensions dim; dim.resize(4); - dim[0] = N * scale_N; - dim[1] = H_in * scale_H; - dim[2] = W_in * scale_W; - dim[3] = C * scale_C; + dim[0] = (int)(N * scale_N); + dim[1] = (int)(H_in * scale_H); + dim[2] = (int)(W_in * scale_W); + dim[3] = (int)(C * scale_C); m_out.resize(dim); m_initDone = true; return true; @@ -261,8 +261,8 @@ template<> inline void Resize<float>::calc_positions(int y, int x, int H, int W, if (m_coordinate_transformation_mode == resize_coordinate_transformation_mode_half_pixel) { - x_ori = (x + 0.5) / scale_factors[2] - 0.5; - y_ori = (y + 0.5) / scale_factors[1] - 0.5; + x_ori = (x + 0.5f) / scale_factors[2] - 0.5f; + y_ori = (y + 0.5f) / scale_factors[1] - 0.5f; } else if (m_coordinate_transformation_mode == resize_coordinate_transformation_mode_asymmetric) { @@ -273,8 +273,8 @@ template<> inline void Resize<float>::calc_positions(int y, int x, int H, int W, y_ori_int = std::floor(y_ori); // acquire the positions of adjacent pixels, prioritizing the left and top pixels - x_ori_left = (x_ori == x_ori_int) ? x_ori_int - 1 : x_ori_int; - y_ori_top = (y_ori == y_ori_int) ? y_ori_int - 1 : y_ori_int; + x_ori_left = (int)((x_ori == x_ori_int) ? x_ori_int - 1 : x_ori_int); + y_ori_top = (int)((y_ori == y_ori_int) ? y_ori_int - 1 : y_ori_int); x_ori_right = x_ori_left + 1; y_ori_bottom = y_ori_top + 1; x_ori_left = std::max(0, x_ori_left); diff --git a/sadl/layer_where.h b/sadl/layer_where.h new file mode 100644 index 0000000000000000000000000000000000000000..900a1bef1e3e0b5bab740a2818a132d3f625232e --- /dev/null +++ b/sadl/layer_where.h @@ -0,0 +1,94 @@ +/* The copyright in this software is being made available under the BSD + * License, included below. This software may be subject to other third party + * and contributor rights, including patent rights, and no such rights are + * granted under this license. + * + * Copyright (c) 2010-2024, ITU/ISO/IEC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of the ITU/ISO/IEC nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +#pragma once +#include "layer.h" + +namespace sadl +{ +namespace layers +{ +template<typename T> class Where : public Layer<T> +{ +public: + using Layer<T>::Layer; + using Layer<T>::m_out; // to avoid this-> + using Layer<T>::m_initDone; + + virtual bool apply(std::vector<Tensor<T> *> &in) override; + virtual bool init(const std::vector<Tensor<T> *> &in) override; + +protected: + virtual bool loadInternal(std::istream &file, Version) override; + DUMP_MODEL_EXT; +}; + +template<typename T> bool Where<T>::apply(std::vector<Tensor<T> *> &in) +{ + assert(in.size() == 3); + assert(in[0]->dims() == m_out.dims()); + assert(in[0]->dims() == in[1]->dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1)); + assert(in[0]->dims() == in[2]->dims() || (in[2]->dims().size() == 1 && in[2]->dims()[0] == 1)); + const Tensor<T> &condition = *in[0]; + const Tensor<T> &A = *in[1]; + const Tensor<T> &B = *in[2]; + m_out.quantizer = A.quantizer > B.quantizer ? A.quantizer : B.quantizer; + for (int i = 0; i < m_out.size(); i++) + { + const T A_i = (A.dims().size() == 1) ? A[0] : A[i]; + const T B_i = (B.dims().size() == 1) ? B[0] : B[i]; + typename ComputationType<T>::type z = condition[i] ? A_i : B_i; + const int z_q = condition[i] ? A.quantizer : B.quantizer ; + ComputationType<T>::shift_left(z, m_out.quantizer - z_q); + COUNTERS(z); + m_out[i] = z; + } + return true; +} + + +template<typename T> bool Where<T>::init(const std::vector<Tensor<T> *> &in) +{ + if (in.size() != 3) + return false; + m_out.resize(in[0]->dims());//condition dims + m_initDone = true; + return true; +} + +template<typename T> bool Where<T>::loadInternal(std::istream &file, Version) +{ + return true; +} + +} // namespace layers +} // namespace sadl diff --git a/sadl/layers.h b/sadl/layers.h index 11996f978510a03f9c05587e2b459df989ed29e3..1009eab9ab5ac9af876353566afd4328b37f3ab0 100644 --- a/sadl/layers.h +++ b/sadl/layers.h @@ -57,6 +57,8 @@ #include "layer_scatternd.h" #include "layer_gridsample.h" #include "layer_resize.h" +#include "layer_compare.h" +#include "layer_where.h" namespace sadl { @@ -96,6 +98,7 @@ inline std::string opName(const OperationType::Type op) DIRTYCASEPRINT(ScatterND); DIRTYCASEPRINT(GridSample); DIRTYCASEPRINT(Resize); + DIRTYCASEPRINT(Compare); default: oss << "??"; break; diff --git a/sadl/model.h b/sadl/model.h index 822acb68a6baaf1730841f8175f888318c091068..8cdfdb4891efad416e978423ddecc4a27dc26966 100644 --- a/sadl/model.h +++ b/sadl/model.h @@ -179,6 +179,12 @@ template<typename T> std::unique_ptr<layers::Layer<T>> createLayer(int32_t id, l case layers::OperationType::Resize: return std::unique_ptr<layers::Layer<T>>(new layers::Resize<T>{ id, op }); break; + case layers::OperationType::Compare: + return std::unique_ptr<layers::Layer<T>>(new layers::Compare<T>{ id, op }); + break; + case layers::OperationType::Where: + return std::unique_ptr<layers::Layer<T>>(new layers::Where<T>{ id, op }); + break; case layers::OperationType::OperationTypeCount: break; // no default on purpose } @@ -442,7 +448,7 @@ template<typename T> bool Model<T>::init(std::vector<Tensor<T>> &in) op_type[inputs_cnt] = L.layer->op(); // always put data layers first when const layers - if (inputs_cnt > 0 && op_type[inputs_cnt - 1] == layers::OperationType::Const && op_type[inputs_cnt] != layers::OperationType::Const) + if ((inputs_cnt > 0 && op_type[inputs_cnt - 1] == layers::OperationType::Const && op_type[inputs_cnt] != layers::OperationType::Const) && m_data[layer_cnt].layer->op() != layers::OperationType::Where) { std::cerr << "[ERROR] data layers should be first" << std::endl; return false; @@ -708,7 +714,6 @@ template<typename T> void Model<T>::insertCopyLayers() } if (layer_with_current_as_mutable_input.size() > 1) { // need copy layer - assert(layer_with_current_as_mutable_input.size() < 3); // for now. can be removed ? // for current layer L, insert copy layers C just after: x x x L C C xxxx std::vector<typename layers::Layer<T>::Id> id_copy_layers; for (int n = 0; n < (int) layer_with_current_as_mutable_input.size() - 1; ++n) diff --git a/sadl/options.h b/sadl/options.h index f5937306ce8a53b70d57f5e232cd87da32cbe853..356bf556d2a7c3f6686b765f8f2b5273d5ca4548 100644 --- a/sadl/options.h +++ b/sadl/options.h @@ -48,7 +48,7 @@ static constexpr float kSparsifySizeThreshold = 1000.0f; // nothing/-msse42: no simd // -mavx2: avx2 // -mavx2 -mfma: avx2 + fuse multiply/add -// -mavx512bw -mavx512f: avx512 +// -mavx512bw -mavx512f: avx512 -mavx512dq // #define NDEBUG 1 // remove sanity tests // debug diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index fa74de7730c50e336efdcfef4fdbf7714b8deeac..297da10816a4dbdd53c5a6b6eb823c9ba4f0340b 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -35,8 +35,8 @@ add_executable(quantization_test quantization_test.cpp ${HEADER_FILES}) if( UNIX OR MINGW ) set(CMAKE_CXX_FLAGS "-ffast-math -Wall -fstrict-aliasing") set_target_properties(sample_simd256 PROPERTIES COMPILE_FLAGS "-mavx2" ) - set_target_properties(sample_simd512 PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw" ) - set_target_properties(debug_model PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw" ) # must build in SIMD mode to debug SIMD issue + set_target_properties(sample_simd512 PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq" ) + set_target_properties(debug_model PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq" ) # must build in SIMD mode to debug SIMD issue endif() diff --git a/sample/copy.h b/sample/copy.h index 17b1a8c011ccea99343222d94aa2ad6001e90cef..951e3a53011bd3c3c674352bef7901bf5dda3b16 100644 --- a/sample/copy.h +++ b/sample/copy.h @@ -94,6 +94,11 @@ template<typename T> bool copy(const sadl::layers::Layer<float> &layer, sadl::la dynamic_cast<sadl::layers::Resize<T> &>(layerQ).m_mode = dynamic_cast<const sadl::layers::Resize<float> &>(layer).m_mode; dynamic_cast<sadl::layers::Resize<T> &>(layerQ).m_nearest_mode = dynamic_cast<const sadl::layers::Resize<float> &>(layer).m_nearest_mode; break; + case sadl::layers::OperationType::Compare: + dynamic_cast<sadl::layers::Compare<T> &>(layerQ).m_mode = dynamic_cast<const sadl::layers::Compare<float> &>(layer).m_mode; + break; + case sadl::layers::OperationType::Where: + break; // no default to get warning } diff --git a/sample/dumper.h b/sample/dumper.h index 901630c29cb0f4211f05377948c7753018378982..da8475817bd41bca508f60bf095dc9489368db43 100644 --- a/sample/dumper.h +++ b/sample/dumper.h @@ -129,6 +129,17 @@ template<typename T> bool sadl::layers::Resize<T>::dump(std::ostream &file) return true; } +template<typename T> bool sadl::layers::Compare<T>::dump(std::ostream &file) +{ + file.write((const char *) &m_mode, sizeof(m_mode)); + return true; +} + +template<typename T> bool sadl::layers::Where<T>::dump(std::ostream &file) +{ + return true; +} + template<typename T> bool sadl::layers::Layer<T>::dump(std::ostream &file) { // std::cout<<"todo? "<<opName(op_)<<std::endl; diff --git a/sample/naive_quantization.cpp b/sample/naive_quantization.cpp index da505dcc4caa41f57f039bc73581da686ca7a221..911a21e5ac9632095156b5f7e54704c9ef4fe7ef 100644 --- a/sample/naive_quantization.cpp +++ b/sample/naive_quantization.cpp @@ -72,7 +72,8 @@ bool toQuantize(sadl::layers::OperationType::Type type) && type != sadl::layers::OperationType::Identity && type != sadl::layers::OperationType::LeakyRelu && type != sadl::layers::OperationType::MaxPool && type != sadl::layers::OperationType::Relu && type != sadl::layers::OperationType::Reshape && type != sadl::layers::OperationType::Shape && type != sadl::layers::OperationType::Slice && type != sadl::layers::OperationType::Transpose && type != sadl::layers::OperationType::PReLU - && type != sadl::layers::OperationType::ScatterND && type != sadl::layers::OperationType::GridSample && type != sadl::layers::OperationType::Resize; + && type != sadl::layers::OperationType::ScatterND && type != sadl::layers::OperationType::GridSample && type != sadl::layers::OperationType::Resize + && type != sadl::layers::OperationType::Compare && type != sadl::layers::OperationType::Where; } template<typename T> void quantizeTensor(const sadl::Tensor<float> &B, sadl::Tensor<T> &Bq) diff --git a/sample/pytorch.py b/sample/pytorch.py index 455a14d921b4c89da918170968c6463bc410b417..2e34848f340350c5aeb9424f7f8198dda8908529 100644 --- a/sample/pytorch.py +++ b/sample/pytorch.py @@ -74,4 +74,4 @@ inputs_torch = [torch.from_numpy(input0)] inputs_torch[0].requires_grad = True output = model(inputs_torch) print("Output", output) -torch.onnx.export(model, inputs_torch, "./pytorch.onnx") +torch.onnx.export(model, inputs_torch, "./pytorch.onnx", opset_version=10) diff --git a/sample/pytorch_matmult.py b/sample/pytorch_matmult.py index 8f5a5cc5fbde658b5f2c00f4a90929cb17e466d4..cd2c1d230fa7995256170c898ef3a5d61d466865 100644 --- a/sample/pytorch_matmult.py +++ b/sample/pytorch_matmult.py @@ -30,5 +30,5 @@ output = model(inputs_torch) print("Output", output) print(model) torch.onnx.export( - model, inputs_torch, "./pytorch_matmult.onnx", verbose=True, opset_version=14 + model, inputs_torch, "./pytorch_matmult.onnx", verbose=True, opset_version=10 ) diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index 8ada8e88e4a66910599c664ee9706db8014581ee..fa213ae4bed371b6d836a5be28f3a9950721d403 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -12,7 +12,7 @@ add_executable(test test.cpp ${HEADER_FILES}) if( UNIX OR MINGW ) set(CMAKE_CXX_FLAGS "-ffast-math -Wall -fstrict-aliasing") - set_target_properties(test PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512f -mavx512bw" ) + set_target_properties(test PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512f -mavx512bw -mavx512dq" ) endif() diff --git a/utests/check.sh b/utests/check.sh index a06d56130fe956d1ba66af9623b5a3965f9289ef..df65c29fe33d9feb1f3d5680d409fcc74196aad3 100755 --- a/utests/check.sh +++ b/utests/check.sh @@ -18,7 +18,7 @@ for F in $L; do ../utest.sh ../models/${F}.onnx --no_transpose; done -L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x4_k1x1s2,1_g1_p0,0 conv2d_4_8x8x4_k1x3s1,2_g1_p0,1 conv2d_4_8x8x4_k3x1s1,1_g1_p1,0 conv2d_4_8x8x4_k3x1s1,1_g4_p1,0 conv2d_4_8x8x4_k3x3s1,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,2_g1_p1,1 conv2d_4_8x8x4_k5x5s1,1_g1_p2,2 conv2d_4_8x8x4_k5x5s1,1_g4_p2,2 conv2d_4_8x8x4_k5x5s2,1_g1_p2,2 conv2d_4_8x9x4_k1x1s2,1_g1_p0,0 conv2d_4_8x9x4_k3x1s1,1_g4_p1,0 conv2d_4_8x9x4_k3x3s1,1_g4_p1,1 conv2d_4_8x9x4_k3x3s2,1_g1_p1,1 conv2d_4_8x9x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k1x1s1,1_g1_p0,0 conv2d_4_9x8x4_k1x1s2,1_g1_p0,0 conv2d_4_9x8x4_k1x3s1,2_g1_p0,1 conv2d_4_9x8x4_k3x1s1,1_g1_p1,0 conv2d_4_9x8x4_k3x3s1,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k5x5s1,1_g1_p2,2 conv2d_4_9x8x4_k5x5s2,1_g1_p2,2 conv2d_4_9x9x4_k1x3s1,2_g1_p0,1 repeated_conv slice_pytorch slice_inf_pytorch slice_chw_pytorch prelu_multiple_alpha prelu_single_alpha scatternd_c_pytorch scatternd_hwc_with_conv_pytorch gridsample_bilinear gridsample_nearest gridsample_bilinear_conv gridsample_nearest_conv conv2dt_32_8x8x32_k3,3_s2,2_p1,1_op1,1 conv2dt_32_8x8x32_k4,4_s2,2_p1,1_op0,0 conv2dt_32_8x8x32_k5,5_s2,2_p2,2_op1,1 resize_bilinear_up2_pytorch resize_nearest_up2_pytorch resize_bilinear_up2_16x16x64_pytorch"; +L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x4_k1x1s2,1_g1_p0,0 conv2d_4_8x8x4_k1x3s1,2_g1_p0,1 conv2d_4_8x8x4_k3x1s1,1_g1_p1,0 conv2d_4_8x8x4_k3x1s1,1_g4_p1,0 conv2d_4_8x8x4_k3x3s1,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,2_g1_p1,1 conv2d_4_8x8x4_k5x5s1,1_g1_p2,2 conv2d_4_8x8x4_k5x5s1,1_g4_p2,2 conv2d_4_8x8x4_k5x5s2,1_g1_p2,2 conv2d_4_8x9x4_k1x1s2,1_g1_p0,0 conv2d_4_8x9x4_k3x1s1,1_g4_p1,0 conv2d_4_8x9x4_k3x3s1,1_g4_p1,1 conv2d_4_8x9x4_k3x3s2,1_g1_p1,1 conv2d_4_8x9x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k1x1s1,1_g1_p0,0 conv2d_4_9x8x4_k1x1s2,1_g1_p0,0 conv2d_4_9x8x4_k1x3s1,2_g1_p0,1 conv2d_4_9x8x4_k3x1s1,1_g1_p1,0 conv2d_4_9x8x4_k3x3s1,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k5x5s1,1_g1_p2,2 conv2d_4_9x8x4_k5x5s2,1_g1_p2,2 conv2d_4_9x9x4_k1x3s1,2_g1_p0,1 repeated_conv slice_pytorch slice_inf_pytorch slice_chw_pytorch prelu_multiple_alpha prelu_single_alpha scatternd_c_pytorch scatternd_hwc_with_conv_pytorch gridsample_bilinear gridsample_nearest gridsample_bilinear_conv gridsample_nearest_conv conv2dt_32_8x8x32_k3,3_s2,2_p1,1_op1,1 conv2dt_32_8x8x32_k4,4_s2,2_p1,1_op0,0 conv2dt_32_8x8x32_k5,5_s2,2_p2,2_op1,1 resize_bilinear_up2_pytorch resize_nearest_up2_pytorch resize_bilinear_up2_16x16x64_pytorch prelu_single_alpha_c32 prelu_multiple_alpha_c32 compare_less compare_greater where_constA_less where_constA_greater where_constB_less where_constB_greater"; for F in $L; do ../utest.sh ../models/${F}.onnx; done diff --git a/utests/models/compare_greater.onnx b/utests/models/compare_greater.onnx new file mode 100644 index 0000000000000000000000000000000000000000..6eb7484a56bca3085d7bed78437e502d61adafd3 Binary files /dev/null and b/utests/models/compare_greater.onnx differ diff --git a/utests/models/compare_less.onnx b/utests/models/compare_less.onnx new file mode 100644 index 0000000000000000000000000000000000000000..131b2b003d56c9b5c82ef1a1f085db8a7ea6e910 Binary files /dev/null and b/utests/models/compare_less.onnx differ diff --git a/utests/models/prelu_multiple_alpha_c32.onnx b/utests/models/prelu_multiple_alpha_c32.onnx new file mode 100644 index 0000000000000000000000000000000000000000..ba6cc15fc039e425cb2ef347e2e421113463b7e3 Binary files /dev/null and b/utests/models/prelu_multiple_alpha_c32.onnx differ diff --git a/utests/models/prelu_single_alpha_c32.onnx b/utests/models/prelu_single_alpha_c32.onnx new file mode 100644 index 0000000000000000000000000000000000000000..63c7c301c8d48cddfed4c3d364dd87e6eb656620 Binary files /dev/null and b/utests/models/prelu_single_alpha_c32.onnx differ diff --git a/utests/models/slice_chw_pytorch.py b/utests/models/slice_chw_pytorch.py index c06e54292ede9cce0800874da52a978dd02febc5..714e0b30b48fb333e5a40309af8222dba2e23377 100644 --- a/utests/models/slice_chw_pytorch.py +++ b/utests/models/slice_chw_pytorch.py @@ -25,4 +25,4 @@ output = model(input) # Output onnx model path onnx_model_path = os.path.join(os.path.dirname(sys.argv[0]), "slice_chw_pytorch.onnx") # Convert to onnx -torch.onnx.export(model, input, onnx_model_path) +torch.onnx.export(model, input, onnx_model_path, opset_version=10) diff --git a/utests/models/where_constA_greater.onnx b/utests/models/where_constA_greater.onnx new file mode 100644 index 0000000000000000000000000000000000000000..a53e8dee567caae8086335b77839805ef6cfa1bf Binary files /dev/null and b/utests/models/where_constA_greater.onnx differ diff --git a/utests/models/where_constA_less.onnx b/utests/models/where_constA_less.onnx new file mode 100644 index 0000000000000000000000000000000000000000..ba66e26ebc624835d2679a360169677ef8da9706 Binary files /dev/null and b/utests/models/where_constA_less.onnx differ diff --git a/utests/models/where_constB_greater.onnx b/utests/models/where_constB_greater.onnx new file mode 100644 index 0000000000000000000000000000000000000000..70c0cde2b79221bdcc22b0a215756e1c7dfabdb5 Binary files /dev/null and b/utests/models/where_constB_greater.onnx differ diff --git a/utests/models/where_constB_less.onnx b/utests/models/where_constB_less.onnx new file mode 100644 index 0000000000000000000000000000000000000000..d653b44d934d77c88d5885f9d634a7b9cf2f7184 Binary files /dev/null and b/utests/models/where_constB_less.onnx differ diff --git a/utests/onnx_inference.py b/utests/onnx_inference.py index dfbaf292bed6445f609b08d573326dc7009790b2..66bd0c8a9b56bc8d1756bee631a48a42e9561678 100644 --- a/utests/onnx_inference.py +++ b/utests/onnx_inference.py @@ -66,6 +66,8 @@ if args.output is not None: f.write("{} ".format(i)) f.write("\n") for x in np.nditer(o): + if np.issubdtype(x.dtype, np.bool_): + x = x.astype(float) f.write("{} ".format(x)) f.write("\n") print("[INFO] results file in {}".format(args.output))