diff --git a/converter/main.py b/converter/main.py
index 310a03ac555a0f05de679a5a4f3eed3eb9e77bf4..e3b11f9a3a5461a2a815a5447f355508752ced13 100644
--- a/converter/main.py
+++ b/converter/main.py
@@ -125,6 +125,8 @@ class OPTYPE(IntEnum):
     ScatterND = (22,)
     GridSample = (23,)
     Resize = (24,)
+    Compare = (25,)
+    Where = (26,)
 
     # "BatchMatMulV2" did not exist in Tensorflow 1.9. It exists in
     # Tensorflow 1.15.
@@ -1109,6 +1111,93 @@ def parse_graph_node(
         myGraph[node.output[0]]["additional"]["mode"] = mode
         myGraph[node.output[0]]["additional"]["nearest_mode"] = nearest_mode
         map_onnx_to_myGraph[node.output[0]] = node.output[0]
+    
+    elif node.op_type == "Less":
+        additional = {}
+        additional["data"] = node
+        if is_constant(node.input[1], model_onnx.graph.initializer):
+            n2 = getNodesWithOutput(node.input[1], model_onnx)      #constant
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(
+                node.input[1],
+                False, 
+                model_onnx.graph,
+                verbose,
+            )
+            myGraph[node.input[1]] = {}
+            myGraph[node.input[1]]["op_type"] = OPTYPE.Const
+            myGraph[node.input[1]]["inputs"] = []
+            myGraph[node.input[1]]["additional"] = additional
+            map_onnx_to_myGraph[node.input[1]] = node.input[1]
+        
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.Compare
+        myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        myGraph[node.output[0]]["additional"]["mode"] = 0
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
+
+    elif node.op_type == "Greater":
+        additional = {}
+        additional["data"] = node
+        if is_constant(node.input[1], model_onnx.graph.initializer):
+            n2 = getNodesWithOutput(node.input[1], model_onnx)      #constant
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(
+                node.input[1],
+                False, 
+                model_onnx.graph,
+                verbose,
+            )
+            myGraph[node.input[1]] = {}
+            myGraph[node.input[1]]["op_type"] = OPTYPE.Const
+            myGraph[node.input[1]]["inputs"] = []
+            myGraph[node.input[1]]["additional"] = additional
+            map_onnx_to_myGraph[node.input[1]] = node.input[1]
+        
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.Compare
+        myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        myGraph[node.output[0]]["additional"]["mode"] = 1
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
+        
+    elif node.op_type == "Where":
+        if is_constant(node.input[1], model_onnx.graph.initializer):
+            additional = {}
+            additional["data"] = node
+            n2 = getNodesWithOutput(node.input[1], model_onnx)
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(node.input[1], False, model_onnx.graph, verbose)
+            myGraph[node.input[1]] = {}
+            myGraph[node.input[1]]["op_type"] = OPTYPE.Const
+            myGraph[node.input[1]]["inputs"] = []
+            myGraph[node.input[1]]["additional"] = additional
+            map_onnx_to_myGraph[node.input[1]] = node.input[1]
+        if is_constant(node.input[2], model_onnx.graph.initializer):
+            additional = {}
+            additional["data"] = node
+            n2 = getNodesWithOutput(node.input[2], model_onnx)
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(node.input[2], False, model_onnx.graph, verbose)
+            myGraph[node.input[2]] = {}
+            myGraph[node.input[2]]["op_type"] = OPTYPE.Const
+            myGraph[node.input[2]]["inputs"] = []
+            myGraph[node.input[2]]["additional"] = additional
+            map_onnx_to_myGraph[node.input[2]] = node.input[2]
+        
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.Where
+        myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [map_onnx_to_myGraph[node.input[1]]]+[map_onnx_to_myGraph[node.input[2]]]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
 
     else:
         raise Exception("[ERROR] node not supported:\n{})".format(node))
@@ -1412,6 +1501,12 @@ def dump_onnx(graph, my_inputs, my_outputs, output_filename, verbose=False):
                     print("#\t nearest_mode", node["additional"]["nearest_mode"])
                 f.write(struct.pack("i", int(node["additional"]["nearest_mode"])))
 
+            elif node["op_type"] == OPTYPE.Compare:
+                if verbose:
+                    print("#\t mode", node["additional"]["mode"])
+                f.write(struct.pack("i", int(node["additional"]["mode"])))
+
+
             if (
                 node["op_type"] == OPTYPE.Conv2D
                 or node["op_type"] == OPTYPE.Conv2DTranspose
diff --git a/sadl/layer.h b/sadl/layer.h
index 9a5b492f0a453df3983224585c5c14b89ad87f2f..3b6737d1801f7cc47a14f35027e6d56594f0f526 100644
--- a/sadl/layer.h
+++ b/sadl/layer.h
@@ -72,7 +72,9 @@ struct OperationType
     ScatterND          = 22,
     GridSample         = 23,
     Resize             = 24,
-    OperationTypeCount = 25
+    Compare            = 25,
+    Where              = 26,
+    OperationTypeCount = 27
   };
 };
 
diff --git a/sadl/layer_compare.h b/sadl/layer_compare.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdd02652d003cbea82804e6a84268075b3bb9660
--- /dev/null
+++ b/sadl/layer_compare.h
@@ -0,0 +1,181 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include "layer.h"
+
+namespace sadl
+{
+namespace layers
+{
+enum class Compare_mode  
+{
+    LessThan,
+    GreaterThan
+};
+template<typename T> class Compare : public Layer<T>
+{
+public:
+  using Layer<T>::Layer;
+  using Layer<T>::m_out;   // to avoid this->
+  using Layer<T>::m_initDone;
+
+  virtual bool apply(std::vector<Tensor<T> *> &in) override;
+  virtual bool init(const std::vector<Tensor<T> *> &in) override;
+
+protected:
+  virtual bool loadInternal(std::istream &file, Version) override;
+  bool apply_less(std::vector<Tensor<T> *> &in);
+  bool apply_greater(std::vector<Tensor<T> *> &in);
+
+  Compare_mode m_mode;             
+  DUMP_MODEL_EXT;
+};
+
+template<typename T> bool Compare<T>::apply(std::vector<Tensor<T> *> &in)
+{
+  assert(in.size() == 2);
+  assert(in[0]->dims() == m_out.dims() || (in[0]->dims().size() == 1 && in[0]->dims()[0] == 1));
+  assert(in[1]->dims() == m_out.dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1));
+  if(m_mode == Compare_mode::LessThan)
+    return apply_less(in);
+  else if(m_mode == Compare_mode::GreaterThan)
+    return apply_greater(in);
+  else return false;
+}
+
+template<typename T> bool Compare<T>::apply_less(std::vector<Tensor<T> *> &in)
+{
+  const Tensor<T> &A = *in[0];
+  const Tensor<T> &B = *in[1];
+  const int &A_q = A.quantizer;
+  const int &B_q = B.quantizer;
+  const int A_shift = std::max(0, B_q - A_q);
+  const int B_shift = std::max(0, A_q - B_q);
+  m_out.quantizer = 0;// bool tensor
+  if(B.dims().size() == 1)
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[0];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i < B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+
+  else
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[i];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i < B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+  return true;
+}
+
+template<typename T> bool Compare<T>::apply_greater(std::vector<Tensor<T> *> &in)
+{
+  const Tensor<T> &A = *in[0];
+  const Tensor<T> &B = *in[1];
+  const int &A_q = A.quantizer;
+  const int &B_q = B.quantizer;
+  const int A_shift = std::max(0, B_q - A_q);
+  const int B_shift = std::max(0, A_q - B_q);
+  m_out.quantizer = 0;// bool tensor
+  if(B.dims().size() == 1)
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[0];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i > B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+
+  else
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[i];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i > B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+  return true;
+}
+
+template<typename T> bool Compare<T>::init(const std::vector<Tensor<T> *> &in)
+{
+  if (in.size() != 2)
+    return false;
+  m_out.resize(in[0]->dims());
+  m_initDone = true;
+  return true;
+}
+
+template<typename T> bool Compare<T>::loadInternal(std::istream &file, Version)
+{
+  int32_t x = 0;
+  file.read((char *) &x, sizeof(x));
+  if(x == (int32_t) Compare_mode::LessThan)
+    m_mode = Compare_mode::LessThan;
+  else if(x == (int32_t) Compare_mode::GreaterThan)
+    m_mode = Compare_mode::GreaterThan;
+  else
+  {
+    std::cerr << "[ERROR] invalid mode: " << x << std::endl;
+    return false;
+  }
+  SADL_DBG(std::cout << "  - mode: " << x << std::endl);
+  return true;
+}
+
+}   // namespace layers
+}   // namespace sadl
diff --git a/sadl/layer_conv2d.h b/sadl/layer_conv2d.h
index 9a51098539f3f387e487c930b1bfc133af4819d4..be6be41b46e68ea40fe424502c773f0ae9d436c7 100644
--- a/sadl/layer_conv2d.h
+++ b/sadl/layer_conv2d.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -126,6 +126,9 @@ protected:
 
   }
 
+  template<int in_D, int ihalf_size, int jhalf_size> void simd32_conv2d_ixj_s11_g1_d_core(const Tensor<T> &A, const Tensor<T> &kernel) {
+    simd16_conv2d_ixj_s11_g1_d_core<in_D,ihalf_size,jhalf_size>(A,kernel);
+  }
 #endif
   DUMP_MODEL_EXT;
 };
diff --git a/sadl/layer_conv2d_ixj.h b/sadl/layer_conv2d_ixj.h
index be79c29cc1edcab0ca3c588bd5e7b81b5b0d2beb..e8e4b796400560a48589081011706da74beb6a33 100644
--- a/sadl/layer_conv2d_ixj.h
+++ b/sadl/layer_conv2d_ixj.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -315,7 +315,7 @@ template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Con
   int           start_h{ ihalf_size - top };
   int           start_w{ jhalf_size - left };
 #if DEBUG_SIMD && __AVX2__
-  std::cout << "\n[WARN] generic version conv (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters
+  std::cout << "\n[WARN] generic version conv2d_ixj_s11_g1_d_core (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters
             << " s=[" << s_w << ' ' << s_h << "]  " << in_H << 'x' << in_W << " "
             << "?? kMAC" << std::endl;
 #endif
@@ -406,7 +406,7 @@ template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<float>
 #endif
 
 #if __AVX512BW__
-template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel)
+template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::simd32_conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel)
 {
   static_assert(in_D % 32 == 0, "Should be used with mod32 filters.");
   using T=int16_t;
@@ -460,6 +460,67 @@ template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_
   }
 }
 #endif
+#if __AVX2__
+template<> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<int16_t>::simd16_conv2d_ixj_s11_g1_d_core(const Tensor<int16_t> &A, const Tensor<int16_t> &kernel)
+{
+  static_assert(in_D % 16 == 0, "Should be used with mod16 filters.");
+  using T=int16_t;
+  constexpr int im_nb = 0;
+  constexpr int s_h = 1;
+  constexpr int s_w = 1;
+  const int     nb_filters{ kernel.dims()[2] };
+  const int     shift     = kernel.quantizer + m_q;
+  const int     top{ m_pads[0] };
+  const int     left{ m_pads[1] };
+  int           in_H{ A.dims()[1] };
+  int           in_W{ A.dims()[2] };
+  int           start_h{ ihalf_size - top };
+  int           start_w{ jhalf_size - left };
+#if DEBUG_SIMD && __AVX512BW__
+  if (in_D % 32 == 0 || in_D>=32)
+    std::cout << "\n[WARN] avx2 (instead of avx512) version simd16_conv2d_ixj_s11_g1_d_core (s=1, g=1, known kernel)" << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D << " outD=" << nb_filters
+              << " s=[" << s_w << ' ' << s_h << "]  " << in_H << 'x' << in_W << " "
+              << "?? kMAC" << std::endl;
+#endif
+
+  assert(start_h + s_h - ihalf_size >= 0);
+  assert(start_w + s_w - jhalf_size >= 0);
+  for (int im_i = start_h + s_h; im_i < in_H - ihalf_size; im_i += s_h)
+  {
+    for (int im_j = start_w + s_w; im_j < in_W - jhalf_size; im_j += s_w)
+    {
+      for (int filter = 0; filter < nb_filters; ++filter)
+      {
+        __m256i s = _mm256_setzero_si256();
+        for (int filter_i = -ihalf_size; filter_i <= ihalf_size; ++filter_i)
+        {   // fixed
+          for (int filter_j = -jhalf_size; filter_j <= jhalf_size; ++filter_j)
+          {   // fixed
+            for (int filter_d = 0; filter_d < in_D; filter_d += 16)
+            {
+              const int      ii   = im_i + filter_i;
+              const int      jj   = im_j + filter_j;
+              const int      ki   = ihalf_size + filter_i;
+              const int      kj   = jhalf_size + filter_j;
+              const __m256i *kptr = (const __m256i *) kernel.addr(ki, kj, filter, filter_d);
+              const __m256i  k0   = _mm256_load_si256(kptr);
+              const __m256i *aptr = (const __m256i *) A.addr(im_nb, ii, jj, filter_d);
+              const __m256i  v0   = _mm256_load_si256(aptr);
+
+              const __m256i mad0 = _mm256_madd_epi16(k0, v0);   // res in si32
+              s                  = _mm256_add_epi32(s, mad0);
+            }
+          }
+        }
+        typename ComputationType<T>::type z = (sum32_int16(s) >> shift);
+        COUNTERS(z);
+        SATURATE(z);
+        m_out(im_nb, im_i / s_h, im_j / s_w, filter) = z;
+      }
+    }
+  }
+}
+#endif
 template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Conv2D<T>::conv2d_ixj_s11_gD_d_core(const Tensor<T> &A, const Tensor<T> &kernel)
 {
   constexpr int nb_filters = in_D;
@@ -474,7 +535,7 @@ template<typename T> template<int in_D, int ihalf_size, int jhalf_size> void Con
   int           start_h{ ihalf_size - top };
   int           start_w{ jhalf_size - left };
 #if DEBUG_SIMD && __AVX2__
-  std::cout << "\n[WARN] partially generic version conv " << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D
+  std::cout << "\n[WARN] partially generic version conv2d_ixj_s11_gD_d_core " << kernel.dims()[0] << "x" << kernel.dims()[1] << "g" << m_groups << " inD=" << in_D
             << " outD=" << nb_filters << " s=[" << s_w << ' ' << s_h << "]  " << in_H << 'x' << in_W << " "
             << "?? kMAC" << std::endl;
 #endif
@@ -515,9 +576,11 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor
   const int nb_filters{ kernel.dims()[2] };
 
   // grouped conv with stride 1 and inD==outD
-#if __AVX512F__ || __AVX512BW__
+#if __AVX2__
+#define CONV_MOD32 simd32_conv2d_ixj_s11_g1_d_core
 #define CONV_MOD16 simd16_conv2d_ixj_s11_g1_d_core
 #else
+#define CONV_MOD32 conv2d_ixj_s11_g1_d_core
 #define CONV_MOD16 conv2d_ixj_s11_g1_d_core
 #endif
   if (in_D == m_groups && in_D == nb_filters && s_h == 1 && s_w == 1)
@@ -577,12 +640,16 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor
       constexpr int kj = 1;
       switch (in_D)
       {
+      case 16:
+        CONV_MOD16<16, ki, kj>(A, kernel);
+        return;
+        break;
       case 32:
-        CONV_MOD16<32, ki, kj>(A, kernel);
+        CONV_MOD32<32, ki, kj>(A, kernel);
         return;
         break;
       case 64:
-        CONV_MOD16<64, ki, kj>(A, kernel);
+        CONV_MOD32<64, ki, kj>(A, kernel);
         return;
         break;
       default:   // do default
@@ -595,12 +662,16 @@ template<typename T> template<int s_h, int s_w> void Conv2D<T>::conv2d_ixj_s_cor
       constexpr int kj = 0;
       switch (in_D)
       {
+      case 16:
+        CONV_MOD16<16, ki, kj>(A, kernel);
+        return;
+        break;
       case 32:
-        CONV_MOD16<32, ki, kj>(A, kernel);
+        CONV_MOD32<32, ki, kj>(A, kernel);
         return;
         break;
       case 64:
-        CONV_MOD16<64, ki, kj>(A, kernel);
+        CONV_MOD32<64, ki, kj>(A, kernel);
         return;
         break;
       default:   // do default
diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h
index a8ff2ae1a7c567138ef27a8da0955d5263826446..62a37f2f7e2053c3ebaaec10c0fd67a6ddbfeb4b 100644
--- a/sadl/layer_gridsample.h
+++ b/sadl/layer_gridsample.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -214,12 +214,12 @@ template<> inline void GridSample<float>::gs_denormalize(float &x, int length)
 {
   if (m_mode == gridsample_mode_nearest)
   {
-    x = (x + 1) * (length - 1) / 2.0;
+    x = (x + 1) * (length - 1) / 2.0f;
     x = round(x);
   }
   else if (m_mode == gridsample_mode_bilinear)
   {
-    x = (x + 1) * (length - 1) / 2.0;
+    x = (x + 1) * (length - 1) / 2.0f;
   }
 }
 
@@ -244,8 +244,8 @@ template<> inline void GridSample<float>::get_bilinear_coeffs(float y, float x,
   float &coeff11 = coeffs[0], &coeff12 = coeffs[1], &coeff21 = coeffs[2], &coeff22 = coeffs[3];
   int   &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
 
-  x_ori_left   = floor(x);
-  y_ori_top    = floor(y);
+  x_ori_left   = (int)floor(x);
+  y_ori_top    = (int)floor(y);
   x_ori_right  = x_ori_left + 1;
   y_ori_bottom = y_ori_top + 1;
   float dy2    = y_ori_bottom - y;
diff --git a/sadl/layer_leakyrelu.h b/sadl/layer_leakyrelu.h
index e4a3e7bc25a77a6674bae76e7c96895413b55de0..11c24d8c0d3de51a5e7f5623a2e976124349d934 100644
--- a/sadl/layer_leakyrelu.h
+++ b/sadl/layer_leakyrelu.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,64 +32,12 @@
  */
 #pragma once
 #include "layer.h"
+#include "layer_prelu.h"
 
 namespace sadl
 {
 namespace layers
 {
-template<typename T> class LeakyRelu : public Layer<T>
-{
-public:
-  using Layer<T>::Layer;
-  using Layer<T>::m_out;   // to avoid this->
-  using Layer<T>::m_initDone;
-
-  virtual bool apply(std::vector<Tensor<T> *> &in) override;
-  virtual bool init(const std::vector<Tensor<T> *> &in) override;
-  virtual bool mutateInput() const override { return true; }
-
-protected:
-  virtual bool loadInternal(std::istream &file, Version) override;
-};
-
-template<typename T> bool LeakyRelu<T>::apply(std::vector<Tensor<T> *> &in)
-{
-  assert(in.size() == 2);
-  assert(in[0]->dims() == m_out.dims());
-  const Tensor<T> &A = *in[1];
-  swap(*in[0], m_out);
-  // keep same qunatiz as input
-  const typename ComputationType<T>::type alpha = A[0];
-
-  const int alpha_q = A.quantizer;
-  for (auto &x: m_out)
-  {
-    if (x < 0)
-    {
-      typename ComputationType<T>::type z = x * alpha;
-      ComputationType<T>::quantize(z, alpha_q);
-      COUNTERS(z);
-      COUNTERS_MAC(z);
-      SATURATE(z); 
-      x = z;
-    } else {
-      COUNTERS_MAC_NOP(1);
-    }
-  }
-
-  return true;
-}
-
-template<typename T> bool LeakyRelu<T>::init(const std::vector<Tensor<T> *> &in)
-{
-  if (in.size() != 2)
-    return false;
-  m_out.resize(in[0]->dims());
-  m_initDone = true;
-  return true;
-}
-
-template<typename T> bool LeakyRelu<T>::loadInternal(std::istream &, Version) { return true; }
-
+template<typename T>  using LeakyRelu=PReLU<T>;
 }   // namespace layers
 }   // namespace sadl
diff --git a/sadl/layer_prelu.h b/sadl/layer_prelu.h
index f943437804d7a08de1c9c42b8c9973b00acc4448..2d139579dfa0a0589d60065fd2242290d300381d 100644
--- a/sadl/layer_prelu.h
+++ b/sadl/layer_prelu.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,50 +49,344 @@ public:
   virtual bool mutateInput() const override { return true; }
 
 protected:
-  virtual bool loadInternal(std::istream &file, Version) override;
+  virtual bool                   loadInternal(std::istream &file, Version) override;
+  template<bool multialpha> bool apply_scalar(std::vector<Tensor<T> *> &in);
+#if __AVX2__
+  template<bool multialpha> bool apply_simd256(std::vector<Tensor<T> *> &in);
+#endif
+#if __AVX512F__ || __AVX512BW__
+  template<bool multialpha> bool apply_simd512(std::vector<Tensor<T> *> &in);
+#endif
 };
 
 template<typename T> bool PReLU<T>::apply(std::vector<Tensor<T> *> &in)
+{
+  assert(in.size() == 2);
+  assert(in[0]->dims() == m_out.dims());
+#if __AVX512F__
+  if (std::is_same<T, float>::value && in[0]->size() % 16 == 0)
+  {
+    if (in[1]->size() == 1)
+    {
+      return apply_simd512<false>(in);
+    }
+    else if (in[1]->size() % 16 == 0)
+    {
+      return apply_simd512<true>(in);
+    }
+  }
+#endif
+#if __AVX512BW__
+  if (std::is_same<T, int16_t>::value && in[0]->size() % 32 == 0)
+  {
+    if (in[1]->size() == 1)
+    {
+      return apply_simd512<false>(in);
+    }
+    else if (in[1]->size() % 32 == 0)
+    {
+      return apply_simd512<true>(in);
+    }
+  }
+#endif
+#if __AVX2__
+  if (std::is_same<T, float>::value && in[0]->size() % 8 == 0)
+  {
+    if (in[1]->size() == 1)
+    {
+      return apply_simd256<false>(in);
+    }
+    else if (in[1]->size() % 8 == 0)
+    {
+      return apply_simd256<true>(in);
+    }
+  }
+#endif
+
+#if __AVX2__
+  if (std::is_same<T, int16_t>::value && in[0]->size() % 16 == 0)
+  {
+    if (in[1]->size() == 1)
+    {
+      return apply_simd256<false>(in);
+    }
+    else if (in[1]->size() % 16 == 0)
+    {
+      return apply_simd256<true>(in);
+    }
+  }
+#endif
+  if (in[1]->size() == 1)
+  {
+    return apply_scalar<false>(in);
+  }
+  else
+  {
+    return apply_scalar<true>(in);
+  }
+}
+
+template<typename T> template<bool multialpha> bool PReLU<T>::apply_scalar(std::vector<Tensor<T> *> &in)   // without simd
 {
   const int in_N{ in[0]->dims()[0] };
   const int in_H{ in[0]->dims()[1] };
   const int in_W{ in[0]->dims()[2] };
   const int in_C{ in[0]->dims()[3] };
 
-  assert(in.size() == 2);
-  assert(in[0]->dims() == m_out.dims());
   const Tensor<T> &A = *in[1];
   swap(*in[0], m_out);
   // keep same qunatiz as input
   const int alpha_q = A.quantizer;
-  for (int n_nb = 0; n_nb < in_N; n_nb++)
+  if (multialpha)
   {
-    for (int c_nb = 0; c_nb < in_C; c_nb++)
+    for (int n_nb = 0; n_nb < in_N; n_nb++)
     {
-      // A.dims()[0] == 1, means all channels share the same alpha parameter
-      const typename ComputationType<T>::type alpha = (A.dims()[0] == 1) ? A(0, 0, 0) : A(c_nb, 0, 0);
-      for (int h_nb = 0; h_nb < in_H; h_nb++)
+      for (int c_nb = 0; c_nb < in_C; c_nb++)
       {
-        for (int w_nb = 0; w_nb < in_W; w_nb++)
+        // A.dims()[0] == 1, means all channels share the same alpha parameter
+        const typename ComputationType<T>::type alpha = (A.dims()[0] == 1) ? A(0, 0, 0) : A(c_nb, 0, 0);
+        for (int h_nb = 0; h_nb < in_H; h_nb++)
         {
-          if (m_out(n_nb, h_nb, w_nb, c_nb) < 0)
+          for (int w_nb = 0; w_nb < in_W; w_nb++)
           {
-            typename ComputationType<T>::type z = m_out(n_nb, h_nb, w_nb, c_nb) * alpha;
-            ComputationType<T>::quantize(z, alpha_q);
-            COUNTERS(z);
-            COUNTERS_MAC(z);
-            SATURATE(z);
-            m_out(n_nb, h_nb, w_nb, c_nb) = z;
-          } else {
-            COUNTERS_MAC_NOP(1);
-          } 
+            if (m_out(n_nb, h_nb, w_nb, c_nb) < 0)
+            {
+              typename ComputationType<T>::type z = m_out(n_nb, h_nb, w_nb, c_nb) * alpha;
+              ComputationType<T>::quantize(z, alpha_q);
+              COUNTERS(z);
+              COUNTERS_MAC(z);
+              SATURATE(z);
+              m_out(n_nb, h_nb, w_nb, c_nb) = z;
+            }
+            else
+            {
+              COUNTERS_MAC_NOP(1);
+            }
+          }
         }
       }
     }
   }
+  else
+  {
+    const typename ComputationType<T>::type alpha = A[0];
+    for (auto &x: m_out)
+    {
+      if (x < 0)
+      {
+        typename ComputationType<T>::type z = x * alpha;
+        ComputationType<T>::quantize(z, alpha_q);
+        COUNTERS(z);
+        COUNTERS_MAC(z);
+        SATURATE(z);
+        x = z;
+      }
+      else
+      {
+        COUNTERS_MAC_NOP(1);
+      }
+    }
+  }
   return true;
 }
 
+#if __AVX2__
+template<> template<bool multialpha> inline bool PReLU<float>::apply_simd256(std::vector<Tensor<float> *> &in)   // simd256 float
+{
+  exit(-1);   // to correct
+  Tensor<float> &A = *in[1];
+  swap(*in[0], m_out);
+  float *const       data_ptr  = m_out.data();
+  const float *const alpha_ptr = A.data();
+  const __m256       m_zeros   = _mm256_setzero_ps();
+  __m256             alpha     = _mm256_set1_ps(*A.data());
+  for (int iter = 0; iter < m_out.size(); iter += 8)
+  {
+    if (multialpha)
+      alpha = _mm256_load_ps(alpha_ptr + iter % A.size());
+
+    float *const aptr       = data_ptr + iter;
+    auto         a          = _mm256_load_ps(aptr);               // load
+    auto         min_a_zero = _mm256_min_ps(a, m_zeros);          // min(a,0)
+    auto         max_a_zero = _mm256_max_ps(a, m_zeros);          // max(a,0)
+    auto         b          = _mm256_mul_ps(min_a_zero, alpha);   // min(a,0)*alpha
+    const __m256 v          = _mm256_add_ps(max_a_zero, b);       // max(a,0)+min(a,0)*alpha
+    /*store*/ _mm256_store_ps(aptr, v);
+  }
+
+  return true;
+}
+
+template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd256(std::vector<Tensor<int16_t> *> &in)
+{
+  Tensor<int16_t> &A = *in[1];
+  swap(*in[0], m_out);
+  int16_t *const                        data_ptr  = m_out.data();
+  [[maybe_unused]] const int16_t *const alpha_ptr = A.data();
+  const int                             alpha_q   = A.quantizer;
+
+  __m256i       alpha = _mm256_set1_epi16(A[0]);
+  const __m256i mask  = _mm256_set1_epi32(65535);
+  const __m256i max   = _mm256_set1_epi32(32767);
+  const __m256i min   = _mm256_set1_epi32(-32768);
+  const __m256i zeros = _mm256_setzero_si256();
+  const int     N     = m_out.size();
+  for (int iter = 0; iter < N; iter += 16)
+  {
+    int16_t *aptr = data_ptr + iter;
+    auto     a    = _mm256_load_si256((__m256i *) aptr);   // load
+    if (multialpha)
+    {
+      alpha = _mm256_load_si256((__m256i *) (alpha_ptr + (iter % A.size())));
+    }
+
+    // prepare branches
+    auto max0 = _mm256_max_epi16(a, zeros);
+    auto min0 = _mm256_min_epi16(a, zeros);
+    // branch neg
+    // mul
+    auto lo = _mm256_mullo_epi16(min0, alpha);   // min(a,0)*alpha lo part
+    auto hi = _mm256_mulhi_epi16(min0, alpha);   // min(a,0)*alpha hi part
+    // repack32
+    auto lo32 = _mm256_unpacklo_epi16(lo, hi);
+    auto hi32 = _mm256_unpackhi_epi16(lo, hi);
+    auto y0   = _mm256_permute2x128_si256(lo32, hi32, _MM_SHUFFLE(0, 2, 0, 0));
+    auto y1   = _mm256_permute2x128_si256(lo32, hi32, _MM_SHUFFLE(0, 3, 0, 1));
+    // shift
+    auto y0s = _mm256_srai_epi32(y0, alpha_q);
+    auto y1s = _mm256_srai_epi32(y1, alpha_q);
+#if SATURATE_RESULT
+    // clip
+    auto y0c  = _mm256_max_epi32(y0s, min);
+    auto y1c  = _mm256_max_epi32(y1s, min);
+    auto y0c2 = _mm256_min_epi32(y0c, max);
+    auto y1c2 = _mm256_min_epi32(y1c, max);
+#else
+    auto y0c2 = y0s;
+    auto y1c2 = y1s;
+#endif
+    // mask 16bits
+    auto y0p = _mm256_and_si256(y0c2, mask);
+    auto y1p = _mm256_and_si256(y1c2, mask);
+    // repack
+    auto z  = _mm256_packus_epi32(y0p, y1p);
+    auto z2 = _mm256_permute4x64_epi64(z, _MM_SHUFFLE(3, 1, 2, 0));
+    // merge 2 branches
+    auto r = _mm256_add_epi16(max0, z2);
+    _mm256_store_si256((__m256i *) aptr, r);
+  }
+  return true;
+}
+
+template<typename T> template<bool multialpha> bool PReLU<T>::apply_simd256(std::vector<Tensor<T> *> &in)   //
+{
+  std::cerr << "[ERROR] simd type not supported: " << std::endl;
+  exit(-1);
+}
+#endif
+
+#if __AVX512F__
+template<> template<bool multialpha> inline bool PReLU<float>::apply_simd512(std::vector<Tensor<float> *> &in)   // simd512 float
+{
+  Tensor<float> &A = *in[1];
+  swap(*in[0], m_out);
+  float *const       data_ptr  = m_out.data();
+  const float *const alpha_ptr = A.data();
+  const __m512       m_zeros   = _mm512_setzero_ps();
+  __m512             alpha     = _mm512_set1_ps(*A.data());
+  for (int iter = 0; iter < m_out.size(); iter += 16)
+  {
+    if (multialpha)
+      alpha = _mm512_load_ps(alpha_ptr + iter % A.size());
+
+    float *const aptr       = data_ptr + iter;   // load
+    auto         a          = _mm512_load_ps(aptr);
+    auto         min_a_zero = _mm512_min_ps(a, m_zeros);          // min(a,0)
+    auto         max_a_zero = _mm512_max_ps(a, m_zeros);          // max(a,0)
+    auto         b          = _mm512_mul_ps(min_a_zero, alpha);   // min(a,0)*alpha
+    auto         v          = _mm512_add_ps(max_a_zero, b);       // max(a,0)+min(a,0)*alpha
+    /*store*/ _mm512_store_ps(aptr, v);
+  }
+
+  return true;
+}
+
+#endif
+
+#if __AVX512BW__  
+template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd512(std::vector<Tensor<int16_t> *> &in)   // simd512 int16 quantize
+{
+  Tensor<int16_t> &A = *in[1];
+  swap(*in[0], m_out);
+  int16_t                              *data_ptr  = m_out.data();
+  [[maybe_unused]] const int16_t *const alpha_ptr = A.data();
+  const int                             alpha_q   = A.quantizer;
+  auto                                  alpha0    = _mm512_set1_epi32(A[0]);
+  auto                                  alpha1    = alpha0;
+  const auto                            max       = _mm512_set1_epi32(32767);
+  const auto                            min       = _mm512_set1_epi32(-32768);
+  const auto                            zeros     = _mm512_setzero_si512();
+  static constexpr int16_t data[]={0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
+  const auto shuffle=  _mm512_loadu_si512((void *)data);
+
+  const int N = m_out.size();
+
+  for (int iter = 0; iter < N; iter += 32)
+  {
+    int16_t *aptr = data_ptr + iter;
+    auto     a    = _mm512_loadu_si512((__m512i *) aptr);   // load
+    if (multialpha)
+    {
+      auto a2   = _mm512_loadu_si512((__m512i *) (alpha_ptr + (iter % A.size())));
+      auto a2lo = _mm512_castsi512_si256(a2);
+      alpha0    = _mm512_cvtepi16_epi32(a2lo);
+      auto a2hi = _mm512_extracti64x4_epi64(a2, 1);
+      alpha1    = _mm512_cvtepi16_epi32(a2hi);
+    }
+    // prepare branches
+    auto max0 = _mm512_max_epi16(a, zeros);
+    auto min0 = _mm512_min_epi16(a, zeros);
+    // branch neg
+    // extract
+    auto lo = _mm512_castsi512_si256(min0);
+    auto hi = _mm512_extracti64x4_epi64(min0, 1);
+    // unpack 16 to 32
+    auto lo32 = _mm512_cvtepi16_epi32(lo);
+    auto hi32 = _mm512_cvtepi16_epi32(hi);
+    // mul
+    auto y0 = _mm512_mullo_epi32(lo32, alpha0);
+    auto y1 = _mm512_mullo_epi32(hi32, alpha1);
+    // shift
+    auto y0s = _mm512_srai_epi32(y0, alpha_q);
+    auto y1s = _mm512_srai_epi32(y1, alpha_q);
+#if SATURATE_RESULT
+    // clip
+    auto y0c  = _mm512_max_epi32(y0s, min);
+    auto y1c  = _mm512_max_epi32(y1s, min);
+    auto y0c2 = _mm512_min_epi32(y0c, max);
+    auto y1c2 = _mm512_min_epi32(y1c, max);
+#else
+    auto y0c2 = y0s;
+    auto y1c2 = y1s;
+#endif
+    // pack
+    auto z2 = _mm512_permutex2var_epi16(y0c2, shuffle, y1c2);
+    // merge branches
+    auto r = _mm512_add_epi16(max0, z2);
+    _mm512_storeu_si512((__m512i *) aptr, r);
+  }
+  return true;
+}
+#endif
+
+#if __AVX512F__ || __AVX512BW__
+template<typename T> template<bool multialpha> bool PReLU<T>::apply_simd512(std::vector<Tensor<T> *> &in)   //
+{
+  std::cerr << "[ERROR] simd type not supported: " << std::endl;
+  exit(-1);
+}
+#endif
+
 template<typename T> bool PReLU<T>::init(const std::vector<Tensor<T> *> &in)
 {
   if (in.size() != 2)
diff --git a/sadl/layer_resize.h b/sadl/layer_resize.h
index c90c3449b5ef5bad90f7d8d1021ebf1e74513565..b7702adfcdd7a750eb271d57d7a4898915563eef 100644
--- a/sadl/layer_resize.h
+++ b/sadl/layer_resize.h
@@ -3,7 +3,7 @@
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
  *
- * Copyright (c) 2010-2023, ITU/ISO/IEC
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -135,10 +135,10 @@ template<typename T> bool Resize<T>::init(const std::vector<Tensor<T> *> &in)
   float scale_N = 0, scale_C = 0, scale_H = 0, scale_W = 0;
   if (m_input_label == 1)   // inputs are X and sizes
   {
-    scale_N = in[1]->data()[0] / N;
-    scale_C = in[1]->data()[1] / C;
-    scale_H = in[1]->data()[2] / H_in;
-    scale_W = in[1]->data()[3] / W_in;
+    scale_N = in[1]->data()[0] / (float)N;
+    scale_C = in[1]->data()[1] / (float)C;
+    scale_H = in[1]->data()[2] / (float)H_in;
+    scale_W = in[1]->data()[3] / (float)W_in;
   }
   else if (m_input_label == 2)   // inputs are X and scales
   {
@@ -153,17 +153,17 @@ template<typename T> bool Resize<T>::init(const std::vector<Tensor<T> *> &in)
     return false;
   }
   scale_factors.resize(in[1]->dims());
-  scale_factors[0] = scale_N;
-  scale_factors[1] = scale_H;
-  scale_factors[2] = scale_W;
-  scale_factors[3] = scale_C;
+  scale_factors[0] = static_cast<T2>(scale_N);
+  scale_factors[1] = static_cast<T2>(scale_H);
+  scale_factors[2] = static_cast<T2>(scale_W);
+  scale_factors[3] = static_cast<T2>(scale_C);
   // resize m_out
   Dimensions dim;
   dim.resize(4);
-  dim[0] = N * scale_N;
-  dim[1] = H_in * scale_H;
-  dim[2] = W_in * scale_W;
-  dim[3] = C * scale_C;
+  dim[0] = (int)(N * scale_N);
+  dim[1] = (int)(H_in * scale_H);
+  dim[2] = (int)(W_in * scale_W);
+  dim[3] = (int)(C * scale_C);
   m_out.resize(dim);
   m_initDone = true;
   return true;
@@ -261,8 +261,8 @@ template<> inline void Resize<float>::calc_positions(int y, int x, int H, int W,
 
   if (m_coordinate_transformation_mode == resize_coordinate_transformation_mode_half_pixel)
   {
-    x_ori = (x + 0.5) / scale_factors[2] - 0.5;
-    y_ori = (y + 0.5) / scale_factors[1] - 0.5;
+    x_ori = (x + 0.5f) / scale_factors[2] - 0.5f;
+    y_ori = (y + 0.5f) / scale_factors[1] - 0.5f;
   }
   else if (m_coordinate_transformation_mode == resize_coordinate_transformation_mode_asymmetric)
   {
@@ -273,8 +273,8 @@ template<> inline void Resize<float>::calc_positions(int y, int x, int H, int W,
   y_ori_int = std::floor(y_ori);
 
   // acquire the positions of adjacent pixels, prioritizing the left and top pixels
-  x_ori_left   = (x_ori == x_ori_int) ? x_ori_int - 1 : x_ori_int;
-  y_ori_top    = (y_ori == y_ori_int) ? y_ori_int - 1 : y_ori_int;
+  x_ori_left   = (int)((x_ori == x_ori_int) ? x_ori_int - 1 : x_ori_int);
+  y_ori_top    = (int)((y_ori == y_ori_int) ? y_ori_int - 1 : y_ori_int);
   x_ori_right  = x_ori_left + 1;
   y_ori_bottom = y_ori_top + 1;
   x_ori_left   = std::max(0, x_ori_left);
diff --git a/sadl/layer_where.h b/sadl/layer_where.h
new file mode 100644
index 0000000000000000000000000000000000000000..900a1bef1e3e0b5bab740a2818a132d3f625232e
--- /dev/null
+++ b/sadl/layer_where.h
@@ -0,0 +1,94 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include "layer.h"
+
+namespace sadl
+{
+namespace layers
+{
+template<typename T> class Where : public Layer<T>
+{
+public:
+  using Layer<T>::Layer;
+  using Layer<T>::m_out;   // to avoid this->
+  using Layer<T>::m_initDone;
+
+  virtual bool apply(std::vector<Tensor<T> *> &in) override;
+  virtual bool init(const std::vector<Tensor<T> *> &in) override;
+
+protected:
+  virtual bool loadInternal(std::istream &file, Version) override;
+  DUMP_MODEL_EXT;
+};
+
+template<typename T> bool Where<T>::apply(std::vector<Tensor<T> *> &in)
+{
+  assert(in.size() == 3);
+  assert(in[0]->dims() == m_out.dims());
+  assert(in[0]->dims() == in[1]->dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1));
+  assert(in[0]->dims() == in[2]->dims() || (in[2]->dims().size() == 1 && in[2]->dims()[0] == 1));
+  const Tensor<T> &condition = *in[0];
+  const Tensor<T> &A = *in[1];
+  const Tensor<T> &B = *in[2];
+  m_out.quantizer = A.quantizer > B.quantizer ? A.quantizer : B.quantizer;
+  for (int i = 0; i < m_out.size(); i++)
+  {
+    const T A_i = (A.dims().size() == 1) ? A[0] : A[i];
+    const T B_i = (B.dims().size() == 1) ? B[0] : B[i];
+    typename ComputationType<T>::type z = condition[i] ? A_i : B_i;
+    const int z_q = condition[i] ? A.quantizer : B.quantizer ;
+    ComputationType<T>::shift_left(z, m_out.quantizer - z_q);
+    COUNTERS(z);
+    m_out[i] = z;
+  }
+  return true;
+}
+
+
+template<typename T> bool Where<T>::init(const std::vector<Tensor<T> *> &in)
+{
+  if (in.size() != 3)
+    return false;
+  m_out.resize(in[0]->dims());//condition dims
+  m_initDone = true;
+  return true;
+}
+
+template<typename T> bool Where<T>::loadInternal(std::istream &file, Version)
+{
+  return true;
+}
+
+}   // namespace layers
+}   // namespace sadl
diff --git a/sadl/layers.h b/sadl/layers.h
index 11996f978510a03f9c05587e2b459df989ed29e3..1009eab9ab5ac9af876353566afd4328b37f3ab0 100644
--- a/sadl/layers.h
+++ b/sadl/layers.h
@@ -57,6 +57,8 @@
 #include "layer_scatternd.h"
 #include "layer_gridsample.h"
 #include "layer_resize.h"
+#include "layer_compare.h"
+#include "layer_where.h"
 
 namespace sadl
 {
@@ -96,6 +98,7 @@ inline std::string opName(const OperationType::Type op)
     DIRTYCASEPRINT(ScatterND);
     DIRTYCASEPRINT(GridSample);
     DIRTYCASEPRINT(Resize);
+    DIRTYCASEPRINT(Compare);
   default:
     oss << "??";
     break;
diff --git a/sadl/model.h b/sadl/model.h
index 822acb68a6baaf1730841f8175f888318c091068..8cdfdb4891efad416e978423ddecc4a27dc26966 100644
--- a/sadl/model.h
+++ b/sadl/model.h
@@ -179,6 +179,12 @@ template<typename T> std::unique_ptr<layers::Layer<T>> createLayer(int32_t id, l
   case layers::OperationType::Resize:
     return std::unique_ptr<layers::Layer<T>>(new layers::Resize<T>{ id, op });
     break;
+  case layers::OperationType::Compare:
+    return std::unique_ptr<layers::Layer<T>>(new layers::Compare<T>{ id, op });
+    break;
+  case layers::OperationType::Where:
+    return std::unique_ptr<layers::Layer<T>>(new layers::Where<T>{ id, op });
+    break;
   case layers::OperationType::OperationTypeCount:
     break;   // no default on purpose
   }
@@ -442,7 +448,7 @@ template<typename T> bool Model<T>::init(std::vector<Tensor<T>> &in)
       op_type[inputs_cnt]                 = L.layer->op();
 
       // always put data layers first when const layers
-      if (inputs_cnt > 0 && op_type[inputs_cnt - 1] == layers::OperationType::Const && op_type[inputs_cnt] != layers::OperationType::Const)
+      if ((inputs_cnt > 0 && op_type[inputs_cnt - 1] == layers::OperationType::Const && op_type[inputs_cnt] != layers::OperationType::Const) && m_data[layer_cnt].layer->op() != layers::OperationType::Where)
       {
         std::cerr << "[ERROR] data layers should be first" << std::endl;
         return false;
@@ -708,7 +714,6 @@ template<typename T> void Model<T>::insertCopyLayers()
     }
     if (layer_with_current_as_mutable_input.size() > 1)
     {                                                           // need copy layer
-      assert(layer_with_current_as_mutable_input.size() < 3);   // for now. can be removed ?
       // for current layer L, insert copy layers C just after: x x x L C C xxxx
       std::vector<typename layers::Layer<T>::Id> id_copy_layers;
       for (int n = 0; n < (int) layer_with_current_as_mutable_input.size() - 1; ++n)
diff --git a/sadl/options.h b/sadl/options.h
index f5937306ce8a53b70d57f5e232cd87da32cbe853..356bf556d2a7c3f6686b765f8f2b5273d5ca4548 100644
--- a/sadl/options.h
+++ b/sadl/options.h
@@ -48,7 +48,7 @@ static constexpr float kSparsifySizeThreshold = 1000.0f;
 // nothing/-msse42: no simd
 // -mavx2:  avx2
 // -mavx2 -mfma: avx2 + fuse multiply/add
-// -mavx512bw -mavx512f: avx512
+// -mavx512bw -mavx512f: avx512 -mavx512dq
 // #define NDEBUG        1 // remove sanity tests
 
 // debug
diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt
index fa74de7730c50e336efdcfef4fdbf7714b8deeac..297da10816a4dbdd53c5a6b6eb823c9ba4f0340b 100644
--- a/sample/CMakeLists.txt
+++ b/sample/CMakeLists.txt
@@ -35,8 +35,8 @@ add_executable(quantization_test quantization_test.cpp ${HEADER_FILES})
 if( UNIX OR MINGW )
  set(CMAKE_CXX_FLAGS "-ffast-math -Wall -fstrict-aliasing") 
  set_target_properties(sample_simd256  PROPERTIES COMPILE_FLAGS "-mavx2" )
- set_target_properties(sample_simd512  PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw" )
- set_target_properties(debug_model  PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw" ) # must build in SIMD mode to debug SIMD issue
+ set_target_properties(sample_simd512  PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq" )
+ set_target_properties(debug_model  PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq" ) # must build in SIMD mode to debug SIMD issue
 endif()
 
 
diff --git a/sample/copy.h b/sample/copy.h
index 17b1a8c011ccea99343222d94aa2ad6001e90cef..951e3a53011bd3c3c674352bef7901bf5dda3b16 100644
--- a/sample/copy.h
+++ b/sample/copy.h
@@ -94,6 +94,11 @@ template<typename T> bool copy(const sadl::layers::Layer<float> &layer, sadl::la
     dynamic_cast<sadl::layers::Resize<T> &>(layerQ).m_mode            = dynamic_cast<const sadl::layers::Resize<float> &>(layer).m_mode;
     dynamic_cast<sadl::layers::Resize<T> &>(layerQ).m_nearest_mode    = dynamic_cast<const sadl::layers::Resize<float> &>(layer).m_nearest_mode;
     break;
+  case sadl::layers::OperationType::Compare:
+    dynamic_cast<sadl::layers::Compare<T> &>(layerQ).m_mode            = dynamic_cast<const sadl::layers::Compare<float> &>(layer).m_mode;
+    break;
+  case sadl::layers::OperationType::Where:
+    break;
     // no default to get warning
   }
 
diff --git a/sample/dumper.h b/sample/dumper.h
index 901630c29cb0f4211f05377948c7753018378982..da8475817bd41bca508f60bf095dc9489368db43 100644
--- a/sample/dumper.h
+++ b/sample/dumper.h
@@ -129,6 +129,17 @@ template<typename T> bool sadl::layers::Resize<T>::dump(std::ostream &file)
   return true;
 }
 
+template<typename T> bool sadl::layers::Compare<T>::dump(std::ostream &file)
+{
+  file.write((const char *) &m_mode, sizeof(m_mode));
+  return true;
+}
+
+template<typename T> bool sadl::layers::Where<T>::dump(std::ostream &file)
+{
+  return true;
+}
+
 template<typename T> bool sadl::layers::Layer<T>::dump(std::ostream &file)
 {
   // std::cout<<"todo? "<<opName(op_)<<std::endl;
diff --git a/sample/naive_quantization.cpp b/sample/naive_quantization.cpp
index da505dcc4caa41f57f039bc73581da686ca7a221..911a21e5ac9632095156b5f7e54704c9ef4fe7ef 100644
--- a/sample/naive_quantization.cpp
+++ b/sample/naive_quantization.cpp
@@ -72,7 +72,8 @@ bool toQuantize(sadl::layers::OperationType::Type type)
          && type != sadl::layers::OperationType::Identity && type != sadl::layers::OperationType::LeakyRelu && type != sadl::layers::OperationType::MaxPool
          && type != sadl::layers::OperationType::Relu && type != sadl::layers::OperationType::Reshape && type != sadl::layers::OperationType::Shape
          && type != sadl::layers::OperationType::Slice && type != sadl::layers::OperationType::Transpose && type != sadl::layers::OperationType::PReLU
-         && type != sadl::layers::OperationType::ScatterND && type != sadl::layers::OperationType::GridSample && type != sadl::layers::OperationType::Resize;
+         && type != sadl::layers::OperationType::ScatterND && type != sadl::layers::OperationType::GridSample && type != sadl::layers::OperationType::Resize
+         && type != sadl::layers::OperationType::Compare && type != sadl::layers::OperationType::Where;
 }
 
 template<typename T> void quantizeTensor(const sadl::Tensor<float> &B, sadl::Tensor<T> &Bq)
diff --git a/sample/pytorch.py b/sample/pytorch.py
index 455a14d921b4c89da918170968c6463bc410b417..2e34848f340350c5aeb9424f7f8198dda8908529 100644
--- a/sample/pytorch.py
+++ b/sample/pytorch.py
@@ -74,4 +74,4 @@ inputs_torch = [torch.from_numpy(input0)]
 inputs_torch[0].requires_grad = True
 output = model(inputs_torch)
 print("Output", output)
-torch.onnx.export(model, inputs_torch, "./pytorch.onnx")
+torch.onnx.export(model, inputs_torch, "./pytorch.onnx", opset_version=10)
diff --git a/sample/pytorch_matmult.py b/sample/pytorch_matmult.py
index 8f5a5cc5fbde658b5f2c00f4a90929cb17e466d4..cd2c1d230fa7995256170c898ef3a5d61d466865 100644
--- a/sample/pytorch_matmult.py
+++ b/sample/pytorch_matmult.py
@@ -30,5 +30,5 @@ output = model(inputs_torch)
 print("Output", output)
 print(model)
 torch.onnx.export(
-    model, inputs_torch, "./pytorch_matmult.onnx", verbose=True, opset_version=14
+    model, inputs_torch, "./pytorch_matmult.onnx", verbose=True, opset_version=10
 )
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 8ada8e88e4a66910599c664ee9706db8014581ee..fa213ae4bed371b6d836a5be28f3a9950721d403 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -12,7 +12,7 @@ add_executable(test   test.cpp ${HEADER_FILES})
 
 if( UNIX OR MINGW )
  set(CMAKE_CXX_FLAGS "-ffast-math -Wall -fstrict-aliasing") 
- set_target_properties(test  PROPERTIES COMPILE_FLAGS "-mavx2  -mavx512f -mavx512bw" )
+ set_target_properties(test  PROPERTIES COMPILE_FLAGS "-mavx2  -mavx512f -mavx512bw -mavx512dq" )
 endif()
 
 
diff --git a/utests/check.sh b/utests/check.sh
index a06d56130fe956d1ba66af9623b5a3965f9289ef..df65c29fe33d9feb1f3d5680d409fcc74196aad3 100755
--- a/utests/check.sh
+++ b/utests/check.sh
@@ -18,7 +18,7 @@ for F in $L; do
   ../utest.sh ../models/${F}.onnx --no_transpose;
 done
 
-L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x4_k1x1s2,1_g1_p0,0 conv2d_4_8x8x4_k1x3s1,2_g1_p0,1 conv2d_4_8x8x4_k3x1s1,1_g1_p1,0 conv2d_4_8x8x4_k3x1s1,1_g4_p1,0 conv2d_4_8x8x4_k3x3s1,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,2_g1_p1,1 conv2d_4_8x8x4_k5x5s1,1_g1_p2,2 conv2d_4_8x8x4_k5x5s1,1_g4_p2,2 conv2d_4_8x8x4_k5x5s2,1_g1_p2,2 conv2d_4_8x9x4_k1x1s2,1_g1_p0,0 conv2d_4_8x9x4_k3x1s1,1_g4_p1,0 conv2d_4_8x9x4_k3x3s1,1_g4_p1,1 conv2d_4_8x9x4_k3x3s2,1_g1_p1,1 conv2d_4_8x9x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k1x1s1,1_g1_p0,0 conv2d_4_9x8x4_k1x1s2,1_g1_p0,0 conv2d_4_9x8x4_k1x3s1,2_g1_p0,1 conv2d_4_9x8x4_k3x1s1,1_g1_p1,0 conv2d_4_9x8x4_k3x3s1,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k5x5s1,1_g1_p2,2 conv2d_4_9x8x4_k5x5s2,1_g1_p2,2 conv2d_4_9x9x4_k1x3s1,2_g1_p0,1 repeated_conv slice_pytorch slice_inf_pytorch slice_chw_pytorch prelu_multiple_alpha prelu_single_alpha scatternd_c_pytorch scatternd_hwc_with_conv_pytorch gridsample_bilinear gridsample_nearest gridsample_bilinear_conv gridsample_nearest_conv conv2dt_32_8x8x32_k3,3_s2,2_p1,1_op1,1 conv2dt_32_8x8x32_k4,4_s2,2_p1,1_op0,0 conv2dt_32_8x8x32_k5,5_s2,2_p2,2_op1,1 resize_bilinear_up2_pytorch resize_nearest_up2_pytorch resize_bilinear_up2_16x16x64_pytorch";
+L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x4_k1x1s2,1_g1_p0,0 conv2d_4_8x8x4_k1x3s1,2_g1_p0,1 conv2d_4_8x8x4_k3x1s1,1_g1_p1,0 conv2d_4_8x8x4_k3x1s1,1_g4_p1,0 conv2d_4_8x8x4_k3x3s1,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,1_g1_p1,1 conv2d_4_8x8x4_k3x3s2,2_g1_p1,1 conv2d_4_8x8x4_k5x5s1,1_g1_p2,2 conv2d_4_8x8x4_k5x5s1,1_g4_p2,2 conv2d_4_8x8x4_k5x5s2,1_g1_p2,2 conv2d_4_8x9x4_k1x1s2,1_g1_p0,0 conv2d_4_8x9x4_k3x1s1,1_g4_p1,0 conv2d_4_8x9x4_k3x3s1,1_g4_p1,1 conv2d_4_8x9x4_k3x3s2,1_g1_p1,1 conv2d_4_8x9x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k1x1s1,1_g1_p0,0 conv2d_4_9x8x4_k1x1s2,1_g1_p0,0 conv2d_4_9x8x4_k1x3s1,2_g1_p0,1 conv2d_4_9x8x4_k3x1s1,1_g1_p1,0 conv2d_4_9x8x4_k3x3s1,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,1_g1_p1,1 conv2d_4_9x8x4_k3x3s2,2_g1_p1,1 conv2d_4_9x8x4_k5x5s1,1_g1_p2,2 conv2d_4_9x8x4_k5x5s2,1_g1_p2,2 conv2d_4_9x9x4_k1x3s1,2_g1_p0,1 repeated_conv slice_pytorch slice_inf_pytorch slice_chw_pytorch prelu_multiple_alpha prelu_single_alpha scatternd_c_pytorch scatternd_hwc_with_conv_pytorch gridsample_bilinear gridsample_nearest gridsample_bilinear_conv gridsample_nearest_conv conv2dt_32_8x8x32_k3,3_s2,2_p1,1_op1,1 conv2dt_32_8x8x32_k4,4_s2,2_p1,1_op0,0 conv2dt_32_8x8x32_k5,5_s2,2_p2,2_op1,1 resize_bilinear_up2_pytorch resize_nearest_up2_pytorch resize_bilinear_up2_16x16x64_pytorch prelu_single_alpha_c32 prelu_multiple_alpha_c32 compare_less compare_greater where_constA_less where_constA_greater where_constB_less where_constB_greater";
 for F in $L; do
   ../utest.sh ../models/${F}.onnx;
 done
diff --git a/utests/models/compare_greater.onnx b/utests/models/compare_greater.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..6eb7484a56bca3085d7bed78437e502d61adafd3
Binary files /dev/null and b/utests/models/compare_greater.onnx differ
diff --git a/utests/models/compare_less.onnx b/utests/models/compare_less.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..131b2b003d56c9b5c82ef1a1f085db8a7ea6e910
Binary files /dev/null and b/utests/models/compare_less.onnx differ
diff --git a/utests/models/prelu_multiple_alpha_c32.onnx b/utests/models/prelu_multiple_alpha_c32.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ba6cc15fc039e425cb2ef347e2e421113463b7e3
Binary files /dev/null and b/utests/models/prelu_multiple_alpha_c32.onnx differ
diff --git a/utests/models/prelu_single_alpha_c32.onnx b/utests/models/prelu_single_alpha_c32.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..63c7c301c8d48cddfed4c3d364dd87e6eb656620
Binary files /dev/null and b/utests/models/prelu_single_alpha_c32.onnx differ
diff --git a/utests/models/slice_chw_pytorch.py b/utests/models/slice_chw_pytorch.py
index c06e54292ede9cce0800874da52a978dd02febc5..714e0b30b48fb333e5a40309af8222dba2e23377 100644
--- a/utests/models/slice_chw_pytorch.py
+++ b/utests/models/slice_chw_pytorch.py
@@ -25,4 +25,4 @@ output = model(input)
 # Output onnx model path
 onnx_model_path = os.path.join(os.path.dirname(sys.argv[0]), "slice_chw_pytorch.onnx")
 # Convert to onnx
-torch.onnx.export(model, input, onnx_model_path)
+torch.onnx.export(model, input, onnx_model_path, opset_version=10)
diff --git a/utests/models/where_constA_greater.onnx b/utests/models/where_constA_greater.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..a53e8dee567caae8086335b77839805ef6cfa1bf
Binary files /dev/null and b/utests/models/where_constA_greater.onnx differ
diff --git a/utests/models/where_constA_less.onnx b/utests/models/where_constA_less.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ba66e26ebc624835d2679a360169677ef8da9706
Binary files /dev/null and b/utests/models/where_constA_less.onnx differ
diff --git a/utests/models/where_constB_greater.onnx b/utests/models/where_constB_greater.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..70c0cde2b79221bdcc22b0a215756e1c7dfabdb5
Binary files /dev/null and b/utests/models/where_constB_greater.onnx differ
diff --git a/utests/models/where_constB_less.onnx b/utests/models/where_constB_less.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..d653b44d934d77c88d5885f9d634a7b9cf2f7184
Binary files /dev/null and b/utests/models/where_constB_less.onnx differ
diff --git a/utests/onnx_inference.py b/utests/onnx_inference.py
index dfbaf292bed6445f609b08d573326dc7009790b2..66bd0c8a9b56bc8d1756bee631a48a42e9561678 100644
--- a/utests/onnx_inference.py
+++ b/utests/onnx_inference.py
@@ -66,6 +66,8 @@ if args.output is not None:
                 f.write("{} ".format(i))
             f.write("\n")
             for x in np.nditer(o):
+                if np.issubdtype(x.dtype, np.bool_):
+                    x = x.astype(float)
                 f.write("{} ".format(x))
             f.write("\n")
         print("[INFO] results file in {}".format(args.output))