Compare revisions

Franck Galpin · Franck Galpin · Franck Galpin · Franck Galpin · Franck Galpin · Franck Galpin
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,6 +3,23 @@ stages:
 variables:
      GIT_SUBMODULE_STRATEGY: none

+.build_template_windows:
+   stage: build
+   script:
+      - |
+        echo "[INFO] BUILD $CI_COMMIT_SHORT_SHA";
+        mkdir -p build;
+        cd build;
+        & "C:\Program Files\CMake\bin\cmake.exe" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Release -DSPARSE_MATMULT_SUPPORT=1 ../sample; 
+        & "C:\Program Files\CMake\bin\cmake.exe" --build ./;     
+   only:
+      refs:
+         - master
+         - dev_for_transformers
+         - merge_requests
+      variables:
+         - $CI_PROJECT_URL == 'https://vcgit.hhi.fraunhofer.de/jvet-ahg-nnvc/sadl'
+         - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'master'
         

 .build_template_linux:
@@ -19,6 +36,14 @@ variables:
        cd utests/build;
        cmake  -DCMAKE_BUILD_TYPE=Release -DSPARSE_MATMULT_SUPPORT=1 ..; 
        make;      
+   only:
+      refs:
+         - master
+         - dev_for_transformers
+         - merge_requests
+      variables:
+         - $CI_PROJECT_URL == 'https://vcgit.hhi.fraunhofer.de/jvet-ahg-nnvc/sadl'
+         - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'master'
 

 build_ubuntu2004:
@@ -31,4 +56,8 @@ build_ubuntu2204:
   tags:
      - ubuntu2204

+build_vc192x:
+   extends: .build_template_windows
+   tags:
+      - vc192x

--- a/converter/main.py
+++ b/converter/main.py
@@ -127,6 +127,7 @@ class OPTYPE(IntEnum):
    Resize = (24,)
    Compare = (25,)
    Where = (26,)
+    Minimum = (27,)

    # "BatchMatMulV2" did not exist in Tensorflow 1.9. It exists in
    # Tensorflow 1.15.
@@ -681,8 +682,8 @@ def parse_graph_node(
        myGraph[node.output[0]]["additional"] = {}
        myGraph[node.output[0]]["additional"]["data"] = node
        map_onnx_to_myGraph[node.output[0]] = node.output[0]
-
-    elif node.op_type == "Identity":
+    
+    elif node.op_type == "Identity" or node.op_type == "Cast":
        myGraph[node.output[0]] = {}
        myGraph[node.output[0]]["op_type"] = OPTYPE.Identity
        myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]]
@@ -767,7 +768,7 @@ def parse_graph_node(
        myGraph[node.output[0]]["op_type"] = OPTYPE.Expand
        map_onnx_to_myGraph[node.output[0]] = node.output[0]

-    elif node.op_type == "Reshape" or node.op_type == "MatMul":
+    elif node.op_type == "Reshape":
        # Const
        myGraph[node.input[1]] = {}
        myGraph[node.input[1]]["op_type"] = OPTYPE.Const
@@ -790,11 +791,65 @@ def parse_graph_node(

        if node.op_type == "Reshape":
            myGraph[node.output[0]]["op_type"] = OPTYPE.Reshape
-        elif node.op_type == "MatMul":
-            myGraph[node.output[0]]["op_type"] = OPTYPE.MatMul

        map_onnx_to_myGraph[node.output[0]] = node.output[0]

+    elif node.op_type == "MatMul":
+        # check the inputs
+        if is_constant(n0name, model_onnx.graph.initializer) and is_constant(
+            node.input[1], model_onnx.graph.initializer
+        ):
+            quit("[ERROR] unsupported double constants MatMul", node)
+        swap_inputs = False
+        if is_constant(n0name, model_onnx.graph.initializer):
+            additional = {}
+            additional["data"] = node
+            n2 = getNodesWithOutput(n0name, model_onnx)
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(
+                n0name, node_annotation[n2.name].to_transpose, model_onnx.graph, verbose
+            )
+            map_onnx_to_myGraph[n0name] = n0name
+            myGraph[n0name] = {}
+            myGraph[n0name]["inputs"] = []
+            myGraph[n0name]["additional"] = additional
+            myGraph[n0name]["op_type"] = OPTYPE.Const
+            swap_inputs = True
+        if is_constant(node.input[1], model_onnx.graph.initializer):
+            additional = {}
+            additional["data"] = node
+            n2 = getNodesWithOutput(node.input[1], model_onnx)
+            additional["dims"], additional["raw_data"], additional[
+                "dtype"
+            ] = extract_additional_data(
+                node.input[1],
+                node_annotation[n2.name].to_transpose,
+                model_onnx.graph,
+                verbose,
+            )
+            map_onnx_to_myGraph[node.input[1]] = node.input[1]
+            myGraph[node.input[1]] = {}
+            myGraph[node.input[1]]["inputs"] = []
+            myGraph[node.input[1]]["additional"] = additional
+            myGraph[node.input[1]]["op_type"] = OPTYPE.Const
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.MatMul
+        if swap_inputs:
+            myGraph[node.output[0]]["inputs"] = [
+                map_onnx_to_myGraph[node.input[1]],
+                map_onnx_to_myGraph[n0name],
+            ]
+        else:
+            myGraph[node.output[0]]["inputs"] = [
+                map_onnx_to_myGraph[n0name],
+                map_onnx_to_myGraph[node.input[1]],
+            ]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
+
+
    elif node.op_type == "Concat":
        # Const
        myGraph[node.output[0]] = {}
@@ -831,6 +886,17 @@ def parse_graph_node(
        myGraph[node.output[0]]["additional"]["data"] = node
        map_onnx_to_myGraph[node.output[0]] = node.output[0]

+    elif node.op_type == "Min":
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.Minimum
+        myGraph[node.output[0]]["inputs"] = [
+            map_onnx_to_myGraph[n0name],
+            map_onnx_to_myGraph[node.input[1]],
+        ]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
+
    elif node.op_type == "Unsqueeze":
        # No need to parse Unsqueeze as SADL can handle it.
        map_onnx_to_myGraph[node.output[0]] = node.output[0]
@@ -1199,6 +1265,37 @@ def parse_graph_node(
        myGraph[node.output[0]]["additional"]["data"] = node
        map_onnx_to_myGraph[node.output[0]] = node.output[0]

+    elif node.op_type == "Equal":
+        additional = {}
+        additional["data"] = node
+        if is_constant(node.input[1], model_onnx.graph.initializer):
+            n2 = getNodesWithOutput(node.input[1], model_onnx)  # constant
+            (
+                additional["dims"],
+                additional["raw_data"],
+                additional["dtype"],
+            ) = extract_additional_data(
+                node.input[1],
+                False,
+                model_onnx.graph,
+                verbose,
+            )
+            myGraph[node.input[1]] = {}
+            myGraph[node.input[1]]["op_type"] = OPTYPE.Const
+            myGraph[node.input[1]]["inputs"] = []
+            myGraph[node.input[1]]["additional"] = additional
+            map_onnx_to_myGraph[node.input[1]] = node.input[1]
+
+        myGraph[node.output[0]] = {}
+        myGraph[node.output[0]]["op_type"] = OPTYPE.Compare
+        myGraph[node.output[0]]["inputs"] = [map_onnx_to_myGraph[n0name]] + [
+            map_onnx_to_myGraph[node.input[1]]
+        ]
+        myGraph[node.output[0]]["additional"] = {}
+        myGraph[node.output[0]]["additional"]["data"] = node
+        myGraph[node.output[0]]["additional"]["mode"] = 2
+        map_onnx_to_myGraph[node.output[0]] = node.output[0]
+
    else:
        raise Exception("[ERROR] node not supported:\n{})".format(node))

@@ -1677,6 +1774,13 @@ def annotate_node(
        node_annotation[n2.name].to_transpose = True
        node_annotation[n2.name].layout_onnx = "nhwc"

+    elif node.op_type == "MatMul":
+        if global_data_layout == "nchw":
+            n2 = getNodesWithOutput(node.input[1], model_onnx)
+            node_annotation[n2.name].add_transpose_after = True
+            node_annotation[node.name].add_transpose_before = True
+            node_annotation[node.name].add_transpose_after = True
+
    elif node.op_type == "Gemm":
        n2 = getInitializer(node.input[1], model_onnx)
        if global_data_layout == "nchw":
@@ -1689,6 +1793,7 @@ def annotate_node(
            node_annotation[n2.name].to_transpose = True

    nexts = getNodesWithInput(node.output[0], model_onnx)
+
    for n in nexts:
        annotate_node(
            n, model_onnx, node_annotation, global_data_layout, verbose

--- a/sadl/interpolation_utils.h
+++ b/sadl/interpolation_utils.h
@@ -107,7 +107,7 @@ void bilinear_in_channels_wo_simd(const Tensor<T> &data, const T2 coeffs[], cons
  constexpr int im_nb      = 0;
  int           in_D       = data.dims()[3];
  const int    &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
-  const int     pos_table[4][2] = { y_ori_top, x_ori_left, y_ori_top, x_ori_right, y_ori_bottom, x_ori_left, y_ori_bottom, x_ori_right };
+  const int     pos_table[4][2] = { {y_ori_top, x_ori_left}, {y_ori_top, x_ori_right}, {y_ori_bottom, x_ori_left}, {y_ori_bottom, x_ori_right} };

  static std::vector<T2> temp_buffer;
  temp_buffer.resize(in_D);
@@ -142,7 +142,7 @@ inline void bilinear_in_channels_simd256(const Tensor<float> &data, const float
  int           in_D  = data.dims()[3];
  assert(in_D % 8 == 0);   // Should be used with mod8 data.
  const int &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
-  const int  pos_table[4][2] = { y_ori_top, x_ori_left, y_ori_top, x_ori_right, y_ori_bottom, x_ori_left, y_ori_bottom, x_ori_right };
+  const int  pos_table[4][2] = { {y_ori_top, x_ori_left}, {y_ori_top, x_ori_right}, {y_ori_bottom, x_ori_left}, {y_ori_bottom, x_ori_right} };

  static std::vector<float> temp_buffer;
  temp_buffer.resize(in_D);
@@ -182,7 +182,7 @@ inline void bilinear_in_channels_simd256(const Tensor<int16_t> &data, const int3
  using T = int16_t;
 #endif
  const int &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
-  const int  pos_table[4][2] = { y_ori_top, x_ori_left, y_ori_top, x_ori_right, y_ori_bottom, x_ori_left, y_ori_bottom, x_ori_right };
+  const int  pos_table[4][2] = { {y_ori_top, x_ori_left}, {y_ori_top, x_ori_right}, {y_ori_bottom, x_ori_left}, {y_ori_bottom, x_ori_right} };

  static std::vector<int32_t> temp_buffer;
  temp_buffer.resize(in_D);
@@ -227,7 +227,7 @@ inline void bilinear_in_channels_simd512(const Tensor<float> &data, const float
  int           in_D  = data.dims()[3];
  assert(in_D % 16 == 0);   // Should be used with mod16 data.
  const int &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
-  const int  pos_table[4][2] = { y_ori_top, x_ori_left, y_ori_top, x_ori_right, y_ori_bottom, x_ori_left, y_ori_bottom, x_ori_right };
+  const int  pos_table[4][2] = { {y_ori_top, x_ori_left}, {y_ori_top, x_ori_right}, {y_ori_bottom, x_ori_left}, {y_ori_bottom, x_ori_right} };

  static std::vector<float> temp_buffer;
  temp_buffer.resize(in_D);
@@ -269,7 +269,7 @@ inline void bilinear_in_channels_simd512(const Tensor<int16_t> &data, const int3
  using T = int16_t;
 #endif
  const int &x_ori_left = pos[0], &y_ori_top = pos[1], &x_ori_right = pos[2], &y_ori_bottom = pos[3];
-  const int  pos_table[4][2] = { y_ori_top, x_ori_left, y_ori_top, x_ori_right, y_ori_bottom, x_ori_left, y_ori_bottom, x_ori_right };
+  const int  pos_table[4][2] = { {y_ori_top, x_ori_left}, {y_ori_top, x_ori_right}, {y_ori_bottom, x_ori_left}, {y_ori_bottom, x_ori_right }};

  static std::vector<int32_t> temp_buffer;
  temp_buffer.resize(in_D);

--- a/sadl/layer.h
+++ b/sadl/layer.h
@@ -74,7 +74,8 @@ struct OperationType
    Resize             = 24,
    Compare            = 25,
    Where              = 26,
-    OperationTypeCount = 27
+    Minimum            = 27,
+    OperationTypeCount = 28
  };
 };


--- a/sadl/layer_compare.h
+++ b/sadl/layer_compare.h
@@ -40,7 +40,8 @@ namespace layers
 enum class Compare_mode  
 {
    LessThan,
-    GreaterThan
+    GreaterThan,
+    EqualTo
 };
 template<typename T> class Compare : public Layer<T>
 {
@@ -56,6 +57,7 @@ protected:
  virtual bool loadInternal(std::istream &file, Version) override;
  bool apply_less(std::vector<Tensor<T> *> &in);
  bool apply_greater(std::vector<Tensor<T> *> &in);
+  bool apply_equal_to(std::vector<Tensor<T> *> &in);

  Compare_mode m_mode;             
  DUMP_MODEL_EXT;
@@ -70,6 +72,8 @@ template<typename T> bool Compare<T>::apply(std::vector<Tensor<T> *> &in)
    return apply_less(in);
  else if(m_mode == Compare_mode::GreaterThan)
    return apply_greater(in);
+  else if(m_mode == Compare_mode::EqualTo)
+    return apply_equal_to(in);
  else return false;
 }

@@ -151,6 +155,55 @@ template<typename T> bool Compare<T>::apply_greater(std::vector<Tensor<T> *> &in
  return true;
 }

+template<typename T> bool Compare<T>::apply_equal_to(std::vector<Tensor<T> *> &in)
+{
+#if DEBUG_MODEL
+  if constexpr (std::is_same<T, float>::value)
+  {
+    static bool once = true;
+    if (once)
+    {
+      std::cout << "[WARNING] using equal layer with float: unexpected results can occur" << std::endl;
+    }
+  once = false;
+  }
+#endif
+  const Tensor<T> &A = *in[0];
+  const Tensor<T> &B = *in[1];
+  const int &A_q = A.quantizer;
+  const int &B_q = B.quantizer;
+  const int A_shift = std::max(0, B_q - A_q);
+  const int B_shift = std::max(0, A_q - B_q);
+  m_out.quantizer = 0;// bool tensor
+  if(B.dims().size() == 1)
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[0];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i == B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+  else
+  {
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      T A_i = A[i];
+      T B_i = B[i];
+      ComputationType<T>::shift_left(A_i, A_shift);//quantization
+      ComputationType<T>::shift_left(B_i, B_shift);//quantization
+      T z = A_i == B_i;
+      COUNTERS(z);
+      m_out[i] = z;
+    }
+  }
+  return true;
+}
+
 template<typename T> bool Compare<T>::init(const std::vector<Tensor<T> *> &in)
 {
  if (in.size() != 2)
@@ -168,6 +221,8 @@ template<typename T> bool Compare<T>::loadInternal(std::istream &file, Version)
    m_mode = Compare_mode::LessThan;
  else if(x == (int32_t) Compare_mode::GreaterThan)
    m_mode = Compare_mode::GreaterThan;
+  else if(x == (int32_t) Compare_mode::EqualTo)
+    m_mode = Compare_mode::EqualTo;
  else
  {
    std::cerr << "[ERROR] invalid mode: " << x << std::endl;

--- a/sadl/layer_concat.h
+++ b/sadl/layer_concat.h
@@ -73,8 +73,10 @@ template<typename T> bool Concat<T>::apply(std::vector<Tensor<T> *> &in)
  }
  m_out.quantizer   = qmin;   // adapt output width to last input
  m_out.border_skip = in[0]->border_skip;
-  for (int i = 1; i < nb_in; ++i)
-    m_out.border_skip = std::max(m_out.border_skip, in[i]->border_skip);
+  for (int i = 1; i < nb_in; ++i) {
+    m_out.border_skip.first = std::max(m_out.border_skip.first, in[i]->border_skip.first);
+    m_out.border_skip.second = std::max(m_out.border_skip.second, in[i]->border_skip.second);
+  }

  const Dimensions dim = in[0]->dims();
  if (dim.size() == 2)

--- a/sadl/layer_conv2d.h
+++ b/sadl/layer_conv2d.h
@@ -158,11 +158,11 @@ template<typename T> bool Conv2D<T>::apply(std::vector<Tensor<T> *> &in)
  {
    return apply_s<1, 1>(A, kernel);
  }
-  else if (m_strides[1] == 1 && m_strides[2] == 2 && m_groups == 1)
+  else if (m_strides[1] == 1 && m_strides[2] == 2)
  {
    return apply_s<1, 2>(A, kernel);
  }
-  else if ((m_strides[1] == 2 && m_strides[2] == 1) && m_groups == 1)
+  else if (m_strides[1] == 2 && m_strides[2] == 1)
  {
    return apply_s<2, 1>(A, kernel);
  }
@@ -209,11 +209,12 @@ template<typename T> template<int s_h, int s_w> bool Conv2D<T>::apply_s(const Te
      {   // skip border
        if (s_h == 1 && s_w == 1)
        {
-          start_h += m_out.border_skip;
-          start_w += m_out.border_skip;
-          in_H -= m_out.border_skip;
-          in_W -= m_out.border_skip;
-          m_out.border_skip++;
+          start_h += m_out.border_skip.first;
+          start_w += m_out.border_skip.second;
+          in_H -= m_out.border_skip.first;
+          in_W -= m_out.border_skip.second;
+          m_out.border_skip.first++;
+          m_out.border_skip.second++;
        }
      }
      conv2d_3x3_s_core_dispatch<s_h, s_w>(A, kernel);

--- a/sadl/layer_expand.h
+++ b/sadl/layer_expand.h
@@ -113,7 +113,7 @@ template<typename T> bool Expand<T>::init(const std::vector<Tensor<T> *> &in)
    std::cerr << "[ERROR] quantizer on reshape dimensions data layer" << std::endl;
    return false;
  }
-  copy(in[1]->begin(), in[1]->end(), dim.begin());
+  for(int64_t i=0;i<in[1]->size();i++) dim[i]=(int)((*in[1])[i]); 
  // current restriction: broadcast only scalar to shape or expand last channel =1 of a tensor of dim 4
  bool ok = false;
  if (in[0]->size() == 1)

--- a/sadl/layer_matmul.h
+++ b/sadl/layer_matmul.h
@@ -55,6 +55,8 @@ protected:
  virtual bool          loadInternal(std::istream &file, Version v) override;
  template<int NN> bool apply_dim2(std::vector<Tensor<T> *> &in);
  template<int NN> bool apply_dim3(std::vector<Tensor<T> *> &in);
+  template<int NN> bool apply_dim4(std::vector<Tensor<T> *> &in);
+
 #if __AVX2__
  bool apply_dim2_simd8(std::vector<Tensor<T> *> &in) { return apply_dim2<8>(in); }
  bool apply_dim2_simd16(std::vector<Tensor<T> *> &in) { return apply_dim2_simd8(in); }
@@ -127,6 +129,9 @@ template<typename T> bool MatMul<T>::apply(std::vector<Tensor<T> *> &in)
  case 3:
    return apply_dim3<1>(in);
    break;
+  case 4:
+    return apply_dim4<1>(in);
+    break;
  default:
    std::cerr << "Logical error MatMul::apply(std::vector<Tensor<T> *> &in)" << A.dims() << ' ' << B.dims() << std::endl;
    return false;
@@ -348,6 +353,48 @@ template<typename T> template<int NN> bool MatMul<T>::apply_dim3(std::vector<Ten
  }
  return true;
 }
+template<typename T> template<int NN> bool MatMul<T>::apply_dim4(std::vector<Tensor<T> *> &in)
+{
+  const Tensor<T> &A{ *in[0] };
+  const Tensor<T> &B{ *in[1] };
+  int        shift{ in[1]->quantizer + m_q };
+  const int        last = A.dims().size() - 1;
+  const int        N{ A.dims()[last - 2] };
+  const int        H{ A.dims()[last - 1] };
+  const int        R{ B.dims().back() };
+  const int        W{ (A.dims()[last] / NN) * NN };
+  (void) W;
+#if __AVX2__ && DEBUG_SIMD
+  std::cout << "\n[WARN] generic version matmul dim4 " << A.dims() << ' ' << B.dims() << "(H=" << H << ") " << (N * R * H * W) / 1000 << " kMAC" << std::endl;
+#endif   // SIMD
+  constexpr int idx_start{ 0 };
+  const int     idx_end{ W };
+
+  for (int b = 0; b < N; ++b)
+  {
+    for (int i = 0; i < H; ++i)
+    {
+      for (int t = 0; t < R; ++t)
+      {
+        typename ComputationType<T>::type x = 0;
+        {
+          for (int j = idx_start; j < idx_end; ++j)
+          {
+            x += (typename ComputationType<T>::type) A(0, b, i, j) * B(0, b, j, t);
+            COUNTERS_MAC(B(0, b, j, t));
+          }
+        }
+        ComputationType<T>::quantize(x, shift);
+        COUNTERS(x);
+        SATURATE(x);
+        m_out(0, b, i, t) = (T) x;
+      }
+    }
+  }
+  
+  return true;
+}
+

 #if SPARSE_SUPPORT
 template<typename T> bool MatMul<T>::apply_sparse_matmul(std::vector<Tensor<T> *> &in)
@@ -558,7 +605,7 @@ template<typename T> bool MatMul<T>::init(const std::vector<Tensor<T> *> &in)
  // B: const (because assumed transposed)
  // 1- A [x,y] B[y,z] || A [x,y,z] B[x,z,t] || A [1,x,y,z] B[1,x,z,t]
  // 2- A [1,x,y] B[y,z] || A [1,x,y,z] B[x,z,t]
-  if (in[1]->dims().size() < 2 || in[1]->dims().size() > 3)
+  if (in[1]->dims().size() < 2 || in[1]->dims().size() > 4)
  {
    return false;
  }

--- a/sadl/layer_maxpool.h
+++ b/sadl/layer_maxpool.h
@@ -72,35 +72,10 @@ template<typename T> bool MaxPool<T>::apply(std::vector<Tensor<T> *> &in)
  const int        offset_end   = m_kernel[1] / 2;
  const int        offset_start = m_kernel[1] - 1 - offset_end;
  const int        step         = m_strides[1];
-  const int        in_H         = in[0]->dims()[1];

  // currently adhoc start
-  int start = 0;
-  if (step == 1)
-  {
-    start = 0;
-  }
-  else if (step == 2)
-  {
-    //  if (in_H % 2 == 0)
-    //    start = 1;
-    //  else
-    start = 0;
-  }
-  else if (step == 3)
-  {
-    if (in_H % 2 == 0)
-      start = 0;
-    else
-      start = 1;
-  }
-  else
-  {
-    std::cerr << "[ERROR] to do" << std::endl;
-    assert(false);
-    exit(-1);
-  }
-
+  int start = offset_start;
+  
  m_out.quantizer   = in[0]->quantizer;     // adapt output width to bias
  m_out.border_skip = in[0]->border_skip;   // to check


--- a/sadl/layer_minimum.h
+++ b/sadl/layer_minimum.h
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2024, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include "layer.h"
+
+namespace sadl
+{
+namespace layers
+{
+template<typename T> class Minimum : public Layer<T>
+{
+public:
+  using Layer<T>::Layer;
+  using Layer<T>::m_out;   // to avoid this->
+  using Layer<T>::m_initDone;
+
+  virtual bool apply(std::vector<Tensor<T> *> &in) override;
+  virtual bool init(const std::vector<Tensor<T> *> &in) override;
+  virtual bool mutateInput() const override { return true; }
+
+protected:
+  virtual bool loadInternal(std::istream &file, Version) override;
+};
+
+template<typename T> bool Minimum<T>::apply(std::vector<Tensor<T> *> &in)
+{
+  assert(in.size() == 2);
+  if (in[0] == in[1])
+  {
+    std::cerr << "  input aliasing" << std::endl;
+    return false;
+  }
+  const int shift = -(in[1]->quantizer - in[0]->quantizer);
+  swap(*in[0], m_out);
+
+  /*
+  Looking at the initialization, if the condition
+  below is false, necessarily, `in[1]->dims().size()`
+  is equal to 1.
+  */
+  if (in[0]->dims() == in[1]->dims())
+  {
+    for (auto it0 = m_out.begin(), it1 = in[1]->begin(); it0 != m_out.end(); ++it0, ++it1)
+    {
+      T z = *it1;
+      ComputationType<T>::shift_left(z, shift);
+      *it0 = std::min(*it0, z);
+    }
+  }
+  else
+  {
+    const Tensor<T> &B{ *in[1] };
+    if (B.size() == 1)
+    {
+      T value{ B[0] };
+      ComputationType<T>::shift_left(value, shift);
+      for (auto it0 = m_out.begin(); it0 != m_out.end(); ++it0)
+      {
+        *it0 = std::min(*it0, value);
+      }
+    }
+    else if (in[0]->dims().size() == 2)
+    {
+      const int N{ in[0]->dims()[0] };
+      const int H{ in[0]->dims()[1] };
+      for (int n = 0; n < N; ++n)
+        for (int i = 0; i < H; ++i)
+        {
+          T z = B[i];
+          ComputationType<T>::shift_left(z, shift);
+          m_out(n, i) = std::min(m_out(n, i), z);
+        }
+    }
+    else if (in[0]->dims().size() == 3)
+    {
+      const int N{ in[0]->dims()[0] };
+      const int H{ in[0]->dims()[1] };
+      const int W{ in[0]->dims()[2] };
+      for (int n = 0; n < N; ++n)
+        for (int i = 0; i < H; ++i)
+          for (int j = 0; j < W; ++j)
+          {
+            T z = B[j];
+            ComputationType<T>::shift_left(z, shift);
+            m_out(n, i, j) = std::min(m_out(n, i, j), z);
+          }
+    }
+    else if (in[0]->dims().size() == 4)
+    {
+      const int N{ in[0]->dims()[0] };
+      const int H{ in[0]->dims()[1] };
+      const int W{ in[0]->dims()[2] };
+      const int K{ in[0]->dims()[3] };
+      for (int n = 0; n < N; ++n)
+        for (int i = 0; i < H; ++i)
+          for (int j = 0; j < W; ++j)
+            for (int k = 0; k < K; ++k)
+            {
+              T z = B[k];
+              ComputationType<T>::shift_left(z, shift);
+              m_out(n, i, j, k) = std::min(m_out(n, i, j, k), z);
+            }
+    }
+  }
+  return true;
+}
+
+template<typename T> bool Minimum<T>::init(const std::vector<Tensor<T> *> &in)
+{
+  SADL_DBG(std::cout << "  - " << in[0]->dims() << ' ' << in[1]->dims() << std::endl);
+  if (in.size() != 2)
+  {
+    return false;
+  }
+
+  /*
+  Broadcasting is supported. This means that either
+  the two input Tensor<T>s have the same shape or the
+  second input Tensor<T> is a singleton or the second
+  input Tensor<T> is a vector and the last dimension
+  of the first input Tensor<T> is equal to the size
+  of the second input Tensor<T>.
+  */
+  if (in[1]->size() == 1)
+  {   // singleton
+      // ok
+  }
+  else if (in[1]->dims().size() == 1 || (in[1]->dims().size() == 2 && in[1]->dims()[0] == 1))
+  {
+    if (in[1]->size() != in[0]->dims().back())
+    {   // broadcast last tdim
+      return false;
+    }
+  }
+  else
+  {
+    if (!(in[0]->dims() == in[1]->dims()))
+    {   // same sim
+      return false;
+    }
+  }
+  m_out.resize(in[0]->dims());
+  m_initDone = true;
+  return true;
+}
+
+template<typename T> bool Minimum<T>::loadInternal(std::istream &, Version) { return true; }
+
+}   // namespace layers
+}   // namespace sadl
--- a/sadl/layer_placeholder.h
+++ b/sadl/layer_placeholder.h
@@ -66,7 +66,7 @@ template<typename T> bool Placeholder<T>::apply(std::vector<Tensor<T> *> &in)
  {   // v2
    m_out.quantizer = m_q;
  }
-  m_out.border_skip = 0;
+  m_out.border_skip = {0,0};
  return true;
 }


--- a/sadl/layer_prelu.h
+++ b/sadl/layer_prelu.h
@@ -193,7 +193,6 @@ template<typename T> template<bool multialpha> bool PReLU<T>::apply_scalar(std::
 #if __AVX2__
 template<> template<bool multialpha> inline bool PReLU<float>::apply_simd256(std::vector<Tensor<float> *> &in)   // simd256 float
 {
-  exit(-1);   // to correct
  Tensor<float> &A = *in[1];
  swap(*in[0], m_out);
  float *const       data_ptr  = m_out.data();
@@ -230,8 +229,8 @@ template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd256(s
  const __m256i max   = _mm256_set1_epi32(32767);
  const __m256i min   = _mm256_set1_epi32(-32768);
  const __m256i zeros = _mm256_setzero_si256();
-  const int     N     = m_out.size();
-  for (int iter = 0; iter < N; iter += 16)
+  const auto     N     = m_out.size();
+  for (int64_t iter = 0; iter < N; iter += 16)
  {
    int16_t *aptr = data_ptr + iter;
    auto     a    = _mm256_load_si256((__m256i *) aptr);   // load
@@ -294,7 +293,7 @@ template<> template<bool multialpha> inline bool PReLU<float>::apply_simd512(std
  const float *const alpha_ptr = A.data();
  const __m512       m_zeros   = _mm512_setzero_ps();
  __m512             alpha     = _mm512_set1_ps(*A.data());
-  for (int iter = 0; iter < m_out.size(); iter += 16)
+  for (int64_t iter = 0; iter < m_out.size(); iter += 16)
  {
    if (multialpha)
      alpha = _mm512_load_ps(alpha_ptr + iter % A.size());
@@ -329,9 +328,9 @@ template<> template<bool multialpha> inline bool PReLU<int16_t>::apply_simd512(s
  static constexpr int16_t data[]={0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
  const auto shuffle=  _mm512_loadu_si512((void *)data);

-  const int N = m_out.size();
+  const auto N = m_out.size();

-  for (int iter = 0; iter < N; iter += 32)
+  for (int64_t iter = 0; iter < N; iter += 32)
  {
    int16_t *aptr = data_ptr + iter;
    auto     a    = _mm512_loadu_si512((__m512i *) aptr);   // load

--- a/sadl/layer_resize.h
+++ b/sadl/layer_resize.h
@@ -85,7 +85,7 @@ template<typename T> bool Resize<T>::loadInternal(std::istream &file, Version v)
  int32_t x = 0;
  file.read((char *) &x, sizeof(x));
  m_input_label = x;
-  SADL_DBG(std::cout << "  - input_lable: " << m_input_label << std::endl);
+  SADL_DBG(std::cout << "  - input_label: " << m_input_label << std::endl);
  file.read((char *) &x, sizeof(x));
  m_coordinate_transformation_mode = x;
  SADL_DBG(std::cout << "  - coordinate_transformation_mode: " << m_coordinate_transformation_mode << std::endl);
@@ -132,24 +132,28 @@ template<typename T> bool Resize<T>::init(const std::vector<Tensor<T> *> &in)
  int W_in = in[0]->dims()[2];
  int C    = in[0]->dims()[3];
  // scale factor
-  float scale_N = 0, scale_C = 0, scale_H = 0, scale_W = 0;
+  int scale_N = 0, scale_C = 0, scale_H = 0, scale_W = 0;
  if (m_input_label == 1)   // inputs are X and sizes
  {
-    scale_N = in[1]->data()[0] / (float)N;
-    scale_C = in[1]->data()[1] / (float)C;
-    scale_H = in[1]->data()[2] / (float)H_in;
-    scale_W = in[1]->data()[3] / (float)W_in;
+    scale_N = (int)round(in[1]->data()[0] / (float)N);
+    scale_C = (int)round(in[1]->data()[1] / (float)C);
+    scale_H = (int)round(in[1]->data()[2] / (float)H_in);
+    scale_W = (int)round(in[1]->data()[3] / (float)W_in);
  }
  else if (m_input_label == 2)   // inputs are X and scales
  {
-    scale_N = in[1]->data()[0];
-    scale_C = in[1]->data()[1];
-    scale_H = in[1]->data()[2];
-    scale_W = in[1]->data()[3];
+    scale_N = (int)round(in[1]->data()[0]);
+    scale_C = (int)round(in[1]->data()[1]);
+    scale_H = (int)round(in[1]->data()[2]);
+    scale_W = (int)round(in[1]->data()[3]);
+  } else {
+    std::cerr << "[ERROR] invalid type " << m_input_label<< std::endl;
+    return false;
  }
+
  if (scale_N != 1 || scale_H != 2 || scale_W != 2 || scale_C != 1)
  {
-    std::cerr << "[ERROR] invalid scale factor: (" << scale_N << ", " << scale_H << ", " << scale_W << ", " << scale_C << ")" << std::endl;
+    std::cerr << "[ERROR] invalid scale factor: input: "<<in[0]->dims()<<" scales: "<<*in[1]<<" result=(" << scale_N << ", " << scale_H << ", " << scale_W << ", " << scale_C << ")" << std::endl;
    return false;
  }
  scale_factors.resize(in[1]->dims());

--- a/sadl/layer_scatternd.h
+++ b/sadl/layer_scatternd.h
@@ -72,9 +72,9 @@ template<typename T> bool ScatterND<T>::apply(std::vector<Tensor<T> *> &in)
    {
      for (int c = 0; c < dim_C; c++)
      {
-        index_H                             = indices(h, w, c, 1);
-        index_W                             = indices(h, w, c, 2);
-        index_C                             = indices(h, w, c, 3);
+        index_H                             = (int)indices(h, w, c, 1);
+        index_W                             = (int)indices(h, w, c, 2);
+        index_C                             = (int)indices(h, w, c, 3);
        m_out(0, index_H, index_W, index_C) = updates(0, h, w, c);   // n==1
      }
    }

--- a/sadl/layer_shape.h
+++ b/sadl/layer_shape.h
@@ -67,7 +67,7 @@ template<typename T> bool Shape<T>::init(const std::vector<Tensor<T> *> &in)
  d.resize(1);
  d[0] = in[0]->dims().size();
  m_out.resize(d);
-  copy(in[0]->dims().begin(), in[0]->dims().end(), m_out.begin());
+  for(int i=0;i<in[0]->dims().size();i++) m_out[i]=(T)(in[0]->dims()[i]);
  m_initDone = true;
  return true;
 }

--- a/sadl/layer_where.h
+++ b/sadl/layer_where.h
@@ -46,6 +46,7 @@ public:

  virtual bool apply(std::vector<Tensor<T> *> &in) override;
  virtual bool init(const std::vector<Tensor<T> *> &in) override;
+  virtual bool mutateInput() const override { return true; }

 protected:
  virtual bool loadInternal(std::istream &file, Version) override;
@@ -55,22 +56,44 @@ protected:
 template<typename T> bool Where<T>::apply(std::vector<Tensor<T> *> &in)
 {
  assert(in.size() == 3);
-  assert(in[0]->dims() == m_out.dims());
-  assert(in[0]->dims() == in[1]->dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1));
-  assert(in[0]->dims() == in[2]->dims() || (in[2]->dims().size() == 1 && in[2]->dims()[0] == 1));
+  if (in[0]->size() != 1)
+  {
+    assert(in[0]->dims() == m_out.dims());
+    assert(in[0]->dims() == in[1]->dims() || (in[1]->dims().size() == 1 && in[1]->dims()[0] == 1));
+    assert(in[0]->dims() == in[2]->dims() || (in[2]->dims().size() == 1 && in[2]->dims()[0] == 1));
+  }
+  else
+  {
+    assert(in[1]->dims() == m_out.dims());
+    assert(in[1]->dims() == in[2]->dims());
+  }
  const Tensor<T> &condition = *in[0];
-  const Tensor<T> &A = *in[1];
-  const Tensor<T> &B = *in[2];
-  m_out.quantizer = A.quantizer > B.quantizer ? A.quantizer : B.quantizer;
-  for (int i = 0; i < m_out.size(); i++)
+  if (condition.size() == 1)
+  {
+    if (condition[0])
+    {
+      swap(*in[1], m_out);
+    }
+    else
+    {
+      swap(*in[2], m_out);
+    }
+  }
+  else
  {
-    const T A_i = (A.dims().size() == 1) ? A[0] : A[i];
-    const T B_i = (B.dims().size() == 1) ? B[0] : B[i];
-    typename ComputationType<T>::type z = condition[i] ? A_i : B_i;
-    const int z_q = condition[i] ? A.quantizer : B.quantizer ;
-    ComputationType<T>::shift_left(z, m_out.quantizer - z_q);
-    COUNTERS(z);
-    m_out[i] = z;
+    const Tensor<T> &A = *in[1];
+    const Tensor<T> &B = *in[2];
+    m_out.quantizer = A.quantizer > B.quantizer ? A.quantizer : B.quantizer;
+    for (int i = 0; i < m_out.size(); i++)
+    {
+      const T A_i = (A.dims().size() == 1) ? A[0] : A[i];
+      const T B_i = (B.dims().size() == 1) ? B[0] : B[i];
+      typename ComputationType<T>::type z = condition[i] ? A_i : B_i;
+      const int z_q = condition[i] ? A.quantizer : B.quantizer ;
+      ComputationType<T>::shift_left(z, m_out.quantizer - z_q);
+      COUNTERS(z);
+      m_out[i] = z;
+    }
  }
  return true;
 }
@@ -80,7 +103,10 @@ template<typename T> bool Where<T>::init(const std::vector<Tensor<T> *> &in)
 {
  if (in.size() != 3)
    return false;
-  m_out.resize(in[0]->dims());//condition dims
+  if (in[0]->size() == 1)//condition dims
+    m_out.resize(in[1]->dims());
+  else
+    m_out.resize(in[0]->dims());
  m_initDone = true;
  return true;
 }

--- a/sadl/layers.h
+++ b/sadl/layers.h
@@ -59,6 +59,7 @@
 #include "layer_resize.h"
 #include "layer_compare.h"
 #include "layer_where.h"
+#include "layer_minimum.h"

 namespace sadl
 {
@@ -99,6 +100,8 @@ inline std::string opName(const OperationType::Type op)
    DIRTYCASEPRINT(GridSample);
    DIRTYCASEPRINT(Resize);
    DIRTYCASEPRINT(Compare);
+    DIRTYCASEPRINT(Minimum);
+    DIRTYCASEPRINT(Where);
  default:
    oss << "??";
    break;

--- a/sadl/model.h
+++ b/sadl/model.h
@@ -57,7 +57,7 @@ private:
  std::vector<LayerData>                     m_data;
  int32_t                                    m_nb_inputs       = 0;
  static constexpr int                       kMaxInputByLayer = 2;
-  static constexpr int                       kMaxLayers       = 2048;
+  static constexpr int                       kMaxLayers       = 8192;
  std::vector<typename layers::Layer<T>::Id> getLayerIdsWithInput(typename layers::Layer<T>::Id id) const;
  void                                       insertCopyLayers();
  void                                       reshapeConv2DFilters();
@@ -75,7 +75,7 @@ public:
  // aditionnal info
  std::vector<Tensor<T>>                            getInputsTemplate() const;
  const std::vector<typename layers::Layer<T>::Id> &getIdsOutput() const { return m_ids_output; }
-  int                                               nbOutputs() const { return m_ids_output.size(); }
+  size_t                                            nbOutputs() const { return m_ids_output.size(); }
  std::vector<typename layers::Layer<T>::Id>        getLayersId() const;
  const LayerData &                                 getLayer(const typename layers::Layer<T>::Id &id) const;
  LayerData &                                getLayer(const typename layers::Layer<T>::Id &id);
@@ -185,6 +185,9 @@ template<typename T> std::unique_ptr<layers::Layer<T>> createLayer(int32_t id, l
  case layers::OperationType::Where:
    return std::unique_ptr<layers::Layer<T>>(new layers::Where<T>{ id, op });
    break;
+  case layers::OperationType::Minimum:
+    return std::unique_ptr<layers::Layer<T>>(new layers::Minimum<T>{ id, op });
+    break;
  case layers::OperationType::OperationTypeCount:
    break;   // no default on purpose
  }
@@ -246,7 +249,7 @@ template<typename T> bool Model<T>::load(std::istream &file)
    if ((std::is_same<T, float>::value && x != layers::TensorInternalType::Float) || (std::is_same<T, int32_t>::value && x != layers::TensorInternalType::Int32)
        || (std::is_same<T, int16_t>::value && x != layers::TensorInternalType::Int16))
    {
-      std::cerr << "[ERROR] wrong model type and Model<T>" << std::endl;
+      std::cerr << "[ERROR] wrong model type and Model<T> " << std::endl;
      return false;
    }
    SADL_DBG(std::cout << "[INFO] Model type: " << (int) x << std::endl);
@@ -622,10 +625,13 @@ template<typename T> typename Model<T>::Stat Model<T>::printOverflow(bool printi
                << "]: overflow: " << m_data[layer_cnt].layer->cpt_overflow << '/' << m_data[layer_cnt].layer->cpt_op << " ("
                << m_data[layer_cnt].layer->cpt_overflow * 100. / m_data[layer_cnt].layer->cpt_op << "%)" << std::endl;
    }
-    else if (printinfo && m_data[layer_cnt].layer->cpt_op > 0)
-    {
-      std::cout << "[INFO] layer " << m_data[layer_cnt].layer->id() << ' ' << m_data[layer_cnt].layer->name() << " [" << opName(m_data[layer_cnt].layer->op())
-                << "]: " << m_data[layer_cnt].layer->cpt_op << " op" << std::endl;
+    else {
+      if (printinfo && (m_data[layer_cnt].layer->cpt_op > 0 || m_data[layer_cnt].layer->cpt_mac > 0 ) )
+      {
+        std::cout << "[INFO] layer " << m_data[layer_cnt].layer->id() << ' ' << m_data[layer_cnt].layer->name() << " [" << opName(m_data[layer_cnt].layer->op()) << "]: "
+                  << m_data[layer_cnt].layer->cpt_mac << " mac, "
+                  << m_data[layer_cnt].layer->cpt_op << " op" << std::endl;
+      }
    }
  }
 #if DEBUG_COUNTERS && __AVX2__

--- a/sadl/options.h
+++ b/sadl/options.h
@@ -58,6 +58,7 @@ static constexpr float kSparsifySizeThreshold = 1000.0f;
 // #define DEBUG_PRINT         1 // print model info
 // #define DEBUG_SIMD          1 // tell about non simd version
 // #define DEBUG_KEEP_OUTPUT   1 // keep a copy of the output tensor
+// #define DEBUG_OVERFLOW      1 // set all accumulator to int16 to detect overflow in accumulator as well
 #if SATURATE_RESULT
 #define SATURATE(X)                                                                                                                                            \
  if (!std::is_same<T, float>::value)                                                                                                                          \
No results found