source: trunk/firmware_v4/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Last change on this file was 42, checked in by f.jahn, 5 days ago
File size: 127.4 KB
Line 
1/*
2 * Copyright (C) 2010-2022 Arm Limited or its affiliates.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19/* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_nnfunctions.h
22 * Description: Public header file for CMSIS NN Library
23 *
24 * $Date: 19 April 2022
25 * $Revision: V.9.0.0
26 *
27 * Target Processor: Cortex-M CPUs
28 * -------------------------------------------------------------------- */
29
30/**
31 \mainpage CMSIS NN Software Library
32 *
33 * Introduction
34 * ------------
35 *
36 * This user manual describes the CMSIS NN software library,
37 * a collection of efficient neural network kernels developed to maximize the
38 * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39 *
40 * The library is divided into a number of functions each covering a specific category:
41 * - Convolution Functions
42 * - Activation Functions
43 * - Fully-connected Layer Functions
44 * - SVDF Layer Functions
45 * - Pooling Functions
46 * - Softmax Functions
47 * - Basic math Functions
48 *
49 * The library has separate functions for operating on different weight and activation data
50 * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
51 * kernels are included in the function description. The implementation details are also
52 * described in this paper [1].
53 *
54 * Supported Processors
55 * -------
56 * CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each
57 * targets a different group of processors.
58 * - Processors without SIMD capability (e.g, Cortex-M0)
59 * - Processors with DSP extention (e.g Cortex-M4)
60 * - Processors with MVE extension (e.g Cortex-M55)
61 * The right implementation is picked through feature flags and the user usually does not have to explicit set it.
62 *
63 * Function Classification
64 * --------
65 * The functions can be classified into two segments
66 * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
67 * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
68 *
69 * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
70 * The article in [2] describes in detail how to run a network using the legacy functions.
71 *
72 * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
73 * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
74 * a TensorFlow Lite model using optimized CMSIS-NN kernels.
75 *
76 * Block Diagram
77 * --------
78 * \image html CMSIS-NN-OVERVIEW.PNG
79 *
80 * Examples
81 * --------
82 *
83 * The library ships with a number of examples which demonstrate how to use the library functions.
84 *
85 * Pre-processor Macros
86 * ------------
87 *
88 * Each library project have different pre-processor macros.
89 *
90 * - ARM_MATH_DSP:
91 *
92 * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
93 *
94 * - ARM_MATH_MVEI:
95 *
96 * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
97
98 * - ARM_MATH_AUTOVECTORIZE
99 * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
100 * assembly. It does not affect functions that use C or intrinsics.
101 * - ARM_MATH_BIG_ENDIAN:
102 *
103 * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
104 * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
105 * little endian targets.
106 *
107 * - ARM_NN_TRUNCATE:
108 *
109 * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
110 *
111 *
112 * Copyright Notice
113 * ------------
114 *
115 * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
116 *
117 * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
118 *
119 * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
120 *
121 https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
122 * [3] https://www.tensorflow.org/lite/microcontrollers/library
123 *
124 * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
125 */
126
127/**
128 * @defgroup groupNN Neural Network Functions
129 * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
130 * TensorFlow Lite framework.
131 */
132
133#ifndef _ARM_NNFUNCTIONS_H
134#define _ARM_NNFUNCTIONS_H
135
136#include "arm_nn_math_types.h"
137#include "arm_nn_types.h"
138
139#define USE_INTRINSIC
140
141//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
142
143#ifdef __cplusplus
144extern "C" {
145#endif
146
147/**
148 * @brief Struct for specifying activation function types
149 *
150 */
151typedef enum
152{
153 ARM_SIGMOID = 0,
154 /**< Sigmoid activation function */
155 ARM_TANH = 1,
156 /**< Tanh activation function */
157} arm_nn_activation_type;
158
159/**
160 * @defgroup NNConv Convolution Functions
161 *
162 * Collection of convolution, depthwise convolution functions and their variants.
163 *
164 * The convolution is implemented in 2 steps: im2col and GEMM
165 *
166 * im2col is a process of converting each patch of image data into
167 * a column. After im2col, the convolution is computed as matrix-matrix
168 * multiplication.
169 *
170 * To reduce the memory footprint, the im2col is performed partially.
171 * Each iteration, only a few column (i.e., patches) are generated and
172 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
173 *
174 */
175
176/**
177 * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
178 cmsis-nn
179 * to perform the convolution.
180 *
181 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
182 arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
183 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
184 * Range of conv_params->input_offset : [-127, 128]
185 * Range of conv_params->output_offset : [-128, 127]
186 * @param[in] quant_params Per-channel quantization info.
187 * It contains the multiplier and shift values to be applied to each output channel
188 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
189 * @param[in] input_data Input (activation) data pointer. Data type: int8
190 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
191 * spatial filter dimensions
192 * @param[in] filter_data Filter data pointer. Data type: int8
193 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
194 * @param[in] bias_data Bias data pointer. Data type: int32
195 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
196 * @param[out] output_data Output data pointer. Data type: int8
197 *
198 * @return The function returns either
199 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
200 * <code>ARM_MATH_SUCCESS</code> on successful completion.
201 *
202 */
203arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
204 const cmsis_nn_conv_params *conv_params,
205 const cmsis_nn_per_channel_quant_params *quant_params,
206 const cmsis_nn_dims *input_dims,
207 const q7_t *input_data,
208 const cmsis_nn_dims *filter_dims,
209 const q7_t *filter_data,
210 const cmsis_nn_dims *bias_dims,
211 const int32_t *bias_data,
212 const cmsis_nn_dims *output_dims,
213 q7_t *output_data);
214
215/**
216 * @brief Get the required buffer size for arm_convolve_wrapper_s8
217 *
218 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
219 * Range of conv_params->input_offset : [-127, 128]
220 * Range of conv_params->output_offset : [-128, 127]
221 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
222 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
223 * filter dimensions
224 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
225 *
226 * @return The function returns required buffer size(bytes)
227 *
228 */
229int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
230 const cmsis_nn_dims *input_dims,
231 const cmsis_nn_dims *filter_dims,
232 const cmsis_nn_dims *output_dims);
233
234/**
235 * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
236 cmsis-nn
237 * to perform the convolution.
238 *
239 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
240 arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
241 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
242 * conv_params->input_offset : Not used
243 * conv_params->output_offset : Not used
244 * @param[in] quant_params Per-channel quantization info.
245 * It contains the multiplier and shift values to be applied to each output channel
246 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
247 * @param[in] input_data Input (activation) data pointer. Data type: int16
248 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
249 * spatial filter dimensions
250 * @param[in] filter_data Filter data pointer. Data type: int8
251 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
252 * @param[in] bias_data Bias data pointer. Data type: int64
253 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
254 * @param[out] output_data Output data pointer. Data type: int16
255 *
256 * @return The function returns either
257 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
258 * <code>ARM_MATH_SUCCESS</code> on successful completion.
259 *
260 */
261arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
262 const cmsis_nn_conv_params *conv_params,
263 const cmsis_nn_per_channel_quant_params *quant_params,
264 const cmsis_nn_dims *input_dims,
265 const q15_t *input_data,
266 const cmsis_nn_dims *filter_dims,
267 const q7_t *filter_data,
268 const cmsis_nn_dims *bias_dims,
269 const int64_t *bias_data,
270 const cmsis_nn_dims *output_dims,
271 q15_t *output_data);
272
273/**
274 * @brief Get the required buffer size for arm_convolve_wrapper_s16
275 *
276 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
277 * conv_params->input_offset : Not used
278 * conv_params->output_offset : Not used
279 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
280 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
281 * filter dimensions
282 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
283 *
284 * @return The function returns required buffer size(bytes)
285 *
286 */
287int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
288 const cmsis_nn_dims *input_dims,
289 const cmsis_nn_dims *filter_dims,
290 const cmsis_nn_dims *output_dims);
291
292/**
293 * @brief Basic s8 convolution function
294 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
295 arm_convolve_s8_get_buffer_size will return the buffer_size if required
296 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
297 * Range of conv_params->input_offset : [-127, 128]
298 * Range of conv_params->output_offset : [-128, 127]
299 * @param[in] quant_params Per-channel quantization info.
300 * It contains the multiplier and shift values to be applied to each output channel
301 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
302 * @param[in] input_data Input (activation) data pointer. Data type: int8
303 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
304 * spatial filter dimensions
305 * @param[in] filter_data Filter data pointer. Data type: int8
306 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
307 * @param[in] bias_data Optional bias data pointer. Data type: int32
308 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
309 * @param[out] output_data Output data pointer. Data type: int8
310
311 * @return The function returns <code>ARM_MATH_SUCCESS</code>
312 *
313 * @details
314 * 1. Supported framework: TensorFlow Lite micro
315 * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
316 * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
317 *
318 */
319arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
320 const cmsis_nn_conv_params *conv_params,
321 const cmsis_nn_per_channel_quant_params *quant_params,
322 const cmsis_nn_dims *input_dims,
323 const q7_t *input_data,
324 const cmsis_nn_dims *filter_dims,
325 const q7_t *filter_data,
326 const cmsis_nn_dims *bias_dims,
327 const int32_t *bias_data,
328 const cmsis_nn_dims *output_dims,
329 q7_t *output_data);
330
331/**
332 * @brief Get the required buffer size for s8 convolution function
333 *
334 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
335 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
336 * are the spatial filter dimensions
337 * @return The function returns required buffer size(bytes)
338 *
339 */
340int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
341
342/**
343 * @brief Basic s16 convolution function
344 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
345 arm_convolve_s16_get_buffer_size will return the buffer_size if required
346 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
347 * conv_params->input_offset : Not used
348 * conv_params->output_offset : Not used
349 * @param[in] quant_params Per-channel quantization info.
350 * It contains the multiplier and shift values to be applied to each output channel
351 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
352 * @param[in] input_data Input (activation) data pointer. Data type: int16
353 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
354 * spatial filter dimensions
355 * @param[in] filter_data Filter data pointer. Data type: int8
356 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
357 * @param[in] bias_data Optional bias data pointer. Data type: int64
358 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
359 * @param[out] output_data Output data pointer. Data type: int16
360
361 * @return The function returns <code>ARM_MATH_SUCCESS</code>
362 *
363 * @details
364 * 1. Supported framework: TensorFlow Lite micro
365 * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
366 * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
367 *
368 */
369arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
370 const cmsis_nn_conv_params *conv_params,
371 const cmsis_nn_per_channel_quant_params *quant_params,
372 const cmsis_nn_dims *input_dims,
373 const q15_t *input_data,
374 const cmsis_nn_dims *filter_dims,
375 const q7_t *filter_data,
376 const cmsis_nn_dims *bias_dims,
377 const int64_t *bias_data,
378 const cmsis_nn_dims *output_dims,
379 q15_t *output_data);
380/**
381 * @brief Optimized s16 convolution function
382 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
383 arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required
384 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
385 * conv_params->input_offset : Not used
386 * conv_params->output_offset : Not used
387 * @param[in] quant_params Per-channel quantization info.
388 * It contains the multiplier and shift values to be applied to each output channel
389 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
390 * @param[in] input_data Input (activation) data pointer. Data type: int16
391 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
392 * spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
393 exceed 512
394 * @param[in] filter_data Filter data pointer. Data type: int8
395 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
396 * @param[in] bias_data Optional bias data pointer. Data type: int64
397 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
398 * @param[out] output_data Output data pointer. Data type: int16
399
400 * @return The function returns <code>ARM_MATH_SUCCESS</code>
401 *
402 * @details
403 * 1. Supported framework: TensorFlow Lite micro
404 * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
405 * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
406 * 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
407 *
408 */
409
410arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
411 const cmsis_nn_conv_params *conv_params,
412 const cmsis_nn_per_channel_quant_params *quant_params,
413 const cmsis_nn_dims *input_dims,
414 const q15_t *input_data,
415 const cmsis_nn_dims *filter_dims,
416 const q7_t *filter_data,
417 const cmsis_nn_dims *bias_dims,
418 const int64_t *bias_data,
419 const cmsis_nn_dims *output_dims,
420 q15_t *output_data);
421
422/**
423 * @brief Get the required buffer size for s16 convolution function
424 *
425 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
426 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
427 * are the spatial filter dimensions
428 * @return The function returns required buffer size(bytes)
429 *
430 */
431int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
432
433/**
434 * @brief Get the required buffer size for fast s16 convolution function
435 *
436 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
437 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
438 * are the spatial filter dimensions
439 * @return The function returns required buffer size(bytes)
440 *
441 */
442int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
443
444/**
445 * @brief Basic Q7 convolution function
446 * @param[in] Im_in pointer to input tensor
447 * @param[in] dim_im_in input tensor dimension
448 * @param[in] ch_im_in number of input tensor channels
449 * @param[in] wt pointer to kernel weights
450 * @param[in] ch_im_out number of filters, i.e., output tensor channels
451 * @param[in] dim_kernel filter kernel size
452 * @param[in] padding padding sizes
453 * @param[in] stride convolution stride
454 * @param[in] bias pointer to bias
455 * @param[in] bias_shift amount of left-shift for bias
456 * @param[in] out_shift amount of right-shift for output
457 * @param[in,out] Im_out pointer to output tensor
458 * @param[in] dim_im_out output tensor dimension
459 * @param[in,out] bufferA pointer to buffer space for input
460 * @param[in,out] bufferB pointer to buffer space for output
461 * @return The function returns <code>ARM_MATH_SUCCESS</code>
462 *
463 */
464arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
465 const uint16_t dim_im_in,
466 const uint16_t ch_im_in,
467 const q7_t *wt,
468 const uint16_t ch_im_out,
469 const uint16_t dim_kernel,
470 const uint16_t padding,
471 const uint16_t stride,
472 const q7_t *bias,
473 const uint16_t bias_shift,
474 const uint16_t out_shift,
475 q7_t *Im_out,
476 const uint16_t dim_im_out,
477 q15_t *bufferA,
478 q7_t *bufferB);
479
480/**
481 * @brief Basic Q7 convolution function (non-square shape)
482 * @param[in] Im_in pointer to input tensor
483 * @param[in] dim_im_in_x input tensor dimension x
484 * @param[in] dim_im_in_y input tensor dimension y
485 * @param[in] ch_im_in number of input tensor channels
486 * @param[in] wt pointer to kernel weights
487 * @param[in] ch_im_out number of filters, i.e., output tensor channels
488 * @param[in] dim_kernel_x filter kernel size x
489 * @param[in] dim_kernel_y filter kernel size y
490 * @param[in] padding_x padding size x
491 * @param[in] padding_y padding size y
492 * @param[in] stride_x convolution stride x
493 * @param[in] stride_y convolution stride y
494 * @param[in] bias pointer to bias
495 * @param[in] bias_shift amount of left-shift for bias
496 * @param[in] out_shift amount of right-shift for output
497 * @param[in,out] Im_out pointer to output tensor
498 * @param[in] dim_im_out_x output tensor dimension x
499 * @param[in] dim_im_out_y output tensor dimension y
500 * @param[in,out] bufferA pointer to buffer space for input
501 * @param[in,out] bufferB pointer to buffer space for output
502 * @return The function returns <code>ARM_MATH_SUCCESS</code>
503 */
504arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
505 const uint16_t dim_im_in_x,
506 const uint16_t dim_im_in_y,
507 const uint16_t ch_im_in,
508 const q7_t *wt,
509 const uint16_t ch_im_out,
510 const uint16_t dim_kernel_x,
511 const uint16_t dim_kernel_y,
512 const uint16_t padding_x,
513 const uint16_t padding_y,
514 const uint16_t stride_x,
515 const uint16_t stride_y,
516 const q7_t *bias,
517 const uint16_t bias_shift,
518 const uint16_t out_shift,
519 q7_t *Im_out,
520 const uint16_t dim_im_out_x,
521 const uint16_t dim_im_out_y,
522 q15_t *bufferA,
523 q7_t *bufferB);
524
525/**
526 * @brief Basic Q15 convolution function
527 * @param[in] Im_in pointer to input tensor
528 * @param[in] dim_im_in input tensor dimension
529 * @param[in] ch_im_in number of input tensor channels
530 * @param[in] wt pointer to kernel weights
531 * @param[in] ch_im_out number of filters, i.e., output tensor channels
532 * @param[in] dim_kernel filter kernel size
533 * @param[in] padding padding sizes
534 * @param[in] stride convolution stride
535 * @param[in] bias pointer to bias
536 * @param[in] bias_shift amount of left-shift for bias
537 * @param[in] out_shift amount of right-shift for output
538 * @param[in,out] Im_out pointer to output tensor
539 * @param[in] dim_im_out output tensor dimension
540 * @param[in,out] bufferA pointer to buffer space for input
541 * @param[in,out] bufferB pointer to buffer space for output
542 * @return The function returns <code>ARM_MATH_SUCCESS</code>
543 *
544 */
545arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
546 const uint16_t dim_im_in,
547 const uint16_t ch_im_in,
548 const q15_t *wt,
549 const uint16_t ch_im_out,
550 const uint16_t dim_kernel,
551 const uint16_t padding,
552 const uint16_t stride,
553 const q15_t *bias,
554 const uint16_t bias_shift,
555 const uint16_t out_shift,
556 q15_t *Im_out,
557 const uint16_t dim_im_out,
558 q15_t *bufferA,
559 q7_t *bufferB);
560
561/**
562 * @brief Fast Q7 convolution function
563 * @param[in] Im_in pointer to input tensor
564 * @param[in] dim_im_in input tensor dimension
565 * @param[in] ch_im_in number of input tensor channels
566 * @param[in] wt pointer to kernel weights
567 * @param[in] ch_im_out number of filters, i.e., output tensor channels
568 * @param[in] dim_kernel filter kernel size
569 * @param[in] padding padding sizes
570 * @param[in] stride convolution stride
571 * @param[in] bias pointer to bias
572 * @param[in] bias_shift amount of left-shift for bias
573 * @param[in] out_shift amount of right-shift for output
574 * @param[in,out] Im_out pointer to output tensor
575 * @param[in] dim_im_out output tensor dimension
576 * @param[in,out] bufferA pointer to buffer space for input
577 * @param[in,out] bufferB pointer to buffer space for output
578 * @return The function returns either
579 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
580 *
581 * This function is the version with full list of optimization tricks, but with
582 * some contraints:
583 * ch_im_in is multiple of 4
584 * ch_im_out is multiple of 2
585 */
586arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
587 const uint16_t dim_im_in,
588 const uint16_t ch_im_in,
589 const q7_t *wt,
590 const uint16_t ch_im_out,
591 const uint16_t dim_kernel,
592 const uint16_t padding,
593 const uint16_t stride,
594 const q7_t *bias,
595 const uint16_t bias_shift,
596 const uint16_t out_shift,
597 q7_t *Im_out,
598 const uint16_t dim_im_out,
599 q15_t *bufferA,
600 q7_t *bufferB);
601
602/**
603 * @brief Fast Q7 convolution function (non-sqaure shape)
604 * @param[in] Im_in pointer to input tensor
605 * @param[in] dim_im_in_x input tensor dimension x
606 * @param[in] dim_im_in_y input tensor dimension y
607 * @param[in] ch_im_in number of input tensor channels
608 * @param[in] wt pointer to kernel weights
609 * @param[in] ch_im_out number of filters, i.e., output tensor channels
610 * @param[in] dim_kernel_x filter kernel size x
611 * @param[in] dim_kernel_y filter kernel size y
612 * @param[in] padding_x padding size x
613 * @param[in] padding_y padding size y
614 * @param[in] stride_x convolution stride x
615 * @param[in] stride_y convolution stride y
616 * @param[in] bias pointer to bias
617 * @param[in] bias_shift amount of left-shift for bias
618 * @param[in] out_shift amount of right-shift for output
619 * @param[in,out] Im_out pointer to output tensor
620 * @param[in] dim_im_out_x output tensor dimension x
621 * @param[in] dim_im_out_y output tensor dimension y
622 * @param[in,out] bufferA pointer to buffer space for input
623 * @param[in,out] bufferB pointer to buffer space for output
624 * @return The function returns either
625 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
626 *
627 * This function is the version with full list of optimization tricks, but with
628 * some contraints:
629 * ch_im_in is multiple of 4
630 * ch_im_out is multiple of 2
631 */
632
633arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
634 const uint16_t dim_im_in_x,
635 const uint16_t dim_im_in_y,
636 const uint16_t ch_im_in,
637 const q7_t *wt,
638 const uint16_t ch_im_out,
639 const uint16_t dim_kernel_x,
640 const uint16_t dim_kernel_y,
641 const uint16_t padding_x,
642 const uint16_t padding_y,
643 const uint16_t stride_x,
644 const uint16_t stride_y,
645 const q7_t *bias,
646 const uint16_t bias_shift,
647 const uint16_t out_shift,
648 q7_t *Im_out,
649 const uint16_t dim_im_out_x,
650 const uint16_t dim_im_out_y,
651 q15_t *bufferA,
652 q7_t *bufferB);
653
654/**
655 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
656 * @param[in] Im_in pointer to input tensor
657 * @param[in] dim_im_in_x input tensor dimension x
658 * @param[in] dim_im_in_y input tensor dimension y
659 * @param[in] ch_im_in number of input tensor channels
660 * @param[in] wt pointer to kernel weights
661 * @param[in] ch_im_out number of filters, i.e., output tensor channels
662 * @param[in] dim_kernel_x filter kernel size x
663 * @param[in] dim_kernel_y filter kernel size y
664 * @param[in] padding_x padding size x
665 * @param[in] padding_y padding size y
666 * @param[in] stride_x convolution stride x
667 * @param[in] stride_y convolution stride y
668 * @param[in] bias pointer to bias
669 * @param[in] bias_shift amount of left-shift for bias
670 * @param[in] out_shift amount of right-shift for output
671 * @param[in,out] Im_out pointer to output tensor
672 * @param[in] dim_im_out_x output tensor dimension x
673 * @param[in] dim_im_out_y output tensor dimension y
674 * @param[in,out] bufferA pointer to buffer space for input
675 * @param[in,out] bufferB pointer to buffer space for output
676 * @return The function returns either
677 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
678 * <code>ARM_MATH_SUCCESS</code> on successful completion.
679 *
680 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
681 * and dim_kernel_y=1). It can be used for
682 * second half of MobileNets after depthwise separable convolution.
683 *
684 * This function is the version with full list of optimization tricks, but with
685 * some contraints:
686 * ch_im_in is multiple of 4
687 * ch_im_out is multiple of 2
688 */
689arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
690 const uint16_t dim_im_in_x,
691 const uint16_t dim_im_in_y,
692 const uint16_t ch_im_in,
693 const q7_t *wt,
694 const uint16_t ch_im_out,
695 const uint16_t dim_kernel_x,
696 const uint16_t dim_kernel_y,
697 const uint16_t padding_x,
698 const uint16_t padding_y,
699 const uint16_t stride_x,
700 const uint16_t stride_y,
701 const q7_t *bias,
702 const uint16_t bias_shift,
703 const uint16_t out_shift,
704 q7_t *Im_out,
705 const uint16_t dim_im_out_x,
706 const uint16_t dim_im_out_y,
707 q15_t *bufferA,
708 q7_t *bufferB);
709
710/**
711 * @brief Fast s8 version for 1x1 convolution (non-square shape)
712 *
713 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
714 arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required
715 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
716 * Range of conv_params->input_offset : [-127, 128]
717 * Range of conv_params->output_offset : [-128, 127]
718 * @param[in] quant_params Per-channel quantization info.
719 * It contains the multiplier and shift values to be applied to each output channel
720 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
721 * @param[in] input_data Input (activation) data pointer. Data type: int8
722 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
723 * @param[in] filter_data Filter data pointer. Data type: int8
724 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
725 * @param[in] bias_data Optional bias data pointer. Data type: int32
726 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
727 * @param[out] output_data Output data pointer. Data type: int8
728 *
729 * @return The function returns either
730 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
731 * <code>ARM_MATH_SUCCESS</code> on successful completion.
732 *
733 * @details
734 * - Supported framework : TensorFlow Lite Micro
735 * - The following constrains on the arguments apply
736 * -# input_dims->c is a multiple of 4
737 * -# conv_params->padding.w = conv_params->padding.h = 0
738 * -# conv_params->stride.w = conv_params->stride.h = 1
739 *
740 */
741arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
742 const cmsis_nn_conv_params *conv_params,
743 const cmsis_nn_per_channel_quant_params *quant_params,
744 const cmsis_nn_dims *input_dims,
745 const q7_t *input_data,
746 const cmsis_nn_dims *filter_dims,
747 const q7_t *filter_data,
748 const cmsis_nn_dims *bias_dims,
749 const int32_t *bias_data,
750 const cmsis_nn_dims *output_dims,
751 q7_t *output_data);
752
753/**
754 * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
755 *
756 * @param[in] input_dims Input (activation) dimensions
757 * @return The function returns the required buffer size in bytes
758 *
759 */
760int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
761
762/**
763 * @brief 1xn convolution
764 *
765 * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
766 arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
767 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
768 * Range of conv_params->input_offset : [-127, 128]
769 * Range of conv_params->output_offset : [-128, 127]
770 * @param[in] quant_params Per-channel quantization info.
771 * It contains the multiplier and shift values to be applied to each output channel
772 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
773 * @param[in] input_data Input (activation) data pointer. Data type: int8
774 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
775 * spatial filter dimension
776 * @param[in] filter_data Filter data pointer. Data type: int8
777 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
778 * @param[in] bias_data Optional bias data pointer. Data type: int32
779 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
780 * @param[out] output_data Output data pointer. Data type: int8
781 *
782 * @return The function returns either
783 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
784 * <code>ARM_MATH_SUCCESS</code> on successful completion.
785 *
786 * @details
787 * - Supported framework : TensorFlow Lite Micro
788 * - The following constrains on the arguments apply
789 * -# input_dims->n equals 1
790 * -# ouput_dims->w is a multiple of 4
791 * -# Explicit constraints(since it is for 1xN convolution)
792 * -## input_dims->h equals 1
793 * -## output_dims->h equals 1
794 * -## filter_dims->h equals 1
795 *@todo Remove constraint on output_dims->w to make the function generic.
796 *
797 */
798arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
799 const cmsis_nn_conv_params *conv_params,
800 const cmsis_nn_per_channel_quant_params *quant_params,
801 const cmsis_nn_dims *input_dims,
802 const q7_t *input_data,
803 const cmsis_nn_dims *filter_dims,
804 const q7_t *filter_data,
805 const cmsis_nn_dims *bias_dims,
806 const int32_t *bias_data,
807 const cmsis_nn_dims *output_dims,
808 q7_t *output_data);
809
810/**
811 * @brief Get the required additional buffer size for 1xn convolution
812 *
813 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
814 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
815 * horizontal spatial filter dimension
816 * @return The function returns required buffer size(bytes)
817 *
818 */
819int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
820
821/**
822 * @brief Q7 version of convolution for RGB image
823 * @param[in] Im_in pointer to input tensor
824 * @param[in] dim_im_in input tensor dimension
825 * @param[in] ch_im_in number of input tensor channels
826 * @param[in] wt pointer to kernel weights
827 * @param[in] ch_im_out number of filters, i.e., output tensor channels
828 * @param[in] dim_kernel filter kernel size
829 * @param[in] padding padding sizes
830 * @param[in] stride convolution stride
831 * @param[in] bias pointer to bias
832 * @param[in] bias_shift amount of left-shift for bias
833 * @param[in] out_shift amount of right-shift for output
834 * @param[in,out] Im_out pointer to output tensor
835 * @param[in] dim_im_out output tensor dimension
836 * @param[in,out] bufferA pointer to buffer space for input
837 * @param[in,out] bufferB pointer to buffer space for output
838 * @return The function returns either
839 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
840 *
841 * This kernel is written exclusively for convolution with ch_im_in
842 * equals 3. This applies on the first layer of CNNs which has input
843 * image with RGB format.
844 */
845
846arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
847 const uint16_t dim_im_in,
848 const uint16_t ch_im_in,
849 const q7_t *wt,
850 const uint16_t ch_im_out,
851 const uint16_t dim_kernel,
852 const uint16_t padding,
853 const uint16_t stride,
854 const q7_t *bias,
855 const uint16_t bias_shift,
856 const uint16_t out_shift,
857 q7_t *Im_out,
858 const uint16_t dim_im_out,
859 q15_t *bufferA,
860 q7_t *bufferB);
861
862/**
863 * @brief Fast Q15 convolution function
864 * @param[in] Im_in pointer to input tensor
865 * @param[in] dim_im_in input tensor dimension
866 * @param[in] ch_im_in number of input tensor channels
867 * @param[in] wt pointer to kernel weights
868 * @param[in] ch_im_out number of filters, i.e., output tensor channels
869 * @param[in] dim_kernel filter kernel size
870 * @param[in] padding padding sizes
871 * @param[in] stride convolution stride
872 * @param[in] bias pointer to bias
873 * @param[in] bias_shift amount of left-shift for bias
874 * @param[in] out_shift amount of right-shift for output
875 * @param[in,out] Im_out pointer to output tensor
876 * @param[in] dim_im_out output tensor dimension
877 * @param[in,out] bufferA pointer to buffer space for input
878 * @param[in,out] bufferB pointer to buffer space for output
879 * @return The function returns either
880 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
881 *
882 * This function is the version with full list of optimization tricks, but with
883 * some contraints:
884 * ch_im_in is multiple of 2
885 * ch_im_out is multiple of 2
886 * dim_im_out is a multiple of 2
887 */
888
889arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
890 const uint16_t dim_im_in,
891 const uint16_t ch_im_in,
892 const q15_t *wt,
893 const uint16_t ch_im_out,
894 const uint16_t dim_kernel,
895 const uint16_t padding,
896 const uint16_t stride,
897 const q15_t *bias,
898 const uint16_t bias_shift,
899 const uint16_t out_shift,
900 q15_t *Im_out,
901 const uint16_t dim_im_out,
902 q15_t *bufferA,
903 q7_t *bufferB);
904
905/**
906 * @brief Fast Q15 convolution function (non-sqaure shape)
907 * @param[in] Im_in pointer to input tensor
908 * @param[in] dim_im_in_x input tensor dimension x
909 * @param[in] dim_im_in_y input tensor dimension y
910 * @param[in] ch_im_in number of input tensor channels
911 * @param[in] wt pointer to kernel weights
912 * @param[in] ch_im_out number of filters, i.e., output tensor channels
913 * @param[in] dim_kernel_x filter kernel size x
914 * @param[in] dim_kernel_y filter kernel size y
915 * @param[in] padding_x padding size x
916 * @param[in] padding_y padding size y
917 * @param[in] stride_x convolution stride x
918 * @param[in] stride_y convolution stride y
919 * @param[in] bias pointer to bias
920 * @param[in] bias_shift amount of left-shift for bias
921 * @param[in] out_shift amount of right-shift for output
922 * @param[in,out] Im_out pointer to output tensor
923 * @param[in] dim_im_out_x output tensor dimension x
924 * @param[in] dim_im_out_y output tensor dimension y
925 * @param[in,out] bufferA pointer to buffer space for input
926 * @param[in,out] bufferB pointer to buffer space for output
927 * @return The function returns either
928 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
929 *
930 * @details
931 *
932 * <b>Buffer size:</b>
933 *
934 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
935 *
936 * bufferB size: 0
937 *
938 * <b>Input dimension constraints:</b>
939 *
940 * ch_im_in is multiple of 2
941 *
942 * ch_im_out is multipe of 2
943 *
944 */
945
946arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
947 const uint16_t dim_im_in_x,
948 const uint16_t dim_im_in_y,
949 const uint16_t ch_im_in,
950 const q15_t *wt,
951 const uint16_t ch_im_out,
952 const uint16_t dim_kernel_x,
953 const uint16_t dim_kernel_y,
954 const uint16_t padding_x,
955 const uint16_t padding_y,
956 const uint16_t stride_x,
957 const uint16_t stride_y,
958 const q15_t *bias,
959 const uint16_t bias_shift,
960 const uint16_t out_shift,
961 q15_t *Im_out,
962 const uint16_t dim_im_out_x,
963 const uint16_t dim_im_out_y,
964 q15_t *bufferA,
965 q7_t *bufferB);
966
967/**
968 * @brief Q7 depthwise separable convolution function
969 * @param[in] Im_in pointer to input tensor
970 * @param[in] dim_im_in input tensor dimension
971 * @param[in] ch_im_in number of input tensor channels
972 * @param[in] wt pointer to kernel weights
973 * @param[in] ch_im_out number of filters, i.e., output tensor channels
974 * @param[in] dim_kernel filter kernel size
975 * @param[in] padding padding sizes
976 * @param[in] stride convolution stride
977 * @param[in] bias pointer to bias
978 * @param[in] bias_shift amount of left-shift for bias
979 * @param[in] out_shift amount of right-shift for output
980 * @param[in,out] Im_out pointer to output tensor
981 * @param[in] dim_im_out output tensor dimension
982 * @param[in,out] bufferA pointer to buffer space for input
983 * @param[in,out] bufferB pointer to buffer space for output
984 * @return The function returns either
985 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
986 *
987 * This function is the version with full list of optimization tricks, but with
988 * some contraints:
989 * ch_im_in is multiple of 2
990 * ch_im_out is multiple of 2
991 */
992
993arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
994 const uint16_t dim_im_in,
995 const uint16_t ch_im_in,
996 const q7_t *wt,
997 const uint16_t ch_im_out,
998 const uint16_t dim_kernel,
999 const uint16_t padding,
1000 const uint16_t stride,
1001 const q7_t *bias,
1002 const uint16_t bias_shift,
1003 const uint16_t out_shift,
1004 q7_t *Im_out,
1005 const uint16_t dim_im_out,
1006 q15_t *bufferA,
1007 q7_t *bufferB);
1008
1009/**
1010 * @brief Q7 depthwise separable convolution function (non-square shape)
1011 * @param[in] Im_in pointer to input tensor
1012 * @param[in] dim_im_in_x input tensor dimension x
1013 * @param[in] dim_im_in_y input tensor dimension y
1014 * @param[in] ch_im_in number of input tensor channels
1015 * @param[in] wt pointer to kernel weights
1016 * @param[in] ch_im_out number of filters, i.e., output tensor channels
1017 * @param[in] dim_kernel_x filter kernel size x
1018 * @param[in] dim_kernel_y filter kernel size y
1019 * @param[in] padding_x padding sizes x
1020 * @param[in] padding_y padding sizes y
1021 * @param[in] stride_x convolution stride x
1022 * @param[in] stride_y convolution stride y
1023 * @param[in] bias pointer to bias
1024 * @param[in] bias_shift amount of left-shift for bias
1025 * @param[in] out_shift amount of right-shift for output
1026 * @param[in,out] Im_out pointer to output tensor
1027 * @param[in] dim_im_out_x output tensor dimension x
1028 * @param[in] dim_im_out_y output tensor dimension y
1029 * @param[in,out] bufferA pointer to buffer space for input
1030 * @param[in,out] bufferB pointer to buffer space for output
1031 * @return The function returns either
1032 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
1033 *
1034 * This function is the version with full list of optimization tricks, but with
1035 * some contraints:
1036 * ch_im_in is multiple of 2
1037 * ch_im_out is multiple of 2
1038 */
1039arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
1040 const uint16_t dim_im_in_x,
1041 const uint16_t dim_im_in_y,
1042 const uint16_t ch_im_in,
1043 const q7_t *wt,
1044 const uint16_t ch_im_out,
1045 const uint16_t dim_kernel_x,
1046 const uint16_t dim_kernel_y,
1047 const uint16_t padding_x,
1048 const uint16_t padding_y,
1049 const uint16_t stride_x,
1050 const uint16_t stride_y,
1051 const q7_t *bias,
1052 const uint16_t bias_shift,
1053 const uint16_t out_shift,
1054 q7_t *Im_out,
1055 const uint16_t dim_im_out_x,
1056 const uint16_t dim_im_out_y,
1057 q15_t *bufferA,
1058 q7_t *bufferB);
1059
1060/**
1061 * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
1062 *
1063 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1064 * definition file to see if an additional buffer is required.
1065 * Optional function {API}_get_buffer_size() provides the buffer
1066 * size if required.
1067 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1068 * dw_conv_params->dilation is not used.
1069 * Range of dw_conv_params->input_offset : [-127, 128]
1070 * Range of dw_conv_params->output_offset : [-128, 127]
1071 * @param[in] quant_params Per-channel quantization info.
1072 * It contains the multiplier and shift values to be applied to each
1073 * output channel
1074 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1075 * Batch argument N is not used and assumed to be 1.
1076 * @param[in] input_data Input (activation) data pointer. Data type: int8
1077 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1078 * @param[in] filter_data Filter data pointer. Data type: int8
1079 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1080 * @param[in] bias_data Bias data pointer. Data type: int32
1081 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
1082 * @param[in, out] output_data Output data pointer. Data type: int8
1083 * @return The function returns
1084 * <code>ARM_MATH_SUCCESS</code> - Successful completion.
1085 *
1086 * @details
1087 * - Supported framework: TensorFlow Lite
1088 * - Picks one of the the following functions
1089 * -# arm_depthwise_conv_s8()
1090 * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
1091 * -# arm_depthwise_conv_s8_opt()
1092 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1093 * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
1094 * boundary.
1095 */
1096arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
1097 const cmsis_nn_dw_conv_params *dw_conv_params,
1098 const cmsis_nn_per_channel_quant_params *quant_params,
1099 const cmsis_nn_dims *input_dims,
1100 const q7_t *input_data,
1101 const cmsis_nn_dims *filter_dims,
1102 const q7_t *filter_data,
1103 const cmsis_nn_dims *bias_dims,
1104 const int32_t *bias_data,
1105 const cmsis_nn_dims *output_dims,
1106 q7_t *output_data);
1107
1108/**
1109 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
1110 *
1111 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1112 * dw_conv_params->dilation is not used.
1113 * Range of dw_conv_params->input_offset : [-127, 128]
1114 * Range of dw_conv_params->input_offset : [-128, 127]
1115 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1116 * Batch argument N is not used and assumed to be 1.
1117 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1118 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
1119 * @return Size of additional memory required for optimizations in bytes.
1120 *
1121 */
1122int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1123 const cmsis_nn_dims *input_dims,
1124 const cmsis_nn_dims *filter_dims,
1125 const cmsis_nn_dims *output_dims);
1126
1127/**
1128 * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
1129 *
1130 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1131 * definition file to see if an additional buffer is required.
1132 * Optional function {API}_get_buffer_size() provides the buffer
1133 * size if an additional buffer is required.
1134 * exists if additional memory is.
1135 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1136 * dw_conv_params->dilation is not used.
1137 * Range of dw_conv_params->input_offset : [-127, 128]
1138 * Range of dw_conv_params->input_offset : [-128, 127]
1139 * @param[in] quant_params Per-channel quantization info.
1140 * It contains the multiplier and shift values to be applied to each
1141 * output channel
1142 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1143 * Batch argument N is not used.
1144 * @param[in] input_data Input (activation) data pointer. Data type: int8
1145 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1146 * @param[in] filter_data Filter data pointer. Data type: int8
1147 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1148 * @param[in] bias_data Bias data pointer. Data type: int32
1149 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
1150 * @param[in, out] output_data Output data pointer. Data type: int8
1151 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1152 *
1153 * @details
1154 * - Supported framework: TensorFlow Lite
1155 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1156 */
1157arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
1158 const cmsis_nn_dw_conv_params *dw_conv_params,
1159 const cmsis_nn_per_channel_quant_params *quant_params,
1160 const cmsis_nn_dims *input_dims,
1161 const q7_t *input_data,
1162 const cmsis_nn_dims *filter_dims,
1163 const q7_t *filter_data,
1164 const cmsis_nn_dims *bias_dims,
1165 const int32_t *bias_data,
1166 const cmsis_nn_dims *output_dims,
1167 q7_t *output_data);
1168
1169/**
1170 * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
1171 *
1172 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1173 * definition file to see if an additional buffer is required.
1174 * Optional function {API}_get_buffer_size() provides the buffer
1175 * size if an additional buffer is required.
1176 * exists if additional memory is.
1177 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1178 * conv_params->input_offset : Not used
1179 * conv_params->output_offset : Not used
1180 * @param[in] quant_params Per-channel quantization info.
1181 * It contains the multiplier and shift values to be applied to each
1182 * output channel
1183 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1184 * Batch argument N is not used.
1185 * @param[in] input_data Input (activation) data pointer. Data type: int8
1186 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1187 * @param[in] filter_data Filter data pointer. Data type: int8
1188 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1189 * @param[in] bias_data Bias data pointer. Data type: int64
1190 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
1191 * @param[in, out] output_data Output data pointer. Data type: int16
1192 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1193 *
1194 * @details
1195 * - Supported framework: TensorFlow Lite
1196 * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
1197 */
1198arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
1199 const cmsis_nn_dw_conv_params *dw_conv_params,
1200 const cmsis_nn_per_channel_quant_params *quant_params,
1201 const cmsis_nn_dims *input_dims,
1202 const q15_t *input_data,
1203 const cmsis_nn_dims *filter_dims,
1204 const q7_t *filter_data,
1205 const cmsis_nn_dims *bias_dims,
1206 const int64_t *bias_data,
1207 const cmsis_nn_dims *output_dims,
1208 q15_t *output_data);
1209
1210/**
1211 * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
1212 * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
1213 * argument details.
1214 *
1215 * @return The function returns one of the following
1216 * <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors
1217 * <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis
1218 * <code>ARM_MATH_SUCCESS</code> - Successful operation
1219 *
1220 * @details
1221 * - Supported framework : TensorFlow Lite Micro
1222 * - The following constrains on the arguments apply
1223 * -# Number of input channel equals number of output channels
1224 * -# Filter height and width equals 3
1225 * -# Padding along x is either 0 or 1.
1226 *
1227 */
1228arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
1229 const cmsis_nn_dw_conv_params *dw_conv_params,
1230 const cmsis_nn_per_channel_quant_params *quant_params,
1231 const cmsis_nn_dims *input_dims,
1232 const q7_t *input_data,
1233 const cmsis_nn_dims *filter_dims,
1234 const q7_t *filter_data,
1235 const cmsis_nn_dims *bias_dims,
1236 const int32_t *bias_data,
1237 const cmsis_nn_dims *output_dims,
1238 q7_t *output_data);
1239
1240/**
1241 * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
1242 * Refer arm_depthwise_conv_s8() for function argument details.
1243 *
1244 * @return The function returns one of the following
1245 * <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or
1246 * ch_mult != 1
1247 * <code>ARM_MATH_SUCCESS</code> - Successful operation
1248 *
1249 * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1250 * for the following if MVE optimizations(Arm Helium Technology) are used.
1251 * - Output shift
1252 * - Output multiplier
1253 * - Output bias
1254 * - kernel
1255 * @details
1256 * - Supported framework: TensorFlow Lite
1257 * - The following constrains on the arguments apply
1258 * -# Number of input channel equals number of output channels or ch_mult equals 1
1259 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1260 * - Reccomended when number of channels is 4 or greater.
1261 *
1262 */
1263arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
1264 const cmsis_nn_dw_conv_params *dw_conv_params,
1265 const cmsis_nn_per_channel_quant_params *quant_params,
1266 const cmsis_nn_dims *input_dims,
1267 const q7_t *input_data,
1268 const cmsis_nn_dims *filter_dims,
1269 const q7_t *filter_data,
1270 const cmsis_nn_dims *bias_dims,
1271 const int32_t *bias_data,
1272 const cmsis_nn_dims *output_dims,
1273 q7_t *output_data);
1274
1275/**
1276 * @brief Get the required buffer size for optimized s8 depthwise convolution
1277 * function with constraint that in_channel equals out_channel.
1278 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1279 * Batch argument N is not used.
1280 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1281 * @return The function returns required buffer size in bytes
1282 *
1283 */
1284int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1285
1286/**
1287 * @defgroup FC Fully-connected Layer Functions
1288 *
1289 * Collection of fully-connected and matrix multiplication functions.
1290 *
1291 * Fully-connected layer is basically a matrix-vector multiplication
1292 * with bias. The matrix is the weights and the input/output vectors
1293 * are the activation values. Supported {weight, activation} precisions
1294 * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
1295 *
1296 * Here we have two types of kernel functions. The basic function
1297 * implements the function using regular GEMV approach. The opt functions
1298 * operates with weights in interleaved formats.
1299 *
1300 */
1301
1302/**
1303 *@brief Q7 basic fully-connected layer function
1304 *@param[in] pV pointer to input vector
1305 *@param[in] pM pointer to matrix weights
1306 *@param[in] dim_vec length of the vector
1307 *@param[in] num_of_rows number of rows in weight matrix
1308 *@param[in] bias_shift amount of left-shift for bias
1309 *@param[in] out_shift amount of right-shift for output
1310 *@param[in] bias pointer to bias
1311 *@param[in,out] pOut pointer to output vector
1312 *@param[in,out] vec_buffer pointer to buffer space for input
1313 *@return The function returns <code>ARM_MATH_SUCCESS</code>
1314 *
1315 */
1316
1317arm_status arm_fully_connected_q7(const q7_t *pV,
1318 const q7_t *pM,
1319 const uint16_t dim_vec,
1320 const uint16_t num_of_rows,
1321 const uint16_t bias_shift,
1322 const uint16_t out_shift,
1323 const q7_t *bias,
1324 q7_t *pOut,
1325 q15_t *vec_buffer);
1326
1327/**
1328 * @brief Basic s8 Fully Connected function.
1329 *
1330 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1331 * definition file to see if an additional buffer is required.
1332 * Optional function {API}_get_buffer_size() provides the buffer
1333 * size if an additional buffer is required.
1334 * @param[in] fc_params Fully Connected layer parameters.
1335 * Range of fc_params->input_offset : [-127, 128]
1336 * fc_params->filter_offset : 0
1337 * Range of fc_params->output_offset : [-128, 127]
1338 * @param[in] quant_params Per-tensor quantization info.
1339 * It contains the multiplier and shift values to be applied to the output tensor.
1340 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1341 * Input dimension is taken as Nx(H * W * C_IN)
1342 * @param[in] input_data Input (activation) data pointer. Data type: int8
1343 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
1344 * N : accumulation depth and equals (H * W * C_IN) from input_dims
1345 * C : output depth and equals C_OUT in output_dims
1346 * H & W : Not used
1347 * @param[in] filter_data Filter data pointer. Data type: int8
1348 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1349 * N, H, W : Not used
1350 * @param[in] bias_data Bias data pointer. Data type: int32
1351 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
1352 * N : Batches
1353 * C_OUT : Output depth
1354 * H & W : Not used.
1355 * @param[in, out] output_data Output data pointer. Data type: int8
1356 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1357 *
1358 * @details
1359 * - Supported framework: TensorFlow Lite
1360 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1361 */
1362arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1363 const cmsis_nn_fc_params *fc_params,
1364 const cmsis_nn_per_tensor_quant_params *quant_params,
1365 const cmsis_nn_dims *input_dims,
1366 const q7_t *input_data,
1367 const cmsis_nn_dims *filter_dims,
1368 const q7_t *filter_data,
1369 const cmsis_nn_dims *bias_dims,
1370 const int32_t *bias_data,
1371 const cmsis_nn_dims *output_dims,
1372 q7_t *output_data);
1373
1374/**
1375 * @brief Get the required buffer size for S8 basic fully-connected and
1376 * matrix multiplication layer function for TF Lite
1377 * @param[in] filter_dims dimension of filter
1378 * @return The function returns required buffer size in bytes
1379 *
1380 */
1381int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1382
1383/**
1384 * @brief Basic s16 Fully Connected function.
1385 *
1386 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1387 * definition file to see if an additional buffer is required.
1388 * Optional function {API}_get_buffer_size() provides the buffer
1389 * size if an additional buffer is required.
1390 * @param[in] fc_params Fully Connected layer parameters.
1391 * fc_params->input_offset : 0
1392 * fc_params->filter_offset : 0
1393 * fc_params->output_offset : 0
1394 * @param[in] quant_params Per-tensor quantization info.
1395 * It contains the multiplier and shift values to be applied to the output tensor.
1396 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1397 * Input dimension is taken as Nx(H * W * C_IN)
1398 * @param[in] input_data Input (activation) data pointer. Data type: int16
1399 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
1400 * N : accumulation depth and equals (H * W * C_IN) from input_dims
1401 * C : output depth and equals C_OUT in output_dims
1402 * H & W : Not used
1403 * @param[in] filter_data Filter data pointer. Data type: int8
1404 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1405 * N, H, W : Not used
1406 * @param[in] bias_data Bias data pointer. Data type: int64
1407 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
1408 * N : Batches
1409 * C_OUT : Output depth
1410 * H & W : Not used.
1411 * @param[in, out] output_data Output data pointer. Data type: int16
1412 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1413 *
1414 * @details
1415 * - Supported framework: TensorFlow Lite
1416 * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
1417 */
1418arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
1419 const cmsis_nn_fc_params *fc_params,
1420 const cmsis_nn_per_tensor_quant_params *quant_params,
1421 const cmsis_nn_dims *input_dims,
1422 const q15_t *input_data,
1423 const cmsis_nn_dims *filter_dims,
1424 const q7_t *filter_data,
1425 const cmsis_nn_dims *bias_dims,
1426 const int64_t *bias_data,
1427 const cmsis_nn_dims *output_dims,
1428 q15_t *output_data);
1429
1430/**
1431 * @brief Get the required buffer size for S16 basic fully-connected and
1432 * matrix multiplication layer function for TF Lite
1433 * @param[in] filter_dims dimension of filter
1434 * @return The function returns required buffer size in bytes
1435 *
1436 */
1437int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
1438
1439/**
1440 * @brief Q7 opt fully-connected layer function
1441 * @param[in] pV pointer to input vector
1442 * @param[in] pM pointer to matrix weights
1443 * @param[in] dim_vec length of the vector
1444 * @param[in] num_of_rows number of rows in weight matrix
1445 * @param[in] bias_shift amount of left-shift for bias
1446 * @param[in] out_shift amount of right-shift for output
1447 * @param[in] bias pointer to bias
1448 * @param[in,out] pOut pointer to output vector
1449 * @param[in,out] vec_buffer pointer to buffer space for input
1450 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1451 *
1452 */
1453
1454arm_status arm_fully_connected_q7_opt(const q7_t *pV,
1455 const q7_t *pM,
1456 const uint16_t dim_vec,
1457 const uint16_t num_of_rows,
1458 const uint16_t bias_shift,
1459 const uint16_t out_shift,
1460 const q7_t *bias,
1461 q7_t *pOut,
1462 q15_t *vec_buffer);
1463
1464/**
1465 * @brief Q15 basic fully-connected layer function
1466 * @param[in] pV pointer to input vector
1467 * @param[in] pM pointer to matrix weights
1468 * @param[in] dim_vec length of the vector
1469 * @param[in] num_of_rows number of rows in weight matrix
1470 * @param[in] bias_shift amount of left-shift for bias
1471 * @param[in] out_shift amount of right-shift for output
1472 * @param[in] bias pointer to bias
1473 * @param[in,out] pOut pointer to output vector
1474 * @param[in,out] vec_buffer pointer to buffer space for input
1475 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1476 *
1477 */
1478
1479arm_status arm_fully_connected_q15(const q15_t *pV,
1480 const q15_t *pM,
1481 const uint16_t dim_vec,
1482 const uint16_t num_of_rows,
1483 const uint16_t bias_shift,
1484 const uint16_t out_shift,
1485 const q15_t *bias,
1486 q15_t *pOut,
1487 q15_t *vec_buffer);
1488
1489/**
1490 * @brief Q15 opt fully-connected layer function
1491 * @param[in] pV pointer to input vector
1492 * @param[in] pM pointer to matrix weights
1493 * @param[in] dim_vec length of the vector
1494 * @param[in] num_of_rows number of rows in weight matrix
1495 * @param[in] bias_shift amount of left-shift for bias
1496 * @param[in] out_shift amount of right-shift for output
1497 * @param[in] bias pointer to bias
1498 * @param[in,out] pOut pointer to output vector
1499 * @param[in,out] vec_buffer pointer to buffer space for input
1500 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1501 *
1502 */
1503
1504arm_status arm_fully_connected_q15_opt(const q15_t *pV,
1505 const q15_t *pM,
1506 const uint16_t dim_vec,
1507 const uint16_t num_of_rows,
1508 const uint16_t bias_shift,
1509 const uint16_t out_shift,
1510 const q15_t *bias,
1511 q15_t *pOut,
1512 q15_t *vec_buffer);
1513
1514/**
1515 * @brief Mixed Q15-Q7 fully-connected layer function
1516 * @param[in] pV pointer to input vector
1517 * @param[in] pM pointer to matrix weights
1518 * @param[in] dim_vec length of the vector
1519 * @param[in] num_of_rows number of rows in weight matrix
1520 * @param[in] bias_shift amount of left-shift for bias
1521 * @param[in] out_shift amount of right-shift for output
1522 * @param[in] bias pointer to bias
1523 * @param[in,out] pOut pointer to output vector
1524 * @param[in,out] vec_buffer pointer to buffer space for input
1525 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1526 *
1527 */
1528
1529arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
1530 const q7_t *pM,
1531 const uint16_t dim_vec,
1532 const uint16_t num_of_rows,
1533 const uint16_t bias_shift,
1534 const uint16_t out_shift,
1535 const q7_t *bias,
1536 q15_t *pOut,
1537 q15_t *vec_buffer);
1538
1539/**
1540 * @brief Mixed Q15-Q7 opt fully-connected layer function
1541 * @param[in] pV pointer to input vector
1542 * @param[in] pM pointer to matrix weights
1543 * @param[in] dim_vec length of the vector
1544 * @param[in] num_of_rows number of rows in weight matrix
1545 * @param[in] bias_shift amount of left-shift for bias
1546 * @param[in] out_shift amount of right-shift for output
1547 * @param[in] bias pointer to bias
1548 * @param[in,out] pOut pointer to output vector
1549 * @param[in,out] vec_buffer pointer to buffer space for input
1550 * @return The function returns <code>ARM_MATH_SUCCESS</code>
1551 *
1552 */
1553
1554arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
1555 const q7_t *pM,
1556 const uint16_t dim_vec,
1557 const uint16_t num_of_rows,
1558 const uint16_t bias_shift,
1559 const uint16_t out_shift,
1560 const q7_t *bias,
1561 q15_t *pOut,
1562 q15_t *vec_buffer);
1563
1564/**
1565 * @brief Matrix-Multiplication Kernels for Convolution
1566 *
1567 * These functions are used within convolution layer functions for
1568 * matrix multiplication.
1569 *
1570 * The implementation is similar to CMSIS-DSP arm_mat_mult functions
1571 * with one Q7 and one Q15 operands. The Q15 operand is the im2col
1572 * output which is always with 2 columns.
1573 *
1574 */
1575
1576/**
1577 * @brief Matrix-multiplication function for convolution
1578 * @param[in] pA pointer to operand A
1579 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
1580 * @param[in] ch_im_out numRow of A
1581 * @param[in] numCol_A numCol of A
1582 * @param[in] bias_shift amount of left-shift for bias
1583 * @param[in] out_shift amount of right-shift for output
1584 * @param[in] bias the bias
1585 * @param[in,out] pOut pointer to output
1586 * @return The function returns the incremented output pointer
1587 */
1588
1589q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
1590 const q15_t *pInBuffer,
1591 const uint16_t ch_im_out,
1592 const uint16_t numCol_A,
1593 const uint16_t bias_shift,
1594 const uint16_t out_shift,
1595 const q7_t *bias,
1596 q7_t *pOut);
1597
1598#ifdef __cplusplus
1599}
1600#endif
1601
1602/*
1603 * Other functions
1604 * These layers are typically not timing critical
1605 * Basic implementation is supported here
1606 */
1607
1608#ifdef __cplusplus
1609extern "C" {
1610#endif
1611
1612/**
1613 * @defgroup BasicMath Basic math functions
1614 *
1615 * Elementwise add and multiplication functions.
1616 *
1617 */
1618
1619/**
1620 * @brief s8 elementwise add of two vectors
1621 * @param[in] input_1_vect pointer to input vector 1
1622 * @param[in] input_2_vect pointer to input vector 2
1623 * @param[in] input_1_offset offset for input 1. Range: -127 to 128
1624 * @param[in] input_1_mult multiplier for input 1
1625 * @param[in] input_1_shift shift for input 1
1626 * @param[in] input_2_offset offset for input 2. Range: -127 to 128
1627 * @param[in] input_2_mult multiplier for input 2
1628 * @param[in] input_2_shift shift for input 2
1629 * @param[in] left_shift input left shift
1630 * @param[in,out] output pointer to output vector
1631 * @param[in] out_offset output offset. Range: -128 to 127
1632 * @param[in] out_mult output multiplier
1633 * @param[in] out_shift output shift
1634 * @param[in] out_activation_min minimum value to clamp output to. Min: -128
1635 * @param[in] out_activation_max maximum value to clamp output to. Max: 127
1636 * @param[in] block_size number of samples
1637 * @return The function returns ARM_MATH_SUCCESS
1638 */
1639arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1640 const int8_t *input_2_vect,
1641 const int32_t input_1_offset,
1642 const int32_t input_1_mult,
1643 const int32_t input_1_shift,
1644 const int32_t input_2_offset,
1645 const int32_t input_2_mult,
1646 const int32_t input_2_shift,
1647 const int32_t left_shift,
1648 int8_t *output,
1649 const int32_t out_offset,
1650 const int32_t out_mult,
1651 const int32_t out_shift,
1652 const int32_t out_activation_min,
1653 const int32_t out_activation_max,
1654 const int32_t block_size);
1655
1656/**
1657 * @brief s16 elementwise add of two vectors
1658 * @param[in] input_1_vect pointer to input vector 1
1659 * @param[in] input_2_vect pointer to input vector 2
1660 * @param[in] input_1_offset offset for input 1. Not used.
1661 * @param[in] input_1_mult multiplier for input 1
1662 * @param[in] input_1_shift shift for input 1
1663 * @param[in] input_2_offset offset for input 2. Not used.
1664 * @param[in] input_2_mult multiplier for input 2
1665 * @param[in] input_2_shift shift for input 2
1666 * @param[in] left_shift input left shift
1667 * @param[in,out] output pointer to output vector
1668 * @param[in] out_offset output offset. Not used.
1669 * @param[in] out_mult output multiplier
1670 * @param[in] out_shift output shift
1671 * @param[in] out_activation_min minimum value to clamp output to. Min: -32768
1672 * @param[in] out_activation_max maximum value to clamp output to. Max: 32767
1673 * @param[in] block_size number of samples
1674 * @return The function returns ARM_MATH_SUCCESS
1675 */
1676arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
1677 const int16_t *input_2_vect,
1678 const int32_t input_1_offset,
1679 const int32_t input_1_mult,
1680 const int32_t input_1_shift,
1681 const int32_t input_2_offset,
1682 const int32_t input_2_mult,
1683 const int32_t input_2_shift,
1684 const int32_t left_shift,
1685 int16_t *output,
1686 const int32_t out_offset,
1687 const int32_t out_mult,
1688 const int32_t out_shift,
1689 const int32_t out_activation_min,
1690 const int32_t out_activation_max,
1691 const int32_t block_size);
1692
1693/**
1694 * @brief s8 elementwise multiplication
1695 * @param[in] input_1_vect pointer to input vector 1
1696 * @param[in] input_2_vect pointer to input vector 2
1697 * @param[in] input_1_offset offset for input 1. Range: -127 to 128
1698 * @param[in] input_2_offset offset for input 2. Range: -127 to 128
1699 * @param[in,out] output pointer to output vector
1700 * @param[in] out_offset output offset. Range: -128 to 127
1701 * @param[in] out_mult output multiplier
1702 * @param[in] out_shift output shift
1703 * @param[in] out_activation_min minimum value to clamp output to. Min: -128
1704 * @param[in] out_activation_max maximum value to clamp output to. Max: 127
1705 * @param[in] block_size number of samples
1706 * @return The function returns ARM_MATH_SUCCESS
1707 *
1708 * @details Supported framework: TensorFlow Lite micro
1709 */
1710arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1711 const int8_t *input_2_vect,
1712 const int32_t input_1_offset,
1713 const int32_t input_2_offset,
1714 int8_t *output,
1715 const int32_t out_offset,
1716 const int32_t out_mult,
1717 const int32_t out_shift,
1718 const int32_t out_activation_min,
1719 const int32_t out_activation_max,
1720 const int32_t block_size);
1721
1722/**
1723 * @brief s16 elementwise multiplication
1724 * @param[in] input_1_vect pointer to input vector 1
1725 * @param[in] input_2_vect pointer to input vector 2
1726 * @param[in] input_1_offset offset for input 1. Not used.
1727 * @param[in] input_2_offset offset for input 2. Not used.
1728 * @param[in,out] output pointer to output vector
1729 * @param[in] out_offset output offset. Not used.
1730 * @param[in] out_mult output multiplier
1731 * @param[in] out_shift output shift
1732 * @param[in] out_activation_min minimum value to clamp output to. Min: -32768
1733 * @param[in] out_activation_max maximum value to clamp output to. Max: 32767
1734 * @param[in] block_size number of samples
1735 * @return The function returns ARM_MATH_SUCCESS
1736 *
1737 * @details Supported framework: TensorFlow Lite micro
1738 */
1739arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
1740 const int16_t *input_2_vect,
1741 const int32_t input_1_offset,
1742 const int32_t input_2_offset,
1743 int16_t *output,
1744 const int32_t out_offset,
1745 const int32_t out_mult,
1746 const int32_t out_shift,
1747 const int32_t out_activation_min,
1748 const int32_t out_activation_max,
1749 const int32_t block_size);
1750
1751/**
1752 * @defgroup Acti Activation Functions
1753 *
1754 * Perform activation layers, including ReLU (Rectified Linear Unit),
1755 * sigmoid and tanh
1756 *
1757 */
1758
1759/**
1760 * @brief Q7 RELU function
1761 * @param[in,out] data pointer to input
1762 * @param[in] size number of elements
1763 * @return none.
1764 */
1765
1766void arm_relu_q7(q7_t *data, uint16_t size);
1767
1768/**
1769 * @brief s8 ReLU6 function
1770 * @param[in,out] data pointer to input
1771 * @param[in] size number of elements
1772 */
1773
1774void arm_relu6_s8(q7_t *data, uint16_t size);
1775
1776/**
1777 * @brief Q15 RELU function
1778 * @param[in,out] data pointer to input
1779 * @param[in] size number of elements
1780 * @return none.
1781 */
1782
1783void arm_relu_q15(q15_t *data, uint16_t size);
1784
1785/**
1786 * @brief Q7 neural network activation function using direct table look-up
1787 * @param[in,out] data pointer to input
1788 * @param[in] size number of elements
1789 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
1790 * @param[in] type type of activation functions
1791 * @return none.
1792 */
1793
1794void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1795
1796/**
1797 * @brief Q15 neural network activation function using direct table look-up
1798 * @param[in,out] data pointer to input
1799 * @param[in] size number of elements
1800 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
1801 * @param[in] type type of activation functions
1802 * @return none.
1803 *
1804 * @details
1805 *
1806 * This is the direct table look-up approach.
1807 *
1808 * Assume here the integer part of the fixed-point is <= 3.
1809 * More than 3 just not making much sense, makes no difference with
1810 * saturation followed by any of these activation functions.
1811 */
1812
1813void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1814
1815/**
1816 * @defgroup Pooling Pooling Functions
1817 *
1818 * Perform pooling functions, including max pooling and average pooling
1819 *
1820 */
1821
1822/**
1823 * @brief Q7 max pooling function
1824 * @param[in] Im_in pointer to input tensor
1825 * @param[in] dim_im_in input tensor dimension
1826 * @param[in] ch_im_in number of input tensor channels
1827 * @param[in] dim_kernel filter kernel size
1828 * @param[in] padding padding sizes
1829 * @param[in] stride convolution stride
1830 * @param[in] dim_im_out output tensor dimension
1831 * @param[in,out] bufferA pointer to buffer space for input
1832 * @param[in,out] Im_out pointer to output tensor
1833 * @return none.
1834 *
1835 */
1836
1837void arm_maxpool_q7_HWC(q7_t *Im_in,
1838 const uint16_t dim_im_in,
1839 const uint16_t ch_im_in,
1840 const uint16_t dim_kernel,
1841 const uint16_t padding,
1842 const uint16_t stride,
1843 const uint16_t dim_im_out,
1844 q7_t *bufferA,
1845 q7_t *Im_out);
1846
1847/**
1848 * @brief Q7 average pooling function
1849 * @param[in] Im_in pointer to input tensor
1850 * @param[in] dim_im_in input tensor dimension
1851 * @param[in] ch_im_in number of input tensor channels
1852 * @param[in] dim_kernel filter kernel size
1853 * @param[in] padding padding sizes
1854 * @param[in] stride convolution stride
1855 * @param[in] dim_im_out output tensor dimension
1856 * @param[in,out] bufferA pointer to buffer space for input
1857 * @param[in,out] Im_out pointer to output tensor
1858 * @return none.
1859 *
1860 */
1861
1862void arm_avepool_q7_HWC(q7_t *Im_in,
1863 const uint16_t dim_im_in,
1864 const uint16_t ch_im_in,
1865 const uint16_t dim_kernel,
1866 const uint16_t padding,
1867 const uint16_t stride,
1868 const uint16_t dim_im_out,
1869 q7_t *bufferA,
1870 q7_t *Im_out);
1871
1872/**
1873 * @brief s8 average pooling function.
1874 *
1875 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1876 * definition file to see if an additional buffer is required.
1877 * Optional function {API}_get_buffer_size() provides the buffer
1878 * size if an additional buffer is required.
1879 * @param[in] pool_params Pooling parameters
1880 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1881 * Argument 'N' is not used.
1882 * @param[in] input_data Input (activation) data pointer. Data type: int8
1883 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1884 * Argument N and C are not used.
1885 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1886 * Argument N is not used.
1887 * C_OUT equals C_IN.
1888 * @param[in, out] output_data Output data pointer. Data type: int8
1889 * @return The function returns
1890 * <code>ARM_MATH_SUCCESS</code> - Successful operation
1891 *
1892 * @details
1893 * - Supported Framework: TensorFlow Lite
1894 *
1895 */
1896arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
1897 const cmsis_nn_pool_params *pool_params,
1898 const cmsis_nn_dims *input_dims,
1899 const q7_t *input_data,
1900 const cmsis_nn_dims *filter_dims,
1901 const cmsis_nn_dims *output_dims,
1902 q7_t *output_data);
1903
1904/**
1905 * @brief Get the required buffer size for S8 average pooling function
1906 * @param[in] dim_dst_width output tensor dimension
1907 * @param[in] ch_src number of input tensor channels
1908 * @return The function returns required buffer size in bytes
1909 *
1910 */
1911int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
1912
1913/**
1914 * @brief s16 average pooling function.
1915 *
1916 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1917 * definition file to see if an additional buffer is required.
1918 * Optional function {API}_get_buffer_size() provides the buffer
1919 * size if an additional buffer is required.
1920 * @param[in] pool_params Pooling parameters
1921 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1922 * Argument 'N' is not used.
1923 * @param[in] input_data Input (activation) data pointer. Data type: int16
1924 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1925 * Argument N and C are not used.
1926 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1927 * Argument N is not used.
1928 * C_OUT equals C_IN.
1929 * @param[in, out] output_data Output data pointer. Data type: int16
1930 * @return The function returns
1931 * <code>ARM_MATH_SUCCESS</code> - Successful operation
1932 *
1933 * @details
1934 * - Supported Framework: TensorFlow Lite
1935 *
1936 */
1937arm_status arm_avgpool_s16(const cmsis_nn_context *ctx,
1938 const cmsis_nn_pool_params *pool_params,
1939 const cmsis_nn_dims *input_dims,
1940 const int16_t *input_data,
1941 const cmsis_nn_dims *filter_dims,
1942 const cmsis_nn_dims *output_dims,
1943 int16_t *output_data);
1944
1945/**
1946 * @brief Get the required buffer size for S16 average pooling function
1947 * @param[in] dim_dst_width output tensor dimension
1948 * @param[in] ch_src number of input tensor channels
1949 * @return The function returns required buffer size in bytes
1950 *
1951 */
1952int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
1953
1954/**
1955 * @brief s8 max pooling function.
1956 *
1957 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1958 * definition file to see if an additional buffer is required.
1959 * Optional function {API}_get_buffer_size() provides the buffer
1960 * size if an additional buffer is required.
1961 * @param[in] pool_params Pooling parameters
1962 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1963 * Argument 'N' is not used.
1964 * @param[in] input_data Input (activation) data pointer. The input tensor must not
1965 * overlap with the output tensor. Data type: int8
1966 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1967 * Argument N and C are not used.
1968 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1969 * Argument N is not used.
1970 * C_OUT equals C_IN.
1971 * @param[in, out] output_data Output data pointer. Data type: int8
1972 * @return The function returns
1973 * <code>ARM_MATH_SUCCESS</code> - Successful operation
1974 *
1975 * @details
1976 * - Supported Framework: TensorFlow Lite
1977 *
1978 */
1979arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
1980 const cmsis_nn_pool_params *pool_params,
1981 const cmsis_nn_dims *input_dims,
1982 const q7_t *input_data,
1983 const cmsis_nn_dims *filter_dims,
1984 const cmsis_nn_dims *output_dims,
1985 q7_t *output_data);
1986
1987/**
1988 * @brief s16 max pooling function.
1989 *
1990 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1991 * definition file to see if an additional buffer is required.
1992 * Optional function {API}_get_buffer_size() provides the buffer
1993 * size if an additional buffer is required.
1994 * @param[in] pool_params Pooling parameters
1995 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1996 * Argument 'N' is not used.
1997 * @param[in] src Input (activation) data pointer. The input tensor must not
1998 * overlap with the output tensor. Data type: int16
1999 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
2000 * Argument N and C are not used.
2001 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
2002 * Argument N is not used.
2003 * C_OUT equals C_IN.
2004 * @param[in, out] dst Output data pointer. Data type: int16
2005 * @return The function returns
2006 * <code>ARM_MATH_SUCCESS</code> - Successful operation
2007 *
2008 * @details
2009 * - Supported Framework: TensorFlow Lite
2010 *
2011 */
2012arm_status arm_max_pool_s16(const cmsis_nn_context *ctx,
2013 const cmsis_nn_pool_params *pool_params,
2014 const cmsis_nn_dims *input_dims,
2015 const int16_t *src,
2016 const cmsis_nn_dims *filter_dims,
2017 const cmsis_nn_dims *output_dims,
2018 int16_t *dst);
2019
2020/**
2021 * @defgroup Softmax Softmax Functions
2022 *
2023 * EXP(2) based softmax functions.
2024 *
2025 */
2026
2027/**
2028 * @brief Q7 softmax function
2029 * @param[in] vec_in pointer to input vector
2030 * @param[in] dim_vec input vector dimension
2031 * @param[out] p_out pointer to output vector
2032 *
2033 * @note This function is an optimized version which is not bit-accurate with
2034 * TensorFlow Lite's kernel
2035 *
2036 */
2037
2038void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
2039
2040/**
2041 * @brief Q7 softmax function with batch parameter
2042 * @param[in] vec_in pointer to input vector
2043 * @param[in] nb_batches number of batches
2044 * @param[in] dim_vec input vector dimension
2045 * @param[out] p_out pointer to output vector
2046 * @return none.
2047 *
2048 * @note This function is an optimized version which is not bit-accurate with
2049 * TensorFlow Lite's kernel
2050 *
2051 */
2052
2053void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out);
2054/**
2055 * @brief Q15 softmax function
2056 * @param[in] vec_in pointer to input vector
2057 * @param[in] dim_vec input vector dimension
2058 * @param[out] p_out pointer to output vector
2059 * @return none.
2060 *
2061 * @note This function is an optimized version which is not bit-accurate with
2062 * TensorFlow Lite's kernel
2063 *
2064 */
2065
2066void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
2067
2068/**
2069 * @brief S8 softmax function
2070 * @param[in] input Pointer to the input tensor
2071 * @param[in] num_rows Number of rows in the input tensor
2072 * @param[in] row_size Number of elements in each input row
2073 * @param[in] mult Input quantization multiplier
2074 * @param[in] shift Input quantization shift within the range [0, 31]
2075 * @param[in] diff_min Minimum difference with max in row. Used to check if
2076 * the quantized exponential operation can be performed
2077 * @param[out] output Pointer to the output tensor
2078 *
2079 * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2080 *
2081 */
2082void arm_softmax_s8(const int8_t *input,
2083 const int32_t num_rows,
2084 const int32_t row_size,
2085 const int32_t mult,
2086 const int32_t shift,
2087 const int32_t diff_min,
2088 int8_t *output);
2089
2090/**
2091 * @brief S8 to s16 softmax function
2092 * @param[in] input Pointer to the input tensor
2093 * @param[in] num_rows Number of rows in the input tensor
2094 * @param[in] row_size Number of elements in each input row
2095 * @param[in] mult Input quantization multiplier
2096 * @param[in] shift Input quantization shift within the range [0, 31]
2097 * @param[in] diff_min Minimum difference with max in row. Used to check if
2098 * the quantized exponential operation can be performed
2099 * @param[out] output Pointer to the output tensor
2100 *
2101 * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2102 *
2103 */
2104void arm_softmax_s8_s16(const int8_t *input,
2105 const int32_t num_rows,
2106 const int32_t row_size,
2107 const int32_t mult,
2108 const int32_t shift,
2109 const int32_t diff_min,
2110 int16_t *output);
2111
2112/**
2113 * @brief S16 softmax function
2114 * @param[in] input Pointer to the input tensor
2115 * @param[in] num_rows Number of rows in the input tensor
2116 * @param[in] row_size Number of elements in each input row
2117 * @param[in] mult Input quantization multiplier
2118 * @param[in] shift Input quantization shift within the range [0, 31]
2119 * @param[in] softmax_params Softmax s16 layer parameters with two pointers to LUTs speficied below.
2120 * For indexing the high 9 bits are used and 7 remaining for interpolation.
2121 * That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
2122 * values for each LUT.
2123 * - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
2124 * - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
2125 * @param[out] output Pointer to the output tensor
2126 * @return The function returns
2127 * <code>ARM_MATH_ARGUMENT_ERROR</code> if LUTs are NULL
2128 * <code>ARM_MATH_SUCCESS</code> - Successful operation
2129 *
2130 * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2131 *
2132 */
2133arm_status arm_softmax_s16(const int16_t *input,
2134 const int32_t num_rows,
2135 const int32_t row_size,
2136 const int32_t mult,
2137 const int32_t shift,
2138 const cmsis_nn_softmax_lut_s16 *softmax_params,
2139 int16_t *output);
2140
2141/**
2142 * @brief U8 softmax function
2143 * @param[in] input Pointer to the input tensor
2144 * @param[in] num_rows Number of rows in the input tensor
2145 * @param[in] row_size Number of elements in each input row
2146 * @param[in] mult Input quantization multiplier
2147 * @param[in] shift Input quantization shift within the range [0, 31]
2148 * @param[in] diff_min Minimum difference with max in row. Used to check if
2149 * the quantized exponential operation can be performed
2150 * @param[out] output Pointer to the output tensor
2151 *
2152 * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2153 *
2154 */
2155
2156void arm_softmax_u8(const uint8_t *input,
2157 const int32_t num_rows,
2158 const int32_t row_size,
2159 const int32_t mult,
2160 const int32_t shift,
2161 const int32_t diff_min,
2162 uint8_t *output);
2163
2164/**
2165 * @brief uint8 depthwise convolution function with asymmetric quantization
2166 * Unless specified otherwise, arguments are mandatory.
2167 *
2168 * @param[in] input Pointer to input tensor
2169 * @param[in] input_x Width of input tensor
2170 * @param[in] input_y Height of input tensor
2171 * @param[in] input_ch Channels in input tensor
2172 * @param[in] kernel Pointer to kernel weights
2173 * @param[in] kernel_x Width of kernel
2174 * @param[in] kernel_y Height of kernel
2175 * @param[in] ch_mult Number of channel multiplier
2176 * @param[in] pad_x Padding sizes x
2177 * @param[in] pad_y Padding sizes y
2178 * @param[in] stride_x stride along the width
2179 * @param[in] stride_y stride along the height
2180 * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
2181 * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
2182 * @param[in] bias Pointer to optional bias values. If no bias is
2183 * availble, NULL is expected
2184 * @param[in] input_offset Input tensor zero offset
2185 * @param[in] filter_offset Kernel tensor zero offset
2186 * @param[in] output_offset Output tensor zero offset
2187 * @param[in,out] output Pointer to output tensor
2188 * @param[in] output_x Width of output tensor
2189 * @param[in] output_y Height of output tensor
2190 * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
2191 * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
2192 * @param[in] out_shift Amount of right-shift for output
2193 * @param[in] out_mult Output multiplier for requantization
2194 * @return The function returns the following
2195 * <code>ARM_MATH_SUCCESS</code> - Successful operation
2196 *
2197 */
2198arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
2199 const uint16_t input_x,
2200 const uint16_t input_y,
2201 const uint16_t input_ch,
2202 const uint8_t *kernel,
2203 const uint16_t kernel_x,
2204 const uint16_t kernel_y,
2205 const int16_t ch_mult,
2206 const int16_t pad_x,
2207 const int16_t pad_y,
2208 const int16_t stride_x,
2209 const int16_t stride_y,
2210 const int16_t dilation_x,
2211 const int16_t dilation_y,
2212 const int32_t *bias,
2213 const int32_t input_offset,
2214 const int32_t filter_offset,
2215 const int32_t output_offset,
2216 uint8_t *output,
2217 const uint16_t output_x,
2218 const uint16_t output_y,
2219 const int32_t output_activation_min,
2220 const int32_t output_activation_max,
2221 const int32_t out_shift,
2222 const int32_t out_mult);
2223
2224/**
2225 * @defgroup Reshape Reshape Functions
2226 *
2227 */
2228
2229/**
2230 * @brief Reshape a s8 vector into another with different shape
2231 * @param[in] input points to the s8 input vector
2232 * @param[out] output points to the s8 output vector
2233 * @param[in] total_size total size of the input and output vectors in bytes
2234 *
2235 * @note The output is expected to be in a memory area that does not overlap with the input's
2236 *
2237 */
2238void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
2239
2240/**
2241 * @defgroup Concatenation Concatenation Functions
2242 *
2243 */
2244
2245/**
2246 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
2247 * This function should be called for each input tensor to concatenate. The argument offset_x
2248 * will be used to store the input tensor in the correct position in the output tensor
2249 *
2250 * i.e. offset_x = 0
2251 * for(i = 0 i < num_input_tensors; ++i)
2252 * {
2253 * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
2254 * offset_x += input_x[i]
2255 * }
2256 *
2257 * This function assumes that the output tensor has:
2258 * -# The same height of the input tensor
2259 * -# The same number of channels of the input tensor
2260 * -# The same batch size of the input tensor
2261 *
2262 * Unless specified otherwise, arguments are mandatory.
2263 *
2264 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2265 * does not involve any arithmetic operation
2266 *
2267 * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
2268 * @param[in] input_x Width of input tensor
2269 * @param[in] input_y Height of input tensor
2270 * @param[in] input_z Channels in input tensor
2271 * @param[in] input_w Batch size in input tensor
2272 * @param[out] output Pointer to output tensor. Expected to be at least
2273 * (input_x * input_y * input_z * input_w) + offset_x
2274 * bytes.
2275 * @param[in] output_x Width of output tensor
2276 * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
2277 * It is user responsibility to provide the correct value
2278 *
2279 * <b> Input constraints</b>
2280 * offset_x is less than output_x
2281 *
2282 */
2283void arm_concatenation_s8_x(const int8_t *input,
2284 const uint16_t input_x,
2285 const uint16_t input_y,
2286 const uint16_t input_z,
2287 const uint16_t input_w,
2288 int8_t *output,
2289 const uint16_t output_x,
2290 const uint32_t offset_x);
2291
2292/**
2293 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
2294 * This function should be called for each input tensor to concatenate. The argument offset_y
2295 * will be used to store the input tensor in the correct position in the output tensor
2296 *
2297 * i.e. offset_y = 0
2298 * for(i = 0 i < num_input_tensors; ++i)
2299 * {
2300 * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
2301 * offset_y += input_y[i]
2302 * }
2303 *
2304 * This function assumes that the output tensor has:
2305 * -# The same width of the input tensor
2306 * -# The same number of channels of the input tensor
2307 * -# The same batch size of the input tensor
2308 *
2309 * Unless specified otherwise, arguments are mandatory.
2310 *
2311 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2312 * does not involve any arithmetic operation
2313 *
2314 * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
2315 * @param[in] input_x Width of input tensor
2316 * @param[in] input_y Height of input tensor
2317 * @param[in] input_z Channels in input tensor
2318 * @param[in] input_w Batch size in input tensor
2319 * @param[out] output Pointer to output tensor. Expected to be at least
2320 * (input_z * input_w * input_x * input_y) + offset_y
2321 * bytes.
2322 * @param[in] output_y Height of output tensor
2323 * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor
2324 * It is user responsibility to provide the correct value
2325 *
2326 * <b> Input constraints</b>
2327 * offset_y is less than output_y
2328 *
2329 */
2330void arm_concatenation_s8_y(const int8_t *input,
2331 const uint16_t input_x,
2332 const uint16_t input_y,
2333 const uint16_t input_z,
2334 const uint16_t input_w,
2335 int8_t *output,
2336 const uint16_t output_y,
2337 const uint32_t offset_y);
2338
2339/**
2340 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
2341 * This function should be called for each input tensor to concatenate. The argument offset_z
2342 * will be used to store the input tensor in the correct position in the output tensor
2343 *
2344 * i.e. offset_z = 0
2345 * for(i = 0 i < num_input_tensors; ++i)
2346 * {
2347 * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
2348 * offset_z += input_z[i]
2349 * }
2350 *
2351 * This function assumes that the output tensor has:
2352 * -# The same width of the input tensor
2353 * -# The same height of the input tensor
2354 * -# The same batch size of the input tensor
2355 *
2356 * Unless specified otherwise, arguments are mandatory.
2357 *
2358 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2359 * does not involve any arithmetic operation
2360 *
2361 * @param[in] input Pointer to input tensor. Input tensor must not overlap with output tensor.
2362 * @param[in] input_x Width of input tensor
2363 * @param[in] input_y Height of input tensor
2364 * @param[in] input_z Channels in input tensor
2365 * @param[in] input_w Batch size in input tensor
2366 * @param[out] output Pointer to output tensor. Expected to be at least
2367 * (input_x * input_y * input_z * input_w) + offset_z
2368 * bytes.
2369 * @param[in] output_z Channels in output tensor
2370 * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor
2371 * It is user responsibility to provide the correct value
2372 *
2373 * <b> Input constraints</b>
2374 * offset_z is less than output_z
2375 *
2376 */
2377void arm_concatenation_s8_z(const int8_t *input,
2378 const uint16_t input_x,
2379 const uint16_t input_y,
2380 const uint16_t input_z,
2381 const uint16_t input_w,
2382 int8_t *output,
2383 const uint16_t output_z,
2384 const uint32_t offset_z);
2385
2386/**
2387 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
2388 * This function should be called for each input tensor to concatenate. The argument offset_w
2389 * will be used to store the input tensor in the correct position in the output tensor
2390 *
2391 * i.e. offset_w = 0
2392 * for(i = 0 i < num_input_tensors; ++i)
2393 * {
2394 * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
2395 * offset_w += input_w[i]
2396 * }
2397 *
2398 * This function assumes that the output tensor has:
2399 * -# The same width of the input tensor
2400 * -# The same height of the input tensor
2401 * -# The same number o channels of the input tensor
2402 *
2403 * Unless specified otherwise, arguments are mandatory.
2404 *
2405 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2406 * does not involve any arithmetic operation
2407 *
2408 * @param[in] input Pointer to input tensor
2409 * @param[in] input_x Width of input tensor
2410 * @param[in] input_y Height of input tensor
2411 * @param[in] input_z Channels in input tensor
2412 * @param[in] input_w Batch size in input tensor
2413 * @param[out] output Pointer to output tensor. Expected to be at least
2414 * input_x * input_y * input_z * input_w
2415 * bytes.
2416 * @param[in] offset_w The offset on the W axis to start concatenating the input tensor
2417 * It is user responsibility to provide the correct value
2418 *
2419 */
2420void arm_concatenation_s8_w(const int8_t *input,
2421 const uint16_t input_x,
2422 const uint16_t input_y,
2423 const uint16_t input_z,
2424 const uint16_t input_w,
2425 int8_t *output,
2426 const uint32_t offset_w);
2427/**
2428 * @defgroup SVDF SVDF Layer Functions
2429 *
2430 */
2431
2432/**
2433 * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
2434 *
2435 * @param[in] input_ctx Temporary scratch buffer
2436 * @param[in] output_ctx Temporary output scratch buffer
2437 * @param[in] svdf_params SVDF Parameters
2438 * Range of svdf_params->input_offset : [-128, 127]
2439 * Range of svdf_params->output_offset : [-128, 127]
2440 * @param[in] input_quant_params Input quantization parameters
2441 * @param[in] output_quant_params Output quantization parameters
2442 * @param[in] input_dims Input tensor dimensions
2443 * @param[in] input_data Pointer to input tensor
2444 * @param[in] state_dims State tensor dimensions
2445 * @param[in] state_data Pointer to state tensor
2446 * @param[in] weights_feature_dims Weights (feature) tensor dimensions
2447 * @param[in] weights_feature_data Pointer to the weights (feature) tensor
2448 * @param[in] weights_time_dims Weights (time) tensor dimensions
2449 * @param[in] weights_time_data Pointer to the weights (time) tensor
2450 * @param[in] bias_dims Bias tensor dimensions
2451 * @param[in] bias_data Pointer to bias tensor
2452 * @param[in] output_dims Output tensor dimensions
2453 * @param[out] output_data Pointer to the output tensor
2454 *
2455 * @return The function returns <code>ARM_MATH_SUCCESS</code>
2456 *
2457 * @details
2458 * 1. Supported framework: TensorFlow Lite micro
2459 * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
2460 *
2461 */
2462arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
2463 const cmsis_nn_context *output_ctx,
2464 const cmsis_nn_svdf_params *svdf_params,
2465 const cmsis_nn_per_tensor_quant_params *input_quant_params,
2466 const cmsis_nn_per_tensor_quant_params *output_quant_params,
2467 const cmsis_nn_dims *input_dims,
2468 const q7_t *input_data,
2469 const cmsis_nn_dims *state_dims,
2470 q7_t *state_data,
2471 const cmsis_nn_dims *weights_feature_dims,
2472 const q7_t *weights_feature_data,
2473 const cmsis_nn_dims *weights_time_dims,
2474 const q7_t *weights_time_data,
2475 const cmsis_nn_dims *bias_dims,
2476 const q31_t *bias_data,
2477 const cmsis_nn_dims *output_dims,
2478 q7_t *output_data);
2479
2480/**
2481 * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
2482 *
2483 * @param[in] input_ctx Temporary scratch buffer
2484 * @param[in] output_ctx Temporary output scratch buffer
2485 * @param[in] svdf_params SVDF Parameters
2486 * Range of svdf_params->input_offset : [-128, 127]
2487 * Range of svdf_params->output_offset : [-128, 127]
2488 * @param[in] input_quant_params Input quantization parameters
2489 * @param[in] output_quant_params Output quantization parameters
2490 * @param[in] input_dims Input tensor dimensions
2491 * @param[in] input_data Pointer to input tensor
2492 * @param[in] state_dims State tensor dimensions
2493 * @param[in] state_data Pointer to state tensor
2494 * @param[in] weights_feature_dims Weights (feature) tensor dimensions
2495 * @param[in] weights_feature_data Pointer to the weights (feature) tensor
2496 * @param[in] weights_time_dims Weights (time) tensor dimensions
2497 * @param[in] weights_time_data Pointer to the weights (time) tensor
2498 * @param[in] bias_dims Bias tensor dimensions
2499 * @param[in] bias_data Pointer to bias tensor
2500 * @param[in] output_dims Output tensor dimensions
2501 * @param[out] output_data Pointer to the output tensor
2502 *
2503 * @return The function returns <code>ARM_MATH_SUCCESS</code>
2504 *
2505 * @details
2506 * 1. Supported framework: TensorFlow Lite micro
2507 * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
2508 *
2509 */
2510arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
2511 const cmsis_nn_context *output_ctx,
2512 const cmsis_nn_svdf_params *svdf_params,
2513 const cmsis_nn_per_tensor_quant_params *input_quant_params,
2514 const cmsis_nn_per_tensor_quant_params *output_quant_params,
2515 const cmsis_nn_dims *input_dims,
2516 const q7_t *input_data,
2517 const cmsis_nn_dims *state_dims,
2518 q15_t *state_data,
2519 const cmsis_nn_dims *weights_feature_dims,
2520 const q7_t *weights_feature_data,
2521 const cmsis_nn_dims *weights_time_dims,
2522 const q15_t *weights_time_data,
2523 const cmsis_nn_dims *bias_dims,
2524 const q31_t *bias_data,
2525 const cmsis_nn_dims *output_dims,
2526 q7_t *output_data);
2527
2528#ifdef __cplusplus
2529}
2530#endif
2531
2532#endif
Note: See TracBrowser for help on using the repository browser.