Context Navigation

arm_nnfunctions.h

Last change on this file was 42, checked in by f.jahn, 5 days ago

File size: 127.4 KB

Line
1	/*
2	* Copyright (C) 2010-2022 Arm Limited or its affiliates.
3	*
4	* SPDX-License-Identifier: Apache-2.0
5	*
6	* Licensed under the Apache License, Version 2.0 (the License); you may
7	* not use this file except in compliance with the License.
8	* You may obtain a copy of the License at
9	*
10	* www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing, software
13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	* See the License for the specific language governing permissions and
16	* limitations under the License.
17	*/
18
19	/* ----------------------------------------------------------------------
20	* Project: CMSIS NN Library
21	* Title: arm_nnfunctions.h
22	* Description: Public header file for CMSIS NN Library
23	*
24	* $Date: 19 April 2022
25	* $Revision: V.9.0.0
26	*
27	* Target Processor: Cortex-M CPUs
28	* -------------------------------------------------------------------- */
29
30	/**
31	\mainpage CMSIS NN Software Library
32	*
33	* Introduction
34	* ------------
35	*
36	* This user manual describes the CMSIS NN software library,
37	* a collection of efficient neural network kernels developed to maximize the
38	* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39	*
40	* The library is divided into a number of functions each covering a specific category:
41	* - Convolution Functions
42	* - Activation Functions
43	* - Fully-connected Layer Functions
44	* - SVDF Layer Functions
45	* - Pooling Functions
46	* - Softmax Functions
47	* - Basic math Functions
48	*
49	* The library has separate functions for operating on different weight and activation data
50	* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
51	* kernels are included in the function description. The implementation details are also
52	* described in this paper [1].
53	*
54	* Supported Processors
55	* -------
56	* CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each
57	* targets a different group of processors.
58	* - Processors without SIMD capability (e.g, Cortex-M0)
59	* - Processors with DSP extention (e.g Cortex-M4)
60	* - Processors with MVE extension (e.g Cortex-M55)
61	* The right implementation is picked through feature flags and the user usually does not have to explicit set it.
62	*
63	* Function Classification
64	* --------
65	* The functions can be classified into two segments
66	* - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
67	* - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
68	*
69	* The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
70	* The article in [2] describes in detail how to run a network using the legacy functions.
71	*
72	* The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
73	* micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
74	* a TensorFlow Lite model using optimized CMSIS-NN kernels.
75	*
76	* Block Diagram
77	* --------
78	* \image html CMSIS-NN-OVERVIEW.PNG
79	*
80	* Examples
81	* --------
82	*
83	* The library ships with a number of examples which demonstrate how to use the library functions.
84	*
85	* Pre-processor Macros
86	* ------------
87	*
88	* Each library project have different pre-processor macros.
89	*
90	* - ARM_MATH_DSP:
91	*
92	* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
93	*
94	* - ARM_MATH_MVEI:
95	*
96	* Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
97
98	* - ARM_MATH_AUTOVECTORIZE
99	* Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
100	* assembly. It does not affect functions that use C or intrinsics.
101	* - ARM_MATH_BIG_ENDIAN:
102	*
103	* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
104	* functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
105	* little endian targets.
106	*
107	* - ARM_NN_TRUNCATE:
108	*
109	* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
110	*
111	*
112	* Copyright Notice
113	* ------------
114	*
115	* Copyright (C) 2010-2019 Arm Limited. All rights reserved.
116	*
117	* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
118	*
119	* [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
120	*
121	https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
122	* [3] https://www.tensorflow.org/lite/microcontrollers/library
123	*
124	* [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
125	*/
126
127	/**
128	* @defgroup groupNN Neural Network Functions
129	* A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
130	* TensorFlow Lite framework.
131	*/
132
133	#ifndef _ARM_NNFUNCTIONS_H
134	#define _ARM_NNFUNCTIONS_H
135
136	#include "arm_nn_math_types.h"
137	#include "arm_nn_types.h"
138
139	#define USE_INTRINSIC
140
141	//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
142
143	#ifdef __cplusplus
144	extern "C" {
145	#endif
146
147	/**
148	* @brief Struct for specifying activation function types
149	*
150	*/
151	typedef enum
152	{
153	ARM_SIGMOID = 0,
154	/*< Sigmoid activation function /
155	ARM_TANH = 1,
156	/*< Tanh activation function /
157	} arm_nn_activation_type;
158
159	/**
160	* @defgroup NNConv Convolution Functions
161	*
162	* Collection of convolution, depthwise convolution functions and their variants.
163	*
164	* The convolution is implemented in 2 steps: im2col and GEMM
165	*
166	* im2col is a process of converting each patch of image data into
167	* a column. After im2col, the convolution is computed as matrix-matrix
168	* multiplication.
169	*
170	* To reduce the memory footprint, the im2col is performed partially.
171	* Each iteration, only a few column (i.e., patches) are generated and
172	* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
173	*
174	*/
175
176	/**
177	* @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
178	cmsis-nn
179	* to perform the convolution.
180	*
181	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
182	arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
183	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
184	* Range of conv_params->input_offset : [-127, 128]
185	* Range of conv_params->output_offset : [-128, 127]
186	* @param[in] quant_params Per-channel quantization info.
187	* It contains the multiplier and shift values to be applied to each output channel
188	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
189	* @param[in] input_data Input (activation) data pointer. Data type: int8
190	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
191	* spatial filter dimensions
192	* @param[in] filter_data Filter data pointer. Data type: int8
193	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
194	* @param[in] bias_data Bias data pointer. Data type: int32
195	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
196	* @param[out] output_data Output data pointer. Data type: int8
197	*
198	* @return The function returns either
199	* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
200	* <code>ARM_MATH_SUCCESS</code> on successful completion.
201	*
202	*/
203	arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
204	const cmsis_nn_conv_params *conv_params,
205	const cmsis_nn_per_channel_quant_params *quant_params,
206	const cmsis_nn_dims *input_dims,
207	const q7_t *input_data,
208	const cmsis_nn_dims *filter_dims,
209	const q7_t *filter_data,
210	const cmsis_nn_dims *bias_dims,
211	const int32_t *bias_data,
212	const cmsis_nn_dims *output_dims,
213	q7_t *output_data);
214
215	/**
216	* @brief Get the required buffer size for arm_convolve_wrapper_s8
217	*
218	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
219	* Range of conv_params->input_offset : [-127, 128]
220	* Range of conv_params->output_offset : [-128, 127]
221	* @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
222	* @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
223	* filter dimensions
224	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
225	*
226	* @return The function returns required buffer size(bytes)
227	*
228	*/
229	int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
230	const cmsis_nn_dims *input_dims,
231	const cmsis_nn_dims *filter_dims,
232	const cmsis_nn_dims *output_dims);
233
234	/**
235	* @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
236	cmsis-nn
237	* to perform the convolution.
238	*
239	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
240	arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
241	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
242	* conv_params->input_offset : Not used
243	* conv_params->output_offset : Not used
244	* @param[in] quant_params Per-channel quantization info.
245	* It contains the multiplier and shift values to be applied to each output channel
246	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
247	* @param[in] input_data Input (activation) data pointer. Data type: int16
248	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
249	* spatial filter dimensions
250	* @param[in] filter_data Filter data pointer. Data type: int8
251	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
252	* @param[in] bias_data Bias data pointer. Data type: int64
253	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
254	* @param[out] output_data Output data pointer. Data type: int16
255	*
256	* @return The function returns either
257	* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
258	* <code>ARM_MATH_SUCCESS</code> on successful completion.
259	*
260	*/
261	arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
262	const cmsis_nn_conv_params *conv_params,
263	const cmsis_nn_per_channel_quant_params *quant_params,
264	const cmsis_nn_dims *input_dims,
265	const q15_t *input_data,
266	const cmsis_nn_dims *filter_dims,
267	const q7_t *filter_data,
268	const cmsis_nn_dims *bias_dims,
269	const int64_t *bias_data,
270	const cmsis_nn_dims *output_dims,
271	q15_t *output_data);
272
273	/**
274	* @brief Get the required buffer size for arm_convolve_wrapper_s16
275	*
276	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
277	* conv_params->input_offset : Not used
278	* conv_params->output_offset : Not used
279	* @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
280	* @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
281	* filter dimensions
282	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
283	*
284	* @return The function returns required buffer size(bytes)
285	*
286	*/
287	int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
288	const cmsis_nn_dims *input_dims,
289	const cmsis_nn_dims *filter_dims,
290	const cmsis_nn_dims *output_dims);
291
292	/**
293	* @brief Basic s8 convolution function
294	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
295	arm_convolve_s8_get_buffer_size will return the buffer_size if required
296	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
297	* Range of conv_params->input_offset : [-127, 128]
298	* Range of conv_params->output_offset : [-128, 127]
299	* @param[in] quant_params Per-channel quantization info.
300	* It contains the multiplier and shift values to be applied to each output channel
301	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
302	* @param[in] input_data Input (activation) data pointer. Data type: int8
303	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
304	* spatial filter dimensions
305	* @param[in] filter_data Filter data pointer. Data type: int8
306	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
307	* @param[in] bias_data Optional bias data pointer. Data type: int32
308	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
309	* @param[out] output_data Output data pointer. Data type: int8
310
311	* @return The function returns <code>ARM_MATH_SUCCESS</code>
312	*
313	* @details
314	* 1. Supported framework: TensorFlow Lite micro
315	* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
316	* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
317	*
318	*/
319	arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
320	const cmsis_nn_conv_params *conv_params,
321	const cmsis_nn_per_channel_quant_params *quant_params,
322	const cmsis_nn_dims *input_dims,
323	const q7_t *input_data,
324	const cmsis_nn_dims *filter_dims,
325	const q7_t *filter_data,
326	const cmsis_nn_dims *bias_dims,
327	const int32_t *bias_data,
328	const cmsis_nn_dims *output_dims,
329	q7_t *output_data);
330
331	/**
332	* @brief Get the required buffer size for s8 convolution function
333	*
334	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
335	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
336	* are the spatial filter dimensions
337	* @return The function returns required buffer size(bytes)
338	*
339	*/
340	int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims);
341
342	/**
343	* @brief Basic s16 convolution function
344	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
345	arm_convolve_s16_get_buffer_size will return the buffer_size if required
346	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
347	* conv_params->input_offset : Not used
348	* conv_params->output_offset : Not used
349	* @param[in] quant_params Per-channel quantization info.
350	* It contains the multiplier and shift values to be applied to each output channel
351	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
352	* @param[in] input_data Input (activation) data pointer. Data type: int16
353	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
354	* spatial filter dimensions
355	* @param[in] filter_data Filter data pointer. Data type: int8
356	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
357	* @param[in] bias_data Optional bias data pointer. Data type: int64
358	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
359	* @param[out] output_data Output data pointer. Data type: int16
360
361	* @return The function returns <code>ARM_MATH_SUCCESS</code>
362	*
363	* @details
364	* 1. Supported framework: TensorFlow Lite micro
365	* 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
366	* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
367	*
368	*/
369	arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
370	const cmsis_nn_conv_params *conv_params,
371	const cmsis_nn_per_channel_quant_params *quant_params,
372	const cmsis_nn_dims *input_dims,
373	const q15_t *input_data,
374	const cmsis_nn_dims *filter_dims,
375	const q7_t *filter_data,
376	const cmsis_nn_dims *bias_dims,
377	const int64_t *bias_data,
378	const cmsis_nn_dims *output_dims,
379	q15_t *output_data);
380	/**
381	* @brief Optimized s16 convolution function
382	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
383	arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required
384	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
385	* conv_params->input_offset : Not used
386	* conv_params->output_offset : Not used
387	* @param[in] quant_params Per-channel quantization info.
388	* It contains the multiplier and shift values to be applied to each output channel
389	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
390	* @param[in] input_data Input (activation) data pointer. Data type: int16
391	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
392	* spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
393	exceed 512
394	* @param[in] filter_data Filter data pointer. Data type: int8
395	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
396	* @param[in] bias_data Optional bias data pointer. Data type: int64
397	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
398	* @param[out] output_data Output data pointer. Data type: int16
399
400	* @return The function returns <code>ARM_MATH_SUCCESS</code>
401	*
402	* @details
403	* 1. Supported framework: TensorFlow Lite micro
404	* 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
405	* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
406	* 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
407	*
408	*/
409
410	arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
411	const cmsis_nn_conv_params *conv_params,
412	const cmsis_nn_per_channel_quant_params *quant_params,
413	const cmsis_nn_dims *input_dims,
414	const q15_t *input_data,
415	const cmsis_nn_dims *filter_dims,
416	const q7_t *filter_data,
417	const cmsis_nn_dims *bias_dims,
418	const int64_t *bias_data,
419	const cmsis_nn_dims *output_dims,
420	q15_t *output_data);
421
422	/**
423	* @brief Get the required buffer size for s16 convolution function
424	*
425	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
426	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
427	* are the spatial filter dimensions
428	* @return The function returns required buffer size(bytes)
429	*
430	*/
431	int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims);
432
433	/**
434	* @brief Get the required buffer size for fast s16 convolution function
435	*
436	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
437	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
438	* are the spatial filter dimensions
439	* @return The function returns required buffer size(bytes)
440	*
441	*/
442	int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims);
443
444	/**
445	* @brief Basic Q7 convolution function
446	* @param[in] Im_in pointer to input tensor
447	* @param[in] dim_im_in input tensor dimension
448	* @param[in] ch_im_in number of input tensor channels
449	* @param[in] wt pointer to kernel weights
450	* @param[in] ch_im_out number of filters, i.e., output tensor channels
451	* @param[in] dim_kernel filter kernel size
452	* @param[in] padding padding sizes
453	* @param[in] stride convolution stride
454	* @param[in] bias pointer to bias
455	* @param[in] bias_shift amount of left-shift for bias
456	* @param[in] out_shift amount of right-shift for output
457	* @param[in,out] Im_out pointer to output tensor
458	* @param[in] dim_im_out output tensor dimension
459	* @param[in,out] bufferA pointer to buffer space for input
460	* @param[in,out] bufferB pointer to buffer space for output
461	* @return The function returns <code>ARM_MATH_SUCCESS</code>
462	*
463	*/
464	arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
465	const uint16_t dim_im_in,
466	const uint16_t ch_im_in,
467	const q7_t *wt,
468	const uint16_t ch_im_out,
469	const uint16_t dim_kernel,
470	const uint16_t padding,
471	const uint16_t stride,
472	const q7_t *bias,
473	const uint16_t bias_shift,
474	const uint16_t out_shift,
475	q7_t *Im_out,
476	const uint16_t dim_im_out,
477	q15_t *bufferA,
478	q7_t *bufferB);
479
480	/**
481	* @brief Basic Q7 convolution function (non-square shape)
482	* @param[in] Im_in pointer to input tensor
483	* @param[in] dim_im_in_x input tensor dimension x
484	* @param[in] dim_im_in_y input tensor dimension y
485	* @param[in] ch_im_in number of input tensor channels
486	* @param[in] wt pointer to kernel weights
487	* @param[in] ch_im_out number of filters, i.e., output tensor channels
488	* @param[in] dim_kernel_x filter kernel size x
489	* @param[in] dim_kernel_y filter kernel size y
490	* @param[in] padding_x padding size x
491	* @param[in] padding_y padding size y
492	* @param[in] stride_x convolution stride x
493	* @param[in] stride_y convolution stride y
494	* @param[in] bias pointer to bias
495	* @param[in] bias_shift amount of left-shift for bias
496	* @param[in] out_shift amount of right-shift for output
497	* @param[in,out] Im_out pointer to output tensor
498	* @param[in] dim_im_out_x output tensor dimension x
499	* @param[in] dim_im_out_y output tensor dimension y
500	* @param[in,out] bufferA pointer to buffer space for input
501	* @param[in,out] bufferB pointer to buffer space for output
502	* @return The function returns <code>ARM_MATH_SUCCESS</code>
503	*/
504	arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
505	const uint16_t dim_im_in_x,
506	const uint16_t dim_im_in_y,
507	const uint16_t ch_im_in,
508	const q7_t *wt,
509	const uint16_t ch_im_out,
510	const uint16_t dim_kernel_x,
511	const uint16_t dim_kernel_y,
512	const uint16_t padding_x,
513	const uint16_t padding_y,
514	const uint16_t stride_x,
515	const uint16_t stride_y,
516	const q7_t *bias,
517	const uint16_t bias_shift,
518	const uint16_t out_shift,
519	q7_t *Im_out,
520	const uint16_t dim_im_out_x,
521	const uint16_t dim_im_out_y,
522	q15_t *bufferA,
523	q7_t *bufferB);
524
525	/**
526	* @brief Basic Q15 convolution function
527	* @param[in] Im_in pointer to input tensor
528	* @param[in] dim_im_in input tensor dimension
529	* @param[in] ch_im_in number of input tensor channels
530	* @param[in] wt pointer to kernel weights
531	* @param[in] ch_im_out number of filters, i.e., output tensor channels
532	* @param[in] dim_kernel filter kernel size
533	* @param[in] padding padding sizes
534	* @param[in] stride convolution stride
535	* @param[in] bias pointer to bias
536	* @param[in] bias_shift amount of left-shift for bias
537	* @param[in] out_shift amount of right-shift for output
538	* @param[in,out] Im_out pointer to output tensor
539	* @param[in] dim_im_out output tensor dimension
540	* @param[in,out] bufferA pointer to buffer space for input
541	* @param[in,out] bufferB pointer to buffer space for output
542	* @return The function returns <code>ARM_MATH_SUCCESS</code>
543	*
544	*/
545	arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
546	const uint16_t dim_im_in,
547	const uint16_t ch_im_in,
548	const q15_t *wt,
549	const uint16_t ch_im_out,
550	const uint16_t dim_kernel,
551	const uint16_t padding,
552	const uint16_t stride,
553	const q15_t *bias,
554	const uint16_t bias_shift,
555	const uint16_t out_shift,
556	q15_t *Im_out,
557	const uint16_t dim_im_out,
558	q15_t *bufferA,
559	q7_t *bufferB);
560
561	/**
562	* @brief Fast Q7 convolution function
563	* @param[in] Im_in pointer to input tensor
564	* @param[in] dim_im_in input tensor dimension
565	* @param[in] ch_im_in number of input tensor channels
566	* @param[in] wt pointer to kernel weights
567	* @param[in] ch_im_out number of filters, i.e., output tensor channels
568	* @param[in] dim_kernel filter kernel size
569	* @param[in] padding padding sizes
570	* @param[in] stride convolution stride
571	* @param[in] bias pointer to bias
572	* @param[in] bias_shift amount of left-shift for bias
573	* @param[in] out_shift amount of right-shift for output
574	* @param[in,out] Im_out pointer to output tensor
575	* @param[in] dim_im_out output tensor dimension
576	* @param[in,out] bufferA pointer to buffer space for input
577	* @param[in,out] bufferB pointer to buffer space for output
578	* @return The function returns either
579	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
580	*
581	* This function is the version with full list of optimization tricks, but with
582	* some contraints:
583	* ch_im_in is multiple of 4
584	* ch_im_out is multiple of 2
585	*/
586	arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
587	const uint16_t dim_im_in,
588	const uint16_t ch_im_in,
589	const q7_t *wt,
590	const uint16_t ch_im_out,
591	const uint16_t dim_kernel,
592	const uint16_t padding,
593	const uint16_t stride,
594	const q7_t *bias,
595	const uint16_t bias_shift,
596	const uint16_t out_shift,
597	q7_t *Im_out,
598	const uint16_t dim_im_out,
599	q15_t *bufferA,
600	q7_t *bufferB);
601
602	/**
603	* @brief Fast Q7 convolution function (non-sqaure shape)
604	* @param[in] Im_in pointer to input tensor
605	* @param[in] dim_im_in_x input tensor dimension x
606	* @param[in] dim_im_in_y input tensor dimension y
607	* @param[in] ch_im_in number of input tensor channels
608	* @param[in] wt pointer to kernel weights
609	* @param[in] ch_im_out number of filters, i.e., output tensor channels
610	* @param[in] dim_kernel_x filter kernel size x
611	* @param[in] dim_kernel_y filter kernel size y
612	* @param[in] padding_x padding size x
613	* @param[in] padding_y padding size y
614	* @param[in] stride_x convolution stride x
615	* @param[in] stride_y convolution stride y
616	* @param[in] bias pointer to bias
617	* @param[in] bias_shift amount of left-shift for bias
618	* @param[in] out_shift amount of right-shift for output
619	* @param[in,out] Im_out pointer to output tensor
620	* @param[in] dim_im_out_x output tensor dimension x
621	* @param[in] dim_im_out_y output tensor dimension y
622	* @param[in,out] bufferA pointer to buffer space for input
623	* @param[in,out] bufferB pointer to buffer space for output
624	* @return The function returns either
625	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
626	*
627	* This function is the version with full list of optimization tricks, but with
628	* some contraints:
629	* ch_im_in is multiple of 4
630	* ch_im_out is multiple of 2
631	*/
632
633	arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
634	const uint16_t dim_im_in_x,
635	const uint16_t dim_im_in_y,
636	const uint16_t ch_im_in,
637	const q7_t *wt,
638	const uint16_t ch_im_out,
639	const uint16_t dim_kernel_x,
640	const uint16_t dim_kernel_y,
641	const uint16_t padding_x,
642	const uint16_t padding_y,
643	const uint16_t stride_x,
644	const uint16_t stride_y,
645	const q7_t *bias,
646	const uint16_t bias_shift,
647	const uint16_t out_shift,
648	q7_t *Im_out,
649	const uint16_t dim_im_out_x,
650	const uint16_t dim_im_out_y,
651	q15_t *bufferA,
652	q7_t *bufferB);
653
654	/**
655	* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
656	* @param[in] Im_in pointer to input tensor
657	* @param[in] dim_im_in_x input tensor dimension x
658	* @param[in] dim_im_in_y input tensor dimension y
659	* @param[in] ch_im_in number of input tensor channels
660	* @param[in] wt pointer to kernel weights
661	* @param[in] ch_im_out number of filters, i.e., output tensor channels
662	* @param[in] dim_kernel_x filter kernel size x
663	* @param[in] dim_kernel_y filter kernel size y
664	* @param[in] padding_x padding size x
665	* @param[in] padding_y padding size y
666	* @param[in] stride_x convolution stride x
667	* @param[in] stride_y convolution stride y
668	* @param[in] bias pointer to bias
669	* @param[in] bias_shift amount of left-shift for bias
670	* @param[in] out_shift amount of right-shift for output
671	* @param[in,out] Im_out pointer to output tensor
672	* @param[in] dim_im_out_x output tensor dimension x
673	* @param[in] dim_im_out_y output tensor dimension y
674	* @param[in,out] bufferA pointer to buffer space for input
675	* @param[in,out] bufferB pointer to buffer space for output
676	* @return The function returns either
677	* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
678	* <code>ARM_MATH_SUCCESS</code> on successful completion.
679	*
680	* This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
681	* and dim_kernel_y=1). It can be used for
682	* second half of MobileNets after depthwise separable convolution.
683	*
684	* This function is the version with full list of optimization tricks, but with
685	* some contraints:
686	* ch_im_in is multiple of 4
687	* ch_im_out is multiple of 2
688	*/
689	arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
690	const uint16_t dim_im_in_x,
691	const uint16_t dim_im_in_y,
692	const uint16_t ch_im_in,
693	const q7_t *wt,
694	const uint16_t ch_im_out,
695	const uint16_t dim_kernel_x,
696	const uint16_t dim_kernel_y,
697	const uint16_t padding_x,
698	const uint16_t padding_y,
699	const uint16_t stride_x,
700	const uint16_t stride_y,
701	const q7_t *bias,
702	const uint16_t bias_shift,
703	const uint16_t out_shift,
704	q7_t *Im_out,
705	const uint16_t dim_im_out_x,
706	const uint16_t dim_im_out_y,
707	q15_t *bufferA,
708	q7_t *bufferB);
709
710	/**
711	* @brief Fast s8 version for 1x1 convolution (non-square shape)
712	*
713	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
714	arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required
715	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
716	* Range of conv_params->input_offset : [-127, 128]
717	* Range of conv_params->output_offset : [-128, 127]
718	* @param[in] quant_params Per-channel quantization info.
719	* It contains the multiplier and shift values to be applied to each output channel
720	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
721	* @param[in] input_data Input (activation) data pointer. Data type: int8
722	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
723	* @param[in] filter_data Filter data pointer. Data type: int8
724	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
725	* @param[in] bias_data Optional bias data pointer. Data type: int32
726	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
727	* @param[out] output_data Output data pointer. Data type: int8
728	*
729	* @return The function returns either
730	* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
731	* <code>ARM_MATH_SUCCESS</code> on successful completion.
732	*
733	* @details
734	* - Supported framework : TensorFlow Lite Micro
735	* - The following constrains on the arguments apply
736	* -# input_dims->c is a multiple of 4
737	* -# conv_params->padding.w = conv_params->padding.h = 0
738	* -# conv_params->stride.w = conv_params->stride.h = 1
739	*
740	*/
741	arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
742	const cmsis_nn_conv_params *conv_params,
743	const cmsis_nn_per_channel_quant_params *quant_params,
744	const cmsis_nn_dims *input_dims,
745	const q7_t *input_data,
746	const cmsis_nn_dims *filter_dims,
747	const q7_t *filter_data,
748	const cmsis_nn_dims *bias_dims,
749	const int32_t *bias_data,
750	const cmsis_nn_dims *output_dims,
751	q7_t *output_data);
752
753	/**
754	* @brief Get the required buffer size for arm_convolve_1x1_s8_fast
755	*
756	* @param[in] input_dims Input (activation) dimensions
757	* @return The function returns the required buffer size in bytes
758	*
759	*/
760	int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
761
762	/**
763	* @brief 1xn convolution
764	*
765	* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
766	arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
767	* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
768	* Range of conv_params->input_offset : [-127, 128]
769	* Range of conv_params->output_offset : [-128, 127]
770	* @param[in] quant_params Per-channel quantization info.
771	* It contains the multiplier and shift values to be applied to each output channel
772	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
773	* @param[in] input_data Input (activation) data pointer. Data type: int8
774	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
775	* spatial filter dimension
776	* @param[in] filter_data Filter data pointer. Data type: int8
777	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
778	* @param[in] bias_data Optional bias data pointer. Data type: int32
779	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
780	* @param[out] output_data Output data pointer. Data type: int8
781	*
782	* @return The function returns either
783	* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
784	* <code>ARM_MATH_SUCCESS</code> on successful completion.
785	*
786	* @details
787	* - Supported framework : TensorFlow Lite Micro
788	* - The following constrains on the arguments apply
789	* -# input_dims->n equals 1
790	* -# ouput_dims->w is a multiple of 4
791	* -# Explicit constraints(since it is for 1xN convolution)
792	* -## input_dims->h equals 1
793	* -## output_dims->h equals 1
794	* -## filter_dims->h equals 1
795	*@todo Remove constraint on output_dims->w to make the function generic.
796	*
797	*/
798	arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
799	const cmsis_nn_conv_params *conv_params,
800	const cmsis_nn_per_channel_quant_params *quant_params,
801	const cmsis_nn_dims *input_dims,
802	const q7_t *input_data,
803	const cmsis_nn_dims *filter_dims,
804	const q7_t *filter_data,
805	const cmsis_nn_dims *bias_dims,
806	const int32_t *bias_data,
807	const cmsis_nn_dims *output_dims,
808	q7_t *output_data);
809
810	/**
811	* @brief Get the required additional buffer size for 1xn convolution
812	*
813	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
814	* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
815	* horizontal spatial filter dimension
816	* @return The function returns required buffer size(bytes)
817	*
818	*/
819	int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims);
820
821	/**
822	* @brief Q7 version of convolution for RGB image
823	* @param[in] Im_in pointer to input tensor
824	* @param[in] dim_im_in input tensor dimension
825	* @param[in] ch_im_in number of input tensor channels
826	* @param[in] wt pointer to kernel weights
827	* @param[in] ch_im_out number of filters, i.e., output tensor channels
828	* @param[in] dim_kernel filter kernel size
829	* @param[in] padding padding sizes
830	* @param[in] stride convolution stride
831	* @param[in] bias pointer to bias
832	* @param[in] bias_shift amount of left-shift for bias
833	* @param[in] out_shift amount of right-shift for output
834	* @param[in,out] Im_out pointer to output tensor
835	* @param[in] dim_im_out output tensor dimension
836	* @param[in,out] bufferA pointer to buffer space for input
837	* @param[in,out] bufferB pointer to buffer space for output
838	* @return The function returns either
839	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
840	*
841	* This kernel is written exclusively for convolution with ch_im_in
842	* equals 3. This applies on the first layer of CNNs which has input
843	* image with RGB format.
844	*/
845
846	arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
847	const uint16_t dim_im_in,
848	const uint16_t ch_im_in,
849	const q7_t *wt,
850	const uint16_t ch_im_out,
851	const uint16_t dim_kernel,
852	const uint16_t padding,
853	const uint16_t stride,
854	const q7_t *bias,
855	const uint16_t bias_shift,
856	const uint16_t out_shift,
857	q7_t *Im_out,
858	const uint16_t dim_im_out,
859	q15_t *bufferA,
860	q7_t *bufferB);
861
862	/**
863	* @brief Fast Q15 convolution function
864	* @param[in] Im_in pointer to input tensor
865	* @param[in] dim_im_in input tensor dimension
866	* @param[in] ch_im_in number of input tensor channels
867	* @param[in] wt pointer to kernel weights
868	* @param[in] ch_im_out number of filters, i.e., output tensor channels
869	* @param[in] dim_kernel filter kernel size
870	* @param[in] padding padding sizes
871	* @param[in] stride convolution stride
872	* @param[in] bias pointer to bias
873	* @param[in] bias_shift amount of left-shift for bias
874	* @param[in] out_shift amount of right-shift for output
875	* @param[in,out] Im_out pointer to output tensor
876	* @param[in] dim_im_out output tensor dimension
877	* @param[in,out] bufferA pointer to buffer space for input
878	* @param[in,out] bufferB pointer to buffer space for output
879	* @return The function returns either
880	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
881	*
882	* This function is the version with full list of optimization tricks, but with
883	* some contraints:
884	* ch_im_in is multiple of 2
885	* ch_im_out is multiple of 2
886	* dim_im_out is a multiple of 2
887	*/
888
889	arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
890	const uint16_t dim_im_in,
891	const uint16_t ch_im_in,
892	const q15_t *wt,
893	const uint16_t ch_im_out,
894	const uint16_t dim_kernel,
895	const uint16_t padding,
896	const uint16_t stride,
897	const q15_t *bias,
898	const uint16_t bias_shift,
899	const uint16_t out_shift,
900	q15_t *Im_out,
901	const uint16_t dim_im_out,
902	q15_t *bufferA,
903	q7_t *bufferB);
904
905	/**
906	* @brief Fast Q15 convolution function (non-sqaure shape)
907	* @param[in] Im_in pointer to input tensor
908	* @param[in] dim_im_in_x input tensor dimension x
909	* @param[in] dim_im_in_y input tensor dimension y
910	* @param[in] ch_im_in number of input tensor channels
911	* @param[in] wt pointer to kernel weights
912	* @param[in] ch_im_out number of filters, i.e., output tensor channels
913	* @param[in] dim_kernel_x filter kernel size x
914	* @param[in] dim_kernel_y filter kernel size y
915	* @param[in] padding_x padding size x
916	* @param[in] padding_y padding size y
917	* @param[in] stride_x convolution stride x
918	* @param[in] stride_y convolution stride y
919	* @param[in] bias pointer to bias
920	* @param[in] bias_shift amount of left-shift for bias
921	* @param[in] out_shift amount of right-shift for output
922	* @param[in,out] Im_out pointer to output tensor
923	* @param[in] dim_im_out_x output tensor dimension x
924	* @param[in] dim_im_out_y output tensor dimension y
925	* @param[in,out] bufferA pointer to buffer space for input
926	* @param[in,out] bufferB pointer to buffer space for output
927	* @return The function returns either
928	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
929	*
930	* @details
931	*
932	* <b>Buffer size:</b>
933	*
934	* bufferA size: 2ch_im_indim_kernel*dim_kernel
935	*
936	* bufferB size: 0
937	*
938	* <b>Input dimension constraints:</b>
939	*
940	* ch_im_in is multiple of 2
941	*
942	* ch_im_out is multipe of 2
943	*
944	*/
945
946	arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
947	const uint16_t dim_im_in_x,
948	const uint16_t dim_im_in_y,
949	const uint16_t ch_im_in,
950	const q15_t *wt,
951	const uint16_t ch_im_out,
952	const uint16_t dim_kernel_x,
953	const uint16_t dim_kernel_y,
954	const uint16_t padding_x,
955	const uint16_t padding_y,
956	const uint16_t stride_x,
957	const uint16_t stride_y,
958	const q15_t *bias,
959	const uint16_t bias_shift,
960	const uint16_t out_shift,
961	q15_t *Im_out,
962	const uint16_t dim_im_out_x,
963	const uint16_t dim_im_out_y,
964	q15_t *bufferA,
965	q7_t *bufferB);
966
967	/**
968	* @brief Q7 depthwise separable convolution function
969	* @param[in] Im_in pointer to input tensor
970	* @param[in] dim_im_in input tensor dimension
971	* @param[in] ch_im_in number of input tensor channels
972	* @param[in] wt pointer to kernel weights
973	* @param[in] ch_im_out number of filters, i.e., output tensor channels
974	* @param[in] dim_kernel filter kernel size
975	* @param[in] padding padding sizes
976	* @param[in] stride convolution stride
977	* @param[in] bias pointer to bias
978	* @param[in] bias_shift amount of left-shift for bias
979	* @param[in] out_shift amount of right-shift for output
980	* @param[in,out] Im_out pointer to output tensor
981	* @param[in] dim_im_out output tensor dimension
982	* @param[in,out] bufferA pointer to buffer space for input
983	* @param[in,out] bufferB pointer to buffer space for output
984	* @return The function returns either
985	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
986	*
987	* This function is the version with full list of optimization tricks, but with
988	* some contraints:
989	* ch_im_in is multiple of 2
990	* ch_im_out is multiple of 2
991	*/
992
993	arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
994	const uint16_t dim_im_in,
995	const uint16_t ch_im_in,
996	const q7_t *wt,
997	const uint16_t ch_im_out,
998	const uint16_t dim_kernel,
999	const uint16_t padding,
1000	const uint16_t stride,
1001	const q7_t *bias,
1002	const uint16_t bias_shift,
1003	const uint16_t out_shift,
1004	q7_t *Im_out,
1005	const uint16_t dim_im_out,
1006	q15_t *bufferA,
1007	q7_t *bufferB);
1008
1009	/**
1010	* @brief Q7 depthwise separable convolution function (non-square shape)
1011	* @param[in] Im_in pointer to input tensor
1012	* @param[in] dim_im_in_x input tensor dimension x
1013	* @param[in] dim_im_in_y input tensor dimension y
1014	* @param[in] ch_im_in number of input tensor channels
1015	* @param[in] wt pointer to kernel weights
1016	* @param[in] ch_im_out number of filters, i.e., output tensor channels
1017	* @param[in] dim_kernel_x filter kernel size x
1018	* @param[in] dim_kernel_y filter kernel size y
1019	* @param[in] padding_x padding sizes x
1020	* @param[in] padding_y padding sizes y
1021	* @param[in] stride_x convolution stride x
1022	* @param[in] stride_y convolution stride y
1023	* @param[in] bias pointer to bias
1024	* @param[in] bias_shift amount of left-shift for bias
1025	* @param[in] out_shift amount of right-shift for output
1026	* @param[in,out] Im_out pointer to output tensor
1027	* @param[in] dim_im_out_x output tensor dimension x
1028	* @param[in] dim_im_out_y output tensor dimension y
1029	* @param[in,out] bufferA pointer to buffer space for input
1030	* @param[in,out] bufferB pointer to buffer space for output
1031	* @return The function returns either
1032	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
1033	*
1034	* This function is the version with full list of optimization tricks, but with
1035	* some contraints:
1036	* ch_im_in is multiple of 2
1037	* ch_im_out is multiple of 2
1038	*/
1039	arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
1040	const uint16_t dim_im_in_x,
1041	const uint16_t dim_im_in_y,
1042	const uint16_t ch_im_in,
1043	const q7_t *wt,
1044	const uint16_t ch_im_out,
1045	const uint16_t dim_kernel_x,
1046	const uint16_t dim_kernel_y,
1047	const uint16_t padding_x,
1048	const uint16_t padding_y,
1049	const uint16_t stride_x,
1050	const uint16_t stride_y,
1051	const q7_t *bias,
1052	const uint16_t bias_shift,
1053	const uint16_t out_shift,
1054	q7_t *Im_out,
1055	const uint16_t dim_im_out_x,
1056	const uint16_t dim_im_out_y,
1057	q15_t *bufferA,
1058	q7_t *bufferB);
1059
1060	/**
1061	* @brief Wrapper function to pick the right optimized s8 depthwise convolution function
1062	*
1063	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1064	* definition file to see if an additional buffer is required.
1065	* Optional function {API}_get_buffer_size() provides the buffer
1066	* size if required.
1067	* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1068	* dw_conv_params->dilation is not used.
1069	* Range of dw_conv_params->input_offset : [-127, 128]
1070	* Range of dw_conv_params->output_offset : [-128, 127]
1071	* @param[in] quant_params Per-channel quantization info.
1072	* It contains the multiplier and shift values to be applied to each
1073	* output channel
1074	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1075	* Batch argument N is not used and assumed to be 1.
1076	* @param[in] input_data Input (activation) data pointer. Data type: int8
1077	* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1078	* @param[in] filter_data Filter data pointer. Data type: int8
1079	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1080	* @param[in] bias_data Bias data pointer. Data type: int32
1081	* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
1082	* @param[in, out] output_data Output data pointer. Data type: int8
1083	* @return The function returns
1084	* <code>ARM_MATH_SUCCESS</code> - Successful completion.
1085	*
1086	* @details
1087	* - Supported framework: TensorFlow Lite
1088	* - Picks one of the the following functions
1089	* -# arm_depthwise_conv_s8()
1090	* -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
1091	* -# arm_depthwise_conv_s8_opt()
1092	* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1093	* - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
1094	* boundary.
1095	*/
1096	arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
1097	const cmsis_nn_dw_conv_params *dw_conv_params,
1098	const cmsis_nn_per_channel_quant_params *quant_params,
1099	const cmsis_nn_dims *input_dims,
1100	const q7_t *input_data,
1101	const cmsis_nn_dims *filter_dims,
1102	const q7_t *filter_data,
1103	const cmsis_nn_dims *bias_dims,
1104	const int32_t *bias_data,
1105	const cmsis_nn_dims *output_dims,
1106	q7_t *output_data);
1107
1108	/**
1109	* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
1110	*
1111	* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1112	* dw_conv_params->dilation is not used.
1113	* Range of dw_conv_params->input_offset : [-127, 128]
1114	* Range of dw_conv_params->input_offset : [-128, 127]
1115	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1116	* Batch argument N is not used and assumed to be 1.
1117	* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1118	* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
1119	* @return Size of additional memory required for optimizations in bytes.
1120	*
1121	*/
1122	int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1123	const cmsis_nn_dims *input_dims,
1124	const cmsis_nn_dims *filter_dims,
1125	const cmsis_nn_dims *output_dims);
1126
1127	/**
1128	* @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
1129	*
1130	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1131	* definition file to see if an additional buffer is required.
1132	* Optional function {API}_get_buffer_size() provides the buffer
1133	* size if an additional buffer is required.
1134	* exists if additional memory is.
1135	* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1136	* dw_conv_params->dilation is not used.
1137	* Range of dw_conv_params->input_offset : [-127, 128]
1138	* Range of dw_conv_params->input_offset : [-128, 127]
1139	* @param[in] quant_params Per-channel quantization info.
1140	* It contains the multiplier and shift values to be applied to each
1141	* output channel
1142	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1143	* Batch argument N is not used.
1144	* @param[in] input_data Input (activation) data pointer. Data type: int8
1145	* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1146	* @param[in] filter_data Filter data pointer. Data type: int8
1147	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1148	* @param[in] bias_data Bias data pointer. Data type: int32
1149	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
1150	* @param[in, out] output_data Output data pointer. Data type: int8
1151	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1152	*
1153	* @details
1154	* - Supported framework: TensorFlow Lite
1155	* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1156	*/
1157	arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
1158	const cmsis_nn_dw_conv_params *dw_conv_params,
1159	const cmsis_nn_per_channel_quant_params *quant_params,
1160	const cmsis_nn_dims *input_dims,
1161	const q7_t *input_data,
1162	const cmsis_nn_dims *filter_dims,
1163	const q7_t *filter_data,
1164	const cmsis_nn_dims *bias_dims,
1165	const int32_t *bias_data,
1166	const cmsis_nn_dims *output_dims,
1167	q7_t *output_data);
1168
1169	/**
1170	* @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
1171	*
1172	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1173	* definition file to see if an additional buffer is required.
1174	* Optional function {API}_get_buffer_size() provides the buffer
1175	* size if an additional buffer is required.
1176	* exists if additional memory is.
1177	* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1178	* conv_params->input_offset : Not used
1179	* conv_params->output_offset : Not used
1180	* @param[in] quant_params Per-channel quantization info.
1181	* It contains the multiplier and shift values to be applied to each
1182	* output channel
1183	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1184	* Batch argument N is not used.
1185	* @param[in] input_data Input (activation) data pointer. Data type: int8
1186	* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1187	* @param[in] filter_data Filter data pointer. Data type: int8
1188	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1189	* @param[in] bias_data Bias data pointer. Data type: int64
1190	* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
1191	* @param[in, out] output_data Output data pointer. Data type: int16
1192	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1193	*
1194	* @details
1195	* - Supported framework: TensorFlow Lite
1196	* - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
1197	*/
1198	arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
1199	const cmsis_nn_dw_conv_params *dw_conv_params,
1200	const cmsis_nn_per_channel_quant_params *quant_params,
1201	const cmsis_nn_dims *input_dims,
1202	const q15_t *input_data,
1203	const cmsis_nn_dims *filter_dims,
1204	const q7_t *filter_data,
1205	const cmsis_nn_dims *bias_dims,
1206	const int64_t *bias_data,
1207	const cmsis_nn_dims *output_dims,
1208	q15_t *output_data);
1209
1210	/**
1211	* @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
1212	* the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
1213	* argument details.
1214	*
1215	* @return The function returns one of the following
1216	* <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors
1217	* <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis
1218	* <code>ARM_MATH_SUCCESS</code> - Successful operation
1219	*
1220	* @details
1221	* - Supported framework : TensorFlow Lite Micro
1222	* - The following constrains on the arguments apply
1223	* -# Number of input channel equals number of output channels
1224	* -# Filter height and width equals 3
1225	* -# Padding along x is either 0 or 1.
1226	*
1227	*/
1228	arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
1229	const cmsis_nn_dw_conv_params *dw_conv_params,
1230	const cmsis_nn_per_channel_quant_params *quant_params,
1231	const cmsis_nn_dims *input_dims,
1232	const q7_t *input_data,
1233	const cmsis_nn_dims *filter_dims,
1234	const q7_t *filter_data,
1235	const cmsis_nn_dims *bias_dims,
1236	const int32_t *bias_data,
1237	const cmsis_nn_dims *output_dims,
1238	q7_t *output_data);
1239
1240	/**
1241	* @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
1242	* Refer arm_depthwise_conv_s8() for function argument details.
1243	*
1244	* @return The function returns one of the following
1245	* <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or
1246	* ch_mult != 1
1247	* <code>ARM_MATH_SUCCESS</code> - Successful operation
1248	*
1249	* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1250	* for the following if MVE optimizations(Arm Helium Technology) are used.
1251	* - Output shift
1252	* - Output multiplier
1253	* - Output bias
1254	* - kernel
1255	* @details
1256	* - Supported framework: TensorFlow Lite
1257	* - The following constrains on the arguments apply
1258	* -# Number of input channel equals number of output channels or ch_mult equals 1
1259	* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1260	* - Reccomended when number of channels is 4 or greater.
1261	*
1262	*/
1263	arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
1264	const cmsis_nn_dw_conv_params *dw_conv_params,
1265	const cmsis_nn_per_channel_quant_params *quant_params,
1266	const cmsis_nn_dims *input_dims,
1267	const q7_t *input_data,
1268	const cmsis_nn_dims *filter_dims,
1269	const q7_t *filter_data,
1270	const cmsis_nn_dims *bias_dims,
1271	const int32_t *bias_data,
1272	const cmsis_nn_dims *output_dims,
1273	q7_t *output_data);
1274
1275	/**
1276	* @brief Get the required buffer size for optimized s8 depthwise convolution
1277	* function with constraint that in_channel equals out_channel.
1278	* @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1279	* Batch argument N is not used.
1280	* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
1281	* @return The function returns required buffer size in bytes
1282	*
1283	*/
1284	int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims);
1285
1286	/**
1287	* @defgroup FC Fully-connected Layer Functions
1288	*
1289	* Collection of fully-connected and matrix multiplication functions.
1290	*
1291	* Fully-connected layer is basically a matrix-vector multiplication
1292	* with bias. The matrix is the weights and the input/output vectors
1293	* are the activation values. Supported {weight, activation} precisions
1294	* include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
1295	*
1296	* Here we have two types of kernel functions. The basic function
1297	* implements the function using regular GEMV approach. The opt functions
1298	* operates with weights in interleaved formats.
1299	*
1300	*/
1301
1302	/**
1303	*@brief Q7 basic fully-connected layer function
1304	*@param[in] pV pointer to input vector
1305	*@param[in] pM pointer to matrix weights
1306	*@param[in] dim_vec length of the vector
1307	*@param[in] num_of_rows number of rows in weight matrix
1308	*@param[in] bias_shift amount of left-shift for bias
1309	*@param[in] out_shift amount of right-shift for output
1310	*@param[in] bias pointer to bias
1311	*@param[in,out] pOut pointer to output vector
1312	*@param[in,out] vec_buffer pointer to buffer space for input
1313	*@return The function returns <code>ARM_MATH_SUCCESS</code>
1314	*
1315	*/
1316
1317	arm_status arm_fully_connected_q7(const q7_t *pV,
1318	const q7_t *pM,
1319	const uint16_t dim_vec,
1320	const uint16_t num_of_rows,
1321	const uint16_t bias_shift,
1322	const uint16_t out_shift,
1323	const q7_t *bias,
1324	q7_t *pOut,
1325	q15_t *vec_buffer);
1326
1327	/**
1328	* @brief Basic s8 Fully Connected function.
1329	*
1330	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1331	* definition file to see if an additional buffer is required.
1332	* Optional function {API}_get_buffer_size() provides the buffer
1333	* size if an additional buffer is required.
1334	* @param[in] fc_params Fully Connected layer parameters.
1335	* Range of fc_params->input_offset : [-127, 128]
1336	* fc_params->filter_offset : 0
1337	* Range of fc_params->output_offset : [-128, 127]
1338	* @param[in] quant_params Per-tensor quantization info.
1339	* It contains the multiplier and shift values to be applied to the output tensor.
1340	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1341	* Input dimension is taken as Nx(H * W * C_IN)
1342	* @param[in] input_data Input (activation) data pointer. Data type: int8
1343	* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
1344	* N : accumulation depth and equals (H * W * C_IN) from input_dims
1345	* C : output depth and equals C_OUT in output_dims
1346	* H & W : Not used
1347	* @param[in] filter_data Filter data pointer. Data type: int8
1348	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1349	* N, H, W : Not used
1350	* @param[in] bias_data Bias data pointer. Data type: int32
1351	* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
1352	* N : Batches
1353	* C_OUT : Output depth
1354	* H & W : Not used.
1355	* @param[in, out] output_data Output data pointer. Data type: int8
1356	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1357	*
1358	* @details
1359	* - Supported framework: TensorFlow Lite
1360	* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1361	*/
1362	arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1363	const cmsis_nn_fc_params *fc_params,
1364	const cmsis_nn_per_tensor_quant_params *quant_params,
1365	const cmsis_nn_dims *input_dims,
1366	const q7_t *input_data,
1367	const cmsis_nn_dims *filter_dims,
1368	const q7_t *filter_data,
1369	const cmsis_nn_dims *bias_dims,
1370	const int32_t *bias_data,
1371	const cmsis_nn_dims *output_dims,
1372	q7_t *output_data);
1373
1374	/**
1375	* @brief Get the required buffer size for S8 basic fully-connected and
1376	* matrix multiplication layer function for TF Lite
1377	* @param[in] filter_dims dimension of filter
1378	* @return The function returns required buffer size in bytes
1379	*
1380	*/
1381	int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1382
1383	/**
1384	* @brief Basic s16 Fully Connected function.
1385	*
1386	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1387	* definition file to see if an additional buffer is required.
1388	* Optional function {API}_get_buffer_size() provides the buffer
1389	* size if an additional buffer is required.
1390	* @param[in] fc_params Fully Connected layer parameters.
1391	* fc_params->input_offset : 0
1392	* fc_params->filter_offset : 0
1393	* fc_params->output_offset : 0
1394	* @param[in] quant_params Per-tensor quantization info.
1395	* It contains the multiplier and shift values to be applied to the output tensor.
1396	* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1397	* Input dimension is taken as Nx(H * W * C_IN)
1398	* @param[in] input_data Input (activation) data pointer. Data type: int16
1399	* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
1400	* N : accumulation depth and equals (H * W * C_IN) from input_dims
1401	* C : output depth and equals C_OUT in output_dims
1402	* H & W : Not used
1403	* @param[in] filter_data Filter data pointer. Data type: int8
1404	* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
1405	* N, H, W : Not used
1406	* @param[in] bias_data Bias data pointer. Data type: int64
1407	* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
1408	* N : Batches
1409	* C_OUT : Output depth
1410	* H & W : Not used.
1411	* @param[in, out] output_data Output data pointer. Data type: int16
1412	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1413	*
1414	* @details
1415	* - Supported framework: TensorFlow Lite
1416	* - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
1417	*/
1418	arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
1419	const cmsis_nn_fc_params *fc_params,
1420	const cmsis_nn_per_tensor_quant_params *quant_params,
1421	const cmsis_nn_dims *input_dims,
1422	const q15_t *input_data,
1423	const cmsis_nn_dims *filter_dims,
1424	const q7_t *filter_data,
1425	const cmsis_nn_dims *bias_dims,
1426	const int64_t *bias_data,
1427	const cmsis_nn_dims *output_dims,
1428	q15_t *output_data);
1429
1430	/**
1431	* @brief Get the required buffer size for S16 basic fully-connected and
1432	* matrix multiplication layer function for TF Lite
1433	* @param[in] filter_dims dimension of filter
1434	* @return The function returns required buffer size in bytes
1435	*
1436	*/
1437	int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
1438
1439	/**
1440	* @brief Q7 opt fully-connected layer function
1441	* @param[in] pV pointer to input vector
1442	* @param[in] pM pointer to matrix weights
1443	* @param[in] dim_vec length of the vector
1444	* @param[in] num_of_rows number of rows in weight matrix
1445	* @param[in] bias_shift amount of left-shift for bias
1446	* @param[in] out_shift amount of right-shift for output
1447	* @param[in] bias pointer to bias
1448	* @param[in,out] pOut pointer to output vector
1449	* @param[in,out] vec_buffer pointer to buffer space for input
1450	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1451	*
1452	*/
1453
1454	arm_status arm_fully_connected_q7_opt(const q7_t *pV,
1455	const q7_t *pM,
1456	const uint16_t dim_vec,
1457	const uint16_t num_of_rows,
1458	const uint16_t bias_shift,
1459	const uint16_t out_shift,
1460	const q7_t *bias,
1461	q7_t *pOut,
1462	q15_t *vec_buffer);
1463
1464	/**
1465	* @brief Q15 basic fully-connected layer function
1466	* @param[in] pV pointer to input vector
1467	* @param[in] pM pointer to matrix weights
1468	* @param[in] dim_vec length of the vector
1469	* @param[in] num_of_rows number of rows in weight matrix
1470	* @param[in] bias_shift amount of left-shift for bias
1471	* @param[in] out_shift amount of right-shift for output
1472	* @param[in] bias pointer to bias
1473	* @param[in,out] pOut pointer to output vector
1474	* @param[in,out] vec_buffer pointer to buffer space for input
1475	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1476	*
1477	*/
1478
1479	arm_status arm_fully_connected_q15(const q15_t *pV,
1480	const q15_t *pM,
1481	const uint16_t dim_vec,
1482	const uint16_t num_of_rows,
1483	const uint16_t bias_shift,
1484	const uint16_t out_shift,
1485	const q15_t *bias,
1486	q15_t *pOut,
1487	q15_t *vec_buffer);
1488
1489	/**
1490	* @brief Q15 opt fully-connected layer function
1491	* @param[in] pV pointer to input vector
1492	* @param[in] pM pointer to matrix weights
1493	* @param[in] dim_vec length of the vector
1494	* @param[in] num_of_rows number of rows in weight matrix
1495	* @param[in] bias_shift amount of left-shift for bias
1496	* @param[in] out_shift amount of right-shift for output
1497	* @param[in] bias pointer to bias
1498	* @param[in,out] pOut pointer to output vector
1499	* @param[in,out] vec_buffer pointer to buffer space for input
1500	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1501	*
1502	*/
1503
1504	arm_status arm_fully_connected_q15_opt(const q15_t *pV,
1505	const q15_t *pM,
1506	const uint16_t dim_vec,
1507	const uint16_t num_of_rows,
1508	const uint16_t bias_shift,
1509	const uint16_t out_shift,
1510	const q15_t *bias,
1511	q15_t *pOut,
1512	q15_t *vec_buffer);
1513
1514	/**
1515	* @brief Mixed Q15-Q7 fully-connected layer function
1516	* @param[in] pV pointer to input vector
1517	* @param[in] pM pointer to matrix weights
1518	* @param[in] dim_vec length of the vector
1519	* @param[in] num_of_rows number of rows in weight matrix
1520	* @param[in] bias_shift amount of left-shift for bias
1521	* @param[in] out_shift amount of right-shift for output
1522	* @param[in] bias pointer to bias
1523	* @param[in,out] pOut pointer to output vector
1524	* @param[in,out] vec_buffer pointer to buffer space for input
1525	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1526	*
1527	*/
1528
1529	arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
1530	const q7_t *pM,
1531	const uint16_t dim_vec,
1532	const uint16_t num_of_rows,
1533	const uint16_t bias_shift,
1534	const uint16_t out_shift,
1535	const q7_t *bias,
1536	q15_t *pOut,
1537	q15_t *vec_buffer);
1538
1539	/**
1540	* @brief Mixed Q15-Q7 opt fully-connected layer function
1541	* @param[in] pV pointer to input vector
1542	* @param[in] pM pointer to matrix weights
1543	* @param[in] dim_vec length of the vector
1544	* @param[in] num_of_rows number of rows in weight matrix
1545	* @param[in] bias_shift amount of left-shift for bias
1546	* @param[in] out_shift amount of right-shift for output
1547	* @param[in] bias pointer to bias
1548	* @param[in,out] pOut pointer to output vector
1549	* @param[in,out] vec_buffer pointer to buffer space for input
1550	* @return The function returns <code>ARM_MATH_SUCCESS</code>
1551	*
1552	*/
1553
1554	arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
1555	const q7_t *pM,
1556	const uint16_t dim_vec,
1557	const uint16_t num_of_rows,
1558	const uint16_t bias_shift,
1559	const uint16_t out_shift,
1560	const q7_t *bias,
1561	q15_t *pOut,
1562	q15_t *vec_buffer);
1563
1564	/**
1565	* @brief Matrix-Multiplication Kernels for Convolution
1566	*
1567	* These functions are used within convolution layer functions for
1568	* matrix multiplication.
1569	*
1570	* The implementation is similar to CMSIS-DSP arm_mat_mult functions
1571	* with one Q7 and one Q15 operands. The Q15 operand is the im2col
1572	* output which is always with 2 columns.
1573	*
1574	*/
1575
1576	/**
1577	* @brief Matrix-multiplication function for convolution
1578	* @param[in] pA pointer to operand A
1579	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
1580	* @param[in] ch_im_out numRow of A
1581	* @param[in] numCol_A numCol of A
1582	* @param[in] bias_shift amount of left-shift for bias
1583	* @param[in] out_shift amount of right-shift for output
1584	* @param[in] bias the bias
1585	* @param[in,out] pOut pointer to output
1586	* @return The function returns the incremented output pointer
1587	*/
1588
1589	q7_t arm_nn_mat_mult_kernel_q7_q15(const q7_t pA,
1590	const q15_t *pInBuffer,
1591	const uint16_t ch_im_out,
1592	const uint16_t numCol_A,
1593	const uint16_t bias_shift,
1594	const uint16_t out_shift,
1595	const q7_t *bias,
1596	q7_t *pOut);
1597
1598	#ifdef __cplusplus
1599	}
1600	#endif
1601
1602	/*
1603	* Other functions
1604	* These layers are typically not timing critical
1605	* Basic implementation is supported here
1606	*/
1607
1608	#ifdef __cplusplus
1609	extern "C" {
1610	#endif
1611
1612	/**
1613	* @defgroup BasicMath Basic math functions
1614	*
1615	* Elementwise add and multiplication functions.
1616	*
1617	*/
1618
1619	/**
1620	* @brief s8 elementwise add of two vectors
1621	* @param[in] input_1_vect pointer to input vector 1
1622	* @param[in] input_2_vect pointer to input vector 2
1623	* @param[in] input_1_offset offset for input 1. Range: -127 to 128
1624	* @param[in] input_1_mult multiplier for input 1
1625	* @param[in] input_1_shift shift for input 1
1626	* @param[in] input_2_offset offset for input 2. Range: -127 to 128
1627	* @param[in] input_2_mult multiplier for input 2
1628	* @param[in] input_2_shift shift for input 2
1629	* @param[in] left_shift input left shift
1630	* @param[in,out] output pointer to output vector
1631	* @param[in] out_offset output offset. Range: -128 to 127
1632	* @param[in] out_mult output multiplier
1633	* @param[in] out_shift output shift
1634	* @param[in] out_activation_min minimum value to clamp output to. Min: -128
1635	* @param[in] out_activation_max maximum value to clamp output to. Max: 127
1636	* @param[in] block_size number of samples
1637	* @return The function returns ARM_MATH_SUCCESS
1638	*/
1639	arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1640	const int8_t *input_2_vect,
1641	const int32_t input_1_offset,
1642	const int32_t input_1_mult,
1643	const int32_t input_1_shift,
1644	const int32_t input_2_offset,
1645	const int32_t input_2_mult,
1646	const int32_t input_2_shift,
1647	const int32_t left_shift,
1648	int8_t *output,
1649	const int32_t out_offset,
1650	const int32_t out_mult,
1651	const int32_t out_shift,
1652	const int32_t out_activation_min,
1653	const int32_t out_activation_max,
1654	const int32_t block_size);
1655
1656	/**
1657	* @brief s16 elementwise add of two vectors
1658	* @param[in] input_1_vect pointer to input vector 1
1659	* @param[in] input_2_vect pointer to input vector 2
1660	* @param[in] input_1_offset offset for input 1. Not used.
1661	* @param[in] input_1_mult multiplier for input 1
1662	* @param[in] input_1_shift shift for input 1
1663	* @param[in] input_2_offset offset for input 2. Not used.
1664	* @param[in] input_2_mult multiplier for input 2
1665	* @param[in] input_2_shift shift for input 2
1666	* @param[in] left_shift input left shift
1667	* @param[in,out] output pointer to output vector
1668	* @param[in] out_offset output offset. Not used.
1669	* @param[in] out_mult output multiplier
1670	* @param[in] out_shift output shift
1671	* @param[in] out_activation_min minimum value to clamp output to. Min: -32768
1672	* @param[in] out_activation_max maximum value to clamp output to. Max: 32767
1673	* @param[in] block_size number of samples
1674	* @return The function returns ARM_MATH_SUCCESS
1675	*/
1676	arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
1677	const int16_t *input_2_vect,
1678	const int32_t input_1_offset,
1679	const int32_t input_1_mult,
1680	const int32_t input_1_shift,
1681	const int32_t input_2_offset,
1682	const int32_t input_2_mult,
1683	const int32_t input_2_shift,
1684	const int32_t left_shift,
1685	int16_t *output,
1686	const int32_t out_offset,
1687	const int32_t out_mult,
1688	const int32_t out_shift,
1689	const int32_t out_activation_min,
1690	const int32_t out_activation_max,
1691	const int32_t block_size);
1692
1693	/**
1694	* @brief s8 elementwise multiplication
1695	* @param[in] input_1_vect pointer to input vector 1
1696	* @param[in] input_2_vect pointer to input vector 2
1697	* @param[in] input_1_offset offset for input 1. Range: -127 to 128
1698	* @param[in] input_2_offset offset for input 2. Range: -127 to 128
1699	* @param[in,out] output pointer to output vector
1700	* @param[in] out_offset output offset. Range: -128 to 127
1701	* @param[in] out_mult output multiplier
1702	* @param[in] out_shift output shift
1703	* @param[in] out_activation_min minimum value to clamp output to. Min: -128
1704	* @param[in] out_activation_max maximum value to clamp output to. Max: 127
1705	* @param[in] block_size number of samples
1706	* @return The function returns ARM_MATH_SUCCESS
1707	*
1708	* @details Supported framework: TensorFlow Lite micro
1709	*/
1710	arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1711	const int8_t *input_2_vect,
1712	const int32_t input_1_offset,
1713	const int32_t input_2_offset,
1714	int8_t *output,
1715	const int32_t out_offset,
1716	const int32_t out_mult,
1717	const int32_t out_shift,
1718	const int32_t out_activation_min,
1719	const int32_t out_activation_max,
1720	const int32_t block_size);
1721
1722	/**
1723	* @brief s16 elementwise multiplication
1724	* @param[in] input_1_vect pointer to input vector 1
1725	* @param[in] input_2_vect pointer to input vector 2
1726	* @param[in] input_1_offset offset for input 1. Not used.
1727	* @param[in] input_2_offset offset for input 2. Not used.
1728	* @param[in,out] output pointer to output vector
1729	* @param[in] out_offset output offset. Not used.
1730	* @param[in] out_mult output multiplier
1731	* @param[in] out_shift output shift
1732	* @param[in] out_activation_min minimum value to clamp output to. Min: -32768
1733	* @param[in] out_activation_max maximum value to clamp output to. Max: 32767
1734	* @param[in] block_size number of samples
1735	* @return The function returns ARM_MATH_SUCCESS
1736	*
1737	* @details Supported framework: TensorFlow Lite micro
1738	*/
1739	arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
1740	const int16_t *input_2_vect,
1741	const int32_t input_1_offset,
1742	const int32_t input_2_offset,
1743	int16_t *output,
1744	const int32_t out_offset,
1745	const int32_t out_mult,
1746	const int32_t out_shift,
1747	const int32_t out_activation_min,
1748	const int32_t out_activation_max,
1749	const int32_t block_size);
1750
1751	/**
1752	* @defgroup Acti Activation Functions
1753	*
1754	* Perform activation layers, including ReLU (Rectified Linear Unit),
1755	* sigmoid and tanh
1756	*
1757	*/
1758
1759	/**
1760	* @brief Q7 RELU function
1761	* @param[in,out] data pointer to input
1762	* @param[in] size number of elements
1763	* @return none.
1764	*/
1765
1766	void arm_relu_q7(q7_t *data, uint16_t size);
1767
1768	/**
1769	* @brief s8 ReLU6 function
1770	* @param[in,out] data pointer to input
1771	* @param[in] size number of elements
1772	*/
1773
1774	void arm_relu6_s8(q7_t *data, uint16_t size);
1775
1776	/**
1777	* @brief Q15 RELU function
1778	* @param[in,out] data pointer to input
1779	* @param[in] size number of elements
1780	* @return none.
1781	*/
1782
1783	void arm_relu_q15(q15_t *data, uint16_t size);
1784
1785	/**
1786	* @brief Q7 neural network activation function using direct table look-up
1787	* @param[in,out] data pointer to input
1788	* @param[in] size number of elements
1789	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
1790	* @param[in] type type of activation functions
1791	* @return none.
1792	*/
1793
1794	void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1795
1796	/**
1797	* @brief Q15 neural network activation function using direct table look-up
1798	* @param[in,out] data pointer to input
1799	* @param[in] size number of elements
1800	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
1801	* @param[in] type type of activation functions
1802	* @return none.
1803	*
1804	* @details
1805	*
1806	* This is the direct table look-up approach.
1807	*
1808	* Assume here the integer part of the fixed-point is <= 3.
1809	* More than 3 just not making much sense, makes no difference with
1810	* saturation followed by any of these activation functions.
1811	*/
1812
1813	void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1814
1815	/**
1816	* @defgroup Pooling Pooling Functions
1817	*
1818	* Perform pooling functions, including max pooling and average pooling
1819	*
1820	*/
1821
1822	/**
1823	* @brief Q7 max pooling function
1824	* @param[in] Im_in pointer to input tensor
1825	* @param[in] dim_im_in input tensor dimension
1826	* @param[in] ch_im_in number of input tensor channels
1827	* @param[in] dim_kernel filter kernel size
1828	* @param[in] padding padding sizes
1829	* @param[in] stride convolution stride
1830	* @param[in] dim_im_out output tensor dimension
1831	* @param[in,out] bufferA pointer to buffer space for input
1832	* @param[in,out] Im_out pointer to output tensor
1833	* @return none.
1834	*
1835	*/
1836
1837	void arm_maxpool_q7_HWC(q7_t *Im_in,
1838	const uint16_t dim_im_in,
1839	const uint16_t ch_im_in,
1840	const uint16_t dim_kernel,
1841	const uint16_t padding,
1842	const uint16_t stride,
1843	const uint16_t dim_im_out,
1844	q7_t *bufferA,
1845	q7_t *Im_out);
1846
1847	/**
1848	* @brief Q7 average pooling function
1849	* @param[in] Im_in pointer to input tensor
1850	* @param[in] dim_im_in input tensor dimension
1851	* @param[in] ch_im_in number of input tensor channels
1852	* @param[in] dim_kernel filter kernel size
1853	* @param[in] padding padding sizes
1854	* @param[in] stride convolution stride
1855	* @param[in] dim_im_out output tensor dimension
1856	* @param[in,out] bufferA pointer to buffer space for input
1857	* @param[in,out] Im_out pointer to output tensor
1858	* @return none.
1859	*
1860	*/
1861
1862	void arm_avepool_q7_HWC(q7_t *Im_in,
1863	const uint16_t dim_im_in,
1864	const uint16_t ch_im_in,
1865	const uint16_t dim_kernel,
1866	const uint16_t padding,
1867	const uint16_t stride,
1868	const uint16_t dim_im_out,
1869	q7_t *bufferA,
1870	q7_t *Im_out);
1871
1872	/**
1873	* @brief s8 average pooling function.
1874	*
1875	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1876	* definition file to see if an additional buffer is required.
1877	* Optional function {API}_get_buffer_size() provides the buffer
1878	* size if an additional buffer is required.
1879	* @param[in] pool_params Pooling parameters
1880	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1881	* Argument 'N' is not used.
1882	* @param[in] input_data Input (activation) data pointer. Data type: int8
1883	* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1884	* Argument N and C are not used.
1885	* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1886	* Argument N is not used.
1887	* C_OUT equals C_IN.
1888	* @param[in, out] output_data Output data pointer. Data type: int8
1889	* @return The function returns
1890	* <code>ARM_MATH_SUCCESS</code> - Successful operation
1891	*
1892	* @details
1893	* - Supported Framework: TensorFlow Lite
1894	*
1895	*/
1896	arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
1897	const cmsis_nn_pool_params *pool_params,
1898	const cmsis_nn_dims *input_dims,
1899	const q7_t *input_data,
1900	const cmsis_nn_dims *filter_dims,
1901	const cmsis_nn_dims *output_dims,
1902	q7_t *output_data);
1903
1904	/**
1905	* @brief Get the required buffer size for S8 average pooling function
1906	* @param[in] dim_dst_width output tensor dimension
1907	* @param[in] ch_src number of input tensor channels
1908	* @return The function returns required buffer size in bytes
1909	*
1910	*/
1911	int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
1912
1913	/**
1914	* @brief s16 average pooling function.
1915	*
1916	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1917	* definition file to see if an additional buffer is required.
1918	* Optional function {API}_get_buffer_size() provides the buffer
1919	* size if an additional buffer is required.
1920	* @param[in] pool_params Pooling parameters
1921	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1922	* Argument 'N' is not used.
1923	* @param[in] input_data Input (activation) data pointer. Data type: int16
1924	* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1925	* Argument N and C are not used.
1926	* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1927	* Argument N is not used.
1928	* C_OUT equals C_IN.
1929	* @param[in, out] output_data Output data pointer. Data type: int16
1930	* @return The function returns
1931	* <code>ARM_MATH_SUCCESS</code> - Successful operation
1932	*
1933	* @details
1934	* - Supported Framework: TensorFlow Lite
1935	*
1936	*/
1937	arm_status arm_avgpool_s16(const cmsis_nn_context *ctx,
1938	const cmsis_nn_pool_params *pool_params,
1939	const cmsis_nn_dims *input_dims,
1940	const int16_t *input_data,
1941	const cmsis_nn_dims *filter_dims,
1942	const cmsis_nn_dims *output_dims,
1943	int16_t *output_data);
1944
1945	/**
1946	* @brief Get the required buffer size for S16 average pooling function
1947	* @param[in] dim_dst_width output tensor dimension
1948	* @param[in] ch_src number of input tensor channels
1949	* @return The function returns required buffer size in bytes
1950	*
1951	*/
1952	int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
1953
1954	/**
1955	* @brief s8 max pooling function.
1956	*
1957	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1958	* definition file to see if an additional buffer is required.
1959	* Optional function {API}_get_buffer_size() provides the buffer
1960	* size if an additional buffer is required.
1961	* @param[in] pool_params Pooling parameters
1962	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1963	* Argument 'N' is not used.
1964	* @param[in] input_data Input (activation) data pointer. The input tensor must not
1965	* overlap with the output tensor. Data type: int8
1966	* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
1967	* Argument N and C are not used.
1968	* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
1969	* Argument N is not used.
1970	* C_OUT equals C_IN.
1971	* @param[in, out] output_data Output data pointer. Data type: int8
1972	* @return The function returns
1973	* <code>ARM_MATH_SUCCESS</code> - Successful operation
1974	*
1975	* @details
1976	* - Supported Framework: TensorFlow Lite
1977	*
1978	*/
1979	arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
1980	const cmsis_nn_pool_params *pool_params,
1981	const cmsis_nn_dims *input_dims,
1982	const q7_t *input_data,
1983	const cmsis_nn_dims *filter_dims,
1984	const cmsis_nn_dims *output_dims,
1985	q7_t *output_data);
1986
1987	/**
1988	* @brief s16 max pooling function.
1989	*
1990	* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
1991	* definition file to see if an additional buffer is required.
1992	* Optional function {API}_get_buffer_size() provides the buffer
1993	* size if an additional buffer is required.
1994	* @param[in] pool_params Pooling parameters
1995	* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
1996	* Argument 'N' is not used.
1997	* @param[in] src Input (activation) data pointer. The input tensor must not
1998	* overlap with the output tensor. Data type: int16
1999	* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
2000	* Argument N and C are not used.
2001	* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
2002	* Argument N is not used.
2003	* C_OUT equals C_IN.
2004	* @param[in, out] dst Output data pointer. Data type: int16
2005	* @return The function returns
2006	* <code>ARM_MATH_SUCCESS</code> - Successful operation
2007	*
2008	* @details
2009	* - Supported Framework: TensorFlow Lite
2010	*
2011	*/
2012	arm_status arm_max_pool_s16(const cmsis_nn_context *ctx,
2013	const cmsis_nn_pool_params *pool_params,
2014	const cmsis_nn_dims *input_dims,
2015	const int16_t *src,
2016	const cmsis_nn_dims *filter_dims,
2017	const cmsis_nn_dims *output_dims,
2018	int16_t *dst);
2019
2020	/**
2021	* @defgroup Softmax Softmax Functions
2022	*
2023	* EXP(2) based softmax functions.
2024	*
2025	*/
2026
2027	/**
2028	* @brief Q7 softmax function
2029	* @param[in] vec_in pointer to input vector
2030	* @param[in] dim_vec input vector dimension
2031	* @param[out] p_out pointer to output vector
2032	*
2033	* @note This function is an optimized version which is not bit-accurate with
2034	* TensorFlow Lite's kernel
2035	*
2036	*/
2037
2038	void arm_softmax_q7(const q7_t vec_in, const uint16_t dim_vec, q7_t p_out);
2039
2040	/**
2041	* @brief Q7 softmax function with batch parameter
2042	* @param[in] vec_in pointer to input vector
2043	* @param[in] nb_batches number of batches
2044	* @param[in] dim_vec input vector dimension
2045	* @param[out] p_out pointer to output vector
2046	* @return none.
2047	*
2048	* @note This function is an optimized version which is not bit-accurate with
2049	* TensorFlow Lite's kernel
2050	*
2051	*/
2052
2053	void arm_softmax_with_batch_q7(const q7_t vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t p_out);
2054	/**
2055	* @brief Q15 softmax function
2056	* @param[in] vec_in pointer to input vector
2057	* @param[in] dim_vec input vector dimension
2058	* @param[out] p_out pointer to output vector
2059	* @return none.
2060	*
2061	* @note This function is an optimized version which is not bit-accurate with
2062	* TensorFlow Lite's kernel
2063	*
2064	*/
2065
2066	void arm_softmax_q15(const q15_t vec_in, const uint16_t dim_vec, q15_t p_out);
2067
2068	/**
2069	* @brief S8 softmax function
2070	* @param[in] input Pointer to the input tensor
2071	* @param[in] num_rows Number of rows in the input tensor
2072	* @param[in] row_size Number of elements in each input row
2073	* @param[in] mult Input quantization multiplier
2074	* @param[in] shift Input quantization shift within the range [0, 31]
2075	* @param[in] diff_min Minimum difference with max in row. Used to check if
2076	* the quantized exponential operation can be performed
2077	* @param[out] output Pointer to the output tensor
2078	*
2079	* @note Supported framework: TensorFlow Lite micro (bit-accurate)
2080	*
2081	*/
2082	void arm_softmax_s8(const int8_t *input,
2083	const int32_t num_rows,
2084	const int32_t row_size,
2085	const int32_t mult,
2086	const int32_t shift,
2087	const int32_t diff_min,
2088	int8_t *output);
2089
2090	/**
2091	* @brief S8 to s16 softmax function
2092	* @param[in] input Pointer to the input tensor
2093	* @param[in] num_rows Number of rows in the input tensor
2094	* @param[in] row_size Number of elements in each input row
2095	* @param[in] mult Input quantization multiplier
2096	* @param[in] shift Input quantization shift within the range [0, 31]
2097	* @param[in] diff_min Minimum difference with max in row. Used to check if
2098	* the quantized exponential operation can be performed
2099	* @param[out] output Pointer to the output tensor
2100	*
2101	* @note Supported framework: TensorFlow Lite micro (bit-accurate)
2102	*
2103	*/
2104	void arm_softmax_s8_s16(const int8_t *input,
2105	const int32_t num_rows,
2106	const int32_t row_size,
2107	const int32_t mult,
2108	const int32_t shift,
2109	const int32_t diff_min,
2110	int16_t *output);
2111
2112	/**
2113	* @brief S16 softmax function
2114	* @param[in] input Pointer to the input tensor
2115	* @param[in] num_rows Number of rows in the input tensor
2116	* @param[in] row_size Number of elements in each input row
2117	* @param[in] mult Input quantization multiplier
2118	* @param[in] shift Input quantization shift within the range [0, 31]
2119	* @param[in] softmax_params Softmax s16 layer parameters with two pointers to LUTs speficied below.
2120	* For indexing the high 9 bits are used and 7 remaining for interpolation.
2121	* That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
2122	* values for each LUT.
2123	* - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
2124	* - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
2125	* @param[out] output Pointer to the output tensor
2126	* @return The function returns
2127	* <code>ARM_MATH_ARGUMENT_ERROR</code> if LUTs are NULL
2128	* <code>ARM_MATH_SUCCESS</code> - Successful operation
2129	*
2130	* @note Supported framework: TensorFlow Lite micro (bit-accurate)
2131	*
2132	*/
2133	arm_status arm_softmax_s16(const int16_t *input,
2134	const int32_t num_rows,
2135	const int32_t row_size,
2136	const int32_t mult,
2137	const int32_t shift,
2138	const cmsis_nn_softmax_lut_s16 *softmax_params,
2139	int16_t *output);
2140
2141	/**
2142	* @brief U8 softmax function
2143	* @param[in] input Pointer to the input tensor
2144	* @param[in] num_rows Number of rows in the input tensor
2145	* @param[in] row_size Number of elements in each input row
2146	* @param[in] mult Input quantization multiplier
2147	* @param[in] shift Input quantization shift within the range [0, 31]
2148	* @param[in] diff_min Minimum difference with max in row. Used to check if
2149	* the quantized exponential operation can be performed
2150	* @param[out] output Pointer to the output tensor
2151	*
2152	* @note Supported framework: TensorFlow Lite micro (bit-accurate)
2153	*
2154	*/
2155
2156	void arm_softmax_u8(const uint8_t *input,
2157	const int32_t num_rows,
2158	const int32_t row_size,
2159	const int32_t mult,
2160	const int32_t shift,
2161	const int32_t diff_min,
2162	uint8_t *output);
2163
2164	/**
2165	* @brief uint8 depthwise convolution function with asymmetric quantization
2166	* Unless specified otherwise, arguments are mandatory.
2167	*
2168	* @param[in] input Pointer to input tensor
2169	* @param[in] input_x Width of input tensor
2170	* @param[in] input_y Height of input tensor
2171	* @param[in] input_ch Channels in input tensor
2172	* @param[in] kernel Pointer to kernel weights
2173	* @param[in] kernel_x Width of kernel
2174	* @param[in] kernel_y Height of kernel
2175	* @param[in] ch_mult Number of channel multiplier
2176	* @param[in] pad_x Padding sizes x
2177	* @param[in] pad_y Padding sizes y
2178	* @param[in] stride_x stride along the width
2179	* @param[in] stride_y stride along the height
2180	* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
2181	* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
2182	* @param[in] bias Pointer to optional bias values. If no bias is
2183	* availble, NULL is expected
2184	* @param[in] input_offset Input tensor zero offset
2185	* @param[in] filter_offset Kernel tensor zero offset
2186	* @param[in] output_offset Output tensor zero offset
2187	* @param[in,out] output Pointer to output tensor
2188	* @param[in] output_x Width of output tensor
2189	* @param[in] output_y Height of output tensor
2190	* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
2191	* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
2192	* @param[in] out_shift Amount of right-shift for output
2193	* @param[in] out_mult Output multiplier for requantization
2194	* @return The function returns the following
2195	* <code>ARM_MATH_SUCCESS</code> - Successful operation
2196	*
2197	*/
2198	arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
2199	const uint16_t input_x,
2200	const uint16_t input_y,
2201	const uint16_t input_ch,
2202	const uint8_t *kernel,
2203	const uint16_t kernel_x,
2204	const uint16_t kernel_y,
2205	const int16_t ch_mult,
2206	const int16_t pad_x,
2207	const int16_t pad_y,
2208	const int16_t stride_x,
2209	const int16_t stride_y,
2210	const int16_t dilation_x,
2211	const int16_t dilation_y,
2212	const int32_t *bias,
2213	const int32_t input_offset,
2214	const int32_t filter_offset,
2215	const int32_t output_offset,
2216	uint8_t *output,
2217	const uint16_t output_x,
2218	const uint16_t output_y,
2219	const int32_t output_activation_min,
2220	const int32_t output_activation_max,
2221	const int32_t out_shift,
2222	const int32_t out_mult);
2223
2224	/**
2225	* @defgroup Reshape Reshape Functions
2226	*
2227	*/
2228
2229	/**
2230	* @brief Reshape a s8 vector into another with different shape
2231	* @param[in] input points to the s8 input vector
2232	* @param[out] output points to the s8 output vector
2233	* @param[in] total_size total size of the input and output vectors in bytes
2234	*
2235	* @note The output is expected to be in a memory area that does not overlap with the input's
2236	*
2237	*/
2238	void arm_reshape_s8(const int8_t input, int8_t output, const uint32_t total_size);
2239
2240	/**
2241	* @defgroup Concatenation Concatenation Functions
2242	*
2243	*/
2244
2245	/**
2246	* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
2247	* This function should be called for each input tensor to concatenate. The argument offset_x
2248	* will be used to store the input tensor in the correct position in the output tensor
2249	*
2250	* i.e. offset_x = 0
2251	* for(i = 0 i < num_input_tensors; ++i)
2252	* {
2253	* arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
2254	* offset_x += input_x[i]
2255	* }
2256	*
2257	* This function assumes that the output tensor has:
2258	* -# The same height of the input tensor
2259	* -# The same number of channels of the input tensor
2260	* -# The same batch size of the input tensor
2261	*
2262	* Unless specified otherwise, arguments are mandatory.
2263	*
2264	* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2265	* does not involve any arithmetic operation
2266	*
2267	* @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
2268	* @param[in] input_x Width of input tensor
2269	* @param[in] input_y Height of input tensor
2270	* @param[in] input_z Channels in input tensor
2271	* @param[in] input_w Batch size in input tensor
2272	* @param[out] output Pointer to output tensor. Expected to be at least
2273	* (input_x * input_y * input_z * input_w) + offset_x
2274	* bytes.
2275	* @param[in] output_x Width of output tensor
2276	* @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
2277	* It is user responsibility to provide the correct value
2278	*
2279	* <b> Input constraints</b>
2280	* offset_x is less than output_x
2281	*
2282	*/
2283	void arm_concatenation_s8_x(const int8_t *input,
2284	const uint16_t input_x,
2285	const uint16_t input_y,
2286	const uint16_t input_z,
2287	const uint16_t input_w,
2288	int8_t *output,
2289	const uint16_t output_x,
2290	const uint32_t offset_x);
2291
2292	/**
2293	* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
2294	* This function should be called for each input tensor to concatenate. The argument offset_y
2295	* will be used to store the input tensor in the correct position in the output tensor
2296	*
2297	* i.e. offset_y = 0
2298	* for(i = 0 i < num_input_tensors; ++i)
2299	* {
2300	* arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
2301	* offset_y += input_y[i]
2302	* }
2303	*
2304	* This function assumes that the output tensor has:
2305	* -# The same width of the input tensor
2306	* -# The same number of channels of the input tensor
2307	* -# The same batch size of the input tensor
2308	*
2309	* Unless specified otherwise, arguments are mandatory.
2310	*
2311	* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2312	* does not involve any arithmetic operation
2313	*
2314	* @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
2315	* @param[in] input_x Width of input tensor
2316	* @param[in] input_y Height of input tensor
2317	* @param[in] input_z Channels in input tensor
2318	* @param[in] input_w Batch size in input tensor
2319	* @param[out] output Pointer to output tensor. Expected to be at least
2320	* (input_z * input_w * input_x * input_y) + offset_y
2321	* bytes.
2322	* @param[in] output_y Height of output tensor
2323	* @param[in] offset_y The offset on the Y axis to start concatenating the input tensor
2324	* It is user responsibility to provide the correct value
2325	*
2326	* <b> Input constraints</b>
2327	* offset_y is less than output_y
2328	*
2329	*/
2330	void arm_concatenation_s8_y(const int8_t *input,
2331	const uint16_t input_x,
2332	const uint16_t input_y,
2333	const uint16_t input_z,
2334	const uint16_t input_w,
2335	int8_t *output,
2336	const uint16_t output_y,
2337	const uint32_t offset_y);
2338
2339	/**
2340	* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
2341	* This function should be called for each input tensor to concatenate. The argument offset_z
2342	* will be used to store the input tensor in the correct position in the output tensor
2343	*
2344	* i.e. offset_z = 0
2345	* for(i = 0 i < num_input_tensors; ++i)
2346	* {
2347	* arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
2348	* offset_z += input_z[i]
2349	* }
2350	*
2351	* This function assumes that the output tensor has:
2352	* -# The same width of the input tensor
2353	* -# The same height of the input tensor
2354	* -# The same batch size of the input tensor
2355	*
2356	* Unless specified otherwise, arguments are mandatory.
2357	*
2358	* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2359	* does not involve any arithmetic operation
2360	*
2361	* @param[in] input Pointer to input tensor. Input tensor must not overlap with output tensor.
2362	* @param[in] input_x Width of input tensor
2363	* @param[in] input_y Height of input tensor
2364	* @param[in] input_z Channels in input tensor
2365	* @param[in] input_w Batch size in input tensor
2366	* @param[out] output Pointer to output tensor. Expected to be at least
2367	* (input_x * input_y * input_z * input_w) + offset_z
2368	* bytes.
2369	* @param[in] output_z Channels in output tensor
2370	* @param[in] offset_z The offset on the Z axis to start concatenating the input tensor
2371	* It is user responsibility to provide the correct value
2372	*
2373	* <b> Input constraints</b>
2374	* offset_z is less than output_z
2375	*
2376	*/
2377	void arm_concatenation_s8_z(const int8_t *input,
2378	const uint16_t input_x,
2379	const uint16_t input_y,
2380	const uint16_t input_z,
2381	const uint16_t input_w,
2382	int8_t *output,
2383	const uint16_t output_z,
2384	const uint32_t offset_z);
2385
2386	/**
2387	* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
2388	* This function should be called for each input tensor to concatenate. The argument offset_w
2389	* will be used to store the input tensor in the correct position in the output tensor
2390	*
2391	* i.e. offset_w = 0
2392	* for(i = 0 i < num_input_tensors; ++i)
2393	* {
2394	* arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
2395	* offset_w += input_w[i]
2396	* }
2397	*
2398	* This function assumes that the output tensor has:
2399	* -# The same width of the input tensor
2400	* -# The same height of the input tensor
2401	* -# The same number o channels of the input tensor
2402	*
2403	* Unless specified otherwise, arguments are mandatory.
2404	*
2405	* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2406	* does not involve any arithmetic operation
2407	*
2408	* @param[in] input Pointer to input tensor
2409	* @param[in] input_x Width of input tensor
2410	* @param[in] input_y Height of input tensor
2411	* @param[in] input_z Channels in input tensor
2412	* @param[in] input_w Batch size in input tensor
2413	* @param[out] output Pointer to output tensor. Expected to be at least
2414	* input_x * input_y * input_z * input_w
2415	* bytes.
2416	* @param[in] offset_w The offset on the W axis to start concatenating the input tensor
2417	* It is user responsibility to provide the correct value
2418	*
2419	*/
2420	void arm_concatenation_s8_w(const int8_t *input,
2421	const uint16_t input_x,
2422	const uint16_t input_y,
2423	const uint16_t input_z,
2424	const uint16_t input_w,
2425	int8_t *output,
2426	const uint32_t offset_w);
2427	/**
2428	* @defgroup SVDF SVDF Layer Functions
2429	*
2430	*/
2431
2432	/**
2433	* @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
2434	*
2435	* @param[in] input_ctx Temporary scratch buffer
2436	* @param[in] output_ctx Temporary output scratch buffer
2437	* @param[in] svdf_params SVDF Parameters
2438	* Range of svdf_params->input_offset : [-128, 127]
2439	* Range of svdf_params->output_offset : [-128, 127]
2440	* @param[in] input_quant_params Input quantization parameters
2441	* @param[in] output_quant_params Output quantization parameters
2442	* @param[in] input_dims Input tensor dimensions
2443	* @param[in] input_data Pointer to input tensor
2444	* @param[in] state_dims State tensor dimensions
2445	* @param[in] state_data Pointer to state tensor
2446	* @param[in] weights_feature_dims Weights (feature) tensor dimensions
2447	* @param[in] weights_feature_data Pointer to the weights (feature) tensor
2448	* @param[in] weights_time_dims Weights (time) tensor dimensions
2449	* @param[in] weights_time_data Pointer to the weights (time) tensor
2450	* @param[in] bias_dims Bias tensor dimensions
2451	* @param[in] bias_data Pointer to bias tensor
2452	* @param[in] output_dims Output tensor dimensions
2453	* @param[out] output_data Pointer to the output tensor
2454	*
2455	* @return The function returns <code>ARM_MATH_SUCCESS</code>
2456	*
2457	* @details
2458	* 1. Supported framework: TensorFlow Lite micro
2459	* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
2460	*
2461	*/
2462	arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
2463	const cmsis_nn_context *output_ctx,
2464	const cmsis_nn_svdf_params *svdf_params,
2465	const cmsis_nn_per_tensor_quant_params *input_quant_params,
2466	const cmsis_nn_per_tensor_quant_params *output_quant_params,
2467	const cmsis_nn_dims *input_dims,
2468	const q7_t *input_data,
2469	const cmsis_nn_dims *state_dims,
2470	q7_t *state_data,
2471	const cmsis_nn_dims *weights_feature_dims,
2472	const q7_t *weights_feature_data,
2473	const cmsis_nn_dims *weights_time_dims,
2474	const q7_t *weights_time_data,
2475	const cmsis_nn_dims *bias_dims,
2476	const q31_t *bias_data,
2477	const cmsis_nn_dims *output_dims,
2478	q7_t *output_data);
2479
2480	/**
2481	* @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
2482	*
2483	* @param[in] input_ctx Temporary scratch buffer
2484	* @param[in] output_ctx Temporary output scratch buffer
2485	* @param[in] svdf_params SVDF Parameters
2486	* Range of svdf_params->input_offset : [-128, 127]
2487	* Range of svdf_params->output_offset : [-128, 127]
2488	* @param[in] input_quant_params Input quantization parameters
2489	* @param[in] output_quant_params Output quantization parameters
2490	* @param[in] input_dims Input tensor dimensions
2491	* @param[in] input_data Pointer to input tensor
2492	* @param[in] state_dims State tensor dimensions
2493	* @param[in] state_data Pointer to state tensor
2494	* @param[in] weights_feature_dims Weights (feature) tensor dimensions
2495	* @param[in] weights_feature_data Pointer to the weights (feature) tensor
2496	* @param[in] weights_time_dims Weights (time) tensor dimensions
2497	* @param[in] weights_time_data Pointer to the weights (time) tensor
2498	* @param[in] bias_dims Bias tensor dimensions
2499	* @param[in] bias_data Pointer to bias tensor
2500	* @param[in] output_dims Output tensor dimensions
2501	* @param[out] output_data Pointer to the output tensor
2502	*
2503	* @return The function returns <code>ARM_MATH_SUCCESS</code>
2504	*
2505	* @details
2506	* 1. Supported framework: TensorFlow Lite micro
2507	* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
2508	*
2509	*/
2510	arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
2511	const cmsis_nn_context *output_ctx,
2512	const cmsis_nn_svdf_params *svdf_params,
2513	const cmsis_nn_per_tensor_quant_params *input_quant_params,
2514	const cmsis_nn_per_tensor_quant_params *output_quant_params,
2515	const cmsis_nn_dims *input_dims,
2516	const q7_t *input_data,
2517	const cmsis_nn_dims *state_dims,
2518	q15_t *state_data,
2519	const cmsis_nn_dims *weights_feature_dims,
2520	const q7_t *weights_feature_data,
2521	const cmsis_nn_dims *weights_time_dims,
2522	const q15_t *weights_time_data,
2523	const cmsis_nn_dims *bias_dims,
2524	const q31_t *bias_data,
2525	const cmsis_nn_dims *output_dims,
2526	q7_t *output_data);
2527
2528	#ifdef __cplusplus
2529	}
2530	#endif
2531
2532	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/firmware_v4/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Download in other formats: