Context Navigation

arm_nnsupportfunctions.h

Last change on this file was 42, checked in by f.jahn, 5 days ago

File size: 48.8 KB

Line
1	/*
2	* Copyright (C) 2010-2022 Arm Limited or its affiliates.
3	*
4	* SPDX-License-Identifier: Apache-2.0
5	*
6	* Licensed under the Apache License, Version 2.0 (the License); you may
7	* not use this file except in compliance with the License.
8	* You may obtain a copy of the License at
9	*
10	* www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing, software
13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	* See the License for the specific language governing permissions and
16	* limitations under the License.
17	*/
18
19	/* ----------------------------------------------------------------------
20	* Project: CMSIS NN Library
21	* Title: arm_nnsupportfunctions.h
22	* Description: Public header file of support functions for CMSIS NN Library
23	*
24	* $Date: 19. April 2022
25	* $Revision: V.7.0.1
26	*
27	* Target Processor: Cortex-M CPUs
28	* -------------------------------------------------------------------- */
29
30	#ifndef _ARM_NNSUPPORTFUNCTIONS_H_
31	#define _ARM_NNSUPPORTFUNCTIONS_H_
32
33	#include "arm_nn_math_types.h"
34	#include "arm_nn_types.h"
35
36	#include <stdbool.h>
37
38	#ifdef __cplusplus
39	extern "C" {
40	#endif
41
42	#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
43	#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
44	#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
45	#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0
46	#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))
47
48	#define MAX(A, B) ((A) > (B) ? (A) : (B))
49	#define MIN(A, B) ((A) < (B) ? (A) : (B))
50	#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
51	#define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
52
53	/**
54	* @brief definition to pack four 8 bit values.
55	*/
56	#define PACK_Q7x4_32x1(v0, v1, v2, v3) \
57	((((int32_t)(v0) << 0) & (int32_t)0x000000FF) \| (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) \| \
58	(((int32_t)(v2) << 16) & (int32_t)0x00FF0000) \| (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
59
60	/**
61	* @brief Union for SIMD access of q31/q15/q7 types
62	*/
63	union arm_nnword
64	{
65	q31_t word;
66	/*< q31 type /
67	q15_t half_words[2];
68	/*< q15 type /
69	q7_t bytes[4];
70	/*< q7 type /
71	};
72
73	/**
74	* @brief Union for data type long long
75	*/
76	struct arm_nn_double
77	{
78	uint32_t low;
79	int32_t high;
80	};
81
82	union arm_nn_long_long
83	{
84	int64_t long_long;
85	struct arm_nn_double word;
86	};
87
88	/**
89	* @defgroup nndata_convert Neural Network Data Conversion Functions
90	*
91	* Perform data type conversion in-between neural network operations
92	*
93	*/
94
95	/**
96	* @brief Converts the elements of the q7 vector to q15 vector without left-shift
97	* @param[in] *pSrc points to the q7 input vector
98	* @param[out] *pDst points to the q15 output vector
99	* @param[in] blockSize length of the input vector
100	*
101	*/
102	void arm_q7_to_q15_no_shift(const q7_t pSrc, q15_t pDst, uint32_t blockSize);
103
104	/**
105	* @brief Non-saturating addition of elements of a q7 vector
106	* @param[in] *input Pointer to the q7 input vector
107	* @param[out] *output Pointer to the q31 output variable.
108	* @param[in] block_size length of the input vector
109	* \par Description:
110	*
111	* 2^24 samples can be added without saturating the result.
112	*
113	* The equation used for the conversion process is:
114	*
115	* <pre>
116	* sum = input[0] + input[1] + .. + input[block_size -1]
117	* </pre>
118	*
119	* */
120	void arm_nn_add_q7(const q7_t input, q31_t output, uint32_t block_size);
121
122	/**
123	* @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift
124	* @param[in] *pSrc points to the q7 input vector
125	* @param[out] *pDst points to the q15 output vector
126	* @param[in] blockSize length of the input vector
127	* @return none.
128	*
129	*/
130	void arm_q7_to_q15_reordered_no_shift(const q7_t pSrc, q15_t pDst, uint32_t blockSize);
131
132	/**
133	* @brief Converts the elements from a q7 vector to a q15 vector with an added offset
134	* @param[in] src pointer to the q7 input vector
135	* @param[out] dst pointer to the q15 output vector
136	* @param[in] block_size length of the input vector
137	* @param[in] offset q7 offset to be added to each input vector element.
138	*
139	* \par Description:
140	*
141	* The equation used for the conversion process is:
142	*
143	* <pre>
144	* dst[n] = (q15_t) src[n] + offset; 0 <= n < block_size.
145	* </pre>
146	*
147	*/
148	void arm_q7_to_q15_with_offset(const q7_t src, q15_t dst, uint32_t block_size, q15_t offset);
149
150	/**
151	* @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
152	* @param[in] src pointer to the q7 input vector
153	* @param[out] dst pointer to the q15 output vector
154	* @param[in] block_size length of the input vector
155	* @param[in] offset offset to be added to each input vector element.
156	* @return none.
157	*
158	* @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
159	* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
160	* original order.
161	*
162	*/
163	void arm_q7_to_q15_reordered_with_offset(const q7_t src, q15_t dst, uint32_t block_size, q15_t offset);
164
165	/**
166	* @brief Converts the elements from a q7 vector and accumulate to a q15 vector
167	* @param[in] *src points to the q7 input vector
168	* @param[out] *dst points to the q15 output vector
169	* @param[in] block_size length of the input vector
170	*
171	* \par Description:
172	*
173	* The equation used for the conversion process is:
174	*
175	* <pre>
176	* dst[n] += (q15_t) src[n] ; 0 <= n < block_size.
177	* </pre>
178	*
179	*/
180	void arm_nn_accumulate_q7_to_q15(q15_t dst, const q7_t src, uint32_t block_size);
181
182	/**
183	* @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
184	* @param[in] row pointer to row
185	* @param[in] col pointer to im2col buffer, always consists of 2 columns.
186	* @param[in] num_ch number of channels
187	* @param[in] out_shift pointer to per output channel requantization shift parameter.
188	* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
189	* @param[in] out_offset output tensor offset.
190	* @param[in] activation_min minimum value to clamp the output to. Range : int8
191	* @param[in] activation_max maximum value to clamp the output to. Range : int8
192	* @param[in] kernel_size number of elements in one column.
193	* @param[in] output_bias per output channel bias. Range : int32
194	* @param[out] out pointer to output
195	* @return The function returns one of the two
196	* 1. The incremented output pointer for a successful operation or
197	* 2. NULL if implementation is not available.
198	*
199	* @details Supported framework: TensorFlow Lite micro.
200	*/
201	q7_t arm_nn_depthwise_conv_s8_core(const q7_t row,
202	const q15_t *col,
203	const uint16_t num_ch,
204	const int32_t *out_shift,
205	const int32_t *out_mult,
206	const int32_t out_offset,
207	const int32_t activation_min,
208	const int32_t activation_max,
209	const uint16_t kernel_size,
210	const int32_t *const output_bias,
211	q7_t *out);
212
213	/**
214	* @brief General Matrix-multiplication function with per-channel requantization.
215	* @param[in] input_row pointer to row operand
216	* @param[in] input_col pointer to col operand
217	* @param[in] output_ch number of rows of input_row
218	* @param[in] col_batches number of column batches. Range: 1 to 4
219	* @param[in] output_shift pointer to per output channel requantization shift parameter.
220	* @param[in] output_mult pointer to per output channel requantization multiplier parameter.
221	* @param[in] out_offset output tensor offset.
222	* @param[in] col_offset input tensor(col) offset.
223	* @param[in] row_offset kernel offset(row). Not used.
224	* @param[in] out_activation_min minimum value to clamp the output to. Range : int8
225	* @param[in] out_activation_max maximum value to clamp the output to. Range : int8
226	* @param[in] row_len number of elements in each row
227	* @param[in] bias per output channel bias. Range : int32
228	* @param[in,out] out pointer to output
229	* @return The function returns one of the two
230	* 1. The incremented output pointer for a successful operation or
231	* 2. NULL if implementation is not available.
232	*
233	* @details Supported framework: TensorFlow Lite
234	*/
235	q7_t arm_nn_mat_mult_s8(const q7_t input_row,
236	const q7_t *input_col,
237	const uint16_t output_ch,
238	const uint16_t col_batches,
239	const int32_t *output_shift,
240	const int32_t *output_mult,
241	const int32_t out_offset,
242	const int32_t col_offset,
243	const int32_t row_offset,
244	const int16_t out_activation_min,
245	const int16_t out_activation_max,
246	const uint16_t row_len,
247	const int32_t *const bias,
248	q7_t *out);
249	/**
250	* @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
251	* @param[in] input_a pointer to operand A
252	* @param[in] input_b pointer to operand B, always consists of 2 vectors.
253	* @param[in] output_ch number of rows of A
254	* @param[in] out_shift pointer to per output channel requantization shift parameter.
255	* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
256	* @param[in] activation_min minimum value to clamp the output to. Range : int16
257	* @param[in] activation_max maximum value to clamp the output to. Range : int16
258	* @param[in] num_col_a number of columns of A
259	* @param[in] output_bias per output channel bias. Range : int64
260	* @param[in,out] out_0 pointer to output
261	* @return The function returns one of the two
262	* 1. The incremented output pointer for a successful operation or
263	* 2. NULL if implementation is not available.
264	*
265	* @details This function does the matrix multiplication of weight matrix for all output channels
266	* with 2 columns from im2col and produces two elements/output_channel. The outputs are
267	* clamped in the range provided by activation min and max.
268	* Supported framework: TensorFlow Lite micro.
269	*/
270	q15_t arm_nn_mat_mult_kernel_s16(const q7_t input_a,
271	const q15_t *input_b,
272	const int32_t output_ch,
273	const int32_t *out_shift,
274	const int32_t *out_mult,
275	const int16_t activation_min,
276	const int16_t activation_max,
277	const int32_t num_col_a,
278	const int64_t *const output_bias,
279	q15_t *out_0);
280	/**
281	* @brief General Matrix-multiplication without requantization for one row & one column
282	* @param[in] row_elements number of row elements
283	* @param[in] row_base pointer to row operand
284	* @param[in] col_base pointer to col operand
285	* @param[out] sum_col pointer to store sum of column elements
286	* @param[out] output pointer to store result of multiply-accumulate
287	* @return The function returns the multiply-accumulated result of the row by column.
288	*
289	* @details Pseudo-code
290	* *output = 0
291	* sum_col = 0
292	* for (i = 0; i < row_elements; i++)
293	* output += row_base[i] col_base[i]
294	* sum_col += col_base[i]
295	*
296	*/
297	arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
298	const int8_t *row_base,
299	const int8_t *col_base,
300	int32_t *const sum_col,
301	int32_t *const output);
302
303	/**
304	* @brief Matrix-multiplication with requantization & activation function for four rows and one column
305	* @param[in] row_elements number of row elements
306	* @param[in] offset offset between rows. Can be the same as row_elements.
307	* For e.g, in a 1x1 conv scenario with stride as 1.
308	* @param[in] row_base pointer to row operand
309	* @param[in] col_base pointer to col operand
310	* @param[in] out_ch Number of output channels
311	* @param[in] conv_params Pointer to convolution parameters like offsets and activation values
312	* @param[in] quant_params Pointer to per-channel quantization parameters
313	* @param[in] bias Pointer to per-channel bias
314	* @param[out] output Pointer to output where int8 results are stored.
315	*
316	* @return The function returns the updated output pointer or NULL if implementation is not available.
317	*
318	* @details Compliant to TFLM int8 specification. MVE implementation only
319	*/
320	int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
321	const int32_t offset,
322	const int8_t *row_base,
323	const int8_t *col_base,
324	const int32_t out_ch,
325	const cmsis_nn_conv_params *conv_params,
326	const cmsis_nn_per_channel_quant_params *quant_params,
327	const int32_t *bias,
328	int8_t *output);
329
330	/**
331	* @brief General Matrix-multiplication function with per-channel requantization.
332	* This function assumes:
333	* - LHS input matrix NOT transposed (nt)
334	* - RHS input matrix transposed (t)
335	*
336	* @note This operation also performs the broadcast bias addition before the requantization
337	*
338	* @param[in] lhs Pointer to the LHS input matrix
339	* @param[in] rhs Pointer to the RHS input matrix
340	* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
341	* output columns (or RHS input rows)
342	* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
343	* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
344	* The length of this vector is equal to the number of output columns (or RHS input
345	* rows)
346	* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
347	* of this vector is equal to the number of output columns (or RHS input rows)
348	* @param[in] lhs_rows Number of LHS input rows
349	* @param[in] rhs_rows Number of RHS input rows
350	* @param[in] rhs_cols Number of LHS/RHS input columns
351	* @param[in] lhs_offset Offset to be applied to the LHS input value
352	* @param[in] dst_offset Offset to be applied the output result
353	* @param[in] activation_min Minimum value to clamp down the output. Range : int8
354	* @param[in] activation_max Maximum value to clamp up the output. Range : int8
355	*
356	* @return The function returns <code>ARM_MATH_SUCCESS</code>
357	*
358	*/
359	arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
360	const q7_t *rhs,
361	const q31_t *bias,
362	q7_t *dst,
363	const int32_t *dst_multipliers,
364	const int32_t *dst_shifts,
365	const int32_t lhs_rows,
366	const int32_t rhs_rows,
367	const int32_t rhs_cols,
368	const int32_t lhs_offset,
369	const int32_t dst_offset,
370	const int32_t activation_min,
371	const int32_t activation_max);
372
373	/**
374	* @brief s8 Vector by Matrix (transposed) multiplication
375	*
376	* @param[in] lhs Input left-hand side vector
377	* @param[in] rhs Input right-hand side matrix (transposed)
378	* @param[in] bias Input bias
379	* @param[out] dst Output vector
380	* @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector.
381	* Range: -127 to 128
382	* @param[in] rhs_offset Not used
383	* @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128
384	* @param[in] dst_multiplier Output multiplier
385	* @param[in] dst_shift Output shift
386	* @param[in] rhs_cols Number of columns in the right-hand side input matrix
387	* @param[in] rhs_rows Number of rows in the right-hand side input matrix
388	* @param[in] activation_min Minimum value to clamp the output to. Range: int8
389	* @param[in] activation_max Maximum value to clamp the output to. Range: int8
390	* @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the
391	* second at 'dst + address_offset' and so on. Default value is typically 1.
392	*
393	* @return The function returns <code>ARM_MATH_SUCCESS</code>
394	*
395	*/
396	arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
397	const q7_t *rhs,
398	const q31_t *bias,
399	q7_t *dst,
400	const int32_t lhs_offset,
401	const int32_t rhs_offset,
402	const int32_t dst_offset,
403	const int32_t dst_multiplier,
404	const int32_t dst_shift,
405	const int32_t rhs_cols,
406	const int32_t rhs_rows,
407	const int32_t activation_min,
408	const int32_t activation_max,
409	const int32_t address_offset);
410
411	/**
412	* @brief s16 Vector by Matrix (transposed) multiplication
413	*
414	* @param[in] lhs Input left-hand side vector
415	* @param[in] rhs Input right-hand side matrix (transposed)
416	* @param[in] bias Input bias
417	* @param[out] dst Output vector
418	* @param[in] dst_multiplier Output multiplier
419	* @param[in] dst_shift Output shift
420	* @param[in] rhs_cols Number of columns in the right-hand side input matrix
421	* @param[in] rhs_rows Number of rows in the right-hand side input matrix
422	* @param[in] activation_min Minimum value to clamp the output to. Range: int16
423	* @param[in] activation_max Maximum value to clamp the output to. Range: int16
424	*
425	* @return The function returns <code>ARM_MATH_SUCCESS</code>
426	*
427	*/
428	arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
429	const q7_t *rhs,
430	const q63_t *bias,
431	q15_t *dst,
432	const int32_t dst_multiplier,
433	const int32_t dst_shift,
434	const int32_t rhs_cols,
435	const int32_t rhs_rows,
436	const int32_t activation_min,
437	const int32_t activation_max);
438
439	/**
440	* @brief s8 Vector by Matrix (transposed) multiplication with s16 output
441	*
442	* @param[in] lhs Input left-hand side vector
443	* @param[in] rhs Input right-hand side matrix (transposed)
444	* @param[out] dst Output vector
445	* @param[in] lhs_offset Offset to be added to the input values of the left-hand side
446	* vector. Range: -127 to 128
447	* @param[in] rhs_offset Not used
448	* @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the
449	* second at 'dst + scatter_offset' and so on.
450	* @param[in] dst_multiplier Output multiplier
451	* @param[in] dst_shift Output shift
452	* @param[in] rhs_cols Number of columns in the right-hand side input matrix
453	* @param[in] rhs_rows Number of rows in the right-hand side input matrix
454	* @param[in] activation_min Minimum value to clamp the output to. Range: int16
455	* @param[in] activation_max Maximum value to clamp the output to. Range: int16
456	*
457	* @return The function returns <code>ARM_MATH_SUCCESS</code>
458	*
459	*/
460	arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
461	const q7_t *rhs,
462	q15_t *dst,
463	const int32_t lhs_offset,
464	const int32_t rhs_offset,
465	const int32_t scatter_offset,
466	const int32_t dst_multiplier,
467	const int32_t dst_shift,
468	const int32_t rhs_cols,
469	const int32_t rhs_rows,
470	const int32_t activation_min,
471	const int32_t activation_max);
472
473	/**
474	* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where
475	* the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
476	*
477	* @param[in] lhs Input left-hand side matrix
478	* @param[in] rhs Input right-hand side matrix (transposed)
479	* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
480	* @param[in] num_ch Number of channels in LHS/RHS
481	* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels
482	* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels
483	* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
484	* @param[in] activation_min Minimum value to clamp the output to. Range: int8
485	* @param[in] activation_max Maximum value to clamp the output to. Range: int8
486	* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
487	* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels
488	* @param[in] out Output pointer
489	*
490	* @return The function returns one of the two
491	* - Updated output pointer if an implementation is available
492	* - NULL if no implementation is available.
493	*
494	* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
495	* out for the following.
496	* - Output shift
497	* - Output multiplier
498	* - Output bias
499	* - rhs
500	*/
501	q7_t arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t lhs,
502	const q7_t *rhs,
503	const int32_t lhs_offset,
504	const uint16_t num_ch,
505	const int32_t *out_shift,
506	const int32_t *out_mult,
507	const int32_t out_offset,
508	const int32_t activation_min,
509	const int32_t activation_max,
510	const uint16_t row_x_col,
511	const int32_t *const output_bias,
512	q7_t *out);
513
514	/**
515	* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
516	* Dimensions are the same for lhs and rhs.
517	*
518	* @param[in] lhs Input left-hand side matrix
519	* @param[in] rhs Input right-hand side matrix (transposed)
520	* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
521	* @param[in] num_ch Number of channels in LHS/RHS
522	* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels.
523	* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels.
524	* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
525	* @param[in] activation_min Minimum value to clamp the output to. Range: int8
526	* @param[in] activation_max Maximum value to clamp the output to. Range: int8
527	* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
528	* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels.
529	* @param[in] out Output pointer
530	*
531	* @return The function returns one of the two
532	* - Updated output pointer if an implementation is available
533	* - NULL if no implementation is available.
534	*
535	* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
536	* out for the following.
537	* - Output shift
538	* - Output multiplier
539	* - Output bias
540	* - rhs
541	*/
542	q7_t arm_nn_depthwise_conv_nt_t_s8(const q7_t lhs,
543	const q7_t *rhs,
544	const int32_t lhs_offset,
545	const uint16_t num_ch,
546	const int32_t *out_shift,
547	const int32_t *out_mult,
548	const int32_t out_offset,
549	const int32_t activation_min,
550	const int32_t activation_max,
551	const uint16_t row_x_col,
552	const int32_t *const output_bias,
553	q7_t *out);
554
555	/**
556	*@brief Matrix-multiplication function for convolution with reordered columns
557	*@param[in] pA pointer to operand A
558	*@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
559	*@param[in] ch_im_out numRow of A
560	*@param[in] numCol_A numCol of A
561	*@param[in] bias_shift amount of left-shift for bias
562	*@param[in] out_shift amount of right-shift for output
563	*@param[in] bias the bias
564	*@param[in,out] pOut pointer to output
565	*@return The function returns the incremented output pointer
566	*
567	*@details This function assumes that data in pInBuffer are reordered
568	*/
569	q7_t arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t pA,
570	const q15_t *pInBuffer,
571	const uint16_t ch_im_out,
572	const uint16_t numCol_A,
573	const uint16_t bias_shift,
574	const uint16_t out_shift,
575	const q7_t *bias,
576	q7_t *pOut);
577
578	/**
579	@brief Read 2 q15 elements and post increment pointer.
580	@param[in] in_q15 Pointer to pointer that holds address of input.
581	@return q31 value
582	*/
583	__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
584	{
585	q31_t val;
586
587	memcpy(&val, *in_q15, 4);
588	*in_q15 += 2;
589
590	return (val);
591	}
592
593	/**
594	@brief Read 4 q7 from q7 pointer and post increment pointer.
595	@param[in] in_q7 Pointer to pointer that holds address of input.
596	@return q31 value
597	*/
598	__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
599	{
600	q31_t val;
601	memcpy(&val, *in_q7, 4);
602	*in_q7 += 4;
603
604	return (val);
605	}
606
607	/**
608	@brief Read 2 q15 from q15 pointer.
609	@param[in] in_q15 pointer to address of input.
610	@return q31 value
611	*/
612	__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
613	{
614	q31_t val;
615	memcpy(&val, in_q15, 4);
616
617	return (val);
618	}
619
620	/**
621	@brief Read 4 q7 values.
622	@param[in] in_q7 pointer to address of input.
623	@return q31 value
624	*/
625	__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
626	{
627	q31_t val;
628	memcpy(&val, in_q7, 4);
629
630	return (val);
631	}
632
633	/**
634	@brief Write four q7 to q7 pointer and increment pointer afterwards.
635	@param[in] in Double pointer to input value
636	@param[in] value Four bytes to copy
637	*/
638	__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
639	{
640	memcpy(*in, &value, 4);
641	*in += 4;
642	}
643
644	/**
645	* @brief memset optimized for MVE
646	* @param[in, out] dst Destination pointer
647	* @param[in] val Value to set
648	* @param[in] block_size Number of bytes to copy.
649	*
650	*/
651	__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size)
652	{
653	#if defined(ARM_MATH_MVEI)
654	__asm volatile(" vdup.8 q0, %[set_val] \n"
655	" wlstp.8 lr, %[cnt], 1f \n"
656	"2: \n"
657	" vstrb.8 q0, [%[in]], #16 \n"
658	" letp lr, 2b \n"
659	"1: \n"
660	: [ in ] "+r"(dst)
661	: [ cnt ] "r"(block_size), [ set_val ] "r"(val)
662	: "q0", "memory", "r14");
663	#else
664	memset(dst, val, block_size);
665	#endif
666	}
667
668	#if defined(ARM_MATH_DSP)
669
670	/**
671	* @brief read and expand one q7 word into two q15 words
672	*/
673
674	__STATIC_FORCEINLINE const q7_t read_and_pad(const q7_t source, q31_t out1, q31_t out2)
675	{
676	q31_t inA = arm_nn_read_q7x4_ia(&source);
677	q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8);
678	q31_t inAbuf2 = __SXTB16(inA);
679
680	#ifndef ARM_MATH_BIG_ENDIAN
681	*out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
682	*out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
683	#else
684	*out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
685	*out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
686	#endif
687
688	return source;
689	}
690
691	/**
692	* @brief read and expand one q7 word into two q15 words with reordering
693	*/
694
695	__STATIC_FORCEINLINE const q7_t read_and_pad_reordered(const q7_t source, q31_t out1, q31_t out2)
696	{
697	q31_t inA = arm_nn_read_q7x4_ia(&source);
698	#ifndef ARM_MATH_BIG_ENDIAN
699	*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
700	*out1 = __SXTB16(inA);
701	#else
702	*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
703	*out2 = __SXTB16(inA);
704	#endif
705
706	return source;
707	}
708
709	/**
710	* @brief read and expand one q7 word into two q15 words with reordering and add an offset
711	*/
712	__STATIC_FORCEINLINE const q7_t *
713	read_and_pad_reordered_with_offset(const q7_t source, q31_t out1, q31_t *out2, q31_t offset)
714	{
715	q31_t inA = arm_nn_read_q7x4_ia(&source);
716
717	#ifndef ARM_MATH_BIG_ENDIAN
718	*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
719	*out1 = __SXTB16(inA);
720	#else
721	*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
722	*out2 = __SXTB16(inA);
723	#endif
724	out1 = __QADD16(out1, offset);
725	out2 = __QADD16(out2, offset);
726
727	return source;
728	}
729
730	#endif
731
732	/**
733	* @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
734	*
735	* Basic Math Functions for Neural Network Computation
736	*
737	*/
738
739	/**
740	* @brief q7 vector multiplication with variable output shifts
741	* @param[in] *pSrcA pointer to the first input vector
742	* @param[in] *pSrcB pointer to the second input vector
743	* @param[out] *pDst pointer to the output vector
744	* @param[in] out_shift amount of right-shift for output
745	* @param[in] blockSize number of samples in each vector
746	* @return none.
747	*
748	* <b>Scaling and Overflow Behavior:</b>
749	* \par
750	* The function uses saturating arithmetic.
751	* Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
752	*/
753
754	void arm_nn_mult_q15(q15_t pSrcA, q15_t pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);
755
756	/**
757	* @brief q7 vector multiplication with variable output shifts
758	* @param[in] *pSrcA pointer to the first input vector
759	* @param[in] *pSrcB pointer to the second input vector
760	* @param[out] *pDst pointer to the output vector
761	* @param[in] out_shift amount of right-shift for output
762	* @param[in] blockSize number of samples in each vector
763	* @return none.
764	*
765	* <b>Scaling and Overflow Behavior:</b>
766	* \par
767	* The function uses saturating arithmetic.
768	* Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
769	*/
770
771	void arm_nn_mult_q7(q7_t pSrcA, q7_t pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
772
773	/**
774	* @brief Matrix-multiplication function for convolution with per-channel requantization.
775	* @param[in] input_a pointer to operand A
776	* @param[in] input_b pointer to operand B, always consists of 2 vectors.
777	* @param[in] output_ch number of rows of A
778	* @param[in] out_shift pointer to per output channel requantization shift parameter.
779	* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
780	* @param[in] out_offset output tensor offset.
781	* @param[in] activation_min minimum value to clamp the output to. Range : int8
782	* @param[in] activation_max maximum value to clamp the output to. Range : int8
783	* @param[in] num_col_a number of columns of A
784	* @param[in] output_bias per output channel bias. Range : int32
785	* @param[in,out] out_0 pointer to output
786	* @return The function returns one of the two
787	* 1. The incremented output pointer for a successful operation or
788	* 2. NULL if implementation is not available.
789	*
790	* @details This function does the matrix multiplication of weight matrix for all output channels
791	* with 2 columns from im2col and produces two elements/output_channel. The outputs are
792	* clamped in the range provided by activation min and max.
793	* Supported framework: TensorFlow Lite micro.
794	*/
795	q7_t arm_nn_mat_mult_kernel_s8_s16(const q7_t input_a,
796	const q15_t *input_b,
797	const uint16_t output_ch,
798	const int32_t *out_shift,
799	const int32_t *out_mult,
800	const int32_t out_offset,
801	const int16_t activation_min,
802	const int16_t activation_max,
803	const uint16_t num_col_a,
804	const int32_t *const output_bias,
805	q7_t *out_0);
806
807	/**
808	* @brief Common softmax function for s8 input and s8 or s16 output
809	* @param[in] input Pointer to the input tensor
810	* @param[in] num_rows Number of rows in the input tensor
811	* @param[in] row_size Number of elements in each input row
812	* @param[in] mult Input quantization multiplier
813	* @param[in] shift Input quantization shift within the range [0, 31]
814	* @param[in] diff_min Minimum difference with max in row. Used to check if
815	* the quantized exponential operation can be performed
816	* @param[in] int16_output Indicating s8 output if 0 else s16 output
817	* @param[out] output Pointer to the output tensor
818	*
819	* @note Supported framework: TensorFlow Lite micro (bit-accurate)
820	*
821	*/
822	void arm_nn_softmax_common_s8(const int8_t *input,
823	const int32_t num_rows,
824	const int32_t row_size,
825	const int32_t mult,
826	const int32_t shift,
827	const int32_t diff_min,
828	const bool int16_output,
829	void *output);
830
831	/**
832	* @brief macro for adding rounding offset
833	*/
834	#ifndef ARM_NN_TRUNCATE
835	#define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
836	#else
837	#define NN_ROUND(out_shift) 0
838	#endif
839
840	// Macros for shortening quantization functions' names and avoid long lines
841	#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))
842	#define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))
843	#define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))
844
845	#define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))
846	#define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))
847
848	#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))
849	#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
850
851	/**
852	* @brief Saturating doubling high multiply. Result matches
853	* NEON instruction VQRDMULH.
854	* @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
855	* @param[in] m2 Multiplier. Range: {NN_Q31_MIN, NN_Q31_MAX}
856	* @return Result of multiplication.
857	*
858	*/
859	__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2)
860	{
861	q31_t result = 0;
862	// Rounding offset to add for a right shift of 31
863	q63_t mult = 1 << 30;
864
865	if ((m1 < 0) ^ (m2 < 0))
866	{
867	mult = 1 - mult;
868	}
869	// Gets resolved as a SMLAL instruction
870	mult = mult + (q63_t)m1 * m2;
871
872	// Utilize all of the upper 32 bits. This is the doubling step
873	// as well.
874	result = (int32_t)(mult / (1ll << 31));
875
876	if ((m1 == m2) && (m1 == (int32_t)NN_Q31_MIN))
877	{
878	result = NN_Q31_MAX;
879	}
880	return result;
881	}
882
883	/**
884	* @brief Doubling high multiply without saturation. This is intended
885	* for requantization where the scale is a positive integer
886	*
887	* @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
888	* @param[in] m2 Multiplier Range: {NN_Q31_MIN, NN_Q31_MAX}
889	* @return Result of multiplication.
890	* @note The result of this matches that of neon instruction
891	* VQRDMULH for m1 in range {NN_Q31_MIN, NN_Q31_MAX} and m2 in
892	* range {NN_Q31_MIN + 1, NN_Q31_MAX}. Saturation occurs when
893	* m1 equals m2 equals NN_Q31_MIN and that is not handled by
894	* this function.
895	*
896	*/
897	__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2)
898	{
899	q31_t result = 0;
900	union arm_nn_long_long mult;
901
902	// Rounding offset to add for a right shift of 31
903	mult.word.low = 1 << 30;
904	mult.word.high = 0;
905
906	// Gets resolved as a SMLAL instruction
907	mult.long_long = mult.long_long + (q63_t)m1 * m2;
908
909	// Utilize all of the upper 32 bits. This is the doubling step
910	// as well.
911	result = (int32_t)(mult.long_long >> 31);
912
913	return result;
914	}
915
916	/**
917	* @brief Rounding divide by power of two.
918	* @param[in] dividend - Dividend
919	* @param[in] exponent - Divisor = power(2, exponent)
920	* Range: [0, 31]
921	* @return Rounded result of division. Midpoint is rounded away from zero.
922	*
923	*/
924	__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
925	{
926	q31_t result = 0;
927	const q31_t remainder_mask = (1 << exponent) - 1;
928	int32_t remainder = remainder_mask & dividend;
929
930	// Basic division
931	result = dividend >> exponent;
932
933	// Adjust 'result' for rounding (mid point away from zero)
934	q31_t threshold = remainder_mask >> 1;
935	if (result < 0)
936	{
937	threshold++;
938	}
939	if (remainder > threshold)
940	{
941	result++;
942	}
943
944	return result;
945	}
946
947	/**
948	* @brief Requantize a given value.
949	* @param[in] val Value to be requantized
950	* @param[in] multiplier multiplier. Range {NN_Q31_MIN + 1, Q32_MAX}
951	* @param[in] shift left or right shift for 'val * multiplier'
952	*
953	* @return Returns (val * multiplier)/(2 ^ shift)
954	*
955	*/
956	__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
957	{
958	#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
959	const int64_t total_shift = 31 - shift;
960	const int64_t new_val = val * (int64_t)multiplier;
961
962	int32_t result = new_val >> (total_shift - 1);
963	result = (result + 1) >> 1;
964
965	return result;
966	#else
967	return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
968	RIGHT_SHIFT(shift));
969	#endif
970	}
971
972	/**
973	* @brief Requantize a given 64 bit value.
974	* @param[in] val Value to be requantized in the range {-(1<<47)} to {(1<<47) - 1}
975	* @param[in] reduced_multiplier Reduced multiplier in the range {NN_Q31_MIN + 1, Q32_MAX} to {Q16_MIN + 1,
976	* Q16_MAX}
977	* @param[in] shift Left or right shift for 'val * multiplier' in the range {-31} to {7}
978	*
979	* @return Returns (val * multiplier)/(2 ^ shift)
980	*
981	*/
982	__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift)
983	{
984	const q63_t new_val = val * reduced_multiplier;
985
986	q31_t result = new_val >> (14 - shift); // 64->32 bit reduction
987	result = (result + 1) >> 1; // Last shift position and insert round
988
989	return result;
990	}
991
992	/**
993	* @brief memcpy optimized for MVE
994	* @param[in, out] dst Destination pointer
995	* @param[in] src Source pointer.
996	* @param[in] block_size Number of bytes to copy.
997	*
998	*/
999	__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t __RESTRICT dst, const q7_t __RESTRICT src, uint32_t block_size)
1000	{
1001	#if defined(ARM_MATH_MVEI)
1002	__asm volatile(" wlstp.8 lr, %[cnt], 1f \n"
1003	"2: \n"
1004	" vldrb.8 q0, [%[in]], #16 \n"
1005	" vstrb.8 q0, [%[out]], #16 \n"
1006	" letp lr, 2b \n"
1007	"1: \n"
1008	: [ in ] "+r"(src), [ out ] "+r"(dst)
1009	: [ cnt ] "r"(block_size)
1010	: "q0", "memory", "r14");
1011	#else
1012	memcpy(dst, src, block_size);
1013	#endif
1014	}
1015
1016	#if defined(ARM_MATH_MVEI)
1017	/**
1018	* @brief Vector saturating doubling high multiply returning high half.
1019	* @param[in] m1 Multiplicand
1020	* @param[in] m2 Multiplier
1021	* @return Result of multiplication.
1022	*
1023	*/
1024	__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2)
1025	{
1026	return vqrdmulhq_n_s32(m1, m2);
1027	}
1028
1029	/**
1030	* @brief Vector rounding divide by power of two.
1031	* @param[in] dividend - Dividend vector
1032	* @param[in] exponent - Divisor = power(2, exponent)
1033	* Range: [0, 31]
1034	* @return Rounded result of division. Midpoint is rounded away from zero.
1035	*
1036	*/
1037	__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
1038	{
1039	const int32x4_t shift = vdupq_n_s32(-exponent);
1040	const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
1041	const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
1042	return vrshlq_s32(fixed_up_dividend, shift);
1043	}
1044
1045	/**
1046	* @brief Requantize a given vector.
1047	* @param[in] val Vector to be requantized
1048	* @param[in] multiplier multiplier
1049	* @param[in] shift shift
1050	*
1051	* @return Returns (val * multiplier)/(2 ^ shift)
1052	*
1053	*/
1054	__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
1055	{
1056	#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
1057	const int right_shift = MIN(-1, shift);
1058	const int left_shift = shift - right_shift;
1059
1060	const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
1061	const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
1062
1063	int32x4_t result = vqdmulhq_n_s32(vshlq_s32(val, left_shift_dup), multiplier);
1064	result = vrshlq_s32(result, right_shift_dup);
1065
1066	return result;
1067	#else
1068	return arm_divide_by_power_of_two_mve(
1069	arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift));
1070	#endif
1071	}
1072
1073	__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
1074	{
1075	return vqrdmulhq_s32(m1, m2);
1076	}
1077
1078	__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent)
1079	{
1080	const int32x4_t shift = -exponent;
1081	const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
1082	const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
1083	return vrshlq_s32(fixed_up_dividend, shift);
1084	}
1085
1086	__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
1087	const int32x4_t multiplier,
1088	const int32x4_t shift)
1089	{
1090	#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
1091	const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift);
1092	const int32x4_t left_shift = vqsubq_s32(shift, right_shift);
1093
1094	int32x4_t result = vqdmulhq_s32(vshlq_s32(val, left_shift), multiplier);
1095	result = vrshlq_s32(result, right_shift);
1096
1097	return result;
1098	#else
1099	const int32x4_t zz = vdupq_n_s32(0);
1100	const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
1101
1102	const int32x4_t left_shift = vpselq_s32(shift, zz, p);
1103	const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
1104
1105	return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier),
1106	right_shift);
1107	#endif
1108	}
1109	#endif
1110
1111	// @note The following functions are used only for softmax layer, scaled bits = 5 assumed
1112
1113	__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
1114	{
1115	int32_t mask = 0;
1116	int32_t shift = 24;
1117
1118	const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift);
1119	const int32_t remainder = val_mod_minus_quarter - val;
1120	const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28);
1121	const int32_t x2 = MUL_SAT(x, x);
1122
1123	int32_t result = 1895147668 +
1124	MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
1125
1126	#define SELECT_IF_NON_ZERO(x) \
1127	{ \
1128	mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \
1129	result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
1130	}
1131
1132	SELECT_IF_NON_ZERO(1672461947)
1133	SELECT_IF_NON_ZERO(1302514674)
1134	SELECT_IF_NON_ZERO(790015084)
1135	SELECT_IF_NON_ZERO(290630308)
1136	SELECT_IF_NON_ZERO(39332535)
1137	SELECT_IF_NON_ZERO(720401)
1138	SELECT_IF_NON_ZERO(242)
1139
1140	#undef SELECT_IF_NON_ZERO
1141
1142	mask = MASK_IF_ZERO(val);
1143	return SELECT_USING_MASK(mask, NN_Q31_MAX, result);
1144	}
1145
1146	__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
1147	{
1148	const int32_t thresh = ((1 << (31 - exp)) - 1);
1149	int32_t result = val << exp;
1150	result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), NN_Q31_MAX, result);
1151	result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), NN_Q31_MIN, result);
1152	return result;
1153	}
1154
1155	__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
1156	{
1157	const int64_t sum = (int64_t)val + (int64_t)NN_Q31_MAX;
1158	const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L);
1159	int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540);
1160
1161	const int32_t shift = (1 << 29);
1162	x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
1163	x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
1164	x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
1165
1166	return MUL_POW2(x, 1);
1167	}
1168
1169	/**
1170	@brief Write 2 q15 elements and post increment pointer.
1171	@param[in] dest_q15 Pointer to pointer that holds address of destination.
1172	@param[in] src_q31 Input value to be written.
1173	*/
1174	__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
1175	{
1176	q31_t val = src_q31;
1177
1178	memcpy(*dest_q15, &val, 4);
1179	*dest_q15 += 2;
1180	}
1181
1182	#ifdef __cplusplus
1183	}
1184	#endif
1185
1186	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/firmware_v4/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h

Download in other formats: