| 1 | /*
|
|---|
| 2 | * Copyright (C) 2010-2022 Arm Limited or its affiliates.
|
|---|
| 3 | *
|
|---|
| 4 | * SPDX-License-Identifier: Apache-2.0
|
|---|
| 5 | *
|
|---|
| 6 | * Licensed under the Apache License, Version 2.0 (the License); you may
|
|---|
| 7 | * not use this file except in compliance with the License.
|
|---|
| 8 | * You may obtain a copy of the License at
|
|---|
| 9 | *
|
|---|
| 10 | * www.apache.org/licenses/LICENSE-2.0
|
|---|
| 11 | *
|
|---|
| 12 | * Unless required by applicable law or agreed to in writing, software
|
|---|
| 13 | * distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|---|
| 14 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 15 | * See the License for the specific language governing permissions and
|
|---|
| 16 | * limitations under the License.
|
|---|
| 17 | */
|
|---|
| 18 |
|
|---|
| 19 | /* ----------------------------------------------------------------------
|
|---|
| 20 | * Project: CMSIS NN Library
|
|---|
| 21 | * Title: arm_nnsupportfunctions.h
|
|---|
| 22 | * Description: Public header file of support functions for CMSIS NN Library
|
|---|
| 23 | *
|
|---|
| 24 | * $Date: 19. April 2022
|
|---|
| 25 | * $Revision: V.7.0.1
|
|---|
| 26 | *
|
|---|
| 27 | * Target Processor: Cortex-M CPUs
|
|---|
| 28 | * -------------------------------------------------------------------- */
|
|---|
| 29 |
|
|---|
| 30 | #ifndef _ARM_NNSUPPORTFUNCTIONS_H_
|
|---|
| 31 | #define _ARM_NNSUPPORTFUNCTIONS_H_
|
|---|
| 32 |
|
|---|
| 33 | #include "arm_nn_math_types.h"
|
|---|
| 34 | #include "arm_nn_types.h"
|
|---|
| 35 |
|
|---|
| 36 | #include <stdbool.h>
|
|---|
| 37 |
|
|---|
| 38 | #ifdef __cplusplus
|
|---|
| 39 | extern "C" {
|
|---|
| 40 | #endif
|
|---|
| 41 |
|
|---|
| 42 | #define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
|
|---|
| 43 | #define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
|
|---|
| 44 | #define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
|
|---|
| 45 | #define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0
|
|---|
| 46 | #define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))
|
|---|
| 47 |
|
|---|
| 48 | #define MAX(A, B) ((A) > (B) ? (A) : (B))
|
|---|
| 49 | #define MIN(A, B) ((A) < (B) ? (A) : (B))
|
|---|
| 50 | #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
|
|---|
| 51 | #define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
|
|---|
| 52 |
|
|---|
| 53 | /**
|
|---|
| 54 | * @brief definition to pack four 8 bit values.
|
|---|
| 55 | */
|
|---|
| 56 | #define PACK_Q7x4_32x1(v0, v1, v2, v3) \
|
|---|
| 57 | ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \
|
|---|
| 58 | (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
|
|---|
| 59 |
|
|---|
| 60 | /**
|
|---|
| 61 | * @brief Union for SIMD access of q31/q15/q7 types
|
|---|
| 62 | */
|
|---|
| 63 | union arm_nnword
|
|---|
| 64 | {
|
|---|
| 65 | q31_t word;
|
|---|
| 66 | /**< q31 type */
|
|---|
| 67 | q15_t half_words[2];
|
|---|
| 68 | /**< q15 type */
|
|---|
| 69 | q7_t bytes[4];
|
|---|
| 70 | /**< q7 type */
|
|---|
| 71 | };
|
|---|
| 72 |
|
|---|
| 73 | /**
|
|---|
| 74 | * @brief Union for data type long long
|
|---|
| 75 | */
|
|---|
| 76 | struct arm_nn_double
|
|---|
| 77 | {
|
|---|
| 78 | uint32_t low;
|
|---|
| 79 | int32_t high;
|
|---|
| 80 | };
|
|---|
| 81 |
|
|---|
| 82 | union arm_nn_long_long
|
|---|
| 83 | {
|
|---|
| 84 | int64_t long_long;
|
|---|
| 85 | struct arm_nn_double word;
|
|---|
| 86 | };
|
|---|
| 87 |
|
|---|
| 88 | /**
|
|---|
| 89 | * @defgroup nndata_convert Neural Network Data Conversion Functions
|
|---|
| 90 | *
|
|---|
| 91 | * Perform data type conversion in-between neural network operations
|
|---|
| 92 | *
|
|---|
| 93 | */
|
|---|
| 94 |
|
|---|
| 95 | /**
|
|---|
| 96 | * @brief Converts the elements of the q7 vector to q15 vector without left-shift
|
|---|
| 97 | * @param[in] *pSrc points to the q7 input vector
|
|---|
| 98 | * @param[out] *pDst points to the q15 output vector
|
|---|
| 99 | * @param[in] blockSize length of the input vector
|
|---|
| 100 | *
|
|---|
| 101 | */
|
|---|
| 102 | void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
|
|---|
| 103 |
|
|---|
| 104 | /**
|
|---|
| 105 | * @brief Non-saturating addition of elements of a q7 vector
|
|---|
| 106 | * @param[in] *input Pointer to the q7 input vector
|
|---|
| 107 | * @param[out] *output Pointer to the q31 output variable.
|
|---|
| 108 | * @param[in] block_size length of the input vector
|
|---|
| 109 | * \par Description:
|
|---|
| 110 | *
|
|---|
| 111 | * 2^24 samples can be added without saturating the result.
|
|---|
| 112 | *
|
|---|
| 113 | * The equation used for the conversion process is:
|
|---|
| 114 | *
|
|---|
| 115 | * <pre>
|
|---|
| 116 | * sum = input[0] + input[1] + .. + input[block_size -1]
|
|---|
| 117 | * </pre>
|
|---|
| 118 | *
|
|---|
| 119 | * */
|
|---|
| 120 | void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
|
|---|
| 121 |
|
|---|
| 122 | /**
|
|---|
| 123 | * @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift
|
|---|
| 124 | * @param[in] *pSrc points to the q7 input vector
|
|---|
| 125 | * @param[out] *pDst points to the q15 output vector
|
|---|
| 126 | * @param[in] blockSize length of the input vector
|
|---|
| 127 | * @return none.
|
|---|
| 128 | *
|
|---|
| 129 | */
|
|---|
| 130 | void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
|
|---|
| 131 |
|
|---|
| 132 | /**
|
|---|
| 133 | * @brief Converts the elements from a q7 vector to a q15 vector with an added offset
|
|---|
| 134 | * @param[in] src pointer to the q7 input vector
|
|---|
| 135 | * @param[out] dst pointer to the q15 output vector
|
|---|
| 136 | * @param[in] block_size length of the input vector
|
|---|
| 137 | * @param[in] offset q7 offset to be added to each input vector element.
|
|---|
| 138 | *
|
|---|
| 139 | * \par Description:
|
|---|
| 140 | *
|
|---|
| 141 | * The equation used for the conversion process is:
|
|---|
| 142 | *
|
|---|
| 143 | * <pre>
|
|---|
| 144 | * dst[n] = (q15_t) src[n] + offset; 0 <= n < block_size.
|
|---|
| 145 | * </pre>
|
|---|
| 146 | *
|
|---|
| 147 | */
|
|---|
| 148 | void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
|
|---|
| 149 |
|
|---|
| 150 | /**
|
|---|
| 151 | * @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
|
|---|
| 152 | * @param[in] src pointer to the q7 input vector
|
|---|
| 153 | * @param[out] dst pointer to the q15 output vector
|
|---|
| 154 | * @param[in] block_size length of the input vector
|
|---|
| 155 | * @param[in] offset offset to be added to each input vector element.
|
|---|
| 156 | * @return none.
|
|---|
| 157 | *
|
|---|
| 158 | * @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
|
|---|
| 159 | * the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
|
|---|
| 160 | * original order.
|
|---|
| 161 | *
|
|---|
| 162 | */
|
|---|
| 163 | void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
|
|---|
| 164 |
|
|---|
| 165 | /**
|
|---|
| 166 | * @brief Converts the elements from a q7 vector and accumulate to a q15 vector
|
|---|
| 167 | * @param[in] *src points to the q7 input vector
|
|---|
| 168 | * @param[out] *dst points to the q15 output vector
|
|---|
| 169 | * @param[in] block_size length of the input vector
|
|---|
| 170 | *
|
|---|
| 171 | * \par Description:
|
|---|
| 172 | *
|
|---|
| 173 | * The equation used for the conversion process is:
|
|---|
| 174 | *
|
|---|
| 175 | * <pre>
|
|---|
| 176 | * dst[n] += (q15_t) src[n] ; 0 <= n < block_size.
|
|---|
| 177 | * </pre>
|
|---|
| 178 | *
|
|---|
| 179 | */
|
|---|
| 180 | void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
|
|---|
| 181 |
|
|---|
| 182 | /**
|
|---|
| 183 | * @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
|
|---|
| 184 | * @param[in] row pointer to row
|
|---|
| 185 | * @param[in] col pointer to im2col buffer, always consists of 2 columns.
|
|---|
| 186 | * @param[in] num_ch number of channels
|
|---|
| 187 | * @param[in] out_shift pointer to per output channel requantization shift parameter.
|
|---|
| 188 | * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
|
|---|
| 189 | * @param[in] out_offset output tensor offset.
|
|---|
| 190 | * @param[in] activation_min minimum value to clamp the output to. Range : int8
|
|---|
| 191 | * @param[in] activation_max maximum value to clamp the output to. Range : int8
|
|---|
| 192 | * @param[in] kernel_size number of elements in one column.
|
|---|
| 193 | * @param[in] output_bias per output channel bias. Range : int32
|
|---|
| 194 | * @param[out] out pointer to output
|
|---|
| 195 | * @return The function returns one of the two
|
|---|
| 196 | * 1. The incremented output pointer for a successful operation or
|
|---|
| 197 | * 2. NULL if implementation is not available.
|
|---|
| 198 | *
|
|---|
| 199 | * @details Supported framework: TensorFlow Lite micro.
|
|---|
| 200 | */
|
|---|
| 201 | q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
|
|---|
| 202 | const q15_t *col,
|
|---|
| 203 | const uint16_t num_ch,
|
|---|
| 204 | const int32_t *out_shift,
|
|---|
| 205 | const int32_t *out_mult,
|
|---|
| 206 | const int32_t out_offset,
|
|---|
| 207 | const int32_t activation_min,
|
|---|
| 208 | const int32_t activation_max,
|
|---|
| 209 | const uint16_t kernel_size,
|
|---|
| 210 | const int32_t *const output_bias,
|
|---|
| 211 | q7_t *out);
|
|---|
| 212 |
|
|---|
| 213 | /**
|
|---|
| 214 | * @brief General Matrix-multiplication function with per-channel requantization.
|
|---|
| 215 | * @param[in] input_row pointer to row operand
|
|---|
| 216 | * @param[in] input_col pointer to col operand
|
|---|
| 217 | * @param[in] output_ch number of rows of input_row
|
|---|
| 218 | * @param[in] col_batches number of column batches. Range: 1 to 4
|
|---|
| 219 | * @param[in] output_shift pointer to per output channel requantization shift parameter.
|
|---|
| 220 | * @param[in] output_mult pointer to per output channel requantization multiplier parameter.
|
|---|
| 221 | * @param[in] out_offset output tensor offset.
|
|---|
| 222 | * @param[in] col_offset input tensor(col) offset.
|
|---|
| 223 | * @param[in] row_offset kernel offset(row). Not used.
|
|---|
| 224 | * @param[in] out_activation_min minimum value to clamp the output to. Range : int8
|
|---|
| 225 | * @param[in] out_activation_max maximum value to clamp the output to. Range : int8
|
|---|
| 226 | * @param[in] row_len number of elements in each row
|
|---|
| 227 | * @param[in] bias per output channel bias. Range : int32
|
|---|
| 228 | * @param[in,out] out pointer to output
|
|---|
| 229 | * @return The function returns one of the two
|
|---|
| 230 | * 1. The incremented output pointer for a successful operation or
|
|---|
| 231 | * 2. NULL if implementation is not available.
|
|---|
| 232 | *
|
|---|
| 233 | * @details Supported framework: TensorFlow Lite
|
|---|
| 234 | */
|
|---|
| 235 | q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
|
|---|
| 236 | const q7_t *input_col,
|
|---|
| 237 | const uint16_t output_ch,
|
|---|
| 238 | const uint16_t col_batches,
|
|---|
| 239 | const int32_t *output_shift,
|
|---|
| 240 | const int32_t *output_mult,
|
|---|
| 241 | const int32_t out_offset,
|
|---|
| 242 | const int32_t col_offset,
|
|---|
| 243 | const int32_t row_offset,
|
|---|
| 244 | const int16_t out_activation_min,
|
|---|
| 245 | const int16_t out_activation_max,
|
|---|
| 246 | const uint16_t row_len,
|
|---|
| 247 | const int32_t *const bias,
|
|---|
| 248 | q7_t *out);
|
|---|
| 249 | /**
|
|---|
| 250 | * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
|
|---|
| 251 | * @param[in] input_a pointer to operand A
|
|---|
| 252 | * @param[in] input_b pointer to operand B, always consists of 2 vectors.
|
|---|
| 253 | * @param[in] output_ch number of rows of A
|
|---|
| 254 | * @param[in] out_shift pointer to per output channel requantization shift parameter.
|
|---|
| 255 | * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
|
|---|
| 256 | * @param[in] activation_min minimum value to clamp the output to. Range : int16
|
|---|
| 257 | * @param[in] activation_max maximum value to clamp the output to. Range : int16
|
|---|
| 258 | * @param[in] num_col_a number of columns of A
|
|---|
| 259 | * @param[in] output_bias per output channel bias. Range : int64
|
|---|
| 260 | * @param[in,out] out_0 pointer to output
|
|---|
| 261 | * @return The function returns one of the two
|
|---|
| 262 | * 1. The incremented output pointer for a successful operation or
|
|---|
| 263 | * 2. NULL if implementation is not available.
|
|---|
| 264 | *
|
|---|
| 265 | * @details This function does the matrix multiplication of weight matrix for all output channels
|
|---|
| 266 | * with 2 columns from im2col and produces two elements/output_channel. The outputs are
|
|---|
| 267 | * clamped in the range provided by activation min and max.
|
|---|
| 268 | * Supported framework: TensorFlow Lite micro.
|
|---|
| 269 | */
|
|---|
| 270 | q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
|
|---|
| 271 | const q15_t *input_b,
|
|---|
| 272 | const int32_t output_ch,
|
|---|
| 273 | const int32_t *out_shift,
|
|---|
| 274 | const int32_t *out_mult,
|
|---|
| 275 | const int16_t activation_min,
|
|---|
| 276 | const int16_t activation_max,
|
|---|
| 277 | const int32_t num_col_a,
|
|---|
| 278 | const int64_t *const output_bias,
|
|---|
| 279 | q15_t *out_0);
|
|---|
| 280 | /**
|
|---|
| 281 | * @brief General Matrix-multiplication without requantization for one row & one column
|
|---|
| 282 | * @param[in] row_elements number of row elements
|
|---|
| 283 | * @param[in] row_base pointer to row operand
|
|---|
| 284 | * @param[in] col_base pointer to col operand
|
|---|
| 285 | * @param[out] sum_col pointer to store sum of column elements
|
|---|
| 286 | * @param[out] output pointer to store result of multiply-accumulate
|
|---|
| 287 | * @return The function returns the multiply-accumulated result of the row by column.
|
|---|
| 288 | *
|
|---|
| 289 | * @details Pseudo-code
|
|---|
| 290 | * *output = 0
|
|---|
| 291 | * sum_col = 0
|
|---|
| 292 | * for (i = 0; i < row_elements; i++)
|
|---|
| 293 | * *output += row_base[i] * col_base[i]
|
|---|
| 294 | * sum_col += col_base[i]
|
|---|
| 295 | *
|
|---|
| 296 | */
|
|---|
| 297 | arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
|
|---|
| 298 | const int8_t *row_base,
|
|---|
| 299 | const int8_t *col_base,
|
|---|
| 300 | int32_t *const sum_col,
|
|---|
| 301 | int32_t *const output);
|
|---|
| 302 |
|
|---|
| 303 | /**
|
|---|
| 304 | * @brief Matrix-multiplication with requantization & activation function for four rows and one column
|
|---|
| 305 | * @param[in] row_elements number of row elements
|
|---|
| 306 | * @param[in] offset offset between rows. Can be the same as row_elements.
|
|---|
| 307 | * For e.g, in a 1x1 conv scenario with stride as 1.
|
|---|
| 308 | * @param[in] row_base pointer to row operand
|
|---|
| 309 | * @param[in] col_base pointer to col operand
|
|---|
| 310 | * @param[in] out_ch Number of output channels
|
|---|
| 311 | * @param[in] conv_params Pointer to convolution parameters like offsets and activation values
|
|---|
| 312 | * @param[in] quant_params Pointer to per-channel quantization parameters
|
|---|
| 313 | * @param[in] bias Pointer to per-channel bias
|
|---|
| 314 | * @param[out] output Pointer to output where int8 results are stored.
|
|---|
| 315 | *
|
|---|
| 316 | * @return The function returns the updated output pointer or NULL if implementation is not available.
|
|---|
| 317 | *
|
|---|
| 318 | * @details Compliant to TFLM int8 specification. MVE implementation only
|
|---|
| 319 | */
|
|---|
| 320 | int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
|
|---|
| 321 | const int32_t offset,
|
|---|
| 322 | const int8_t *row_base,
|
|---|
| 323 | const int8_t *col_base,
|
|---|
| 324 | const int32_t out_ch,
|
|---|
| 325 | const cmsis_nn_conv_params *conv_params,
|
|---|
| 326 | const cmsis_nn_per_channel_quant_params *quant_params,
|
|---|
| 327 | const int32_t *bias,
|
|---|
| 328 | int8_t *output);
|
|---|
| 329 |
|
|---|
| 330 | /**
|
|---|
| 331 | * @brief General Matrix-multiplication function with per-channel requantization.
|
|---|
| 332 | * This function assumes:
|
|---|
| 333 | * - LHS input matrix NOT transposed (nt)
|
|---|
| 334 | * - RHS input matrix transposed (t)
|
|---|
| 335 | *
|
|---|
| 336 | * @note This operation also performs the broadcast bias addition before the requantization
|
|---|
| 337 | *
|
|---|
| 338 | * @param[in] lhs Pointer to the LHS input matrix
|
|---|
| 339 | * @param[in] rhs Pointer to the RHS input matrix
|
|---|
| 340 | * @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
|
|---|
| 341 | * output columns (or RHS input rows)
|
|---|
| 342 | * @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
|
|---|
| 343 | * @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
|
|---|
| 344 | * The length of this vector is equal to the number of output columns (or RHS input
|
|---|
| 345 | * rows)
|
|---|
| 346 | * @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
|
|---|
| 347 | * of this vector is equal to the number of output columns (or RHS input rows)
|
|---|
| 348 | * @param[in] lhs_rows Number of LHS input rows
|
|---|
| 349 | * @param[in] rhs_rows Number of RHS input rows
|
|---|
| 350 | * @param[in] rhs_cols Number of LHS/RHS input columns
|
|---|
| 351 | * @param[in] lhs_offset Offset to be applied to the LHS input value
|
|---|
| 352 | * @param[in] dst_offset Offset to be applied the output result
|
|---|
| 353 | * @param[in] activation_min Minimum value to clamp down the output. Range : int8
|
|---|
| 354 | * @param[in] activation_max Maximum value to clamp up the output. Range : int8
|
|---|
| 355 | *
|
|---|
| 356 | * @return The function returns <code>ARM_MATH_SUCCESS</code>
|
|---|
| 357 | *
|
|---|
| 358 | */
|
|---|
| 359 | arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
|
|---|
| 360 | const q7_t *rhs,
|
|---|
| 361 | const q31_t *bias,
|
|---|
| 362 | q7_t *dst,
|
|---|
| 363 | const int32_t *dst_multipliers,
|
|---|
| 364 | const int32_t *dst_shifts,
|
|---|
| 365 | const int32_t lhs_rows,
|
|---|
| 366 | const int32_t rhs_rows,
|
|---|
| 367 | const int32_t rhs_cols,
|
|---|
| 368 | const int32_t lhs_offset,
|
|---|
| 369 | const int32_t dst_offset,
|
|---|
| 370 | const int32_t activation_min,
|
|---|
| 371 | const int32_t activation_max);
|
|---|
| 372 |
|
|---|
| 373 | /**
|
|---|
| 374 | * @brief s8 Vector by Matrix (transposed) multiplication
|
|---|
| 375 | *
|
|---|
| 376 | * @param[in] lhs Input left-hand side vector
|
|---|
| 377 | * @param[in] rhs Input right-hand side matrix (transposed)
|
|---|
| 378 | * @param[in] bias Input bias
|
|---|
| 379 | * @param[out] dst Output vector
|
|---|
| 380 | * @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector.
|
|---|
| 381 | * Range: -127 to 128
|
|---|
| 382 | * @param[in] rhs_offset Not used
|
|---|
| 383 | * @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128
|
|---|
| 384 | * @param[in] dst_multiplier Output multiplier
|
|---|
| 385 | * @param[in] dst_shift Output shift
|
|---|
| 386 | * @param[in] rhs_cols Number of columns in the right-hand side input matrix
|
|---|
| 387 | * @param[in] rhs_rows Number of rows in the right-hand side input matrix
|
|---|
| 388 | * @param[in] activation_min Minimum value to clamp the output to. Range: int8
|
|---|
| 389 | * @param[in] activation_max Maximum value to clamp the output to. Range: int8
|
|---|
| 390 | * @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the
|
|---|
| 391 | * second at 'dst + address_offset' and so on. Default value is typically 1.
|
|---|
| 392 | *
|
|---|
| 393 | * @return The function returns <code>ARM_MATH_SUCCESS</code>
|
|---|
| 394 | *
|
|---|
| 395 | */
|
|---|
| 396 | arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
|
|---|
| 397 | const q7_t *rhs,
|
|---|
| 398 | const q31_t *bias,
|
|---|
| 399 | q7_t *dst,
|
|---|
| 400 | const int32_t lhs_offset,
|
|---|
| 401 | const int32_t rhs_offset,
|
|---|
| 402 | const int32_t dst_offset,
|
|---|
| 403 | const int32_t dst_multiplier,
|
|---|
| 404 | const int32_t dst_shift,
|
|---|
| 405 | const int32_t rhs_cols,
|
|---|
| 406 | const int32_t rhs_rows,
|
|---|
| 407 | const int32_t activation_min,
|
|---|
| 408 | const int32_t activation_max,
|
|---|
| 409 | const int32_t address_offset);
|
|---|
| 410 |
|
|---|
| 411 | /**
|
|---|
| 412 | * @brief s16 Vector by Matrix (transposed) multiplication
|
|---|
| 413 | *
|
|---|
| 414 | * @param[in] lhs Input left-hand side vector
|
|---|
| 415 | * @param[in] rhs Input right-hand side matrix (transposed)
|
|---|
| 416 | * @param[in] bias Input bias
|
|---|
| 417 | * @param[out] dst Output vector
|
|---|
| 418 | * @param[in] dst_multiplier Output multiplier
|
|---|
| 419 | * @param[in] dst_shift Output shift
|
|---|
| 420 | * @param[in] rhs_cols Number of columns in the right-hand side input matrix
|
|---|
| 421 | * @param[in] rhs_rows Number of rows in the right-hand side input matrix
|
|---|
| 422 | * @param[in] activation_min Minimum value to clamp the output to. Range: int16
|
|---|
| 423 | * @param[in] activation_max Maximum value to clamp the output to. Range: int16
|
|---|
| 424 | *
|
|---|
| 425 | * @return The function returns <code>ARM_MATH_SUCCESS</code>
|
|---|
| 426 | *
|
|---|
| 427 | */
|
|---|
| 428 | arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
|
|---|
| 429 | const q7_t *rhs,
|
|---|
| 430 | const q63_t *bias,
|
|---|
| 431 | q15_t *dst,
|
|---|
| 432 | const int32_t dst_multiplier,
|
|---|
| 433 | const int32_t dst_shift,
|
|---|
| 434 | const int32_t rhs_cols,
|
|---|
| 435 | const int32_t rhs_rows,
|
|---|
| 436 | const int32_t activation_min,
|
|---|
| 437 | const int32_t activation_max);
|
|---|
| 438 |
|
|---|
| 439 | /**
|
|---|
| 440 | * @brief s8 Vector by Matrix (transposed) multiplication with s16 output
|
|---|
| 441 | *
|
|---|
| 442 | * @param[in] lhs Input left-hand side vector
|
|---|
| 443 | * @param[in] rhs Input right-hand side matrix (transposed)
|
|---|
| 444 | * @param[out] dst Output vector
|
|---|
| 445 | * @param[in] lhs_offset Offset to be added to the input values of the left-hand side
|
|---|
| 446 | * vector. Range: -127 to 128
|
|---|
| 447 | * @param[in] rhs_offset Not used
|
|---|
| 448 | * @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the
|
|---|
| 449 | * second at 'dst + scatter_offset' and so on.
|
|---|
| 450 | * @param[in] dst_multiplier Output multiplier
|
|---|
| 451 | * @param[in] dst_shift Output shift
|
|---|
| 452 | * @param[in] rhs_cols Number of columns in the right-hand side input matrix
|
|---|
| 453 | * @param[in] rhs_rows Number of rows in the right-hand side input matrix
|
|---|
| 454 | * @param[in] activation_min Minimum value to clamp the output to. Range: int16
|
|---|
| 455 | * @param[in] activation_max Maximum value to clamp the output to. Range: int16
|
|---|
| 456 | *
|
|---|
| 457 | * @return The function returns <code>ARM_MATH_SUCCESS</code>
|
|---|
| 458 | *
|
|---|
| 459 | */
|
|---|
| 460 | arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
|
|---|
| 461 | const q7_t *rhs,
|
|---|
| 462 | q15_t *dst,
|
|---|
| 463 | const int32_t lhs_offset,
|
|---|
| 464 | const int32_t rhs_offset,
|
|---|
| 465 | const int32_t scatter_offset,
|
|---|
| 466 | const int32_t dst_multiplier,
|
|---|
| 467 | const int32_t dst_shift,
|
|---|
| 468 | const int32_t rhs_cols,
|
|---|
| 469 | const int32_t rhs_rows,
|
|---|
| 470 | const int32_t activation_min,
|
|---|
| 471 | const int32_t activation_max);
|
|---|
| 472 |
|
|---|
| 473 | /**
|
|---|
| 474 | * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where
|
|---|
| 475 | * the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
|
|---|
| 476 | *
|
|---|
| 477 | * @param[in] lhs Input left-hand side matrix
|
|---|
| 478 | * @param[in] rhs Input right-hand side matrix (transposed)
|
|---|
| 479 | * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
|
|---|
| 480 | * @param[in] num_ch Number of channels in LHS/RHS
|
|---|
| 481 | * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels
|
|---|
| 482 | * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels
|
|---|
| 483 | * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
|
|---|
| 484 | * @param[in] activation_min Minimum value to clamp the output to. Range: int8
|
|---|
| 485 | * @param[in] activation_max Maximum value to clamp the output to. Range: int8
|
|---|
| 486 | * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
|
|---|
| 487 | * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels
|
|---|
| 488 | * @param[in] out Output pointer
|
|---|
| 489 | *
|
|---|
| 490 | * @return The function returns one of the two
|
|---|
| 491 | * - Updated output pointer if an implementation is available
|
|---|
| 492 | * - NULL if no implementation is available.
|
|---|
| 493 | *
|
|---|
| 494 | * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
|
|---|
| 495 | * out for the following.
|
|---|
| 496 | * - Output shift
|
|---|
| 497 | * - Output multiplier
|
|---|
| 498 | * - Output bias
|
|---|
| 499 | * - rhs
|
|---|
| 500 | */
|
|---|
| 501 | q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
|
|---|
| 502 | const q7_t *rhs,
|
|---|
| 503 | const int32_t lhs_offset,
|
|---|
| 504 | const uint16_t num_ch,
|
|---|
| 505 | const int32_t *out_shift,
|
|---|
| 506 | const int32_t *out_mult,
|
|---|
| 507 | const int32_t out_offset,
|
|---|
| 508 | const int32_t activation_min,
|
|---|
| 509 | const int32_t activation_max,
|
|---|
| 510 | const uint16_t row_x_col,
|
|---|
| 511 | const int32_t *const output_bias,
|
|---|
| 512 | q7_t *out);
|
|---|
| 513 |
|
|---|
| 514 | /**
|
|---|
| 515 | * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
|
|---|
| 516 | * Dimensions are the same for lhs and rhs.
|
|---|
| 517 | *
|
|---|
| 518 | * @param[in] lhs Input left-hand side matrix
|
|---|
| 519 | * @param[in] rhs Input right-hand side matrix (transposed)
|
|---|
| 520 | * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
|
|---|
| 521 | * @param[in] num_ch Number of channels in LHS/RHS
|
|---|
| 522 | * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels.
|
|---|
| 523 | * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels.
|
|---|
| 524 | * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
|
|---|
| 525 | * @param[in] activation_min Minimum value to clamp the output to. Range: int8
|
|---|
| 526 | * @param[in] activation_max Maximum value to clamp the output to. Range: int8
|
|---|
| 527 | * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
|
|---|
| 528 | * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels.
|
|---|
| 529 | * @param[in] out Output pointer
|
|---|
| 530 | *
|
|---|
| 531 | * @return The function returns one of the two
|
|---|
| 532 | * - Updated output pointer if an implementation is available
|
|---|
| 533 | * - NULL if no implementation is available.
|
|---|
| 534 | *
|
|---|
| 535 | * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
|
|---|
| 536 | * out for the following.
|
|---|
| 537 | * - Output shift
|
|---|
| 538 | * - Output multiplier
|
|---|
| 539 | * - Output bias
|
|---|
| 540 | * - rhs
|
|---|
| 541 | */
|
|---|
| 542 | q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
|
|---|
| 543 | const q7_t *rhs,
|
|---|
| 544 | const int32_t lhs_offset,
|
|---|
| 545 | const uint16_t num_ch,
|
|---|
| 546 | const int32_t *out_shift,
|
|---|
| 547 | const int32_t *out_mult,
|
|---|
| 548 | const int32_t out_offset,
|
|---|
| 549 | const int32_t activation_min,
|
|---|
| 550 | const int32_t activation_max,
|
|---|
| 551 | const uint16_t row_x_col,
|
|---|
| 552 | const int32_t *const output_bias,
|
|---|
| 553 | q7_t *out);
|
|---|
| 554 |
|
|---|
| 555 | /**
|
|---|
| 556 | *@brief Matrix-multiplication function for convolution with reordered columns
|
|---|
| 557 | *@param[in] pA pointer to operand A
|
|---|
| 558 | *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
|
|---|
| 559 | *@param[in] ch_im_out numRow of A
|
|---|
| 560 | *@param[in] numCol_A numCol of A
|
|---|
| 561 | *@param[in] bias_shift amount of left-shift for bias
|
|---|
| 562 | *@param[in] out_shift amount of right-shift for output
|
|---|
| 563 | *@param[in] bias the bias
|
|---|
| 564 | *@param[in,out] pOut pointer to output
|
|---|
| 565 | *@return The function returns the incremented output pointer
|
|---|
| 566 | *
|
|---|
| 567 | *@details This function assumes that data in pInBuffer are reordered
|
|---|
| 568 | */
|
|---|
| 569 | q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
|
|---|
| 570 | const q15_t *pInBuffer,
|
|---|
| 571 | const uint16_t ch_im_out,
|
|---|
| 572 | const uint16_t numCol_A,
|
|---|
| 573 | const uint16_t bias_shift,
|
|---|
| 574 | const uint16_t out_shift,
|
|---|
| 575 | const q7_t *bias,
|
|---|
| 576 | q7_t *pOut);
|
|---|
| 577 |
|
|---|
| 578 | /**
|
|---|
| 579 | @brief Read 2 q15 elements and post increment pointer.
|
|---|
| 580 | @param[in] in_q15 Pointer to pointer that holds address of input.
|
|---|
| 581 | @return q31 value
|
|---|
| 582 | */
|
|---|
| 583 | __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
|
|---|
| 584 | {
|
|---|
| 585 | q31_t val;
|
|---|
| 586 |
|
|---|
| 587 | memcpy(&val, *in_q15, 4);
|
|---|
| 588 | *in_q15 += 2;
|
|---|
| 589 |
|
|---|
| 590 | return (val);
|
|---|
| 591 | }
|
|---|
| 592 |
|
|---|
| 593 | /**
|
|---|
| 594 | @brief Read 4 q7 from q7 pointer and post increment pointer.
|
|---|
| 595 | @param[in] in_q7 Pointer to pointer that holds address of input.
|
|---|
| 596 | @return q31 value
|
|---|
| 597 | */
|
|---|
| 598 | __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
|
|---|
| 599 | {
|
|---|
| 600 | q31_t val;
|
|---|
| 601 | memcpy(&val, *in_q7, 4);
|
|---|
| 602 | *in_q7 += 4;
|
|---|
| 603 |
|
|---|
| 604 | return (val);
|
|---|
| 605 | }
|
|---|
| 606 |
|
|---|
| 607 | /**
|
|---|
| 608 | @brief Read 2 q15 from q15 pointer.
|
|---|
| 609 | @param[in] in_q15 pointer to address of input.
|
|---|
| 610 | @return q31 value
|
|---|
| 611 | */
|
|---|
| 612 | __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
|
|---|
| 613 | {
|
|---|
| 614 | q31_t val;
|
|---|
| 615 | memcpy(&val, in_q15, 4);
|
|---|
| 616 |
|
|---|
| 617 | return (val);
|
|---|
| 618 | }
|
|---|
| 619 |
|
|---|
| 620 | /**
|
|---|
| 621 | @brief Read 4 q7 values.
|
|---|
| 622 | @param[in] in_q7 pointer to address of input.
|
|---|
| 623 | @return q31 value
|
|---|
| 624 | */
|
|---|
| 625 | __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
|
|---|
| 626 | {
|
|---|
| 627 | q31_t val;
|
|---|
| 628 | memcpy(&val, in_q7, 4);
|
|---|
| 629 |
|
|---|
| 630 | return (val);
|
|---|
| 631 | }
|
|---|
| 632 |
|
|---|
| 633 | /**
|
|---|
| 634 | @brief Write four q7 to q7 pointer and increment pointer afterwards.
|
|---|
| 635 | @param[in] in Double pointer to input value
|
|---|
| 636 | @param[in] value Four bytes to copy
|
|---|
| 637 | */
|
|---|
| 638 | __STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
|
|---|
| 639 | {
|
|---|
| 640 | memcpy(*in, &value, 4);
|
|---|
| 641 | *in += 4;
|
|---|
| 642 | }
|
|---|
| 643 |
|
|---|
| 644 | /**
|
|---|
| 645 | * @brief memset optimized for MVE
|
|---|
| 646 | * @param[in, out] dst Destination pointer
|
|---|
| 647 | * @param[in] val Value to set
|
|---|
| 648 | * @param[in] block_size Number of bytes to copy.
|
|---|
| 649 | *
|
|---|
| 650 | */
|
|---|
| 651 | __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size)
|
|---|
| 652 | {
|
|---|
| 653 | #if defined(ARM_MATH_MVEI)
|
|---|
| 654 | __asm volatile(" vdup.8 q0, %[set_val] \n"
|
|---|
| 655 | " wlstp.8 lr, %[cnt], 1f \n"
|
|---|
| 656 | "2: \n"
|
|---|
| 657 | " vstrb.8 q0, [%[in]], #16 \n"
|
|---|
| 658 | " letp lr, 2b \n"
|
|---|
| 659 | "1: \n"
|
|---|
| 660 | : [ in ] "+r"(dst)
|
|---|
| 661 | : [ cnt ] "r"(block_size), [ set_val ] "r"(val)
|
|---|
| 662 | : "q0", "memory", "r14");
|
|---|
| 663 | #else
|
|---|
| 664 | memset(dst, val, block_size);
|
|---|
| 665 | #endif
|
|---|
| 666 | }
|
|---|
| 667 |
|
|---|
| 668 | #if defined(ARM_MATH_DSP)
|
|---|
| 669 |
|
|---|
| 670 | /**
|
|---|
| 671 | * @brief read and expand one q7 word into two q15 words
|
|---|
| 672 | */
|
|---|
| 673 |
|
|---|
| 674 | __STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2)
|
|---|
| 675 | {
|
|---|
| 676 | q31_t inA = arm_nn_read_q7x4_ia(&source);
|
|---|
| 677 | q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8);
|
|---|
| 678 | q31_t inAbuf2 = __SXTB16(inA);
|
|---|
| 679 |
|
|---|
| 680 | #ifndef ARM_MATH_BIG_ENDIAN
|
|---|
| 681 | *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
|
|---|
| 682 | *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
|
|---|
| 683 | #else
|
|---|
| 684 | *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
|
|---|
| 685 | *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
|
|---|
| 686 | #endif
|
|---|
| 687 |
|
|---|
| 688 | return source;
|
|---|
| 689 | }
|
|---|
| 690 |
|
|---|
| 691 | /**
|
|---|
| 692 | * @brief read and expand one q7 word into two q15 words with reordering
|
|---|
| 693 | */
|
|---|
| 694 |
|
|---|
| 695 | __STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2)
|
|---|
| 696 | {
|
|---|
| 697 | q31_t inA = arm_nn_read_q7x4_ia(&source);
|
|---|
| 698 | #ifndef ARM_MATH_BIG_ENDIAN
|
|---|
| 699 | *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
|
|---|
| 700 | *out1 = __SXTB16(inA);
|
|---|
| 701 | #else
|
|---|
| 702 | *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
|
|---|
| 703 | *out2 = __SXTB16(inA);
|
|---|
| 704 | #endif
|
|---|
| 705 |
|
|---|
| 706 | return source;
|
|---|
| 707 | }
|
|---|
| 708 |
|
|---|
| 709 | /**
|
|---|
| 710 | * @brief read and expand one q7 word into two q15 words with reordering and add an offset
|
|---|
| 711 | */
|
|---|
| 712 | __STATIC_FORCEINLINE const q7_t *
|
|---|
| 713 | read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset)
|
|---|
| 714 | {
|
|---|
| 715 | q31_t inA = arm_nn_read_q7x4_ia(&source);
|
|---|
| 716 |
|
|---|
| 717 | #ifndef ARM_MATH_BIG_ENDIAN
|
|---|
| 718 | *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
|
|---|
| 719 | *out1 = __SXTB16(inA);
|
|---|
| 720 | #else
|
|---|
| 721 | *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
|
|---|
| 722 | *out2 = __SXTB16(inA);
|
|---|
| 723 | #endif
|
|---|
| 724 | *out1 = __QADD16(*out1, offset);
|
|---|
| 725 | *out2 = __QADD16(*out2, offset);
|
|---|
| 726 |
|
|---|
| 727 | return source;
|
|---|
| 728 | }
|
|---|
| 729 |
|
|---|
| 730 | #endif
|
|---|
| 731 |
|
|---|
| 732 | /**
|
|---|
| 733 | * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
|
|---|
| 734 | *
|
|---|
| 735 | * Basic Math Functions for Neural Network Computation
|
|---|
| 736 | *
|
|---|
| 737 | */
|
|---|
| 738 |
|
|---|
| 739 | /**
|
|---|
| 740 | * @brief q7 vector multiplication with variable output shifts
|
|---|
| 741 | * @param[in] *pSrcA pointer to the first input vector
|
|---|
| 742 | * @param[in] *pSrcB pointer to the second input vector
|
|---|
| 743 | * @param[out] *pDst pointer to the output vector
|
|---|
| 744 | * @param[in] out_shift amount of right-shift for output
|
|---|
| 745 | * @param[in] blockSize number of samples in each vector
|
|---|
| 746 | * @return none.
|
|---|
| 747 | *
|
|---|
| 748 | * <b>Scaling and Overflow Behavior:</b>
|
|---|
| 749 | * \par
|
|---|
| 750 | * The function uses saturating arithmetic.
|
|---|
| 751 | * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
|
|---|
| 752 | */
|
|---|
| 753 |
|
|---|
| 754 | void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);
|
|---|
| 755 |
|
|---|
| 756 | /**
|
|---|
| 757 | * @brief q7 vector multiplication with variable output shifts
|
|---|
| 758 | * @param[in] *pSrcA pointer to the first input vector
|
|---|
| 759 | * @param[in] *pSrcB pointer to the second input vector
|
|---|
| 760 | * @param[out] *pDst pointer to the output vector
|
|---|
| 761 | * @param[in] out_shift amount of right-shift for output
|
|---|
| 762 | * @param[in] blockSize number of samples in each vector
|
|---|
| 763 | * @return none.
|
|---|
| 764 | *
|
|---|
| 765 | * <b>Scaling and Overflow Behavior:</b>
|
|---|
| 766 | * \par
|
|---|
| 767 | * The function uses saturating arithmetic.
|
|---|
| 768 | * Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
|
|---|
| 769 | */
|
|---|
| 770 |
|
|---|
| 771 | void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
|
|---|
| 772 |
|
|---|
| 773 | /**
|
|---|
| 774 | * @brief Matrix-multiplication function for convolution with per-channel requantization.
|
|---|
| 775 | * @param[in] input_a pointer to operand A
|
|---|
| 776 | * @param[in] input_b pointer to operand B, always consists of 2 vectors.
|
|---|
| 777 | * @param[in] output_ch number of rows of A
|
|---|
| 778 | * @param[in] out_shift pointer to per output channel requantization shift parameter.
|
|---|
| 779 | * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
|
|---|
| 780 | * @param[in] out_offset output tensor offset.
|
|---|
| 781 | * @param[in] activation_min minimum value to clamp the output to. Range : int8
|
|---|
| 782 | * @param[in] activation_max maximum value to clamp the output to. Range : int8
|
|---|
| 783 | * @param[in] num_col_a number of columns of A
|
|---|
| 784 | * @param[in] output_bias per output channel bias. Range : int32
|
|---|
| 785 | * @param[in,out] out_0 pointer to output
|
|---|
| 786 | * @return The function returns one of the two
|
|---|
| 787 | * 1. The incremented output pointer for a successful operation or
|
|---|
| 788 | * 2. NULL if implementation is not available.
|
|---|
| 789 | *
|
|---|
| 790 | * @details This function does the matrix multiplication of weight matrix for all output channels
|
|---|
| 791 | * with 2 columns from im2col and produces two elements/output_channel. The outputs are
|
|---|
| 792 | * clamped in the range provided by activation min and max.
|
|---|
| 793 | * Supported framework: TensorFlow Lite micro.
|
|---|
| 794 | */
|
|---|
| 795 | q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
|
|---|
| 796 | const q15_t *input_b,
|
|---|
| 797 | const uint16_t output_ch,
|
|---|
| 798 | const int32_t *out_shift,
|
|---|
| 799 | const int32_t *out_mult,
|
|---|
| 800 | const int32_t out_offset,
|
|---|
| 801 | const int16_t activation_min,
|
|---|
| 802 | const int16_t activation_max,
|
|---|
| 803 | const uint16_t num_col_a,
|
|---|
| 804 | const int32_t *const output_bias,
|
|---|
| 805 | q7_t *out_0);
|
|---|
| 806 |
|
|---|
| 807 | /**
|
|---|
| 808 | * @brief Common softmax function for s8 input and s8 or s16 output
|
|---|
| 809 | * @param[in] input Pointer to the input tensor
|
|---|
| 810 | * @param[in] num_rows Number of rows in the input tensor
|
|---|
| 811 | * @param[in] row_size Number of elements in each input row
|
|---|
| 812 | * @param[in] mult Input quantization multiplier
|
|---|
| 813 | * @param[in] shift Input quantization shift within the range [0, 31]
|
|---|
| 814 | * @param[in] diff_min Minimum difference with max in row. Used to check if
|
|---|
| 815 | * the quantized exponential operation can be performed
|
|---|
| 816 | * @param[in] int16_output Indicating s8 output if 0 else s16 output
|
|---|
| 817 | * @param[out] output Pointer to the output tensor
|
|---|
| 818 | *
|
|---|
| 819 | * @note Supported framework: TensorFlow Lite micro (bit-accurate)
|
|---|
| 820 | *
|
|---|
| 821 | */
|
|---|
| 822 | void arm_nn_softmax_common_s8(const int8_t *input,
|
|---|
| 823 | const int32_t num_rows,
|
|---|
| 824 | const int32_t row_size,
|
|---|
| 825 | const int32_t mult,
|
|---|
| 826 | const int32_t shift,
|
|---|
| 827 | const int32_t diff_min,
|
|---|
| 828 | const bool int16_output,
|
|---|
| 829 | void *output);
|
|---|
| 830 |
|
|---|
| 831 | /**
|
|---|
| 832 | * @brief macro for adding rounding offset
|
|---|
| 833 | */
|
|---|
| 834 | #ifndef ARM_NN_TRUNCATE
|
|---|
| 835 | #define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
|
|---|
| 836 | #else
|
|---|
| 837 | #define NN_ROUND(out_shift) 0
|
|---|
| 838 | #endif
|
|---|
| 839 |
|
|---|
| 840 | // Macros for shortening quantization functions' names and avoid long lines
|
|---|
| 841 | #define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))
|
|---|
| 842 | #define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))
|
|---|
| 843 | #define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))
|
|---|
| 844 |
|
|---|
| 845 | #define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))
|
|---|
| 846 | #define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))
|
|---|
| 847 |
|
|---|
| 848 | #define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))
|
|---|
| 849 | #define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
|
|---|
| 850 |
|
|---|
| 851 | /**
|
|---|
| 852 | * @brief Saturating doubling high multiply. Result matches
|
|---|
| 853 | * NEON instruction VQRDMULH.
|
|---|
| 854 | * @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
|
|---|
| 855 | * @param[in] m2 Multiplier. Range: {NN_Q31_MIN, NN_Q31_MAX}
|
|---|
| 856 | * @return Result of multiplication.
|
|---|
| 857 | *
|
|---|
| 858 | */
|
|---|
| 859 | __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2)
|
|---|
| 860 | {
|
|---|
| 861 | q31_t result = 0;
|
|---|
| 862 | // Rounding offset to add for a right shift of 31
|
|---|
| 863 | q63_t mult = 1 << 30;
|
|---|
| 864 |
|
|---|
| 865 | if ((m1 < 0) ^ (m2 < 0))
|
|---|
| 866 | {
|
|---|
| 867 | mult = 1 - mult;
|
|---|
| 868 | }
|
|---|
| 869 | // Gets resolved as a SMLAL instruction
|
|---|
| 870 | mult = mult + (q63_t)m1 * m2;
|
|---|
| 871 |
|
|---|
| 872 | // Utilize all of the upper 32 bits. This is the doubling step
|
|---|
| 873 | // as well.
|
|---|
| 874 | result = (int32_t)(mult / (1ll << 31));
|
|---|
| 875 |
|
|---|
| 876 | if ((m1 == m2) && (m1 == (int32_t)NN_Q31_MIN))
|
|---|
| 877 | {
|
|---|
| 878 | result = NN_Q31_MAX;
|
|---|
| 879 | }
|
|---|
| 880 | return result;
|
|---|
| 881 | }
|
|---|
| 882 |
|
|---|
| 883 | /**
|
|---|
| 884 | * @brief Doubling high multiply without saturation. This is intended
|
|---|
| 885 | * for requantization where the scale is a positive integer
|
|---|
| 886 | *
|
|---|
| 887 | * @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
|
|---|
| 888 | * @param[in] m2 Multiplier Range: {NN_Q31_MIN, NN_Q31_MAX}
|
|---|
| 889 | * @return Result of multiplication.
|
|---|
| 890 | * @note The result of this matches that of neon instruction
|
|---|
| 891 | * VQRDMULH for m1 in range {NN_Q31_MIN, NN_Q31_MAX} and m2 in
|
|---|
| 892 | * range {NN_Q31_MIN + 1, NN_Q31_MAX}. Saturation occurs when
|
|---|
| 893 | * m1 equals m2 equals NN_Q31_MIN and that is not handled by
|
|---|
| 894 | * this function.
|
|---|
| 895 | *
|
|---|
| 896 | */
|
|---|
| 897 | __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2)
|
|---|
| 898 | {
|
|---|
| 899 | q31_t result = 0;
|
|---|
| 900 | union arm_nn_long_long mult;
|
|---|
| 901 |
|
|---|
| 902 | // Rounding offset to add for a right shift of 31
|
|---|
| 903 | mult.word.low = 1 << 30;
|
|---|
| 904 | mult.word.high = 0;
|
|---|
| 905 |
|
|---|
| 906 | // Gets resolved as a SMLAL instruction
|
|---|
| 907 | mult.long_long = mult.long_long + (q63_t)m1 * m2;
|
|---|
| 908 |
|
|---|
| 909 | // Utilize all of the upper 32 bits. This is the doubling step
|
|---|
| 910 | // as well.
|
|---|
| 911 | result = (int32_t)(mult.long_long >> 31);
|
|---|
| 912 |
|
|---|
| 913 | return result;
|
|---|
| 914 | }
|
|---|
| 915 |
|
|---|
| 916 | /**
|
|---|
| 917 | * @brief Rounding divide by power of two.
|
|---|
| 918 | * @param[in] dividend - Dividend
|
|---|
| 919 | * @param[in] exponent - Divisor = power(2, exponent)
|
|---|
| 920 | * Range: [0, 31]
|
|---|
| 921 | * @return Rounded result of division. Midpoint is rounded away from zero.
|
|---|
| 922 | *
|
|---|
| 923 | */
|
|---|
| 924 | __STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
|
|---|
| 925 | {
|
|---|
| 926 | q31_t result = 0;
|
|---|
| 927 | const q31_t remainder_mask = (1 << exponent) - 1;
|
|---|
| 928 | int32_t remainder = remainder_mask & dividend;
|
|---|
| 929 |
|
|---|
| 930 | // Basic division
|
|---|
| 931 | result = dividend >> exponent;
|
|---|
| 932 |
|
|---|
| 933 | // Adjust 'result' for rounding (mid point away from zero)
|
|---|
| 934 | q31_t threshold = remainder_mask >> 1;
|
|---|
| 935 | if (result < 0)
|
|---|
| 936 | {
|
|---|
| 937 | threshold++;
|
|---|
| 938 | }
|
|---|
| 939 | if (remainder > threshold)
|
|---|
| 940 | {
|
|---|
| 941 | result++;
|
|---|
| 942 | }
|
|---|
| 943 |
|
|---|
| 944 | return result;
|
|---|
| 945 | }
|
|---|
| 946 |
|
|---|
| 947 | /**
|
|---|
| 948 | * @brief Requantize a given value.
|
|---|
| 949 | * @param[in] val Value to be requantized
|
|---|
| 950 | * @param[in] multiplier multiplier. Range {NN_Q31_MIN + 1, Q32_MAX}
|
|---|
| 951 | * @param[in] shift left or right shift for 'val * multiplier'
|
|---|
| 952 | *
|
|---|
| 953 | * @return Returns (val * multiplier)/(2 ^ shift)
|
|---|
| 954 | *
|
|---|
| 955 | */
|
|---|
| 956 | __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
|
|---|
| 957 | {
|
|---|
| 958 | #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
|
|---|
| 959 | const int64_t total_shift = 31 - shift;
|
|---|
| 960 | const int64_t new_val = val * (int64_t)multiplier;
|
|---|
| 961 |
|
|---|
| 962 | int32_t result = new_val >> (total_shift - 1);
|
|---|
| 963 | result = (result + 1) >> 1;
|
|---|
| 964 |
|
|---|
| 965 | return result;
|
|---|
| 966 | #else
|
|---|
| 967 | return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
|
|---|
| 968 | RIGHT_SHIFT(shift));
|
|---|
| 969 | #endif
|
|---|
| 970 | }
|
|---|
| 971 |
|
|---|
| 972 | /**
|
|---|
| 973 | * @brief Requantize a given 64 bit value.
|
|---|
| 974 | * @param[in] val Value to be requantized in the range {-(1<<47)} to {(1<<47) - 1}
|
|---|
| 975 | * @param[in] reduced_multiplier Reduced multiplier in the range {NN_Q31_MIN + 1, Q32_MAX} to {Q16_MIN + 1,
|
|---|
| 976 | * Q16_MAX}
|
|---|
| 977 | * @param[in] shift Left or right shift for 'val * multiplier' in the range {-31} to {7}
|
|---|
| 978 | *
|
|---|
| 979 | * @return Returns (val * multiplier)/(2 ^ shift)
|
|---|
| 980 | *
|
|---|
| 981 | */
|
|---|
| 982 | __STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift)
|
|---|
| 983 | {
|
|---|
| 984 | const q63_t new_val = val * reduced_multiplier;
|
|---|
| 985 |
|
|---|
| 986 | q31_t result = new_val >> (14 - shift); // 64->32 bit reduction
|
|---|
| 987 | result = (result + 1) >> 1; // Last shift position and insert round
|
|---|
| 988 |
|
|---|
| 989 | return result;
|
|---|
| 990 | }
|
|---|
| 991 |
|
|---|
| 992 | /**
|
|---|
| 993 | * @brief memcpy optimized for MVE
|
|---|
| 994 | * @param[in, out] dst Destination pointer
|
|---|
| 995 | * @param[in] src Source pointer.
|
|---|
| 996 | * @param[in] block_size Number of bytes to copy.
|
|---|
| 997 | *
|
|---|
| 998 | */
|
|---|
| 999 | __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size)
|
|---|
| 1000 | {
|
|---|
| 1001 | #if defined(ARM_MATH_MVEI)
|
|---|
| 1002 | __asm volatile(" wlstp.8 lr, %[cnt], 1f \n"
|
|---|
| 1003 | "2: \n"
|
|---|
| 1004 | " vldrb.8 q0, [%[in]], #16 \n"
|
|---|
| 1005 | " vstrb.8 q0, [%[out]], #16 \n"
|
|---|
| 1006 | " letp lr, 2b \n"
|
|---|
| 1007 | "1: \n"
|
|---|
| 1008 | : [ in ] "+r"(src), [ out ] "+r"(dst)
|
|---|
| 1009 | : [ cnt ] "r"(block_size)
|
|---|
| 1010 | : "q0", "memory", "r14");
|
|---|
| 1011 | #else
|
|---|
| 1012 | memcpy(dst, src, block_size);
|
|---|
| 1013 | #endif
|
|---|
| 1014 | }
|
|---|
| 1015 |
|
|---|
| 1016 | #if defined(ARM_MATH_MVEI)
|
|---|
| 1017 | /**
|
|---|
| 1018 | * @brief Vector saturating doubling high multiply returning high half.
|
|---|
| 1019 | * @param[in] m1 Multiplicand
|
|---|
| 1020 | * @param[in] m2 Multiplier
|
|---|
| 1021 | * @return Result of multiplication.
|
|---|
| 1022 | *
|
|---|
| 1023 | */
|
|---|
| 1024 | __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2)
|
|---|
| 1025 | {
|
|---|
| 1026 | return vqrdmulhq_n_s32(m1, m2);
|
|---|
| 1027 | }
|
|---|
| 1028 |
|
|---|
| 1029 | /**
|
|---|
| 1030 | * @brief Vector rounding divide by power of two.
|
|---|
| 1031 | * @param[in] dividend - Dividend vector
|
|---|
| 1032 | * @param[in] exponent - Divisor = power(2, exponent)
|
|---|
| 1033 | * Range: [0, 31]
|
|---|
| 1034 | * @return Rounded result of division. Midpoint is rounded away from zero.
|
|---|
| 1035 | *
|
|---|
| 1036 | */
|
|---|
| 1037 | __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
|
|---|
| 1038 | {
|
|---|
| 1039 | const int32x4_t shift = vdupq_n_s32(-exponent);
|
|---|
| 1040 | const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
|
|---|
| 1041 | const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
|
|---|
| 1042 | return vrshlq_s32(fixed_up_dividend, shift);
|
|---|
| 1043 | }
|
|---|
| 1044 |
|
|---|
| 1045 | /**
|
|---|
| 1046 | * @brief Requantize a given vector.
|
|---|
| 1047 | * @param[in] val Vector to be requantized
|
|---|
| 1048 | * @param[in] multiplier multiplier
|
|---|
| 1049 | * @param[in] shift shift
|
|---|
| 1050 | *
|
|---|
| 1051 | * @return Returns (val * multiplier)/(2 ^ shift)
|
|---|
| 1052 | *
|
|---|
| 1053 | */
|
|---|
| 1054 | __STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
|
|---|
| 1055 | {
|
|---|
| 1056 | #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
|
|---|
| 1057 | const int right_shift = MIN(-1, shift);
|
|---|
| 1058 | const int left_shift = shift - right_shift;
|
|---|
| 1059 |
|
|---|
| 1060 | const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
|
|---|
| 1061 | const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
|
|---|
| 1062 |
|
|---|
| 1063 | int32x4_t result = vqdmulhq_n_s32(vshlq_s32(val, left_shift_dup), multiplier);
|
|---|
| 1064 | result = vrshlq_s32(result, right_shift_dup);
|
|---|
| 1065 |
|
|---|
| 1066 | return result;
|
|---|
| 1067 | #else
|
|---|
| 1068 | return arm_divide_by_power_of_two_mve(
|
|---|
| 1069 | arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift));
|
|---|
| 1070 | #endif
|
|---|
| 1071 | }
|
|---|
| 1072 |
|
|---|
| 1073 | __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
|
|---|
| 1074 | {
|
|---|
| 1075 | return vqrdmulhq_s32(m1, m2);
|
|---|
| 1076 | }
|
|---|
| 1077 |
|
|---|
| 1078 | __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent)
|
|---|
| 1079 | {
|
|---|
| 1080 | const int32x4_t shift = -exponent;
|
|---|
| 1081 | const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
|
|---|
| 1082 | const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
|
|---|
| 1083 | return vrshlq_s32(fixed_up_dividend, shift);
|
|---|
| 1084 | }
|
|---|
| 1085 |
|
|---|
| 1086 | __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
|
|---|
| 1087 | const int32x4_t multiplier,
|
|---|
| 1088 | const int32x4_t shift)
|
|---|
| 1089 | {
|
|---|
| 1090 | #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
|
|---|
| 1091 | const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift);
|
|---|
| 1092 | const int32x4_t left_shift = vqsubq_s32(shift, right_shift);
|
|---|
| 1093 |
|
|---|
| 1094 | int32x4_t result = vqdmulhq_s32(vshlq_s32(val, left_shift), multiplier);
|
|---|
| 1095 | result = vrshlq_s32(result, right_shift);
|
|---|
| 1096 |
|
|---|
| 1097 | return result;
|
|---|
| 1098 | #else
|
|---|
| 1099 | const int32x4_t zz = vdupq_n_s32(0);
|
|---|
| 1100 | const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
|
|---|
| 1101 |
|
|---|
| 1102 | const int32x4_t left_shift = vpselq_s32(shift, zz, p);
|
|---|
| 1103 | const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
|
|---|
| 1104 |
|
|---|
| 1105 | return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier),
|
|---|
| 1106 | right_shift);
|
|---|
| 1107 | #endif
|
|---|
| 1108 | }
|
|---|
| 1109 | #endif
|
|---|
| 1110 |
|
|---|
| 1111 | // @note The following functions are used only for softmax layer, scaled bits = 5 assumed
|
|---|
| 1112 |
|
|---|
| 1113 | __STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
|
|---|
| 1114 | {
|
|---|
| 1115 | int32_t mask = 0;
|
|---|
| 1116 | int32_t shift = 24;
|
|---|
| 1117 |
|
|---|
| 1118 | const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift);
|
|---|
| 1119 | const int32_t remainder = val_mod_minus_quarter - val;
|
|---|
| 1120 | const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28);
|
|---|
| 1121 | const int32_t x2 = MUL_SAT(x, x);
|
|---|
| 1122 |
|
|---|
| 1123 | int32_t result = 1895147668 +
|
|---|
| 1124 | MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
|
|---|
| 1125 |
|
|---|
| 1126 | #define SELECT_IF_NON_ZERO(x) \
|
|---|
| 1127 | { \
|
|---|
| 1128 | mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \
|
|---|
| 1129 | result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
|
|---|
| 1130 | }
|
|---|
| 1131 |
|
|---|
| 1132 | SELECT_IF_NON_ZERO(1672461947)
|
|---|
| 1133 | SELECT_IF_NON_ZERO(1302514674)
|
|---|
| 1134 | SELECT_IF_NON_ZERO(790015084)
|
|---|
| 1135 | SELECT_IF_NON_ZERO(290630308)
|
|---|
| 1136 | SELECT_IF_NON_ZERO(39332535)
|
|---|
| 1137 | SELECT_IF_NON_ZERO(720401)
|
|---|
| 1138 | SELECT_IF_NON_ZERO(242)
|
|---|
| 1139 |
|
|---|
| 1140 | #undef SELECT_IF_NON_ZERO
|
|---|
| 1141 |
|
|---|
| 1142 | mask = MASK_IF_ZERO(val);
|
|---|
| 1143 | return SELECT_USING_MASK(mask, NN_Q31_MAX, result);
|
|---|
| 1144 | }
|
|---|
| 1145 |
|
|---|
| 1146 | __STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
|
|---|
| 1147 | {
|
|---|
| 1148 | const int32_t thresh = ((1 << (31 - exp)) - 1);
|
|---|
| 1149 | int32_t result = val << exp;
|
|---|
| 1150 | result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), NN_Q31_MAX, result);
|
|---|
| 1151 | result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), NN_Q31_MIN, result);
|
|---|
| 1152 | return result;
|
|---|
| 1153 | }
|
|---|
| 1154 |
|
|---|
| 1155 | __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
|
|---|
| 1156 | {
|
|---|
| 1157 | const int64_t sum = (int64_t)val + (int64_t)NN_Q31_MAX;
|
|---|
| 1158 | const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L);
|
|---|
| 1159 | int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540);
|
|---|
| 1160 |
|
|---|
| 1161 | const int32_t shift = (1 << 29);
|
|---|
| 1162 | x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
|
|---|
| 1163 | x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
|
|---|
| 1164 | x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
|
|---|
| 1165 |
|
|---|
| 1166 | return MUL_POW2(x, 1);
|
|---|
| 1167 | }
|
|---|
| 1168 |
|
|---|
| 1169 | /**
|
|---|
| 1170 | @brief Write 2 q15 elements and post increment pointer.
|
|---|
| 1171 | @param[in] dest_q15 Pointer to pointer that holds address of destination.
|
|---|
| 1172 | @param[in] src_q31 Input value to be written.
|
|---|
| 1173 | */
|
|---|
| 1174 | __STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
|
|---|
| 1175 | {
|
|---|
| 1176 | q31_t val = src_q31;
|
|---|
| 1177 |
|
|---|
| 1178 | memcpy(*dest_q15, &val, 4);
|
|---|
| 1179 | *dest_q15 += 2;
|
|---|
| 1180 | }
|
|---|
| 1181 |
|
|---|
| 1182 | #ifdef __cplusplus
|
|---|
| 1183 | }
|
|---|
| 1184 | #endif
|
|---|
| 1185 |
|
|---|
| 1186 | #endif
|
|---|