| 1 | /******************************************************************************
|
|---|
| 2 | * @file arm_vec_math.h
|
|---|
| 3 | * @brief Public header file for CMSIS DSP Library
|
|---|
| 4 | * @version V1.10.0
|
|---|
| 5 | * @date 08 July 2021
|
|---|
| 6 | * Target Processor: Cortex-M and Cortex-A cores
|
|---|
| 7 | ******************************************************************************/
|
|---|
| 8 | /*
|
|---|
| 9 | * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
|
|---|
| 10 | *
|
|---|
| 11 | * SPDX-License-Identifier: Apache-2.0
|
|---|
| 12 | *
|
|---|
| 13 | * Licensed under the Apache License, Version 2.0 (the License); you may
|
|---|
| 14 | * not use this file except in compliance with the License.
|
|---|
| 15 | * You may obtain a copy of the License at
|
|---|
| 16 | *
|
|---|
| 17 | * www.apache.org/licenses/LICENSE-2.0
|
|---|
| 18 | *
|
|---|
| 19 | * Unless required by applicable law or agreed to in writing, software
|
|---|
| 20 | * distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|---|
| 21 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 22 | * See the License for the specific language governing permissions and
|
|---|
| 23 | * limitations under the License.
|
|---|
| 24 | */
|
|---|
| 25 |
|
|---|
| 26 | #ifndef _ARM_VEC_MATH_H
|
|---|
| 27 | #define _ARM_VEC_MATH_H
|
|---|
| 28 |
|
|---|
| 29 | #include "arm_math_types.h"
|
|---|
| 30 | #include "arm_common_tables.h"
|
|---|
| 31 | #include "arm_helium_utils.h"
|
|---|
| 32 |
|
|---|
| 33 | #ifdef __cplusplus
|
|---|
| 34 | extern "C"
|
|---|
| 35 | {
|
|---|
| 36 | #endif
|
|---|
| 37 |
|
|---|
| 38 | #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|---|
| 39 |
|
|---|
| 40 | #define INV_NEWTON_INIT_F32 0x7EF127EA
|
|---|
| 41 |
|
|---|
| 42 | static const float32_t __logf_rng_f32=0.693147180f;
|
|---|
| 43 |
|
|---|
| 44 |
|
|---|
| 45 | /* fast inverse approximation (3x newton) */
|
|---|
| 46 | __STATIC_INLINE f32x4_t vrecip_medprec_f32(
|
|---|
| 47 | f32x4_t x)
|
|---|
| 48 | {
|
|---|
| 49 | q31x4_t m;
|
|---|
| 50 | f32x4_t b;
|
|---|
| 51 | any32x4_t xinv;
|
|---|
| 52 | f32x4_t ax = vabsq(x);
|
|---|
| 53 |
|
|---|
| 54 | xinv.f = ax;
|
|---|
| 55 | m = 0x3F800000 - (xinv.i & 0x7F800000);
|
|---|
| 56 | xinv.i = xinv.i + m;
|
|---|
| 57 | xinv.f = 1.41176471f - 0.47058824f * xinv.f;
|
|---|
| 58 | xinv.i = xinv.i + m;
|
|---|
| 59 |
|
|---|
| 60 | b = 2.0f - xinv.f * ax;
|
|---|
| 61 | xinv.f = xinv.f * b;
|
|---|
| 62 |
|
|---|
| 63 | b = 2.0f - xinv.f * ax;
|
|---|
| 64 | xinv.f = xinv.f * b;
|
|---|
| 65 |
|
|---|
| 66 | b = 2.0f - xinv.f * ax;
|
|---|
| 67 | xinv.f = xinv.f * b;
|
|---|
| 68 |
|
|---|
| 69 | xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
|
|---|
| 70 | /*
|
|---|
| 71 | * restore sign
|
|---|
| 72 | */
|
|---|
| 73 | xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
|
|---|
| 74 |
|
|---|
| 75 | return xinv.f;
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | /* fast inverse approximation (4x newton) */
|
|---|
| 79 | __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
|
|---|
| 80 | f32x4_t x)
|
|---|
| 81 | {
|
|---|
| 82 | q31x4_t m;
|
|---|
| 83 | f32x4_t b;
|
|---|
| 84 | any32x4_t xinv;
|
|---|
| 85 | f32x4_t ax = vabsq(x);
|
|---|
| 86 |
|
|---|
| 87 | xinv.f = ax;
|
|---|
| 88 |
|
|---|
| 89 | m = 0x3F800000 - (xinv.i & 0x7F800000);
|
|---|
| 90 | xinv.i = xinv.i + m;
|
|---|
| 91 | xinv.f = 1.41176471f - 0.47058824f * xinv.f;
|
|---|
| 92 | xinv.i = xinv.i + m;
|
|---|
| 93 |
|
|---|
| 94 | b = 2.0f - xinv.f * ax;
|
|---|
| 95 | xinv.f = xinv.f * b;
|
|---|
| 96 |
|
|---|
| 97 | b = 2.0f - xinv.f * ax;
|
|---|
| 98 | xinv.f = xinv.f * b;
|
|---|
| 99 |
|
|---|
| 100 | b = 2.0f - xinv.f * ax;
|
|---|
| 101 | xinv.f = xinv.f * b;
|
|---|
| 102 |
|
|---|
| 103 | b = 2.0f - xinv.f * ax;
|
|---|
| 104 | xinv.f = xinv.f * b;
|
|---|
| 105 |
|
|---|
| 106 | xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
|
|---|
| 107 | /*
|
|---|
| 108 | * restore sign
|
|---|
| 109 | */
|
|---|
| 110 | xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
|
|---|
| 111 |
|
|---|
| 112 | return xinv.f;
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | __STATIC_INLINE f32x4_t vdiv_f32(
|
|---|
| 116 | f32x4_t num, f32x4_t den)
|
|---|
| 117 | {
|
|---|
| 118 | return vmulq(num, vrecip_hiprec_f32(den));
|
|---|
| 119 | }
|
|---|
| 120 |
|
|---|
| 121 | /**
|
|---|
| 122 | @brief Single-precision taylor dev.
|
|---|
| 123 | @param[in] x f32 quad vector input
|
|---|
| 124 | @param[in] coeffs f32 quad vector coeffs
|
|---|
| 125 | @return destination f32 quad vector
|
|---|
| 126 | */
|
|---|
| 127 |
|
|---|
| 128 | __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
|
|---|
| 129 | f32x4_t x,
|
|---|
| 130 | const float32_t * coeffs)
|
|---|
| 131 | {
|
|---|
| 132 | f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
|
|---|
| 133 | f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
|
|---|
| 134 | f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
|
|---|
| 135 | f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
|
|---|
| 136 | f32x4_t x2 = vmulq(x, x);
|
|---|
| 137 | f32x4_t x4 = vmulq(x2, x2);
|
|---|
| 138 | f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
|
|---|
| 139 |
|
|---|
| 140 | return res;
|
|---|
| 141 | }
|
|---|
| 142 |
|
|---|
| 143 | __STATIC_INLINE f32x4_t vmant_exp_f32(
|
|---|
| 144 | f32x4_t x,
|
|---|
| 145 | int32x4_t * e)
|
|---|
| 146 | {
|
|---|
| 147 | any32x4_t r;
|
|---|
| 148 | int32x4_t n;
|
|---|
| 149 |
|
|---|
| 150 | r.f = x;
|
|---|
| 151 | n = r.i >> 23;
|
|---|
| 152 | n = n - 127;
|
|---|
| 153 | r.i = r.i - (n << 23);
|
|---|
| 154 |
|
|---|
| 155 | *e = n;
|
|---|
| 156 | return r.f;
|
|---|
| 157 | }
|
|---|
| 158 |
|
|---|
| 159 |
|
|---|
| 160 | __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
|
|---|
| 161 | {
|
|---|
| 162 | q31x4_t vecExpUnBiased;
|
|---|
| 163 | f32x4_t vecTmpFlt0, vecTmpFlt1;
|
|---|
| 164 | f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
|
|---|
| 165 | f32x4_t vecExpUnBiasedFlt;
|
|---|
| 166 |
|
|---|
| 167 | /*
|
|---|
| 168 | * extract exponent
|
|---|
| 169 | */
|
|---|
| 170 | vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
|
|---|
| 171 |
|
|---|
| 172 | vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
|
|---|
| 173 | /*
|
|---|
| 174 | * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
|
|---|
| 175 | */
|
|---|
| 176 | vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
|
|---|
| 177 | vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
|
|---|
| 178 | /*
|
|---|
| 179 | * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
|
|---|
| 180 | */
|
|---|
| 181 | vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
|
|---|
| 182 | vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
|
|---|
| 183 | /*
|
|---|
| 184 | * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
|
|---|
| 185 | */
|
|---|
| 186 | vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
|
|---|
| 187 | vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
|
|---|
| 188 | /*
|
|---|
| 189 | * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
|
|---|
| 190 | */
|
|---|
| 191 | vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
|
|---|
| 192 | vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
|
|---|
| 193 | /*
|
|---|
| 194 | * a = a + b * xx;
|
|---|
| 195 | */
|
|---|
| 196 | vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
|
|---|
| 197 | /*
|
|---|
| 198 | * c = c + d * xx;
|
|---|
| 199 | */
|
|---|
| 200 | vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
|
|---|
| 201 | /*
|
|---|
| 202 | * xx = xx * xx;
|
|---|
| 203 | */
|
|---|
| 204 | vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
|
|---|
| 205 | vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
|
|---|
| 206 | /*
|
|---|
| 207 | * r.f = a + c * xx;
|
|---|
| 208 | */
|
|---|
| 209 | vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
|
|---|
| 210 | /*
|
|---|
| 211 | * add exponent
|
|---|
| 212 | * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
|
|---|
| 213 | */
|
|---|
| 214 | vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
|
|---|
| 215 | // set log0 down to -inf
|
|---|
| 216 | vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
|
|---|
| 217 | return vecAcc0;
|
|---|
| 218 | }
|
|---|
| 219 |
|
|---|
| 220 | __STATIC_INLINE f32x4_t vexpq_f32(
|
|---|
| 221 | f32x4_t x)
|
|---|
| 222 | {
|
|---|
| 223 | // Perform range reduction [-log(2),log(2)]
|
|---|
| 224 | int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
|
|---|
| 225 | f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
|
|---|
| 226 |
|
|---|
| 227 | // Polynomial Approximation
|
|---|
| 228 | f32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
|
|---|
| 229 |
|
|---|
| 230 | // Reconstruct
|
|---|
| 231 | poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
|
|---|
| 232 |
|
|---|
| 233 | poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
|
|---|
| 234 | return poly;
|
|---|
| 235 | }
|
|---|
| 236 |
|
|---|
| 237 | __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
|
|---|
| 238 | {
|
|---|
| 239 | f32x4_t r = x;
|
|---|
| 240 | nb--;
|
|---|
| 241 | while (nb > 0) {
|
|---|
| 242 | r = vmulq(r, x);
|
|---|
| 243 | nb--;
|
|---|
| 244 | }
|
|---|
| 245 | return (r);
|
|---|
| 246 | }
|
|---|
| 247 |
|
|---|
| 248 | __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
|
|---|
| 249 | {
|
|---|
| 250 | f32x4_t vecSx, vecW, vecTmp;
|
|---|
| 251 | any32x4_t v;
|
|---|
| 252 |
|
|---|
| 253 | vecSx = vabsq(vecIn);
|
|---|
| 254 |
|
|---|
| 255 | v.f = vecIn;
|
|---|
| 256 | v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
|
|---|
| 257 |
|
|---|
| 258 | vecW = vmulq(vecSx, v.f);
|
|---|
| 259 |
|
|---|
| 260 | // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
|
|---|
| 261 | vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
|
|---|
| 262 | vecTmp = vfmasq(vecW, vecTmp, -28.0f);
|
|---|
| 263 | vecTmp = vfmasq(vecW, vecTmp, 56.0f);
|
|---|
| 264 | vecTmp = vfmasq(vecW, vecTmp, -70.0f);
|
|---|
| 265 | vecTmp = vfmasq(vecW, vecTmp, 56.0f);
|
|---|
| 266 | vecTmp = vfmasq(vecW, vecTmp, -28.0f);
|
|---|
| 267 | vecTmp = vfmasq(vecW, vecTmp, 8.0f);
|
|---|
| 268 | v.f = vmulq(v.f, vecTmp);
|
|---|
| 269 |
|
|---|
| 270 | v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
|
|---|
| 271 | /*
|
|---|
| 272 | * restore sign
|
|---|
| 273 | */
|
|---|
| 274 | v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
|
|---|
| 275 | return v.f;
|
|---|
| 276 | }
|
|---|
| 277 |
|
|---|
| 278 | __STATIC_INLINE f32x4_t vtanhq_f32(
|
|---|
| 279 | f32x4_t val)
|
|---|
| 280 | {
|
|---|
| 281 | f32x4_t x =
|
|---|
| 282 | vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
|
|---|
| 283 | f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
|
|---|
| 284 | f32x4_t num = vsubq_n_f32(exp2x, 1.f);
|
|---|
| 285 | f32x4_t den = vaddq_n_f32(exp2x, 1.f);
|
|---|
| 286 | f32x4_t tanh = vmulq_f32(num, vrecip_f32(den));
|
|---|
| 287 | return tanh;
|
|---|
| 288 | }
|
|---|
| 289 |
|
|---|
| 290 | __STATIC_INLINE f32x4_t vpowq_f32(
|
|---|
| 291 | f32x4_t val,
|
|---|
| 292 | f32x4_t n)
|
|---|
| 293 | {
|
|---|
| 294 | return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
|
|---|
| 295 | }
|
|---|
| 296 |
|
|---|
| 297 | #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
|
|---|
| 298 |
|
|---|
| 299 | #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|---|
| 300 | #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
|
|---|
| 301 |
|
|---|
| 302 | #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|---|
| 303 |
|
|---|
| 304 | #include "NEMath.h"
|
|---|
| 305 | /**
|
|---|
| 306 | * @brief Vectorized integer exponentiation
|
|---|
| 307 | * @param[in] x value
|
|---|
| 308 | * @param[in] nb integer exponent >= 1
|
|---|
| 309 | * @return x^nb
|
|---|
| 310 | *
|
|---|
| 311 | */
|
|---|
| 312 | __STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
|
|---|
| 313 | {
|
|---|
| 314 | float32x4_t r = x;
|
|---|
| 315 | nb --;
|
|---|
| 316 | while(nb > 0)
|
|---|
| 317 | {
|
|---|
| 318 | r = vmulq_f32(r , x);
|
|---|
| 319 | nb--;
|
|---|
| 320 | }
|
|---|
| 321 | return(r);
|
|---|
| 322 | }
|
|---|
| 323 |
|
|---|
| 324 |
|
|---|
| 325 | __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x)
|
|---|
| 326 | {
|
|---|
| 327 | float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
|
|---|
| 328 | float32x4_t e = vrsqrteq_f32(x1);
|
|---|
| 329 | e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
|
|---|
| 330 | e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
|
|---|
| 331 | return vmulq_f32(x, e);
|
|---|
| 332 | }
|
|---|
| 333 |
|
|---|
| 334 | __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
|
|---|
| 335 | {
|
|---|
| 336 | float32x4_t tempF;
|
|---|
| 337 | int32x4_t tempHI,tempLO;
|
|---|
| 338 |
|
|---|
| 339 | tempLO = vmovl_s16(vget_low_s16(vec));
|
|---|
| 340 | tempF = vcvtq_n_f32_s32(tempLO,15);
|
|---|
| 341 | tempF = __arm_vec_sqrt_f32_neon(tempF);
|
|---|
| 342 | tempLO = vcvtq_n_s32_f32(tempF,15);
|
|---|
| 343 |
|
|---|
| 344 | tempHI = vmovl_s16(vget_high_s16(vec));
|
|---|
| 345 | tempF = vcvtq_n_f32_s32(tempHI,15);
|
|---|
| 346 | tempF = __arm_vec_sqrt_f32_neon(tempF);
|
|---|
| 347 | tempHI = vcvtq_n_s32_f32(tempF,15);
|
|---|
| 348 |
|
|---|
| 349 | return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
|
|---|
| 350 | }
|
|---|
| 351 |
|
|---|
| 352 | __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
|
|---|
| 353 | {
|
|---|
| 354 | float32x4_t temp;
|
|---|
| 355 |
|
|---|
| 356 | temp = vcvtq_n_f32_s32(vec,31);
|
|---|
| 357 | temp = __arm_vec_sqrt_f32_neon(temp);
|
|---|
| 358 | return(vcvtq_n_s32_f32(temp,31));
|
|---|
| 359 | }
|
|---|
| 360 |
|
|---|
| 361 | #endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
|---|
| 362 |
|
|---|
| 363 | #ifdef __cplusplus
|
|---|
| 364 | }
|
|---|
| 365 | #endif
|
|---|
| 366 |
|
|---|
| 367 |
|
|---|
| 368 | #endif /* _ARM_VEC_MATH_H */
|
|---|
| 369 |
|
|---|
| 370 | /**
|
|---|
| 371 | *
|
|---|
| 372 | * End of file.
|
|---|
| 373 | */
|
|---|