Context Navigation

arm_vec_fft.h

Last change on this file was 42, checked in by f.jahn, 5 days ago

File size: 11.2 KB

Line
1	/******************************************************************************
2	* @file arm_vec_fft.h
3	* @brief Private header file for CMSIS DSP Library
4	* @version V1.7.0
5	* @date 07. January 2020
6	******************************************************************************/
7	/*
8	* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
9	*
10	* SPDX-License-Identifier: Apache-2.0
11	*
12	* Licensed under the Apache License, Version 2.0 (the License); you may
13	* not use this file except in compliance with the License.
14	* You may obtain a copy of the License at
15	*
16	* www.apache.org/licenses/LICENSE-2.0
17	*
18	* Unless required by applicable law or agreed to in writing, software
19	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
20	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21	* See the License for the specific language governing permissions and
22	* limitations under the License.
23	*/
24
25	#ifndef _ARM_VEC_FFT_H_
26	#define _ARM_VEC_FFT_H_
27
28	#include "arm_math.h"
29	#include "arm_helium_utils.h"
30
31	#ifdef __cplusplus
32	extern "C"
33	{
34	#endif
35
36	#if (defined(ARM_MATH_MVEF) \|\| defined(ARM_MATH_MVEI) \|\| defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
37
38	#define MVE_CMPLX_ADD_A_ixB(A, B) vcaddq_rot90(A,B)
39	#define MVE_CMPLX_SUB_A_ixB(A,B) vcaddq_rot270(A,B)
40	#define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B)
41	#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B)
42
43	#define MVE_CMPLX_MULT_FX_AxB(A,B,TyA) vqdmladhxq(vqdmlsdhq((TyA)vuninitializedq_s32(), A, B), A, B)
44	#define MVE_CMPLX_MULT_FX_AxConjB(A,B,TyA) vqdmladhq(vqdmlsdhxq((TyA)vuninitializedq_s32(), A, B), A, B)
45
46	#define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B)
47	#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)
48
49
50	/**
51	@brief In-place 32 bit reversal function for helium
52	@param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
53	@param[in] bitRevLen bit reversal table length
54	@param[in] pBitRevTab points to bit reversal table
55	@return none
56	*/
57
58	__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
59	uint32_t *pSrc,
60	const uint16_t bitRevLen,
61	const uint16_t *pBitRevTab)
62
63	{
64	uint64_t src = (uint64_t ) pSrc;
65	int32_t blkCnt; /* loop counters */
66	uint32x4_t bitRevTabOff;
67	uint32x4_t one = vdupq_n_u32(1);
68	uint64x2_t inLow, inHigh;
69	uint64x2_t bitRevOff1Low, bitRevOff0Low;
70	uint64x2_t bitRevOff1High, bitRevOff0High;
71
72	/* load scheduling to increase gather load idx update / gather load distance */
73	bitRevTabOff = vldrhq_u32(pBitRevTab);
74	pBitRevTab += 4;
75
76	bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
77	bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
78
79
80	blkCnt = bitRevLen / 8;
81	while (blkCnt > 0) {
82	bitRevTabOff = vldrhq_u32(pBitRevTab);
83	pBitRevTab += 4;
84
85	/* 64-bit index expansion */
86	bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
87	bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
88
89	inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
90	inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
91
92	vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
93	vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
94
95
96	/* unrolled */
97	bitRevTabOff = vldrhq_u32(pBitRevTab);
98	pBitRevTab += 4;
99
100	bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
101	bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
102
103	inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
104	inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
105
106	vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
107	vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
108
109	/*
110	* Decrement the blockSize loop counter
111	*/
112	blkCnt--;
113	}
114
115	if (bitRevLen & 7) {
116	/* FFT size = 16 */
117	inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
118	inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
119
120	vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
121	vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
122	}
123	}
124
125
126
127	/**
128	@brief In-place 16 bit reversal function for helium
129	@param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
130	@param[in] bitRevLen bit reversal table length
131	@param[in] pBitRevTab points to bit reversal table
132	@return none
133	*/
134
135	__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
136	uint16_t *pSrc,
137	const uint16_t bitRevLen,
138	const uint16_t *pBitRevTab)
139
140	{
141	uint32_t src = (uint32_t ) pSrc;
142	int32_t blkCnt; /* loop counters */
143	uint32x4_t bitRevTabOff;
144	uint16x8_t one = vdupq_n_u16(1);
145	uint32x4_t bitRevOff1Low, bitRevOff0Low;
146	uint32x4_t bitRevOff1High, bitRevOff0High;
147	uint32x4_t inLow, inHigh;
148
149	/* load scheduling to increase gather load idx update / gather load distance */
150	bitRevTabOff = vldrhq_u16(pBitRevTab);
151	pBitRevTab += 8;
152
153	bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
154	bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
155	bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
156	bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
157
158	blkCnt = (bitRevLen / 16);
159	while (blkCnt > 0) {
160	bitRevTabOff = vldrhq_u16(pBitRevTab);
161	pBitRevTab += 8;
162
163	bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
164	bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
165	bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
166	bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
167
168	inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
169	inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
170
171	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
172	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
173
174	/* loop unrolling */
175	bitRevTabOff = vldrhq_u16(pBitRevTab);
176	pBitRevTab += 8;
177
178	bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
179	bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
180	bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
181	bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
182
183	inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
184	inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
185
186	vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
187	vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
188
189	blkCnt--;
190	}
191
192	/* tail handling */
193	blkCnt = bitRevLen & 0xf;
194	if (blkCnt == 8) {
195	inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
196	inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
197
198	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
199	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
200	} else if (blkCnt == 12) {
201	/* FFT 16 special case */
202	mve_pred16_t p = vctp16q(4);
203
204	bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
205
206	inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
207	inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
208
209	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
210	vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
211
212	bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
213	bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
214	bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
215	bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
216
217	inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
218	inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
219
220	vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
221	vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
222	}
223	}
224
225	/**
226	@brief Out-of-place 32 bit reversal function for helium
227	@param[out] pDst points to destination buffer of unknown 32-bit data type
228	@param[in] pSrc points to input buffer of unknown 32-bit data type
229	@param[in] fftLen FFT length
230	@return none
231	*/
232	__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void pDst, void pSrc, uint32_t fftLen)
233	{
234	uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
235	uint32_t bitRevPos, blkCnt;
236	uint32_t pDst32 = (uint32_t ) pDst;
237
238	/* fwd indexes */
239	idxOffs0 = vdupq_n_u32(0);
240	idxOffs1 = vdupq_n_u32(0);
241	idxOffs0[0] = 0; idxOffs0[2] = 4;
242	idxOffs1[0] = 8; idxOffs1[2] = 12;
243
244	bitRevPos = (31 - __CLZ(fftLen)) + 5;
245	blkCnt = fftLen >> 2;
246
247	/* issued earlier to increase gather load idx update / gather load distance */
248	/* bit-reverse fwd indexes */
249	bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
250	bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
251	while (blkCnt > 0) {
252	uint64x2_t vecIn;
253
254	vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
255	idxOffs0 = idxOffs0 + 16;
256	vst1q(pDst32, (uint32x4_t) vecIn);
257	pDst32 += 4;
258	bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
259
260	vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
261	idxOffs1 = idxOffs1 + 16;
262	vst1q(pDst32, (uint32x4_t) vecIn);
263	pDst32 += 4;
264	bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
265
266	blkCnt--;
267	}
268	}
269
270
271	/**
272	@brief Out-of-place 16 bit reversal function for helium
273	@param[out] pDst points to destination buffer of unknown 16-bit data type
274	@param[in] pSrc points to input buffer of unknown 16-bit data type
275	@param[in] fftLen FFT length
276	@return none
277	*/
278
279	__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void pDst, void pSrc, uint32_t fftLen)
280	{
281	uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
282	uint32_t bitRevPos, blkCnt;
283	uint16_t pDst16 = (uint16_t ) pDst;
284	uint32_t incrIdx = 0;
285
286	/* fwd indexes */
287	idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12}
288	idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28}
289
290	bitRevPos = (31 - __CLZ(fftLen)) + 4;
291	blkCnt = fftLen >> 3;
292
293	/* issued earlier to increase gather load idx update / gather load distance */
294	/* bit-reverse fwd indexes */
295	bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
296	bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
297	while (blkCnt > 0) {
298	uint32x4_t vecIn;
299
300	vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
301	idxOffs0 = idxOffs0 + 32;
302	vst1q(pDst16, (uint16x8_t) vecIn);
303	pDst16 += 8;
304	bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
305
306	vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
307	idxOffs1 = idxOffs1 + 32;
308	vst1q(pDst16, (uint16x8_t) vecIn);
309	pDst16 += 8;
310	bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
311
312	blkCnt--;
313	}
314	}
315
316
317	#endif /* (defined(ARM_MATH_MVEF) \|\| defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
318
319
320	#ifdef __cplusplus
321	}
322	#endif
323
324
325	#endif /* _ARM_VEC_FFT_H_ */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/firmware_v4/Drivers/CMSIS/DSP/PrivateInclude/arm_vec_fft.h

Download in other formats: