Cell SDK Code Sample: FFT
%% --------------------------------------------------------------
%% (C) Copyright 2001,2005,
%% International Business Machines Corporation,
%% Sony Computer Entertainment Incorporated,
%% Toshiba Corporation.
%%
%% All Rights Reserved.
%% --------------------------------------------------------------
%% PROLOG END TAG zYx
Target:
CBE-Linux (HW or simulator)
Description:
This directory contains a hand-tuned program which performs
a 4-way SIMD single-precision complex FFT on an array of
size 16,777,216 elements.
Notes:
The actual executable resides in the ppu subdirectory. It's
called 'fft'. It's a full be executable. It takes two runtime
parameters 'ncycles' and 'printflag'.
'ncycles' - is a count of how many times you wish to do a
full roundtrip of time-to-frequency-to-time calculations.
'printflag' - enables or disables print statements within the SPEs.
[note that the PPE print statements will always appear.]
On SystemSim, this program will take hours to do a full cycle. The
print statements in the SPE are intended to give the user evidence
that the program has not stalled.
Recommend that on SystemSim, you say 'fft 1 1'.
Recommend that on real hardware, you say 'fft 100 0'.
Here's what happens within the program.
* First, the PPE fills an array with 16,277,216 complex
numbers, using a function called trigfunc. This function
can be changed by the user.
* Then the PPE fires up all eight SPEs, and they perform a
time-to-frequency conversion on this data, and signal
the PPE.
* The PPE checks these results. If you change trigfunc, you
should comment out this part. It then signals the SPEs.
* The SPEs convert the data back into the time domain,
and signal the PPE.
* The PPE checks these results (using trigfunc), prints
a message "START TIMING" and signals the SPEs.
* Then, the SPEs perform 'ncycles' round trips of time to
frequency and back to time, and signals the PPE.
* The PPE prints STOP TIMING" and exits.
When running on real hardware, you can use these PPE print
messages with a stopwatch to get a quick estimate of the time
required to perform this function.
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#ifndef __fft_h__
#define __fft_h__
#include <stdlib.h>
#include <stdio.h>
/*
#include <string.h>
*/
#define NP 16777216 /* number of points (16 meg) */
#define M_PI 3.14159265358979323846
#define FULLRUN
//#define MAMBO_RUN
void stage1(int, int); /* first eight sets of butterflies */
void stage2(int, int); /* second eight sets of butterflies */
void stage3(int, int); /* third eight sets of butterflies */
typedef union
{
unsigned long long ull;
unsigned int ui[2];
}
addr64; /* linkage stuff used when calling the SPU program */
typedef struct _control_block {
unsigned int spu_num; /* number from 0 through 7 */
unsigned int ncycles; /* number of round trips to take when estimating performance */
unsigned long long barrier1_count;
unsigned long long barrier1_address;
unsigned long long barrier2_count;
unsigned long long barrier2_address;
unsigned long long barrier3_count;
unsigned long long barrier3_address;
unsigned int ar; /* real component of A array */
unsigned int ai; /* imaginary component of A array */
unsigned int br; /* real component of B array */
unsigned int bi; /* imaginary component of B array */
unsigned int printflag; /* true if we want print statements from the SPU */
unsigned char pad[52]; /* pad to a full cache line */
} control_block;
#endif /* __fft_h__ */
Target: PPE [This code section runs on the PPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../fft.h"
#include <sys/mman.h>
#include <stdio.h>
#include <libspe.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <fenv.h>
#include <sys/types.h>
#include <errno.h>
#include <math.h>
/* next two lines used for huge page size stuff */
char *mem_file = "/huge/fft_mem.bin";
char *mem_addr = NULL;
/* Allocate space for three separate barrier blocks */
static unsigned int b[3][32] __attribute__ ((aligned (4096)));
control_block cb[8] __attribute__ ((aligned (4096)));
extern spe_program_handle_t fft_spu;
speid_t speids[8];
spe_gid_t gid;
int status[8];
void *rc;
float *ar, *ai, *br, *bi;
#define MALLOC_BIG_ARRAYS { \
/* bottom 11 bits of ar and br addrs should be 0x000 */ \
/* bottom 11 bits of ai and bi addrs shuold be 0x400 */ \
int ar_raw, ai_raw, br_raw, bi_raw; \
ar_raw = (int) malloc(0x4000800); \
br_raw = (int) malloc(0x4000800); \
ai_raw = (int) malloc(0x4000800); \
bi_raw = (int) malloc(0x4000800); \
if ((ar_raw * br_raw * ai_raw * bi_raw) == 0) { \
printf("ERROR: unable to malloc. Exiting...\n"); \
return(-1); \
} \
ar_raw += 0x3ff; \
br_raw += 0x3ff; \
ai_raw += 0x3ff; \
bi_raw += 0x3ff; \
ar_raw &= ~0x3ff; \
br_raw &= ~0x3ff; \
ai_raw &= ~0x3ff; \
bi_raw &= ~0x3ff; \
ar_raw += (ar_raw & 0x400); \
br_raw += (br_raw & 0x400); \
ai_raw += 0x400 - (ai_raw & 0x400); \
bi_raw += 0x400 - (bi_raw & 0x400); \
ar = (float *) ar_raw; \
br = (float *) br_raw; \
ai = (float *) ai_raw; \
bi = (float *) bi_raw; \
}
#define MBOX_SND(_val) { \
for (i=0; i<8; ++i) { \
int foo = _val; \
spe_write_in_mbox(speids[i], foo); \
} \
}
#define MBOX_RCV { \
for (i=0; i<8; ++i) { \
unsigned int foo; \
while (spe_stat_out_mbox(speids[i]) <= 0); \
foo = spe_read_out_mbox(speids[i]); \
} \
}
#define PERFORM_TIME_TO_FREQUENCY_FFT { \
MBOX_SND(1) \
MBOX_RCV \
MBOX_SND(1) \
MBOX_RCV \
MBOX_SND(1) \
MBOX_RCV \
}
#define PERFORM_FREQUENCY_TO_TIME_FFT { \
MBOX_SND(-1) \
MBOX_RCV \
MBOX_SND(-1) \
MBOX_RCV \
MBOX_SND(-1) \
MBOX_RCV \
}
float trigfunc(int in) {
double x;
x = ((double) in) * 6.2831853071796 / ((double) NP);
return ((float) (7.0 + sin(x) + cos(2*x)));
}
int main(int argc, char *argv[]) {
int i, j;
int fmem;
int ncycles;
unsigned int printflag;
float lo_real, hi_real, lo_imag, hi_imag;
if (argc != 3) {
fprintf(stderr, "usage: fft <ncycles> <printflag>\n");
return -1;
}
ncycles = atoi(argv[1]);
printflag = atoi(argv[2]);
/* Create a large contiguous memory buffer by allocating a large
* * page (or more). Large page memory will also reduce the TLB thrashing.
* */
if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {
printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
MALLOC_BIG_ARRAYS
} else {
mem_addr = (char *) mmap (0, 0x11000000, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);
if (mem_addr == MAP_FAILED) {
printf("ERROR: unable to mmap file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
close (fmem);
MALLOC_BIG_ARRAYS
}
else {
ar = (float *) (mem_addr+0x0000000);
br = (float *) (mem_addr+0x4000000);
ai = (float *) (mem_addr+0x8000400);
bi = (float *) (mem_addr+0xc000400);
}
}
#ifndef SIM_RUN
printf("big array addrs: %x %x %x %x\n", (int) ar, (int) br, (int) ai, (int) bi);
printf("loading big array A\n"); fflush(stdout);
for (i=0; i<NP; ++i) {
if ((i&0xfffff) == 0) { printf("%x of %x done\n", i, NP); fflush(stdout); }
ai[i] = 0.0f;
ar[i] = trigfunc(i);
}
#endif
/* Create an SPE group. */
gid = spe_create_group ( SCHED_OTHER, 0, 1 );
if (gid == NULL) {
fprintf(stderr, "Failed spe_create_group(errno=%d)\n", errno);
return -1;
}
if (spe_group_max (gid) < 8) {
fprintf(stderr, "System doesn't have eight working SPEs. I can't continue...\n");
return -1;
}
/* Initialize barrier count to 0. */
for (i=0; i<32; ++i) b[0][i] = b[1][i] = b[2][i] = 0;
for (i = 0; i < 8; i++) {
cb[i].spu_num = i;
cb[i].ncycles = ncycles;
cb[i].barrier1_count = (unsigned long long) 8;
cb[i].barrier1_address = (unsigned long long) b[0];
cb[i].barrier2_count = (unsigned long long) 8;
cb[i].barrier2_address = (unsigned long long) b[1];
cb[i].barrier3_count = (unsigned long long) 8;
cb[i].barrier3_address = (unsigned long long) b[2];
cb[i].ar = (unsigned long) ar;
cb[i].ai = (unsigned long) ai;
cb[i].br = (unsigned long) br;
cb[i].bi = (unsigned long) bi;
cb[i].printflag = printflag;
}
/* allocate SPEs */
fprintf(stderr, "ready to call (create) SPE threads\n"); fflush(stderr);
for (i = 0; i < 8; i++)
{
speids[i] = spe_create_thread (gid, &fft_spu, (unsigned long long *) &cb[i], NULL, -1, 0);
if (speids[i] == NULL)
{
fprintf (stderr, "FAILED: spe_create_thread(num=%d, errno=%d)\n", i, errno);
exit (3+i);
}
}
/* monitor progress while SPEs process time-to-frequency computations */
PERFORM_TIME_TO_FREQUENCY_FFT
#ifndef SIM_RUN
#ifdef FULLRUN
printf("Checking frequency results...\n"); fflush(stdout);
for (i=0; i<NP; ++i) {
if (ar[i] < -0.002 || ar[i] > 0.002 || ai[i] < -0.002 || ai[i] > 0.002) printf("a[%d] = (%10.3f, %9.3f)\n", i, ar[i], ai[i]);
}
#endif
/* monitor progress while SPEs process frequency-to-time computations */
PERFORM_FREQUENCY_TO_TIME_FFT
#ifdef FULLRUN
printf("Now checking results...\n"); fflush(stdout);
hi_real = -100.0;
lo_real = 100.0;
hi_imag = -100.0;
lo_imag = 100.0;
for (i=0; i<NP; ++i) {
float x;
x = trigfunc(i);
if (ar[i] - x > hi_real) hi_real = ar[i] - x ;
if (ar[i] - x < lo_real) lo_real = ar[i] - x ;
if (ai[i] > hi_imag) hi_imag = ai[i];
if (ai[i] < lo_imag) lo_imag = ai[i];
}
fprintf(stderr, "real err*r range = %f %f\n", lo_real, hi_real);
fprintf(stderr, "imag err*r range = %f %f\n", lo_imag, hi_imag);
if (lo_real < -0.00003 || hi_real > 0.00003 || lo_imag < -0.00003 || hi_imag > 0.00003) {
fprintf(stderr, "ERROR: range of error values too large...\n");
fflush(stderr);
return -1;
}
#endif
printf("START TIMING!\n");
/* monitor timing runs */
for (j=0; j<ncycles; ++j) {
PERFORM_TIME_TO_FREQUENCY_FFT
PERFORM_FREQUENCY_TO_TIME_FFT
}
printf("STOP TIMING!\n");
MBOX_SND(0)
printf("Now checking results...\n"); fflush(stdout);
hi_real = -100.0;
lo_real = 100.0;
hi_imag = -100.0;
lo_imag = 100.0;
for (i=0; i<NP; ++i) {
float x;
x = trigfunc(i);
if (ar[i] - x > hi_real) hi_real = ar[i] - x ;
if (ar[i] - x < lo_real) lo_real = ar[i] - x ;
if (ai[i] > hi_imag) hi_imag = ai[i];
if (ai[i] < lo_imag) lo_imag = ai[i];
}
printf("real err*r range = %f %f\n", lo_real, hi_real);
printf("imag err*r range = %f %f\n", lo_imag, hi_imag);
#endif
return 0;
}
Target: SPE [This code section runs on the SPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../fft.h"
#include <cos.h>
#include <sin.h>
#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include <transpose_matrix4x4.h>
#include <profile.h>
#include <stdio.h>
vector float sin5lsb[16][8] __attribute__ ((aligned (128))) = {
{ (vector float) (0.0000000e+00, 1.2271538e-02, 2.4541229e-02, 3.6807224e-02),
(vector float) (4.9067676e-02, 6.1320737e-02, 7.3564567e-02, 8.5797310e-02),
(vector float) (9.8017141e-02, 1.1022221e-01, 1.2241068e-01, 1.3458070e-01),
(vector float) (1.4673047e-01, 1.5885815e-01, 1.7096189e-01, 1.8303989e-01),
(vector float) (1.9509032e-01, 2.0711137e-01, 2.1910124e-01, 2.3105811e-01),
(vector float) (2.4298018e-01, 2.5486565e-01, 2.6671275e-01, 2.7851969e-01),
(vector float) (2.9028466e-01, 3.0200595e-01, 3.1368175e-01, 3.2531029e-01),
(vector float) (3.3688986e-01, 3.4841868e-01, 3.5989505e-01, 3.7131721e-01)},
{ (vector float) (0.0000000e+00, 6.1358847e-03, 1.2271538e-02, 1.8406730e-02),
(vector float) (2.4541229e-02, 3.0674804e-02, 3.6807224e-02, 4.2938258e-02),
(vector float) (4.9067676e-02, 5.5195246e-02, 6.1320737e-02, 6.7443922e-02),
(vector float) (7.3564567e-02, 7.9682440e-02, 8.5797310e-02, 9.1908954e-02),
(vector float) (9.8017141e-02, 1.0412163e-01, 1.1022221e-01, 1.1631863e-01),
(vector float) (1.2241068e-01, 1.2849811e-01, 1.3458070e-01, 1.4065824e-01),
(vector float) (1.4673047e-01, 1.5279719e-01, 1.5885815e-01, 1.6491312e-01),
(vector float) (1.7096189e-01, 1.7700422e-01, 1.8303989e-01, 1.8906866e-01)},
{ (vector float) (0.0000000e+00, 3.0679568e-03, 6.1358847e-03, 9.2037544e-03),
(vector float) (1.2271538e-02, 1.5339206e-02, 1.8406730e-02, 2.1474080e-02),
(vector float) (2.4541229e-02, 2.7608145e-02, 3.0674804e-02, 3.3741172e-02),
(vector float) (3.6807224e-02, 3.9872926e-02, 4.2938258e-02, 4.6003181e-02),
(vector float) (4.9067676e-02, 5.2131705e-02, 5.5195246e-02, 5.8258265e-02),
(vector float) (6.1320737e-02, 6.4382628e-02, 6.7443922e-02, 7.0504576e-02),
(vector float) (7.3564567e-02, 7.6623864e-02, 7.9682440e-02, 8.2740262e-02),
(vector float) (8.5797310e-02, 8.8853553e-02, 9.1908954e-02, 9.4963498e-02)},
{ (vector float) (0.0000000e+00, 1.5339801e-03, 3.0679568e-03, 4.6019261e-03),
(vector float) (6.1358847e-03, 7.6698288e-03, 9.2037544e-03, 1.0737659e-02),
(vector float) (1.2271538e-02, 1.3805388e-02, 1.5339206e-02, 1.6872987e-02),
(vector float) (1.8406730e-02, 1.9940428e-02, 2.1474080e-02, 2.3007682e-02),
(vector float) (2.4541229e-02, 2.6074719e-02, 2.7608145e-02, 2.9141508e-02),
(vector float) (3.0674804e-02, 3.2208025e-02, 3.3741172e-02, 3.5274237e-02),
(vector float) (3.6807224e-02, 3.8340122e-02, 3.9872926e-02, 4.1405641e-02),
(vector float) (4.2938258e-02, 4.4470772e-02, 4.6003181e-02, 4.7535483e-02)},
{ (vector float) (0.0000000e+00, 7.6699030e-04, 1.5339801e-03, 2.3009691e-03),
(vector float) (3.0679568e-03, 3.8349426e-03, 4.6019261e-03, 5.3689070e-03),
(vector float) (6.1358847e-03, 6.9028586e-03, 7.6698288e-03, 8.4367944e-03),
(vector float) (9.2037544e-03, 9.9707097e-03, 1.0737659e-02, 1.1504602e-02),
(vector float) (1.2271538e-02, 1.3038468e-02, 1.3805388e-02, 1.4572302e-02),
(vector float) (1.5339206e-02, 1.6106103e-02, 1.6872987e-02, 1.7639864e-02),
(vector float) (1.8406730e-02, 1.9173585e-02, 1.9940428e-02, 2.0707261e-02),
(vector float) (2.1474080e-02, 2.2240888e-02, 2.3007682e-02, 2.3774462e-02)},
{ (vector float) (0.0000000e+00, 3.8349518e-04, 7.6699030e-04, 1.1504854e-03),
(vector float) (1.5339801e-03, 1.9174748e-03, 2.3009691e-03, 2.6844630e-03),
(vector float) (3.0679568e-03, 3.4514500e-03, 3.8349426e-03, 4.2184344e-03),
(vector float) (4.6019261e-03, 4.9854168e-03, 5.3689070e-03, 5.7523963e-03),
(vector float) (6.1358847e-03, 6.5193721e-03, 6.9028586e-03, 7.2863442e-03),
(vector float) (7.6698288e-03, 8.0533121e-03, 8.4367944e-03, 8.8202748e-03),
(vector float) (9.2037544e-03, 9.5872330e-03, 9.9707097e-03, 1.0354185e-02),
(vector float) (1.0737659e-02, 1.1121131e-02, 1.1504602e-02, 1.1888071e-02)},
{ (vector float) (0.0000000e+00, 1.9174760e-04, 3.8349518e-04, 5.7524274e-04),
(vector float) (7.6699030e-04, 9.5873786e-04, 1.1504854e-03, 1.3422328e-03),
(vector float) (1.5339801e-03, 1.7257276e-03, 1.9174748e-03, 2.1092221e-03),
(vector float) (2.3009691e-03, 2.4927163e-03, 2.6844630e-03, 2.8762100e-03),
(vector float) (3.0679568e-03, 3.2597035e-03, 3.4514500e-03, 3.6431963e-03),
(vector float) (3.8349426e-03, 4.0266886e-03, 4.2184344e-03, 4.4101803e-03),
(vector float) (4.6019261e-03, 4.7936714e-03, 4.9854168e-03, 5.1771621e-03),
(vector float) (5.3689070e-03, 5.5606519e-03, 5.7523963e-03, 5.9441407e-03)},
{ (vector float) (0.0000000e+00, 9.5873802e-05, 1.9174760e-04, 2.8762140e-04),
(vector float) (3.8349518e-04, 4.7936899e-04, 5.7524274e-04, 6.7111652e-04),
(vector float) (7.6699030e-04, 8.6286408e-04, 9.5873786e-04, 1.0546116e-03),
(vector float) (1.1504854e-03, 1.2463591e-03, 1.3422328e-03, 1.4381065e-03),
(vector float) (1.5339801e-03, 1.6298539e-03, 1.7257276e-03, 1.8216012e-03),
(vector float) (1.9174748e-03, 2.0133485e-03, 2.1092221e-03, 2.2050955e-03),
(vector float) (2.3009691e-03, 2.3968427e-03, 2.4927163e-03, 2.5885897e-03),
(vector float) (2.6844630e-03, 2.7803367e-03, 2.8762100e-03, 2.9720834e-03)},
{ (vector float) (0.0000000e+00, 4.7936901e-05, 9.5873802e-05, 1.4381070e-04),
(vector float) (1.9174760e-04, 2.3968449e-04, 2.8762140e-04, 3.3555829e-04),
(vector float) (3.8349518e-04, 4.3143210e-04, 4.7936899e-04, 5.2730588e-04),
(vector float) (5.7524274e-04, 6.2317966e-04, 6.7111652e-04, 7.1905344e-04),
(vector float) (7.6699030e-04, 8.1492722e-04, 8.6286408e-04, 9.1080094e-04),
(vector float) (9.5873786e-04, 1.0066747e-03, 1.0546116e-03, 1.1025484e-03),
(vector float) (1.1504854e-03, 1.1984222e-03, 1.2463591e-03, 1.2942959e-03),
(vector float) (1.3422328e-03, 1.3901696e-03, 1.4381065e-03, 1.4860433e-03)},
{ (vector float) (0.0000000e+00, 2.3968450e-05, 4.7936901e-05, 7.1905350e-05),
(vector float) (9.5873802e-05, 1.1984225e-04, 1.4381070e-04, 1.6777914e-04),
(vector float) (1.9174760e-04, 2.1571605e-04, 2.3968449e-04, 2.6365294e-04),
(vector float) (2.8762140e-04, 3.1158983e-04, 3.3555829e-04, 3.5952675e-04),
(vector float) (3.8349518e-04, 4.0746364e-04, 4.3143210e-04, 4.5540053e-04),
(vector float) (4.7936899e-04, 5.0333742e-04, 5.2730588e-04, 5.5127434e-04),
(vector float) (5.7524274e-04, 5.9921120e-04, 6.2317966e-04, 6.4714812e-04),
(vector float) (6.7111652e-04, 6.9508498e-04, 7.1905344e-04, 7.4302190e-04)},
{ (vector float) (0.0000000e+00, 1.1984225e-05, 2.3968450e-05, 3.5952675e-05),
(vector float) (4.7936901e-05, 5.9921123e-05, 7.1905350e-05, 8.3889572e-05),
(vector float) (9.5873802e-05, 1.0785802e-04, 1.1984225e-04, 1.3182647e-04),
(vector float) (1.4381070e-04, 1.5579493e-04, 1.6777914e-04, 1.7976337e-04),
(vector float) (1.9174760e-04, 2.0373182e-04, 2.1571605e-04, 2.2770026e-04),
(vector float) (2.3968449e-04, 2.5166871e-04, 2.6365294e-04, 2.7563717e-04),
(vector float) (2.8762140e-04, 2.9960563e-04, 3.1158983e-04, 3.2357406e-04),
(vector float) (3.3555829e-04, 3.4754252e-04, 3.5952675e-04, 3.7151098e-04)},
{ (vector float) (0.0000000e+00, 5.9921126e-06, 1.1984225e-05, 1.7976337e-05),
(vector float) (2.3968450e-05, 2.9960562e-05, 3.5952675e-05, 4.1944786e-05),
(vector float) (4.7936901e-05, 5.3929012e-05, 5.9921123e-05, 6.5913235e-05),
(vector float) (7.1905350e-05, 7.7897465e-05, 8.3889572e-05, 8.9881687e-05),
(vector float) (9.5873802e-05, 1.0186591e-04, 1.0785802e-04, 1.1385014e-04),
(vector float) (1.1984225e-04, 1.2583435e-04, 1.3182647e-04, 1.3781858e-04),
(vector float) (1.4381070e-04, 1.4980281e-04, 1.5579493e-04, 1.6178703e-04),
(vector float) (1.6777914e-04, 1.7377126e-04, 1.7976337e-04, 1.8575549e-04)},
{ (vector float) (0.0000000e+00, 2.9960563e-06, 5.9921126e-06, 8.9881687e-06),
(vector float) (1.1984225e-05, 1.4980281e-05, 1.7976337e-05, 2.0972393e-05),
(vector float) (2.3968450e-05, 2.6964506e-05, 2.9960562e-05, 3.2956617e-05),
(vector float) (3.5952675e-05, 3.8948732e-05, 4.1944786e-05, 4.4940844e-05),
(vector float) (4.7936901e-05, 5.0932955e-05, 5.3929012e-05, 5.6925070e-05),
(vector float) (5.9921123e-05, 6.2917177e-05, 6.5913235e-05, 6.8909292e-05),
(vector float) (7.1905350e-05, 7.4901407e-05, 7.7897465e-05, 8.0893515e-05),
(vector float) (8.3889572e-05, 8.6885630e-05, 8.9881687e-05, 9.2877744e-05)},
{ (vector float) (0.0000000e+00, 1.4980282e-06, 2.9960563e-06, 4.4940844e-06),
(vector float) (5.9921126e-06, 7.4901404e-06, 8.9881687e-06, 1.0486197e-05),
(vector float) (1.1984225e-05, 1.3482253e-05, 1.4980281e-05, 1.6478309e-05),
(vector float) (1.7976337e-05, 1.9474366e-05, 2.0972393e-05, 2.2470422e-05),
(vector float) (2.3968450e-05, 2.5466477e-05, 2.6964506e-05, 2.8462535e-05),
(vector float) (2.9960562e-05, 3.1458589e-05, 3.2956617e-05, 3.4454646e-05),
(vector float) (3.5952675e-05, 3.7450704e-05, 3.8948732e-05, 4.0446757e-05),
(vector float) (4.1944786e-05, 4.3442815e-05, 4.4940844e-05, 4.6438872e-05)},
{ (vector float) (0.0000000e+00, 7.4901408e-07, 1.4980282e-06, 2.2470422e-06),
(vector float) (2.9960563e-06, 3.7450702e-06, 4.4940844e-06, 5.2430983e-06),
(vector float) (5.9921126e-06, 6.7411265e-06, 7.4901404e-06, 8.2391543e-06),
(vector float) (8.9881687e-06, 9.7371831e-06, 1.0486197e-05, 1.1235211e-05),
(vector float) (1.1984225e-05, 1.2733239e-05, 1.3482253e-05, 1.4231267e-05),
(vector float) (1.4980281e-05, 1.5729294e-05, 1.6478309e-05, 1.7227323e-05),
(vector float) (1.7976337e-05, 1.8725352e-05, 1.9474366e-05, 2.0223379e-05),
(vector float) (2.0972393e-05, 2.1721407e-05, 2.2470422e-05, 2.3219436e-05)},
{ (vector float) (0.0000000e+00, 3.7450704e-07, 7.4901408e-07, 1.1235211e-06),
(vector float) (1.4980282e-06, 1.8725351e-06, 2.2470422e-06, 2.6215491e-06),
(vector float) (2.9960563e-06, 3.3705633e-06, 3.7450702e-06, 4.1195772e-06),
(vector float) (4.4940844e-06, 4.8685915e-06, 5.2430983e-06, 5.6176054e-06),
(vector float) (5.9921126e-06, 6.3666193e-06, 6.7411265e-06, 7.1156337e-06),
(vector float) (7.4901404e-06, 7.8646472e-06, 8.2391543e-06, 8.6136615e-06),
(vector float) (8.9881687e-06, 9.3626759e-06, 9.7371831e-06, 1.0111689e-05),
(vector float) (1.0486197e-05, 1.0860704e-05, 1.1235211e-05, 1.1609718e-05)}};
vector float cos5lsb[16][8] __attribute__ ((aligned (128))) = {
{ (vector float) (1.0000000e+00, 9.9992472e-01, 9.9969882e-01, 9.9932235e-01),
(vector float) (9.9879545e-01, 9.9811810e-01, 9.9729043e-01, 9.9631262e-01),
(vector float) (9.9518472e-01, 9.9390697e-01, 9.9247956e-01, 9.9090266e-01),
(vector float) (9.8917651e-01, 9.8730141e-01, 9.8527765e-01, 9.8310548e-01),
(vector float) (9.8078525e-01, 9.7831738e-01, 9.7570211e-01, 9.7293997e-01),
(vector float) (9.7003126e-01, 9.6697646e-01, 9.6377605e-01, 9.6043050e-01),
(vector float) (9.5694035e-01, 9.5330602e-01, 9.4952816e-01, 9.4560730e-01),
(vector float) (9.4154406e-01, 9.3733901e-01, 9.3299282e-01, 9.2850608e-01)},
{ (vector float) (1.0000000e+00, 9.9998116e-01, 9.9992472e-01, 9.9983060e-01),
(vector float) (9.9969882e-01, 9.9952942e-01, 9.9932235e-01, 9.9907774e-01),
(vector float) (9.9879545e-01, 9.9847555e-01, 9.9811810e-01, 9.9772304e-01),
(vector float) (9.9729043e-01, 9.9682027e-01, 9.9631262e-01, 9.9576741e-01),
(vector float) (9.9518472e-01, 9.9456459e-01, 9.9390697e-01, 9.9321193e-01),
(vector float) (9.9247956e-01, 9.9170977e-01, 9.9090266e-01, 9.9005818e-01),
(vector float) (9.8917651e-01, 9.8825759e-01, 9.8730141e-01, 9.8630810e-01),
(vector float) (9.8527765e-01, 9.8421007e-01, 9.8310548e-01, 9.8196387e-01)},
{ (vector float) (1.0000000e+00, 9.9999529e-01, 9.9998116e-01, 9.9995762e-01),
(vector float) (9.9992472e-01, 9.9988234e-01, 9.9983060e-01, 9.9976939e-01),
(vector float) (9.9969882e-01, 9.9961883e-01, 9.9952942e-01, 9.9943060e-01),
(vector float) (9.9932235e-01, 9.9920475e-01, 9.9907774e-01, 9.9894130e-01),
(vector float) (9.9879545e-01, 9.9864024e-01, 9.9847555e-01, 9.9830157e-01),
(vector float) (9.9811810e-01, 9.9792528e-01, 9.9772304e-01, 9.9751145e-01),
(vector float) (9.9729043e-01, 9.9706006e-01, 9.9682027e-01, 9.9657112e-01),
(vector float) (9.9631262e-01, 9.9604470e-01, 9.9576741e-01, 9.9548078e-01)},
{ (vector float) (1.0000000e+00, 9.9999881e-01, 9.9999529e-01, 9.9998939e-01),
(vector float) (9.9998116e-01, 9.9997061e-01, 9.9995762e-01, 9.9994236e-01),
(vector float) (9.9992472e-01, 9.9990469e-01, 9.9988234e-01, 9.9985766e-01),
(vector float) (9.9983060e-01, 9.9980116e-01, 9.9976939e-01, 9.9973530e-01),
(vector float) (9.9969882e-01, 9.9966002e-01, 9.9961883e-01, 9.9957532e-01),
(vector float) (9.9952942e-01, 9.9948120e-01, 9.9943060e-01, 9.9937767e-01),
(vector float) (9.9932235e-01, 9.9926478e-01, 9.9920475e-01, 9.9914241e-01),
(vector float) (9.9907774e-01, 9.9901068e-01, 9.9894130e-01, 9.9886954e-01)},
{ (vector float) (1.0000000e+00, 9.9999970e-01, 9.9999881e-01, 9.9999738e-01),
(vector float) (9.9999529e-01, 9.9999267e-01, 9.9998939e-01, 9.9998558e-01),
(vector float) (9.9998116e-01, 9.9997616e-01, 9.9997061e-01, 9.9996442e-01),
(vector float) (9.9995762e-01, 9.9995029e-01, 9.9994236e-01, 9.9993384e-01),
(vector float) (9.9992472e-01, 9.9991500e-01, 9.9990469e-01, 9.9989384e-01),
(vector float) (9.9988234e-01, 9.9987030e-01, 9.9985766e-01, 9.9984443e-01),
(vector float) (9.9983060e-01, 9.9981618e-01, 9.9980116e-01, 9.9978560e-01),
(vector float) (9.9976939e-01, 9.9975264e-01, 9.9973530e-01, 9.9971735e-01)},
{ (vector float) (1.0000000e+00, 9.9999994e-01, 9.9999970e-01, 9.9999934e-01),
(vector float) (9.9999881e-01, 9.9999815e-01, 9.9999738e-01, 9.9999642e-01),
(vector float) (9.9999529e-01, 9.9999404e-01, 9.9999267e-01, 9.9999112e-01),
(vector float) (9.9998939e-01, 9.9998760e-01, 9.9998558e-01, 9.9998343e-01),
(vector float) (9.9998116e-01, 9.9997872e-01, 9.9997616e-01, 9.9997348e-01),
(vector float) (9.9997061e-01, 9.9996758e-01, 9.9996442e-01, 9.9996108e-01),
(vector float) (9.9995762e-01, 9.9995404e-01, 9.9995029e-01, 9.9994642e-01),
(vector float) (9.9994236e-01, 9.9993813e-01, 9.9993384e-01, 9.9992931e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 9.9999994e-01, 9.9999982e-01),
(vector float) (9.9999970e-01, 9.9999952e-01, 9.9999934e-01, 9.9999911e-01),
(vector float) (9.9999881e-01, 9.9999851e-01, 9.9999815e-01, 9.9999779e-01),
(vector float) (9.9999738e-01, 9.9999690e-01, 9.9999642e-01, 9.9999589e-01),
(vector float) (9.9999529e-01, 9.9999470e-01, 9.9999404e-01, 9.9999338e-01),
(vector float) (9.9999267e-01, 9.9999189e-01, 9.9999112e-01, 9.9999028e-01),
(vector float) (9.9998939e-01, 9.9998850e-01, 9.9998760e-01, 9.9998659e-01),
(vector float) (9.9998558e-01, 9.9998456e-01, 9.9998343e-01, 9.9998236e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999988e-01, 9.9999982e-01, 9.9999976e-01),
(vector float) (9.9999970e-01, 9.9999964e-01, 9.9999952e-01, 9.9999946e-01),
(vector float) (9.9999934e-01, 9.9999923e-01, 9.9999911e-01, 9.9999899e-01),
(vector float) (9.9999881e-01, 9.9999869e-01, 9.9999851e-01, 9.9999833e-01),
(vector float) (9.9999815e-01, 9.9999797e-01, 9.9999779e-01, 9.9999756e-01),
(vector float) (9.9999738e-01, 9.9999714e-01, 9.9999690e-01, 9.9999666e-01),
(vector float) (9.9999642e-01, 9.9999613e-01, 9.9999589e-01, 9.9999559e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999988e-01, 9.9999988e-01, 9.9999988e-01),
(vector float) (9.9999982e-01, 9.9999982e-01, 9.9999976e-01, 9.9999976e-01),
(vector float) (9.9999970e-01, 9.9999964e-01, 9.9999964e-01, 9.9999958e-01),
(vector float) (9.9999952e-01, 9.9999946e-01, 9.9999946e-01, 9.9999940e-01),
(vector float) (9.9999934e-01, 9.9999928e-01, 9.9999923e-01, 9.9999917e-01),
(vector float) (9.9999911e-01, 9.9999905e-01, 9.9999899e-01, 9.9999887e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999988e-01, 9.9999988e-01),
(vector float) (9.9999988e-01, 9.9999988e-01, 9.9999988e-01, 9.9999982e-01),
(vector float) (9.9999982e-01, 9.9999982e-01, 9.9999982e-01, 9.9999976e-01),
(vector float) (9.9999976e-01, 9.9999976e-01, 9.9999976e-01, 9.9999970e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)}};
float sin256[256] __attribute__ ((aligned (128))) = {
0.0000000e+00, 1.2271538e-02, 2.4541229e-02, 3.6807224e-02,
4.9067676e-02, 6.1320737e-02, 7.3564567e-02, 8.5797310e-02,
9.8017141e-02, 1.1022221e-01, 1.2241068e-01, 1.3458070e-01,
1.4673047e-01, 1.5885815e-01, 1.7096189e-01, 1.8303989e-01,
1.9509032e-01, 2.0711137e-01, 2.1910124e-01, 2.3105811e-01,
2.4298018e-01, 2.5486565e-01, 2.6671275e-01, 2.7851969e-01,
2.9028466e-01, 3.0200595e-01, 3.1368175e-01, 3.2531029e-01,
3.3688986e-01, 3.4841868e-01, 3.5989505e-01, 3.7131721e-01,
3.8268343e-01, 3.9399204e-01, 4.0524131e-01, 4.1642955e-01,
4.2755508e-01, 4.3861625e-01, 4.4961134e-01, 4.6053872e-01,
4.7139674e-01, 4.8218378e-01, 4.9289820e-01, 5.0353837e-01,
5.1410276e-01, 5.2458966e-01, 5.3499764e-01, 5.4532498e-01,
5.5557024e-01, 5.6573182e-01, 5.7580817e-01, 5.8579785e-01,
5.9569931e-01, 6.0551107e-01, 6.1523157e-01, 6.2485951e-01,
6.3439327e-01, 6.4383155e-01, 6.5317285e-01, 6.6241580e-01,
6.7155898e-01, 6.8060100e-01, 6.8954057e-01, 6.9837624e-01,
7.0710677e-01, 7.1573085e-01, 7.2424710e-01, 7.3265427e-01,
7.4095112e-01, 7.4913639e-01, 7.5720882e-01, 7.6516724e-01,
7.7301043e-01, 7.8073722e-01, 7.8834641e-01, 7.9583693e-01,
8.0320752e-01, 8.1045717e-01, 8.1758481e-01, 8.2458931e-01,
8.3146960e-01, 8.3822471e-01, 8.4485358e-01, 8.5135520e-01,
8.5772860e-01, 8.6397284e-01, 8.7008697e-01, 8.7607008e-01,
8.8192129e-01, 8.8763964e-01, 8.9322430e-01, 8.9867449e-01,
9.0398932e-01, 9.0916800e-01, 9.1420978e-01, 9.1911387e-01,
9.2387950e-01, 9.2850608e-01, 9.3299282e-01, 9.3733901e-01,
9.4154406e-01, 9.4560730e-01, 9.4952816e-01, 9.5330602e-01,
9.5694035e-01, 9.6043050e-01, 9.6377605e-01, 9.6697646e-01,
9.7003126e-01, 9.7293997e-01, 9.7570211e-01, 9.7831738e-01,
9.8078525e-01, 9.8310548e-01, 9.8527765e-01, 9.8730141e-01,
9.8917651e-01, 9.9090266e-01, 9.9247956e-01, 9.9390697e-01,
9.9518472e-01, 9.9631262e-01, 9.9729043e-01, 9.9811810e-01,
9.9879545e-01, 9.9932235e-01, 9.9969882e-01, 9.9992472e-01,
1.0000000e+00, 9.9992472e-01, 9.9969882e-01, 9.9932235e-01,
9.9879545e-01, 9.9811810e-01, 9.9729043e-01, 9.9631262e-01,
9.9518472e-01, 9.9390697e-01, 9.9247956e-01, 9.9090266e-01,
9.8917651e-01, 9.8730141e-01, 9.8527765e-01, 9.8310548e-01,
9.8078525e-01, 9.7831738e-01, 9.7570211e-01, 9.7293997e-01,
9.7003126e-01, 9.6697646e-01, 9.6377605e-01, 9.6043050e-01,
9.5694035e-01, 9.5330602e-01, 9.4952816e-01, 9.4560730e-01,
9.4154406e-01, 9.3733901e-01, 9.3299282e-01, 9.2850608e-01,
9.2387950e-01, 9.1911387e-01, 9.1420978e-01, 9.0916800e-01,
9.0398932e-01, 8.9867449e-01, 8.9322430e-01, 8.8763964e-01,
8.8192129e-01, 8.7607008e-01, 8.7008697e-01, 8.6397284e-01,
8.5772860e-01, 8.5135520e-01, 8.4485358e-01, 8.3822471e-01,
8.3146960e-01, 8.2458931e-01, 8.1758481e-01, 8.1045717e-01,
8.0320752e-01, 7.9583693e-01, 7.8834641e-01, 7.8073722e-01,
7.7301043e-01, 7.6516724e-01, 7.5720882e-01, 7.4913639e-01,
7.4095112e-01, 7.3265427e-01, 7.2424710e-01, 7.1573085e-01,
7.0710677e-01, 6.9837624e-01, 6.8954057e-01, 6.8060100e-01,
6.7155898e-01, 6.6241580e-01, 6.5317285e-01, 6.4383155e-01,
6.3439327e-01, 6.2485951e-01, 6.1523157e-01, 6.0551107e-01,
5.9569931e-01, 5.8579785e-01, 5.7580817e-01, 5.6573182e-01,
5.5557024e-01, 5.4532498e-01, 5.3499764e-01, 5.2458966e-01,
5.1410276e-01, 5.0353837e-01, 4.9289820e-01, 4.8218378e-01,
4.7139674e-01, 4.6053872e-01, 4.4961134e-01, 4.3861625e-01,
4.2755508e-01, 4.1642955e-01, 4.0524131e-01, 3.9399204e-01,
3.8268343e-01, 3.7131721e-01, 3.5989505e-01, 3.4841868e-01,
3.3688986e-01, 3.2531029e-01, 3.1368175e-01, 3.0200595e-01,
2.9028466e-01, 2.7851969e-01, 2.6671275e-01, 2.5486565e-01,
2.4298018e-01, 2.3105811e-01, 2.1910124e-01, 2.0711137e-01,
1.9509032e-01, 1.8303989e-01, 1.7096189e-01, 1.5885815e-01,
1.4673047e-01, 1.3458070e-01, 1.2241068e-01, 1.1022221e-01,
9.8017141e-02, 8.5797310e-02, 7.3564567e-02, 6.1320737e-02,
4.9067676e-02, 3.6807224e-02, 2.4541229e-02, 1.2271538e-02};
float cos256[256] __attribute__ ((aligned (128))) = {
1.0000000e+00, 9.9992472e-01, 9.9969882e-01, 9.9932235e-01,
9.9879545e-01, 9.9811810e-01, 9.9729043e-01, 9.9631262e-01,
9.9518472e-01, 9.9390697e-01, 9.9247956e-01, 9.9090266e-01,
9.8917651e-01, 9.8730141e-01, 9.8527765e-01, 9.8310548e-01,
9.8078525e-01, 9.7831738e-01, 9.7570211e-01, 9.7293997e-01,
9.7003126e-01, 9.6697646e-01, 9.6377605e-01, 9.6043050e-01,
9.5694035e-01, 9.5330602e-01, 9.4952816e-01, 9.4560730e-01,
9.4154406e-01, 9.3733901e-01, 9.3299282e-01, 9.2850608e-01,
9.2387950e-01, 9.1911387e-01, 9.1420978e-01, 9.0916800e-01,
9.0398932e-01, 8.9867449e-01, 8.9322430e-01, 8.8763964e-01,
8.8192129e-01, 8.7607008e-01, 8.7008697e-01, 8.6397284e-01,
8.5772860e-01, 8.5135520e-01, 8.4485358e-01, 8.3822471e-01,
8.3146960e-01, 8.2458931e-01, 8.1758481e-01, 8.1045717e-01,
8.0320752e-01, 7.9583693e-01, 7.8834641e-01, 7.8073722e-01,
7.7301043e-01, 7.6516724e-01, 7.5720882e-01, 7.4913639e-01,
7.4095112e-01, 7.3265427e-01, 7.2424710e-01, 7.1573085e-01,
7.0710677e-01, 6.9837624e-01, 6.8954057e-01, 6.8060100e-01,
6.7155898e-01, 6.6241580e-01, 6.5317285e-01, 6.4383155e-01,
6.3439327e-01, 6.2485951e-01, 6.1523157e-01, 6.0551107e-01,
5.9569931e-01, 5.8579785e-01, 5.7580817e-01, 5.6573182e-01,
5.5557024e-01, 5.4532498e-01, 5.3499764e-01, 5.2458966e-01,
5.1410276e-01, 5.0353837e-01, 4.9289820e-01, 4.8218378e-01,
4.7139674e-01, 4.6053872e-01, 4.4961134e-01, 4.3861625e-01,
4.2755508e-01, 4.1642955e-01, 4.0524131e-01, 3.9399204e-01,
3.8268343e-01, 3.7131721e-01, 3.5989505e-01, 3.4841868e-01,
3.3688986e-01, 3.2531029e-01, 3.1368175e-01, 3.0200595e-01,
2.9028466e-01, 2.7851969e-01, 2.6671275e-01, 2.5486565e-01,
2.4298018e-01, 2.3105811e-01, 2.1910124e-01, 2.0711137e-01,
1.9509032e-01, 1.8303989e-01, 1.7096189e-01, 1.5885815e-01,
1.4673047e-01, 1.3458070e-01, 1.2241068e-01, 1.1022221e-01,
9.8017141e-02, 8.5797310e-02, 7.3564567e-02, 6.1320737e-02,
4.9067676e-02, 3.6807224e-02, 2.4541229e-02, 1.2271538e-02,
6.1230318e-17, -1.2271538e-02, -2.4541229e-02, -3.6807224e-02,
-4.9067676e-02, -6.1320737e-02, -7.3564567e-02, -8.5797310e-02,
-9.8017141e-02, -1.1022221e-01, -1.2241068e-01, -1.3458070e-01,
-1.4673047e-01, -1.5885815e-01, -1.7096189e-01, -1.8303989e-01,
-1.9509032e-01, -2.0711137e-01, -2.1910124e-01, -2.3105811e-01,
-2.4298018e-01, -2.5486565e-01, -2.6671275e-01, -2.7851969e-01,
-2.9028466e-01, -3.0200595e-01, -3.1368175e-01, -3.2531029e-01,
-3.3688986e-01, -3.4841868e-01, -3.5989505e-01, -3.7131721e-01,
-3.8268343e-01, -3.9399204e-01, -4.0524131e-01, -4.1642955e-01,
-4.2755508e-01, -4.3861625e-01, -4.4961134e-01, -4.6053872e-01,
-4.7139674e-01, -4.8218378e-01, -4.9289820e-01, -5.0353837e-01,
-5.1410276e-01, -5.2458966e-01, -5.3499764e-01, -5.4532498e-01,
-5.5557024e-01, -5.6573182e-01, -5.7580817e-01, -5.8579785e-01,
-5.9569931e-01, -6.0551107e-01, -6.1523157e-01, -6.2485951e-01,
-6.3439327e-01, -6.4383155e-01, -6.5317285e-01, -6.6241580e-01,
-6.7155898e-01, -6.8060100e-01, -6.8954057e-01, -6.9837624e-01,
-7.0710677e-01, -7.1573085e-01, -7.2424710e-01, -7.3265427e-01,
-7.4095112e-01, -7.4913639e-01, -7.5720882e-01, -7.6516724e-01,
-7.7301043e-01, -7.8073722e-01, -7.8834641e-01, -7.9583693e-01,
-8.0320752e-01, -8.1045717e-01, -8.1758481e-01, -8.2458931e-01,
-8.3146960e-01, -8.3822471e-01, -8.4485358e-01, -8.5135520e-01,
-8.5772860e-01, -8.6397284e-01, -8.7008697e-01, -8.7607008e-01,
-8.8192129e-01, -8.8763964e-01, -8.9322430e-01, -8.9867449e-01,
-9.0398932e-01, -9.0916800e-01, -9.1420978e-01, -9.1911387e-01,
-9.2387950e-01, -9.2850608e-01, -9.3299282e-01, -9.3733901e-01,
-9.4154406e-01, -9.4560730e-01, -9.4952816e-01, -9.5330602e-01,
-9.5694035e-01, -9.6043050e-01, -9.6377605e-01, -9.6697646e-01,
-9.7003126e-01, -9.7293997e-01, -9.7570211e-01, -9.7831738e-01,
-9.8078525e-01, -9.8310548e-01, -9.8527765e-01, -9.8730141e-01,
-9.8917651e-01, -9.9090266e-01, -9.9247956e-01, -9.9390697e-01,
-9.9518472e-01, -9.9631262e-01, -9.9729043e-01, -9.9811810e-01,
-9.9879545e-01, -9.9932235e-01, -9.9969882e-01, -9.9992472e-01};
vector unsigned int q_mirror_array[64] __attribute ((aligned (128))) =
{ (vector unsigned int) (0x0, 0x00, 0x0, 0x00),
(vector unsigned int) (0x0, 0x20, 0x0, 0x20),
(vector unsigned int) (0x0, 0x10, 0x0, 0x10),
(vector unsigned int) (0x0, 0x30, 0x0, 0x30),
(vector unsigned int) (0x0, 0x08, 0x0, 0x08),
(vector unsigned int) (0x0, 0x28, 0x0, 0x28),
(vector unsigned int) (0x0, 0x18, 0x0, 0x18),
(vector unsigned int) (0x0, 0x38, 0x0, 0x38),
(vector unsigned int) (0x0, 0x04, 0x0, 0x04),
(vector unsigned int) (0x0, 0x24, 0x0, 0x24),
(vector unsigned int) (0x0, 0x14, 0x0, 0x14),
(vector unsigned int) (0x0, 0x34, 0x0, 0x34),
(vector unsigned int) (0x0, 0x0c, 0x0, 0x0c),
(vector unsigned int) (0x0, 0x2c, 0x0, 0x2c),
(vector unsigned int) (0x0, 0x1c, 0x0, 0x1c),
(vector unsigned int) (0x0, 0x3c, 0x0, 0x3c),
(vector unsigned int) (0x0, 0x02, 0x0, 0x02),
(vector unsigned int) (0x0, 0x22, 0x0, 0x22),
(vector unsigned int) (0x0, 0x12, 0x0, 0x12),
(vector unsigned int) (0x0, 0x32, 0x0, 0x32),
(vector unsigned int) (0x0, 0x0a, 0x0, 0x0a),
(vector unsigned int) (0x0, 0x2a, 0x0, 0x2a),
(vector unsigned int) (0x0, 0x1a, 0x0, 0x1a),
(vector unsigned int) (0x0, 0x3a, 0x0, 0x3a),
(vector unsigned int) (0x0, 0x06, 0x0, 0x06),
(vector unsigned int) (0x0, 0x26, 0x0, 0x26),
(vector unsigned int) (0x0, 0x16, 0x0, 0x16),
(vector unsigned int) (0x0, 0x36, 0x0, 0x36),
(vector unsigned int) (0x0, 0x0e, 0x0, 0x0e),
(vector unsigned int) (0x0, 0x2e, 0x0, 0x2e),
(vector unsigned int) (0x0, 0x1e, 0x0, 0x1e),
(vector unsigned int) (0x0, 0x3e, 0x0, 0x3e),
(vector unsigned int) (0x0, 0x01, 0x0, 0x01),
(vector unsigned int) (0x0, 0x21, 0x0, 0x21),
(vector unsigned int) (0x0, 0x11, 0x0, 0x11),
(vector unsigned int) (0x0, 0x31, 0x0, 0x31),
(vector unsigned int) (0x0, 0x09, 0x0, 0x09),
(vector unsigned int) (0x0, 0x29, 0x0, 0x29),
(vector unsigned int) (0x0, 0x19, 0x0, 0x19),
(vector unsigned int) (0x0, 0x39, 0x0, 0x39),
(vector unsigned int) (0x0, 0x05, 0x0, 0x05),
(vector unsigned int) (0x0, 0x25, 0x0, 0x25),
(vector unsigned int) (0x0, 0x15, 0x0, 0x15),
(vector unsigned int) (0x0, 0x35, 0x0, 0x35),
(vector unsigned int) (0x0, 0x0d, 0x0, 0x0d),
(vector unsigned int) (0x0, 0x2d, 0x0, 0x2d),
(vector unsigned int) (0x0, 0x1d, 0x0, 0x1d),
(vector unsigned int) (0x0, 0x3d, 0x0, 0x3d),
(vector unsigned int) (0x0, 0x03, 0x0, 0x03),
(vector unsigned int) (0x0, 0x23, 0x0, 0x23),
(vector unsigned int) (0x0, 0x13, 0x0, 0x13),
(vector unsigned int) (0x0, 0x33, 0x0, 0x33),
(vector unsigned int) (0x0, 0x0b, 0x0, 0x0b),
(vector unsigned int) (0x0, 0x2b, 0x0, 0x2b),
(vector unsigned int) (0x0, 0x1b, 0x0, 0x1b),
(vector unsigned int) (0x0, 0x3b, 0x0, 0x3b),
(vector unsigned int) (0x0, 0x07, 0x0, 0x07),
(vector unsigned int) (0x0, 0x27, 0x0, 0x27),
(vector unsigned int) (0x0, 0x17, 0x0, 0x17),
(vector unsigned int) (0x0, 0x37, 0x0, 0x37),
(vector unsigned int) (0x0, 0x0f, 0x0, 0x0f),
(vector unsigned int) (0x0, 0x2f, 0x0, 0x2f),
(vector unsigned int) (0x0, 0x1f, 0x0, 0x1f),
(vector unsigned int) (0x0, 0x3f, 0x0, 0x3f)};
unsigned int mirror_array[256] __attribute__ ((aligned (128))) =
{ 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff};
volatile float bufs[49152] __attribute__ ((aligned (128)));
volatile float *sa[3];
volatile unsigned int dma_list_bufs[4096] __attribute__ ((aligned (128)));
volatile unsigned int *dma_list[4];
unsigned int ar, ai, br, bi;
/* control structure */
control_block cb __attribute__ ((aligned (128)));
#define fill_dma_read_list(_r, _i, _list_index, _base) { \
int j, b; \
vector unsigned int length = (vector unsigned int) (128, 0, 128, 0); \
vector unsigned int addend = (vector unsigned int) (0, 262144, 0, 262144); \
vector unsigned int *p, w0, w1, w2, w3; \
j = _list_index; \
b = _base; \
p = (vector unsigned int *) dma_list[j]; \
w0 = length; \
w0 = spu_insert(_r+4*b, w0, 1); \
w0 = spu_insert(_i+4*b, w0, 3); \
w1 = spu_add(w0, addend); \
w2 = spu_add(w1, addend); \
w3 = spu_add(w2, addend); \
addend = spu_add(addend, addend); \
addend = spu_add(addend, addend); \
for (k=0; k<64; ++k) { \
p[0] = w0; \
p[1] = w1; \
p[2] = w2; \
p[3] = w3; \
w0 = spu_add(w0, addend); \
w1 = spu_add(w1, addend); \
p+=4; \
w2 = spu_add(w2, addend); \
w3 = spu_add(w3, addend); \
} \
}
#define fill_mirror_write_list(_list_index, _base, _shift) { \
int j, b; \
vector unsigned int length = (vector unsigned int) (128, 0, 128, 0); \
vector unsigned int *p, w, m0, m1, m2, m3; \
vector unsigned int b0 = (vector unsigned int) {0x0, 0x80, 0x0, 0x80}; \
vector unsigned int b1 = (vector unsigned int) {0x0, 0x40, 0x0, 0x40}; \
j = _list_index; \
b = _base; \
p = (vector unsigned int *) dma_list[j]; \
w = length; \
w = spu_insert(ar+4*b, w, 1); \
w = spu_insert(ai+4*b, w, 3); \
p-=4; \
for (k=0; k<64; ++k) { \
m0 = q_mirror_array[k]; \
m1 = spu_or(m0, b0); \
m2 = spu_or(m0, b1); \
m3 = spu_or(m1, b1); \
p+=4; \
m0 = spu_rl(m0, _shift); \
m1 = spu_rl(m1, _shift); \
m2 = spu_rl(m2, _shift); \
m3 = spu_rl(m3, _shift); \
p[0] = spu_add(w, m0); \
p[1] = spu_add(w, m1); \
p[2] = spu_add(w, m2); \
p[3] = spu_add(w, m3); \
} \
}
#define fill_dma_write_list(_list_index, _base) { \
int j, b; \
vector unsigned int length = (vector unsigned int) (128, 0, 128, 0); \
vector unsigned int addend = (vector unsigned int) (0, 128, 0, 128); \
vector unsigned int *p, w0, w1, w2, w3; \
j = _list_index; \
b = _base; \
p = (vector unsigned int *) dma_list[j]; \
w0 = length; \
w0 = spu_insert(br+4*b, w0, 1); \
w0 = spu_insert(bi+4*b, w0, 3); \
w1 = spu_add(w0, addend); \
w2 = spu_add(w1, addend); \
w3 = spu_add(w2, addend); \
addend = spu_add(addend, addend); \
addend = spu_add(addend, addend); \
for (k=0; k<64; ++k) { \
p[0] = w0; \
p[1] = w1; \
p[2] = w2; \
p[3] = w3; \
w0 = spu_add(w0, addend); \
w1 = spu_add(w1, addend); \
p+=4; \
w2 = spu_add(w2, addend); \
w3 = spu_add(w3, addend); \
} \
}
#define SHUFFLE { \
for (k1 = 0; k1 < 256; k1+=4) { \
int mk0, mk1, mk2, mk3; \
vector float inr[4], outr[4], ini[4], outi[4]; \
mk0 = mirror_array[k1]<<4; \
mk1 = mk0 + 0x800; \
mk2 = mk0 + 0x400; \
mk3 = mk0 + 0xc00; \
for (k2 = 0; k2 < 8; ++k2) { \
int idx; \
idx = (k2<<9) + (k1 >> 1); \
inr[0] = qs[(mk0+k2) ]; \
inr[1] = qs[(mk1+k2) ]; \
inr[2] = qs[(mk2+k2) ]; \
inr[3] = qs[(mk3+k2) ]; \
idx &= 0xffff0; \
ini[0] = qs[(mk0+k2)+8]; \
ini[1] = qs[(mk1+k2)+8]; \
ini[2] = qs[(mk2+k2)+8]; \
ini[3] = qs[(mk3+k2)+8]; \
idx |= ((k1 >> 2) & 0x7); \
_transpose_matrix4x4(outr, inr); \
_transpose_matrix4x4(outi, ini); \
qd[(idx+ 0) ] = outr[0]; \
qd[(idx+128) ] = outr[1]; \
qd[(idx+256) ] = outr[2]; \
qd[(idx+384) ] = outr[3]; \
qd[(idx+ 0)+8] = outi[0]; \
qd[(idx+128)+8] = outi[1]; \
qd[(idx+256)+8] = outi[2]; \
qd[(idx+384)+8] = outi[3]; \
} \
} \
}
#define SCALE { \
int i; \
vector float sf = (vector float) \
(0.000244140625F, \
0.000244140625F, \
0.000244140625F, \
0.000244140625F); \
register vector float y_i0; \
register vector float y_i1; \
register vector float y_i2; \
register vector float y_i3; \
register vector float y_i4; \
register vector float y_i5; \
register vector float y_i6; \
register vector float y_i7; \
for (i=0; i<4096; i+=8) { \
y_i0 = qs[i+0]; \
y_i1 = qs[i+1]; \
y_i2 = qs[i+2]; \
y_i3 = qs[i+3]; \
y_i4 = qs[i+4]; \
y_i5 = qs[i+5]; \
y_i6 = qs[i+6]; \
y_i7 = qs[i+7]; \
y_i0 = spu_mul(y_i0, sf); \
y_i1 = spu_mul(y_i1, sf); \
y_i2 = spu_mul(y_i2, sf); \
y_i3 = spu_mul(y_i3, sf); \
y_i4 = spu_mul(y_i4, sf); \
y_i5 = spu_mul(y_i5, sf); \
y_i6 = spu_mul(y_i6, sf); \
y_i7 = spu_mul(y_i7, sf); \
qs[i+0] = y_i0; \
qs[i+1] = y_i1; \
qs[i+2] = y_i2; \
qs[i+3] = y_i3; \
qs[i+4] = y_i4; \
qs[i+5] = y_i5; \
qs[i+6] = y_i6; \
qs[i+7] = y_i7; \
} \
}
#define COMMON_CORE { \
int jj, kk; \
jj = j << 1; \
k = j + ((1<<lstart)>>2); \
kk = k << 1; \
yr_j0 = y[jj+0 ]; \
yr_j1 = y[jj+1 ]; \
yr_j2 = y[jj+2 ]; \
yr_j3 = y[jj+3 ]; \
yr_j4 = y[jj+4 ]; \
yr_j5 = y[jj+5 ]; \
yr_j6 = y[jj+6 ]; \
yr_j7 = y[jj+7 ]; \
yi_j0 = y[jj+8 ]; \
yi_j1 = y[jj+9 ]; \
yi_j2 = y[jj+10]; \
yi_j3 = y[jj+11]; \
yi_j4 = y[jj+12]; \
yi_j5 = y[jj+13]; \
yi_j6 = y[jj+14]; \
yi_j7 = y[jj+15]; \
yr_k0 = y[kk+0 ]; \
yr_k1 = y[kk+1 ]; \
yr_k2 = y[kk+2 ]; \
yr_k3 = y[kk+3 ]; \
yr_k4 = y[kk+4 ]; \
yr_k5 = y[kk+5 ]; \
yr_k6 = y[kk+6 ]; \
yr_k7 = y[kk+7 ]; \
yi_k0 = y[kk+8 ]; \
yi_k1 = y[kk+9 ]; \
yi_k2 = y[kk+10]; \
yi_k3 = y[kk+11]; \
yi_k4 = y[kk+12]; \
yi_k5 = y[kk+13]; \
yi_k6 = y[kk+14]; \
yi_k7 = y[kk+15]; \
tr0 = spu_mul(wi0, yi_k0); \
tr1 = spu_mul(wi1, yi_k1); \
tr2 = spu_mul(wi2, yi_k2); \
tr3 = spu_mul(wi3, yi_k3); \
tr4 = spu_mul(wi4, yi_k4); \
tr5 = spu_mul(wi5, yi_k5); \
tr6 = spu_mul(wi6, yi_k6); \
tr7 = spu_mul(wi7, yi_k7); \
ti0 = spu_mul(wi0, yr_k0); \
ti1 = spu_mul(wi1, yr_k1); \
ti2 = spu_mul(wi2, yr_k2); \
ti3 = spu_mul(wi3, yr_k3); \
ti4 = spu_mul(wi4, yr_k4); \
ti5 = spu_mul(wi5, yr_k5); \
ti6 = spu_mul(wi6, yr_k6); \
ti7 = spu_mul(wi7, yr_k7); \
tr0 = spu_msub(wr0, yr_k0, tr0); \
tr1 = spu_msub(wr1, yr_k1, tr1); \
tr2 = spu_msub(wr2, yr_k2, tr2); \
tr3 = spu_msub(wr3, yr_k3, tr3); \
tr4 = spu_msub(wr4, yr_k4, tr4); \
tr5 = spu_msub(wr5, yr_k5, tr5); \
tr6 = spu_msub(wr6, yr_k6, tr6); \
tr7 = spu_msub(wr7, yr_k7, tr7); \
ti0 = spu_madd(wr0, yi_k0, ti0); \
ti1 = spu_madd(wr1, yi_k1, ti1); \
ti2 = spu_madd(wr2, yi_k2, ti2); \
ti3 = spu_madd(wr3, yi_k3, ti3); \
ti4 = spu_madd(wr4, yi_k4, ti4); \
ti5 = spu_madd(wr5, yi_k5, ti5); \
ti6 = spu_madd(wr6, yi_k6, ti6); \
ti7 = spu_madd(wr7, yi_k7, ti7); \
yr_k0 = spu_sub(yr_j0, tr0); \
yr_k1 = spu_sub(yr_j1, tr1); \
yr_k2 = spu_sub(yr_j2, tr2); \
yr_k3 = spu_sub(yr_j3, tr3); \
yr_k4 = spu_sub(yr_j4, tr4); \
yr_k5 = spu_sub(yr_j5, tr5); \
yr_k6 = spu_sub(yr_j6, tr6); \
yr_k7 = spu_sub(yr_j7, tr7); \
yi_k0 = spu_sub(yi_j0, ti0); \
yi_k1 = spu_sub(yi_j1, ti1); \
yi_k2 = spu_sub(yi_j2, ti2); \
yi_k3 = spu_sub(yi_j3, ti3); \
yi_k4 = spu_sub(yi_j4, ti4); \
yi_k5 = spu_sub(yi_j5, ti5); \
yi_k6 = spu_sub(yi_j6, ti6); \
yi_k7 = spu_sub(yi_j7, ti7); \
yr_j0 = spu_add(yr_j0, tr0); \
yr_j1 = spu_add(yr_j1, tr1); \
yr_j2 = spu_add(yr_j2, tr2); \
yr_j3 = spu_add(yr_j3, tr3); \
yr_j4 = spu_add(yr_j4, tr4); \
yr_j5 = spu_add(yr_j5, tr5); \
yr_j6 = spu_add(yr_j6, tr6); \
yr_j7 = spu_add(yr_j7, tr7); \
yi_j0 = spu_add(yi_j0, ti0); \
yi_j1 = spu_add(yi_j1, ti1); \
yi_j2 = spu_add(yi_j2, ti2); \
yi_j3 = spu_add(yi_j3, ti3); \
yi_j4 = spu_add(yi_j4, ti4); \
yi_j5 = spu_add(yi_j5, ti5); \
yi_j6 = spu_add(yi_j6, ti6); \
yi_j7 = spu_add(yi_j7, ti7); \
y[jj+0 ] = yr_j0; \
y[jj+1 ] = yr_j1; \
y[jj+2 ] = yr_j2; \
y[jj+3 ] = yr_j3; \
y[jj+4 ] = yr_j4; \
y[jj+5 ] = yr_j5; \
y[jj+6 ] = yr_j6; \
y[jj+7 ] = yr_j7; \
y[jj+8 ] = yi_j0; \
y[jj+9 ] = yi_j1; \
y[jj+10] = yi_j2; \
y[jj+11] = yi_j3; \
y[jj+12] = yi_j4; \
y[jj+13] = yi_j5; \
y[jj+14] = yi_j6; \
y[jj+15] = yi_j7; \
y[kk+0 ] = yr_k0; \
y[kk+1 ] = yr_k1; \
y[kk+2 ] = yr_k2; \
y[kk+3 ] = yr_k3; \
y[kk+4 ] = yr_k4; \
y[kk+5 ] = yr_k5; \
y[kk+6 ] = yr_k6; \
y[kk+7 ] = yr_k7; \
y[kk+8 ] = yi_k0; \
y[kk+9 ] = yi_k1; \
y[kk+10] = yi_k2; \
y[kk+11] = yi_k3; \
y[kk+12] = yi_k4; \
y[kk+13] = yi_k5; \
y[kk+14] = yi_k6; \
y[kk+15] = yi_k7; \
}
void process8192_0(vector float *y, int flag)
{
int i, Ls, LLs, r, lstart, LLs_mask;
vector float vflag;
vector float sn, cs;
vflag = spu_splats((float) (-flag));
lstart = 12;
r = 4096;
LLs_mask = 0xffffff00;
for (LLs = 0; LLs < 8; ++LLs) {
Ls = (1 << LLs);
for(i = 0; i < Ls; ++i) {
int j, mi, irs;
mi = mirror_array[i] & LLs_mask;
cs = spu_splats(cos256[mi]);
sn = spu_splats(sin256[mi]);
irs = i*(r+r);
for(j = irs>>2; j < (irs+r)>>2; j+=8) {
int k;
register vector float tr0, ti0, wr0, wi0, yr_j0, yr_k0, yi_j0, yi_k0;
register vector float tr1, ti1, wr1, wi1, yr_j1, yr_k1, yi_j1, yi_k1;
register vector float tr2, ti2, wr2, wi2, yr_j2, yr_k2, yi_j2, yi_k2;
register vector float tr3, ti3, wr3, wi3, yr_j3, yr_k3, yi_j3, yi_k3;
register vector float tr4, ti4, wr4, wi4, yr_j4, yr_k4, yi_j4, yi_k4;
register vector float tr5, ti5, wr5, wi5, yr_j5, yr_k5, yi_j5, yi_k5;
register vector float tr6, ti6, wr6, wi6, yr_j6, yr_k6, yi_j6, yi_k6;
register vector float tr7, ti7, wr7, wi7, yr_j7, yr_k7, yi_j7, yi_k7;
wr0 = cs;
wr1 = cs;
wr2 = cs;
wr3 = cs;
wr4 = cs;
wr5 = cs;
wr6 = cs;
wr7 = cs;
wi0 = spu_mul(vflag, sn);
wi1 = spu_mul(vflag, sn);
wi2 = spu_mul(vflag, sn);
wi3 = spu_mul(vflag, sn);
wi4 = spu_mul(vflag, sn);
wi5 = spu_mul(vflag, sn);
wi6 = spu_mul(vflag, sn);
wi7 = spu_mul(vflag, sn);
COMMON_CORE
}
}
LLs_mask >>= 1;
--lstart;
r >>= 1;
}
}
void process8192_816(vector float *y, int flag, int leadbits, int leadval)
{
int i, Ls, LLs, r, lstart, LLs_mask;
vector float vflag;
float snlv, cslv;
vector float sn, cs;
double angmulfac;
double angle;
vflag = spu_splats((float) (-flag));
angmulfac = 1.0 / ((double) (((float) (1 << leadbits))));
lstart = 12;
r = 4096;
angle = M_PI * leadval * angmulfac;
LLs_mask = 0xffffff00;
for (LLs = 0; LLs < 8; ++LLs) {
int idx;
vector float *c5ptr, *s5ptr;
snlv = (float) _sin(angle);
cslv = (float) _cos(angle);
Ls = (1 << LLs);
idx = LLs + leadbits - 8;
c5ptr = cos5lsb[idx];
s5ptr = sin5lsb[idx];
for(i = 0; i < Ls; ++i) {
int j, mi, irs;
mi = mirror_array[i] & LLs_mask;
cs = spu_splats(cslv * cos256[mi] - snlv * sin256[mi]);
sn = spu_splats(snlv * cos256[mi] + cslv * sin256[mi]);
irs = i*(r+r);
for(j = irs>>2; j < (irs+r)>>2; j+=8) {
int k;
register vector float tr0, ti0, wr0, wi0, yr_j0, yr_k0, yi_j0, yi_k0, cos_reg0, sin_reg0;
register vector float tr1, ti1, wr1, wi1, yr_j1, yr_k1, yi_j1, yi_k1, cos_reg1, sin_reg1;
register vector float tr2, ti2, wr2, wi2, yr_j2, yr_k2, yi_j2, yi_k2, cos_reg2, sin_reg2;
register vector float tr3, ti3, wr3, wi3, yr_j3, yr_k3, yi_j3, yi_k3, cos_reg3, sin_reg3;
register vector float tr4, ti4, wr4, wi4, yr_j4, yr_k4, yi_j4, yi_k4, cos_reg4, sin_reg4;
register vector float tr5, ti5, wr5, wi5, yr_j5, yr_k5, yi_j5, yi_k5, cos_reg5, sin_reg5;
register vector float tr6, ti6, wr6, wi6, yr_j6, yr_k6, yi_j6, yi_k6, cos_reg6, sin_reg6;
register vector float tr7, ti7, wr7, wi7, yr_j7, yr_k7, yi_j7, yi_k7, cos_reg7, sin_reg7;
cos_reg0 = c5ptr[0];
cos_reg1 = c5ptr[1];
cos_reg2 = c5ptr[2];
cos_reg3 = c5ptr[3];
cos_reg4 = c5ptr[4];
cos_reg5 = c5ptr[5];
cos_reg6 = c5ptr[6];
cos_reg7 = c5ptr[7];
sin_reg0 = s5ptr[0];
sin_reg1 = s5ptr[1];
sin_reg2 = s5ptr[2];
sin_reg3 = s5ptr[3];
sin_reg4 = s5ptr[4];
sin_reg5 = s5ptr[5];
sin_reg6 = s5ptr[6];
sin_reg7 = s5ptr[7];
wr0 = spu_mul(sn, sin_reg0);
wr1 = spu_mul(sn, sin_reg1);
wr2 = spu_mul(sn, sin_reg2);
wr3 = spu_mul(sn, sin_reg3);
wr4 = spu_mul(sn, sin_reg4);
wr5 = spu_mul(sn, sin_reg5);
wr6 = spu_mul(sn, sin_reg6);
wr7 = spu_mul(sn, sin_reg7);
wr0 = spu_msub(cs, cos_reg0, wr0);
wr1 = spu_msub(cs, cos_reg1, wr1);
wr2 = spu_msub(cs, cos_reg2, wr2);
wr3 = spu_msub(cs, cos_reg3, wr3);
wr4 = spu_msub(cs, cos_reg4, wr4);
wr5 = spu_msub(cs, cos_reg5, wr5);
wr6 = spu_msub(cs, cos_reg6, wr6);
wr7 = spu_msub(cs, cos_reg7, wr7);
wi0 = spu_mul(cs, sin_reg0);
wi1 = spu_mul(cs, sin_reg1);
wi2 = spu_mul(cs, sin_reg2);
wi3 = spu_mul(cs, sin_reg3);
wi4 = spu_mul(cs, sin_reg4);
wi5 = spu_mul(cs, sin_reg5);
wi6 = spu_mul(cs, sin_reg6);
wi7 = spu_mul(cs, sin_reg7);
wi0 = spu_madd(sn, cos_reg0, wi0);
wi1 = spu_madd(sn, cos_reg1, wi1);
wi2 = spu_madd(sn, cos_reg2, wi2);
wi3 = spu_madd(sn, cos_reg3, wi3);
wi4 = spu_madd(sn, cos_reg4, wi4);
wi5 = spu_madd(sn, cos_reg5, wi5);
wi6 = spu_madd(sn, cos_reg6, wi6);
wi7 = spu_madd(sn, cos_reg7, wi7);
wi0 = spu_mul(vflag, wi0);
wi1 = spu_mul(vflag, wi1);
wi2 = spu_mul(vflag, wi2);
wi3 = spu_mul(vflag, wi3);
wi4 = spu_mul(vflag, wi4);
wi5 = spu_mul(vflag, wi5);
wi6 = spu_mul(vflag, wi6);
wi7 = spu_mul(vflag, wi7);
COMMON_CORE
}
}
LLs_mask >>= 1;
--lstart;
r >>= 1;
angmulfac *= 0.5;
angle *= 0.5;
}
}
void stage1(int spu, int flag)
{
int i, k, k1, k2;
vector float *qs, *qd;
fill_dma_read_list(ar, ai, 3, (0<<8)+(spu<<5))
mfc_getl((void *) sa[0], 0, dma_list[3], 4096, 20, 0, 0);
for (i=1; i<256; ++i) {
fill_dma_read_list(ar, ai, (3+2*i)&3, (i<<8)+(spu<<5))
mfc_getlb((void *) sa[i&1], 0, dma_list[(3+2*i)&3], 4096, 20+(i&1), 0, 0);
mfc_write_tag_mask((1<<(21-(i&1))));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_0((vector float *) sa[1-(i&1)], flag);
qs = (vector float *) sa[1-(i&1)];
qd = (vector float *) sa[2];
SCALE
#endif
mfc_write_tag_mask((1<<22));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
SHUFFLE
#endif
fill_dma_write_list((2*i)&3, ((i-1)<<16)+(spu<<13))
mfc_putl((void *) sa[2], 0, dma_list[(2*i)&3], 4096, 22, 0, 0);
}
mfc_write_tag_mask((1<<21));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_0((vector float *) sa[1], flag);
qs = (vector float *) sa[1];
qd = (vector float *) sa[2];
SCALE
#endif
mfc_write_tag_mask((1<<22));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
SHUFFLE
#endif
mfc_write_tag_mask((1<<22));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
fill_dma_write_list(0, (255<<16)+(spu<<13))
mfc_putl((void *) sa[2], 0, dma_list[0], 4096, 22, 0, 0);
mfc_write_tag_mask((1<<22));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
}
void stage2(int spu, int flag)
{
int i, k;
fill_dma_read_list(br, bi, 3, (0<<8)+(spu<<5))
mfc_getl((void *) sa[0], 0, dma_list[3], 4096, 20, 0, 0);
for (i=1; i<256; ++i) {
fill_dma_read_list(br, bi, (3+2*i)&3, (i<<8)+(spu<<5))
mfc_getlb((void *) sa[i&1], 0, dma_list[(3+2*i)&3], 4096, 20+(i&1), 0, 0);
mfc_write_tag_mask((1<<(21-(i&1))));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_816((vector float *) sa[1-(i&1)], flag, 8, (spu<<5));
#endif
fill_mirror_write_list((2*i)&3, ((i-1)<<16)+(spu<<5), 10)
mfc_putl((void *) sa[1-(i&1)], 0, dma_list[(2*i)&3], 4096, 21-(i&1), 0, 0);
}
mfc_write_tag_mask((1<<21));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_816((vector float *) sa[1], flag, 8, (spu<<5));
#endif
fill_mirror_write_list(0, (255<<16)+(spu<<5), 10)
mfc_putl((void *) sa[1], 0, dma_list[0], 4096, 21, 0, 0);
mfc_write_tag_mask((1<<21));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
}
void stage3(int spu, int flag)
{
int i, k;
fill_dma_read_list(ar, ai, 3, (0<<8)+(spu<<5))
mfc_getl((void *) sa[0], 0, dma_list[3], 4096, 20, 0, 0);
for (i=1; i<256; ++i) {
fill_dma_read_list(ar, ai, (3+2*i)&3, (i<<8)+(spu<<5))
mfc_getlb((void *) sa[i&1], 0, dma_list[(3+2*i)&3], 4096, 20+(i&1), 0, 0);
mfc_write_tag_mask((1<<(21-(i&1))));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_816((vector float *) sa[1-(i&1)], flag, 16, ((i-1)<<8)|(spu<<5));
#endif
fill_mirror_write_list((2*i)&3, ((i-1)<<8)+(spu<<5), 18)
mfc_putl((void *) sa[1-(i&1)], 0, dma_list[(2*i)&3], 4096, 21-(i&1), 0, 0);
}
mfc_write_tag_mask((1<<21));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
#ifdef FULLRUN
process8192_816((vector float *) sa[1], flag, 16, (255<<8)|(spu<<5));
#endif
fill_mirror_write_list(0, (255<<8)+(spu<<5), 18)
mfc_putl((void *) sa[1], 0, dma_list[0], 4096, 21, 0, 0);
mfc_write_tag_mask((1<<21));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
}
int main(int spuid, addr64 argp, addr64 envp) {
volatile int foo;
/* assign three sets of local buffers, each having 8192 reals and 8192 imaginaries */
sa[0] = &bufs[0];
sa[1] = &bufs[16384];
sa[2] = &bufs[32768];
/* assign four sets of DMA buffers to load and store data */
dma_list[0] = &dma_list_bufs[0];
dma_list[1] = &dma_list_bufs[1024];
dma_list[2] = &dma_list_bufs[2048];
dma_list[3] = &dma_list_bufs[3072];
/* DMA control block information from system memory. */
mfc_get(&cb, argp.ui[1], sizeof(cb), 31, 0, 0);
mfc_write_tag_mask((1<<31));
mfc_read_tag_status_all(); /* Wait for DMA to complete */
/* assign local values to the array addresses in main memory */
ar = cb.ar;
ai = cb.ai;
br = cb.br;
bi = cb.bi;
while (spu_stat_in_mbox () < 1);
foo = spu_read_in_mbox();
while (foo) {
stage1(cb.spu_num, foo);
spu_write_out_mbox(7);
foo = spu_read_in_mbox();
stage2(cb.spu_num, foo);
spu_write_out_mbox(7);
foo = spu_read_in_mbox();
stage3(cb.spu_num, foo);
spu_write_out_mbox(7);
foo = spu_read_in_mbox();
}
return 0;
}