Cell SDK Code Sample: FFT
%% --------------------------------------------------------------
%% (C) Copyright 2001,2005,
%% International Business Machines Corporation,
%% Sony Computer Entertainment Incorporated,
%% Toshiba Corporation.
%%
%% All Rights Reserved.
%% --------------------------------------------------------------
%% PROLOG END TAG zYx
Target:
CBE-Linux (HW or simulator)
Description:
This directory contains a hand-tuned program which performs
a 4-way SIMD single-precision complex FFT on an array of
size 16,777,216 elements.
Notes:
The actual executable resides in the ppu subdirectory. It's
called 'fft'. It's a full be executable. It takes two runtime
parameters 'ncycles' and 'printflag'.
'ncycles' - is a count of how many times you wish to do a
full roundtrip of time-to-frequency-to-time calculations.
'printflag' - enables or disables print statements within the SPEs.
[note that the PPE print statements will always appear.]
On SystemSim, this program will take hours to do a full cycle. The
print statements in the SPE are intended to give the user evidence
that the program has not stalled.
Recommend that on SystemSim, you say 'fft 1 1'.
Recommend that on real hardware, you say 'fft 100 0'.
Here's what happens within the program.
* First, the PPE fills an array with 16,277,216 complex
numbers, using a function called trigfunc. This function
can be changed by the user.
* Then the PPE fires up all eight SPEs, and they perform a
time-to-frequency conversion on this data, and signal
the PPE.
* The PPE checks these results. If you change trigfunc, you
should comment out this part. It then signals the SPEs.
* The SPEs convert the data back into the time domain,
and signal the PPE.
* The PPE checks these results (using trigfunc), prints
a message "START TIMING" and signals the SPEs.
* Then, the SPEs perform 'ncycles' round trips of time to
frequency and back to time, and signals the PPE.
* The PPE prints STOP TIMING" and exits.
When running on real hardware, you can use these PPE print
messages with a stopwatch to get a quick estimate of the time
required to perform this function.
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#ifndef __fft_h__
#define __fft_h__
#include <stdlib.h>
#include <stdio.h>
/*
#include <string.h>
*/
#define NP 16777216 /* number of points (16 meg) */
#define M_PI 3.14159265358979323846
#define FULLRUN
//#define MAMBO_RUN
void stage1(int, int); /* first eight sets of butterflies */
void stage2(int, int); /* second eight sets of butterflies */
void stage3(int, int); /* third eight sets of butterflies */
typedef union
{
unsigned long long ull;
unsigned int ui[2];
}
addr64; /* linkage stuff used when calling the SPU program */
typedef struct _control_block {
unsigned int spu_num; /* number from 0 through 7 */
unsigned int ncycles; /* number of round trips to take when estimating performance */
unsigned long long barrier1_count;
unsigned long long barrier1_address;
unsigned long long barrier2_count;
unsigned long long barrier2_address;
unsigned long long barrier3_count;
unsigned long long barrier3_address;
unsigned int ar; /* real component of A array */
unsigned int ai; /* imaginary component of A array */
unsigned int br; /* real component of B array */
unsigned int bi; /* imaginary component of B array */
unsigned int printflag; /* true if we want print statements from the SPU */
unsigned char pad[52]; /* pad to a full cache line */
} control_block;
#endif /* __fft_h__ */
Target: PPE [This code section runs on the PPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../fft.h"
#include <sys/mman.h>
#include <stdio.h>
#include <libspe.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <fenv.h>
#include <sys/types.h>
#include <errno.h>
#include <math.h>
/* next two lines used for huge page size stuff */
char *mem_file = "/huge/fft_mem.bin";
char *mem_addr = NULL;
/* Allocate space for three separate barrier blocks */
static unsigned int b[3][32] __attribute__ ((aligned (4096)));
control_block cb[8] __attribute__ ((aligned (4096)));
extern spe_program_handle_t fft_spu;
speid_t speids[8];
spe_gid_t gid;
int status[8];
void *rc;
float *ar, *ai, *br, *bi;
#define MALLOC_BIG_ARRAYS { \
/* bottom 11 bits of ar and br addrs should be 0x000 */ \
/* bottom 11 bits of ai and bi addrs shuold be 0x400 */ \
int ar_raw, ai_raw, br_raw, bi_raw; \
ar_raw = (int) malloc(0x4000800); \
br_raw = (int) malloc(0x4000800); \
ai_raw = (int) malloc(0x4000800); \
bi_raw = (int) malloc(0x4000800); \
if ((ar_raw * br_raw * ai_raw * bi_raw) == 0) { \
printf("ERROR: unable to malloc. Exiting...\n"); \
return(-1); \
} \
ar_raw += 0x3ff; \
br_raw += 0x3ff; \
ai_raw += 0x3ff; \
bi_raw += 0x3ff; \
ar_raw &= ~0x3ff; \
br_raw &= ~0x3ff; \
ai_raw &= ~0x3ff; \
bi_raw &= ~0x3ff; \
ar_raw += (ar_raw & 0x400); \
br_raw += (br_raw & 0x400); \
ai_raw += 0x400 - (ai_raw & 0x400); \
bi_raw += 0x400 - (bi_raw & 0x400); \
ar = (float *) ar_raw; \
br = (float *) br_raw; \
ai = (float *) ai_raw; \
bi = (float *) bi_raw; \
}
#define MBOX_SND(_val) { \
for (i=0; i<8; ++i) { \
int foo = _val; \
spe_write_in_mbox(speids[i], foo); \
} \
}
#define MBOX_RCV { \
for (i=0; i<8; ++i) { \
unsigned int foo; \
while (spe_stat_out_mbox(speids[i]) <= 0); \
foo = spe_read_out_mbox(speids[i]); \
} \
}
#define PERFORM_TIME_TO_FREQUENCY_FFT { \
MBOX_SND(1) \
MBOX_RCV \
MBOX_SND(1) \
MBOX_RCV \
MBOX_SND(1) \
MBOX_RCV \
}
#define PERFORM_FREQUENCY_TO_TIME_FFT { \
MBOX_SND(-1) \
MBOX_RCV \
MBOX_SND(-1) \
MBOX_RCV \
MBOX_SND(-1) \
MBOX_RCV \
}
float trigfunc(int in) {
double x;
x = ((double) in) * 6.2831853071796 / ((double) NP);
return ((float) (7.0 + sin(x) + cos(2*x)));
}
int main(int argc, char *argv[]) {
int i, j;
int fmem;
int ncycles;
unsigned int printflag;
float lo_real, hi_real, lo_imag, hi_imag;
if (argc != 3) {
fprintf(stderr, "usage: fft <ncycles> <printflag>\n");
return -1;
}
ncycles = atoi(argv[1]);
printflag = atoi(argv[2]);
/* Create a large contiguous memory buffer by allocating a large
* * page (or more). Large page memory will also reduce the TLB thrashing.
* */
if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {
printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
MALLOC_BIG_ARRAYS
} else {
mem_addr = (char *) mmap (0, 0x11000000, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);
if (mem_addr == MAP_FAILED) {
printf("ERROR: unable to mmap file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
close (fmem);
MALLOC_BIG_ARRAYS
}
else {
ar = (float *) (mem_addr+0x0000000);
br = (float *) (mem_addr+0x4000000);
ai = (float *) (mem_addr+0x8000400);
bi = (float *) (mem_addr+0xc000400);
}
}
#ifndef SIM_RUN
printf("big array addrs: %x %x %x %x\n", (int) ar, (int) br, (int) ai, (int) bi);
printf("loading big array A\n"); fflush(stdout);
for (i=0; i<NP; ++i) {
if ((i&0xfffff) == 0) { printf("%x of %x done\n", i, NP); fflush(stdout); }
ai[i] = 0.0f;
ar[i] = trigfunc(i);
}
#endif
/* Create an SPE group. */
gid = spe_create_group ( SCHED_OTHER, 0, 1 );
if (gid == NULL) {
fprintf(stderr, "Failed spe_create_group(errno=%d)\n", errno);
return -1;
}
if (spe_group_max (gid) < 8) {
fprintf(stderr, "System doesn't have eight working SPEs. I can't continue...\n");
return -1;
}
/* Initialize barrier count to 0. */
for (i=0; i<32; ++i) b[0][i] = b[1][i] = b[2][i] = 0;
for (i = 0; i < 8; i++) {
cb[i].spu_num = i;
cb[i].ncycles = ncycles;
cb[i].barrier1_count = (unsigned long long) 8;
cb[i].barrier1_address = (unsigned long long) b[0];
cb[i].barrier2_count = (unsigned long long) 8;
cb[i].barrier2_address = (unsigned long long) b[1];
cb[i].barrier3_count = (unsigned long long) 8;
cb[i].barrier3_address = (unsigned long long) b[2];
cb[i].ar = (unsigned long) ar;
cb[i].ai = (unsigned long) ai;
cb[i].br = (unsigned long) br;
cb[i].bi = (unsigned long) bi;
cb[i].printflag = printflag;
}
/* allocate SPEs */
fprintf(stderr, "ready to call (create) SPE threads\n"); fflush(stderr);
for (i = 0; i < 8; i++)
{
speids[i] = spe_create_thread (gid, &fft_spu, (unsigned long long *) &cb[i], NULL, -1, 0);
if (speids[i] == NULL)
{
fprintf (stderr, "FAILED: spe_create_thread(num=%d, errno=%d)\n", i, errno);
exit (3+i);
}
}
/* monitor progress while SPEs process time-to-frequency computations */
PERFORM_TIME_TO_FREQUENCY_FFT
#ifndef SIM_RUN
#ifdef FULLRUN
printf("Checking frequency results...\n"); fflush(stdout);
for (i=0; i<NP; ++i) {
if (ar[i] < -0.002 || ar[i] > 0.002 || ai[i] < -0.002 || ai[i] > 0.002) printf("a[%d] = (%10.3f, %9.3f)\n", i, ar[i], ai[i]);
}
#endif
/* monitor progress while SPEs process frequency-to-time computations */
PERFORM_FREQUENCY_TO_TIME_FFT
#ifdef FULLRUN
printf("Now checking results...\n"); fflush(stdout);
hi_real = -100.0;
lo_real = 100.0;
hi_imag = -100.0;
lo_imag = 100.0;
for (i=0; i<NP; ++i) {
float x;
x = trigfunc(i);
if (ar[i] - x > hi_real) hi_real = ar[i] - x ;
if (ar[i] - x < lo_real) lo_real = ar[i] - x ;
if (ai[i] > hi_imag) hi_imag = ai[i];
if (ai[i] < lo_imag) lo_imag = ai[i];
}
fprintf(stderr, "real err*r range = %f %f\n", lo_real, hi_real);
fprintf(stderr, "imag err*r range = %f %f\n", lo_imag, hi_imag);
if (lo_real < -0.00003 || hi_real > 0.00003 || lo_imag < -0.00003 || hi_imag > 0.00003) {
fprintf(stderr, "ERROR: range of error values too large...\n");
fflush(stderr);
return -1;
}
#endif
printf("START TIMING!\n");
/* monitor timing runs */
for (j=0; j<ncycles; ++j) {
PERFORM_TIME_TO_FREQUENCY_FFT
PERFORM_FREQUENCY_TO_TIME_FFT
}
printf("STOP TIMING!\n");
MBOX_SND(0)
printf("Now checking results...\n"); fflush(stdout);
hi_real = -100.0;
lo_real = 100.0;
hi_imag = -100.0;
lo_imag = 100.0;
for (i=0; i<NP; ++i) {
float x;
x = trigfunc(i);
if (ar[i] - x > hi_real) hi_real = ar[i] - x ;
if (ar[i] - x < lo_real) lo_real = ar[i] - x ;
if (ai[i] > hi_imag) hi_imag = ai[i];
if (ai[i] < lo_imag) lo_imag = ai[i];
}
printf("real err*r range = %f %f\n", lo_real, hi_real);
printf("imag err*r range = %f %f\n", lo_imag, hi_imag);
#endif
return 0;
}
Target: SPE [This code section runs on the SPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../fft.h"
#include <cos.h>
#include <sin.h>
#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include <transpose_matrix4x4.h>
#include <profile.h>
#include <stdio.h>
vector float sin5lsb[16][8] __attribute__ ((aligned (128))) = {
{ (vector float) (0.0000000e+00, 1.2271538e-02, 2.4541229e-02, 3.6807224e-02),
(vector float) (4.9067676e-02, 6.1320737e-02, 7.3564567e-02, 8.5797310e-02),
(vector float) (9.8017141e-02, 1.1022221e-01, 1.2241068e-01, 1.3458070e-01),
(vector float) (1.4673047e-01, 1.5885815e-01, 1.7096189e-01, 1.8303989e-01),
(vector float) (1.9509032e-01, 2.0711137e-01, 2.1910124e-01, 2.3105811e-01),
(vector float) (2.4298018e-01, 2.5486565e-01, 2.6671275e-01, 2.7851969e-01),
(vector float) (2.9028466e-01, 3.0200595e-01, 3.1368175e-01, 3.2531029e-01),
(vector float) (3.3688986e-01, 3.4841868e-01, 3.5989505e-01, 3.7131721e-01)},
{ (vector float) (0.0000000e+00, 6.1358847e-03, 1.2271538e-02, 1.8406730e-02),
(vector float) (2.4541229e-02, 3.0674804e-02, 3.6807224e-02, 4.2938258e-02),
(vector float) (4.9067676e-02, 5.5195246e-02, 6.1320737e-02, 6.7443922e-02),
(vector float) (7.3564567e-02, 7.9682440e-02, 8.5797310e-02, 9.1908954e-02),
(vector float) (9.8017141e-02, 1.0412163e-01, 1.1022221e-01, 1.1631863e-01),
(vector float) (1.2241068e-01, 1.2849811e-01, 1.3458070e-01, 1.4065824e-01),
(vector float) (1.4673047e-01, 1.5279719e-01, 1.5885815e-01, 1.6491312e-01),
(vector float) (1.7096189e-01, 1.7700422e-01, 1.8303989e-01, 1.8906866e-01)},
{ (vector float) (0.0000000e+00, 3.0679568e-03, 6.1358847e-03, 9.2037544e-03),
(vector float) (1.2271538e-02, 1.5339206e-02, 1.8406730e-02, 2.1474080e-02),
(vector float) (2.4541229e-02, 2.7608145e-02, 3.0674804e-02, 3.3741172e-02),
(vector float) (3.6807224e-02, 3.9872926e-02, 4.2938258e-02, 4.6003181e-02),
(vector float) (4.9067676e-02, 5.2131705e-02, 5.5195246e-02, 5.8258265e-02),
(vector float) (6.1320737e-02, 6.4382628e-02, 6.7443922e-02, 7.0504576e-02),
(vector float) (7.3564567e-02, 7.6623864e-02, 7.9682440e-02, 8.2740262e-02),
(vector float) (8.5797310e-02, 8.8853553e-02, 9.1908954e-02, 9.4963498e-02)},
{ (vector float) (0.0000000e+00, 1.5339801e-03, 3.0679568e-03, 4.6019261e-03),
(vector float) (6.1358847e-03, 7.6698288e-03, 9.2037544e-03, 1.0737659e-02),
(vector float) (1.2271538e-02, 1.3805388e-02, 1.5339206e-02, 1.6872987e-02),
(vector float) (1.8406730e-02, 1.9940428e-02, 2.1474080e-02, 2.3007682e-02),
(vector float) (2.4541229e-02, 2.6074719e-02, 2.7608145e-02, 2.9141508e-02),
(vector float) (3.0674804e-02, 3.2208025e-02, 3.3741172e-02, 3.5274237e-02),
(vector float) (3.6807224e-02, 3.8340122e-02, 3.9872926e-02, 4.1405641e-02),
(vector float) (4.2938258e-02, 4.4470772e-02, 4.6003181e-02, 4.7535483e-02)},
{ (vector float) (0.0000000e+00, 7.6699030e-04, 1.5339801e-03, 2.3009691e-03),
(vector float) (3.0679568e-03, 3.8349426e-03, 4.6019261e-03, 5.3689070e-03),
(vector float) (6.1358847e-03, 6.9028586e-03, 7.6698288e-03, 8.4367944e-03),
(vector float) (9.2037544e-03, 9.9707097e-03, 1.0737659e-02, 1.1504602e-02),
(vector float) (1.2271538e-02, 1.3038468e-02, 1.3805388e-02, 1.4572302e-02),
(vector float) (1.5339206e-02, 1.6106103e-02, 1.6872987e-02, 1.7639864e-02),
(vector float) (1.8406730e-02, 1.9173585e-02, 1.9940428e-02, 2.0707261e-02),
(vector float) (2.1474080e-02, 2.2240888e-02, 2.3007682e-02, 2.3774462e-02)},
{ (vector float) (0.0000000e+00, 3.8349518e-04, 7.6699030e-04, 1.1504854e-03),
(vector float) (1.5339801e-03, 1.9174748e-03, 2.3009691e-03, 2.6844630e-03),
(vector float) (3.0679568e-03, 3.4514500e-03, 3.8349426e-03, 4.2184344e-03),
(vector float) (4.6019261e-03, 4.9854168e-03, 5.3689070e-03, 5.7523963e-03),
(vector float) (6.1358847e-03, 6.5193721e-03, 6.9028586e-03, 7.2863442e-03),
(vector float) (7.6698288e-03, 8.0533121e-03, 8.4367944e-03, 8.8202748e-03),
(vector float) (9.2037544e-03, 9.5872330e-03, 9.9707097e-03, 1.0354185e-02),
(vector float) (1.0737659e-02, 1.1121131e-02, 1.1504602e-02, 1.1888071e-02)},
{ (vector float) (0.0000000e+00, 1.9174760e-04, 3.8349518e-04, 5.7524274e-04),
(vector float) (7.6699030e-04, 9.5873786e-04, 1.1504854e-03, 1.3422328e-03),
(vector float) (1.5339801e-03, 1.7257276e-03, 1.9174748e-03, 2.1092221e-03),
(vector float) (2.3009691e-03, 2.4927163e-03, 2.6844630e-03, 2.8762100e-03),
(vector float) (3.0679568e-03, 3.2597035e-03, 3.4514500e-03, 3.6431963e-03),
(vector float) (3.8349426e-03, 4.0266886e-03, 4.2184344e-03, 4.4101803e-03),
(vector float) (4.6019261e-03, 4.7936714e-03, 4.9854168e-03, 5.1771621e-03),
(vector float) (5.3689070e-03, 5.5606519e-03, 5.7523963e-03, 5.9441407e-03)},
{ (vector float) (0.0000000e+00, 9.5873802e-05, 1.9174760e-04, 2.8762140e-04),
(vector float) (3.8349518e-04, 4.7936899e-04, 5.7524274e-04, 6.7111652e-04),
(vector float) (7.6699030e-04, 8.6286408e-04, 9.5873786e-04, 1.0546116e-03),
(vector float) (1.1504854e-03, 1.2463591e-03, 1.3422328e-03, 1.4381065e-03),
(vector float) (1.5339801e-03, 1.6298539e-03, 1.7257276e-03, 1.8216012e-03),
(vector float) (1.9174748e-03, 2.0133485e-03, 2.1092221e-03, 2.2050955e-03),
(vector float) (2.3009691e-03, 2.3968427e-03, 2.4927163e-03, 2.5885897e-03),
(vector float) (2.6844630e-03, 2.7803367e-03, 2.8762100e-03, 2.9720834e-03)},
{ (vector float) (0.0000000e+00, 4.7936901e-05, 9.5873802e-05, 1.4381070e-04),
(vector float) (1.9174760e-04, 2.3968449e-04, 2.8762140e-04, 3.3555829e-04),
(vector float) (3.8349518e-04, 4.3143210e-04, 4.7936899e-04, 5.2730588e-04),
(vector float) (5.7524274e-04, 6.2317966e-04, 6.7111652e-04, 7.1905344e-04),
(vector float) (7.6699030e-04, 8.1492722e-04, 8.6286408e-04, 9.1080094e-04),
(vector float) (9.5873786e-04, 1.0066747e-03, 1.0546116e-03, 1.1025484e-03),
(vector float) (1.1504854e-03, 1.1984222e-03, 1.2463591e-03, 1.2942959e-03),
(vector float) (1.3422328e-03, 1.3901696e-03, 1.4381065e-03, 1.4860433e-03)},
{ (vector float) (0.0000000e+00, 2.3968450e-05, 4.7936901e-05, 7.1905350e-05),
(vector float) (9.5873802e-05, 1.1984225e-04, 1.4381070e-04, 1.6777914e-04),
(vector float) (1.9174760e-04, 2.1571605e-04, 2.3968449e-04, 2.6365294e-04),
(vector float) (2.8762140e-04, 3.1158983e-04, 3.3555829e-04, 3.5952675e-04),
(vector float) (3.8349518e-04, 4.0746364e-04, 4.3143210e-04, 4.5540053e-04),
(vector float) (4.7936899e-04, 5.0333742e-04, 5.2730588e-04, 5.5127434e-04),
(vector float) (5.7524274e-04, 5.9921120e-04, 6.2317966e-04, 6.4714812e-04),
(vector float) (6.7111652e-04, 6.9508498e-04, 7.1905344e-04, 7.4302190e-04)},
{ (vector float) (0.0000000e+00, 1.1984225e-05, 2.3968450e-05, 3.5952675e-05),
(vector float) (4.7936901e-05, 5.9921123e-05, 7.1905350e-05, 8.3889572e-05),
(vector float) (9.5873802e-05, 1.0785802e-04, 1.1984225e-04, 1.3182647e-04),
(vector float) (1.4381070e-04, 1.5579493e-04, 1.6777914e-04, 1.7976337e-04),
(vector float) (1.9174760e-04, 2.0373182e-04, 2.1571605e-04, 2.2770026e-04),
(vector float) (2.3968449e-04, 2.5166871e-04, 2.6365294e-04, 2.7563717e-04),
(vector float) (2.8762140e-04, 2.9960563e-04, 3.1158983e-04, 3.2357406e-04),
(vector float) (3.3555829e-04, 3.4754252e-04, 3.5952675e-04, 3.7151098e-04)},
{ (vector float) (0.0000000e+00, 5.9921126e-06, 1.1984225e-05, 1.7976337e-05),
(vector float) (2.3968450e-05, 2.9960562e-05, 3.5952675e-05, 4.1944786e-05),
(vector float) (4.7936901e-05, 5.3929012e-05, 5.9921123e-05, 6.5913235e-05),
(vector float) (7.1905350e-05, 7.7897465e-05, 8.3889572e-05, 8.9881687e-05),
(vector float) (9.5873802e-05, 1.0186591e-04, 1.0785802e-04, 1.1385014e-04),
(vector float) (1.1984225e-04, 1.2583435e-04, 1.3182647e-04, 1.3781858e-04),
(vector float) (1.4381070e-04, 1.4980281e-04, 1.5579493e-04, 1.6178703e-04),
(vector float) (1.6777914e-04, 1.7377126e-04, 1.7976337e-04, 1.8575549e-04)},
{ (vector float) (0.0000000e+00, 2.9960563e-06, 5.9921126e-06, 8.9881687e-06),
(vector float) (1.1984225e-05, 1.4980281e-05, 1.7976337e-05, 2.0972393e-05),
(vector float) (2.3968450e-05, 2.6964506e-05, 2.9960562e-05, 3.2956617e-05),
(vector float) (3.5952675e-05, 3.8948732e-05, 4.1944786e-05, 4.4940844e-05),
(vector float) (4.7936901e-05, 5.0932955e-05, 5.3929012e-05, 5.6925070e-05),
(vector float) (5.9921123e-05, 6.2917177e-05, 6.5913235e-05, 6.8909292e-05),
(vector float) (7.1905350e-05, 7.4901407e-05, 7.7897465e-05, 8.0893515e-05),
(vector float) (8.3889572e-05, 8.6885630e-05, 8.9881687e-05, 9.2877744e-05)},
{ (vector float) (0.0000000e+00, 1.4980282e-06, 2.9960563e-06, 4.4940844e-06),
(vector float) (5.9921126e-06, 7.4901404e-06, 8.9881687e-06, 1.0486197e-05),
(vector float) (1.1984225e-05, 1.3482253e-05, 1.4980281e-05, 1.6478309e-05),
(vector float) (1.7976337e-05, 1.9474366e-05, 2.0972393e-05, 2.2470422e-05),
(vector float) (2.3968450e-05, 2.5466477e-05, 2.6964506e-05, 2.8462535e-05),
(vector float) (2.9960562e-05, 3.1458589e-05, 3.2956617e-05, 3.4454646e-05),
(vector float) (3.5952675e-05, 3.7450704e-05, 3.8948732e-05, 4.0446757e-05),
(vector float) (4.1944786e-05, 4.3442815e-05, 4.4940844e-05, 4.6438872e-05)},
{ (vector float) (0.0000000e+00, 7.4901408e-07, 1.4980282e-06, 2.2470422e-06),
(vector float) (2.9960563e-06, 3.7450702e-06, 4.4940844e-06, 5.2430983e-06),
(vector float) (5.9921126e-06, 6.7411265e-06, 7.4901404e-06, 8.2391543e-06),
(vector float) (8.9881687e-06, 9.7371831e-06, 1.0486197e-05, 1.1235211e-05),
(vector float) (1.1984225e-05, 1.2733239e-05, 1.3482253e-05, 1.4231267e-05),
(vector float) (1.4980281e-05, 1.5729294e-05, 1.6478309e-05, 1.7227323e-05),
(vector float) (1.7976337e-05, 1.8725352e-05, 1.9474366e-05, 2.0223379e-05),
(vector float) (2.0972393e-05, 2.1721407e-05, 2.2470422e-05, 2.3219436e-05)},
{ (vector float) (0.0000000e+00, 3.7450704e-07, 7.4901408e-07, 1.1235211e-06),
(vector float) (1.4980282e-06, 1.8725351e-06, 2.2470422e-06, 2.6215491e-06),
(vector float) (2.9960563e-06, 3.3705633e-06, 3.7450702e-06, 4.1195772e-06),
(vector float) (4.4940844e-06, 4.8685915e-06, 5.2430983e-06, 5.6176054e-06),
(vector float) (5.9921126e-06, 6.3666193e-06, 6.7411265e-06, 7.1156337e-06),
(vector float) (7.4901404e-06, 7.8646472e-06, 8.2391543e-06, 8.6136615e-06),
(vector float) (8.9881687e-06, 9.3626759e-06, 9.7371831e-06, 1.0111689e-05),
(vector float) (1.0486197e-05, 1.0860704e-05, 1.1235211e-05, 1.1609718e-05)}};
vector float cos5lsb[16][8] __attribute__ ((aligned (128))) = {
{ (vector float) (1.0000000e+00, 9.9992472e-01, 9.9969882e-01, 9.9932235e-01),
(vector float) (9.9879545e-01, 9.9811810e-01, 9.9729043e-01, 9.9631262e-01),
(vector float) (9.9518472e-01, 9.9390697e-01, 9.9247956e-01, 9.9090266e-01),
(vector float) (9.8917651e-01, 9.8730141e-01, 9.8527765e-01, 9.8310548e-01),
(vector float) (9.8078525e-01, 9.7831738e-01, 9.7570211e-01, 9.7293997e-01),
(vector float) (9.7003126e-01, 9.6697646e-01, 9.6377605e-01, 9.6043050e-01),
(vector float) (9.5694035e-01, 9.5330602e-01, 9.4952816e-01, 9.4560730e-01),
(vector float) (9.4154406e-01, 9.3733901e-01, 9.3299282e-01, 9.2850608e-01)},
{ (vector float) (1.0000000e+00, 9.9998116e-01, 9.9992472e-01, 9.9983060e-01),
(vector float) (9.9969882e-01, 9.9952942e-01, 9.9932235e-01, 9.9907774e-01),
(vector float) (9.9879545e-01, 9.9847555e-01, 9.9811810e-01, 9.9772304e-01),
(vector float) (9.9729043e-01, 9.9682027e-01, 9.9631262e-01, 9.9576741e-01),
(vector float) (9.9518472e-01, 9.9456459e-01, 9.9390697e-01, 9.9321193e-01),
(vector float) (9.9247956e-01, 9.9170977e-01, 9.9090266e-01, 9.9005818e-01),
(vector float) (9.8917651e-01, 9.8825759e-01, 9.8730141e-01, 9.8630810e-01),
(vector float) (9.8527765e-01, 9.8421007e-01, 9.8310548e-01, 9.8196387e-01)},
{ (vector float) (1.0000000e+00, 9.9999529e-01, 9.9998116e-01, 9.9995762e-01),
(vector float) (9.9992472e-01, 9.9988234e-01, 9.9983060e-01, 9.9976939e-01),
(vector float) (9.9969882e-01, 9.9961883e-01, 9.9952942e-01, 9.9943060e-01),
(vector float) (9.9932235e-01, 9.9920475e-01, 9.9907774e-01, 9.9894130e-01),
(vector float) (9.9879545e-01, 9.9864024e-01, 9.9847555e-01, 9.9830157e-01),
(vector float) (9.9811810e-01, 9.9792528e-01, 9.9772304e-01, 9.9751145e-01),
(vector float) (9.9729043e-01, 9.9706006e-01, 9.9682027e-01, 9.9657112e-01),
(vector float) (9.9631262e-01, 9.9604470e-01, 9.9576741e-01, 9.9548078e-01)},
{ (vector float) (1.0000000e+00, 9.9999881e-01, 9.9999529e-01, 9.9998939e-01),
(vector float) (9.9998116e-01, 9.9997061e-01, 9.9995762e-01, 9.9994236e-01),
(vector float) (9.9992472e-01, 9.9990469e-01, 9.9988234e-01, 9.9985766e-01),
(vector float) (9.9983060e-01, 9.9980116e-01, 9.9976939e-01, 9.9973530e-01),
(vector float) (9.9969882e-01, 9.9966002e-01, 9.9961883e-01, 9.9957532e-01),
(vector float) (9.9952942e-01, 9.9948120e-01, 9.9943060e-01, 9.9937767e-01),
(vector float) (9.9932235e-01, 9.9926478e-01, 9.9920475e-01, 9.9914241e-01),
(vector float) (9.9907774e-01, 9.9901068e-01, 9.9894130e-01, 9.9886954e-01)},
{ (vector float) (1.0000000e+00, 9.9999970e-01, 9.9999881e-01, 9.9999738e-01),
(vector float) (9.9999529e-01, 9.9999267e-01, 9.9998939e-01, 9.9998558e-01),
(vector float) (9.9998116e-01, 9.9997616e-01, 9.9997061e-01, 9.9996442e-01),
(vector float) (9.9995762e-01, 9.9995029e-01, 9.9994236e-01, 9.9993384e-01),
(vector float) (9.9992472e-01, 9.9991500e-01, 9.9990469e-01, 9.9989384e-01),
(vector float) (9.9988234e-01, 9.9987030e-01, 9.9985766e-01, 9.9984443e-01),
(vector float) (9.9983060e-01, 9.9981618e-01, 9.9980116e-01, 9.9978560e-01),
(vector float) (9.9976939e-01, 9.9975264e-01, 9.9973530e-01, 9.9971735e-01)},
{ (vector float) (1.0000000e+00, 9.9999994e-01, 9.9999970e-01, 9.9999934e-01),
(vector float) (9.9999881e-01, 9.9999815e-01, 9.9999738e-01, 9.9999642e-01),
(vector float) (9.9999529e-01, 9.9999404e-01, 9.9999267e-01, 9.9999112e-01),
(vector float) (9.9998939e-01, 9.9998760e-01, 9.9998558e-01, 9.9998343e-01),
(vector float) (9.9998116e-01, 9.9997872e-01, 9.9997616e-01, 9.9997348e-01),
(vector float) (9.9997061e-01, 9.9996758e-01, 9.9996442e-01, 9.9996108e-01),
(vector float) (9.9995762e-01, 9.9995404e-01, 9.9995029e-01, 9.9994642e-01),
(vector float) (9.9994236e-01, 9.9993813e-01, 9.9993384e-01, 9.9992931e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 9.9999994e-01, 9.9999982e-01),
(vector float) (9.9999970e-01, 9.9999952e-01, 9.9999934e-01, 9.9999911e-01),
(vector float) (9.9999881e-01, 9.9999851e-01, 9.9999815e-01, 9.9999779e-01),
(vector float) (9.9999738e-01, 9.9999690e-01, 9.9999642e-01, 9.9999589e-01),
(vector float) (9.9999529e-01, 9.9999470e-01, 9.9999404e-01, 9.9999338e-01),
(vector float) (9.9999267e-01, 9.9999189e-01, 9.9999112e-01, 9.9999028e-01),
(vector float) (9.9998939e-01, 9.9998850e-01, 9.9998760e-01, 9.9998659e-01),
(vector float) (9.9998558e-01, 9.9998456e-01, 9.9998343e-01, 9.9998236e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999988e-01, 9.9999982e-01, 9.9999976e-01),
(vector float) (9.9999970e-01, 9.9999964e-01, 9.9999952e-01, 9.9999946e-01),
(vector float) (9.9999934e-01, 9.9999923e-01, 9.9999911e-01, 9.9999899e-01),
(vector float) (9.9999881e-01, 9.9999869e-01, 9.9999851e-01, 9.9999833e-01),
(vector float) (9.9999815e-01, 9.9999797e-01, 9.9999779e-01, 9.9999756e-01),
(vector float) (9.9999738e-01, 9.9999714e-01, 9.9999690e-01, 9.9999666e-01),
(vector float) (9.9999642e-01, 9.9999613e-01, 9.9999589e-01, 9.9999559e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999988e-01, 9.9999988e-01, 9.9999988e-01),
(vector float) (9.9999982e-01, 9.9999982e-01, 9.9999976e-01, 9.9999976e-01),
(vector float) (9.9999970e-01, 9.9999964e-01, 9.9999964e-01, 9.9999958e-01),
(vector float) (9.9999952e-01, 9.9999946e-01, 9.9999946e-01, 9.9999940e-01),
(vector float) (9.9999934e-01, 9.9999928e-01, 9.9999923e-01, 9.9999917e-01),
(vector float) (9.9999911e-01, 9.9999905e-01, 9.9999899e-01, 9.9999887e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999988e-01, 9.9999988e-01),
(vector float) (9.9999988e-01, 9.9999988e-01, 9.9999988e-01, 9.9999982e-01),
(vector float) (9.9999982e-01, 9.9999982e-01, 9.9999982e-01, 9.9999976e-01),
(vector float) (9.9999976e-01, 9.9999976e-01, 9.9999976e-01, 9.9999970e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01),
(vector float) (9.9999994e-01, 9.9999994e-01, 9.9999994e-01, 9.9999994e-01)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)},
{ (vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00),
(vector float) (1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00)}};
float sin256[256] __attribute__ ((aligned (128))) = {
0.0000000e+00, 1.2271538e-02, 2.4541229e-02, 3.6807224e-02,
4.9067676e-02, 6.1320737e-02, 7.3564567e-02, 8.5797310e-02,
9.8017141e-02, 1.1022221e-01, 1.2241068e-01, 1.3458070e-01,
1.4673047e-01, 1.5885815e-01, 1.7096189e-01, 1.8303989e-01,
1.9509032e-01, 2.0711137e-01, 2.1910124e-01, 2.3105811e-01,
2.4298018e-01, 2.5486565e-01, 2.6671275e-01, 2.7851969e-01,
2.9028466e-01, 3.0200595e-01, 3.1368175e-01, 3.2531029e-01,
3.3688986e-01, 3.4841868e-01, 3.5989505e-01, 3.7131721e-01,
3.8268343e-01, 3.9399204e-01, 4.0524131e-01, 4.1642955e-01,
4.2755508e-01, 4.3861625e-01, 4.4961134e-01, 4.6053872e-01,
4.7139674e-01, 4.8218378e-01, 4.9289820e-01, 5.0353837e-01,
5.1410276e-01, 5.2458966e-01, 5.3499764e-01, 5.4532498e-01,
5.5557024e-01, 5.6573182e-01, 5.7580817e-01, 5.8579785e-01,
5.9569931e-01, 6.0551107e-01, 6.1523157e-01, 6.2485951e-01,
6.3439327e-01, 6.4383155e-01, 6.5317285e-01, 6.6241580e-01,
6.7155898e-01, 6.8060100e-01, 6.8954057e-01, 6.9837624e-01,
7.0710677e-01, 7.1573085e-01, 7.2424710e-01, 7.3265427e-01,
7.4095112e-01, 7.4913639e-01, 7.5720882e-01, 7.6516724e-01,
7.7301043e-01, 7.8073722e-01, 7.8834641e-01, 7.9583693e-01,
8.0320752e-01, 8.1045717e-01, 8.1758481e-01, 8.2458931e-01,
8.3146960e-01, 8.3822471e-01, 8.4485358e-01, 8.5135520e-01,
8.5772860e-01, 8.6397284e-01, 8.7008697e-01, 8.7607008e-01,
8.8192129e-01, 8.8763964e-01, 8.9322430e-01, 8.9867449e-01,
9.0398932e-01, 9.0916800e-01, 9.1420978e-01, 9.1911387e-01,
9.2387950e-01, 9.2850608e-01, 9.3299282e-01, 9.3733901e-01,
9.4154406e-01, 9.4560730e-01, 9.4952816e-01, 9.5330602e-01,
9.5694035e-01, 9.6043050e-01, 9.6377605e-01, 9.6697646e-01,
9.7003126e-01, 9.7293997e-01, 9.7570211e-01, 9.7831738e-01,
9.8078525e-01, 9.8310548e-01, 9.8527765e-01, 9.8730141e-01,
9.8917651e-01, 9.9090266e-01, 9.9247956e-01, 9.9390697e-01,
9.9518472e-01, 9.9631262e-01, 9.9729043e-01, 9.9811810e-01,
9.9879545e-01, 9.9932235e-01, 9.9969882e-01, 9.9992472e-01,
1.0000000e+00, 9.9992472e-01, 9.9969882e-01, 9.9932235e-01,
9.9879545e-01, 9.9811810e-01, 9.9729043e-01, 9.9631262e-01,
9.9518472e-01, 9.9390697e-01, 9.9247956e-01, 9.9090266e-01,
9.8917651e-01, 9.8730141e-01, 9.8527765e-01, 9.8310548e-01,
9.8078525e-01, 9.7831738e-01, 9.7570211e-01, 9.7293997e-01,
9.7003126e-01, 9.6697646e-01, 9.6377605e-01, 9.6043050e-01,
9.5694035e-01, 9.5330602e-01, 9.4952816e-01, 9.4560730e-01,
9.4154406e-01, 9.3733901e-01, 9.3299282e-01, 9.2850608e-01,
9.2387950e-01, 9.1911387e-01, 9.1420978e-01, 9.0916800e-01,
9.0398932e-01, 8.9867449e-01, 8.9322430e-01, 8.8763964e-01,
8.8192129e-01, 8.7607008e-01, 8.7008697e-01, 8.6397284e-01,
8.5772860e-01, 8.5135520e-01, 8.4485358e-01, 8.3822471e-01,
8.3146960e-01, 8.2458931e-01, 8.1758481e-01, 8.1045717e-01,
8.0320752e-01, 7.9583693e-01, 7.8834641e-01, 7.8073722e-01,
7.7301043e-01, 7.6516724e-01, 7.5720882e-01, 7.4913639e-01,
7.4095112e-01, 7.3265427e-01, 7.2424710e-01, 7.1573085e-01,
7.0710677e-01, 6.9837624e-01, 6.8954057e-01, 6.8060100e-01,
6.7155898e-01, 6.6241580