Cell SDK Code Sample: FFT



%% --------------------------------------------------------------

%% (C) Copyright 2001,2005,                                      

%% International Business Machines Corporation,                  

%% Sony Computer Entertainment Incorporated,                      

%% Toshiba Corporation.                                          

%%                                                               

%% All Rights Reserved.                                          

%% --------------------------------------------------------------

%% PROLOG END TAG zYx                                             

 

Target:

            CBE-Linux (HW or simulator)

 

Description:

            This directory contains a hand-tuned program which performs

        a 4-way SIMD single-precision complex FFT on an array of

        size 16,777,216 elements.

 

Notes:

            The actual executable resides in the ppu subdirectory. It's

            called 'fft'. It's a full be executable. It takes two runtime

            parameters 'ncycles' and 'printflag'.

           

            'ncycles' - is a count of how many times you wish to do a

                    full roundtrip of time-to-frequency-to-time calculations.

 

        'printflag' - enables or disables print statements within the SPEs.

                      [note that the PPE print statements will always appear.]

 

        On SystemSim, this program will take hours to do a full cycle. The

            print statements in the SPE are intended to give the user evidence

            that the program has not stalled.

 

        Recommend that on SystemSim, you say 'fft 1 1'.

        Recommend that on real hardware, you say 'fft 100 0'.

       

        Here's what happens within the program.

 

         *  First, the PPE fills an array with 16,277,216 complex

            numbers, using a function called trigfunc.  This function

             can be changed by the user.

 

         *  Then the PPE fires up all eight SPEs, and they perform a

            time-to-frequency conversion on this data, and signal

            the PPE.

 

         *  The PPE checks these results.  If you change trigfunc, you

            should comment out this part.  It then signals the SPEs.

 

         *  The SPEs convert the data back into the time domain,

            and signal the PPE.

 

         *  The PPE checks these results (using trigfunc), prints

            a message "START TIMING" and signals the SPEs.

 

         *  Then, the SPEs perform 'ncycles' round trips of time to

            frequency and back to time, and signals the PPE.

 

         *  The PPE prints STOP TIMING" and exits.

 

        When running on real hardware, you can use these PPE print

        messages with a stopwatch to get a quick estimate of the time

        required to perform this function.

 


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                             */

/* International Business Machines Corporation,            */

/* Sony Computer Entertainment Incorporated,            */

/* Toshiba Corporation.                                                  */

/*                                                                                      */

/* All Rights Reserved.                                                   */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                          */

#ifndef __fft_h__

#define __fft_h__

 

#include <stdlib.h>

#include <stdio.h>

/*

#include <string.h>

*/

 

#define NP 16777216 /* number of points (16 meg) */

#define M_PI           3.14159265358979323846

#define FULLRUN

//#define MAMBO_RUN

 

void stage1(int, int); /* first eight sets of butterflies */

void stage2(int, int); /* second eight sets of butterflies */

void stage3(int, int); /* third eight sets of butterflies */

 

typedef union

{

  unsigned long long ull;

  unsigned int ui[2];

}

addr64; /* linkage stuff used when calling the SPU program */

 

typedef struct _control_block {

 

  unsigned int spu_num; /* number from 0 through 7 */

  unsigned int ncycles; /* number of round trips to take when estimating performance */

  unsigned long long barrier1_count;                       

  unsigned long long barrier1_address;                       

  unsigned long long barrier2_count;                       

  unsigned long long barrier2_address;                       

  unsigned long long barrier3_count;                       

  unsigned long long barrier3_address;                       

  unsigned int ar; /* real component of A array */

  unsigned int ai; /* imaginary component of A array */ 

  unsigned int br; /* real component of B array */

  unsigned int bi; /* imaginary component of B array */

  unsigned int printflag; /* true if we want print statements from the SPU */

  unsigned char pad[52]; /* pad to a full cache line */

 

} control_block;

 

#endif /* __fft_h__ */


Target: PPE [This code section runs on the PPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                           */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

 

#include "../fft.h"

#include <sys/mman.h>

 

#include <stdio.h>

#include <libspe.h>

#include <pthread.h>

 

#include <stdlib.h>

#include <unistd.h>

#include <fcntl.h>

#include <fenv.h>

#include <sys/types.h>

#include <errno.h>

 

#include <math.h>

 

/* next two lines used for huge page size stuff */

char *mem_file = "/huge/fft_mem.bin";

char *mem_addr = NULL;

 

/* Allocate space for three separate barrier blocks */

static unsigned int b[3][32] __attribute__ ((aligned (4096)));

 

control_block cb[8] __attribute__ ((aligned (4096)));

extern spe_program_handle_t fft_spu;

speid_t speids[8];

spe_gid_t gid;

int status[8];

void *rc;

 

float *ar, *ai, *br, *bi;

 

#define MALLOC_BIG_ARRAYS {                                  \

  /* bottom 11 bits of ar and br addrs should be 0x000 */    \

  /* bottom 11 bits of ai and bi addrs shuold be 0x400 */    \

  int ar_raw, ai_raw, br_raw, bi_raw;                        \

  ar_raw = (int) malloc(0x4000800);                          \

  br_raw = (int) malloc(0x4000800);                          \

  ai_raw = (int) malloc(0x4000800);                          \

  bi_raw = (int) malloc(0x4000800);                          \

  if ((ar_raw * br_raw * ai_raw * bi_raw) == 0) {            \

    printf("ERROR: unable to malloc.  Exiting...\n");        \

    return(-1);                                              \

  }                                                          \

  ar_raw += 0x3ff;                                            \

  br_raw += 0x3ff;                                           \

  ai_raw += 0x3ff;                                           \

  bi_raw += 0x3ff;                                           \

  ar_raw &= ~0x3ff;                                           \

  br_raw &= ~0x3ff;                                          \

  ai_raw &= ~0x3ff;                                          \

  bi_raw &= ~0x3ff;                                          \

  ar_raw += (ar_raw & 0x400);                                 \

  br_raw += (br_raw & 0x400);                                \

  ai_raw += 0x400 - (ai_raw & 0x400);                        \

  bi_raw += 0x400 - (bi_raw & 0x400);                        \

  ar = (float *) ar_raw;                                      \

  br = (float *) br_raw;                                     \

  ai = (float *) ai_raw;                                     \

  bi = (float *) bi_raw;                                     \

}

 

#define MBOX_SND(_val) {                   \

  for (i=0; i<8; ++i) {                    \

     int foo = _val;                       \

     spe_write_in_mbox(speids[i], foo);    \

  }                                        \

}

 

#define MBOX_RCV {                         \

  for (i=0; i<8; ++i) {                     \

     unsigned int foo;                     \

     while (spe_stat_out_mbox(speids[i]) <= 0); \

     foo = spe_read_out_mbox(speids[i]);   \

  }                                        \

}

 

#define PERFORM_TIME_TO_FREQUENCY_FFT {    \

  MBOX_SND(1)                              \

  MBOX_RCV                                 \

  MBOX_SND(1)                              \

  MBOX_RCV                                 \

  MBOX_SND(1)                              \

  MBOX_RCV                                  \

}

 

#define PERFORM_FREQUENCY_TO_TIME_FFT {    \

  MBOX_SND(-1)                             \

  MBOX_RCV                                 \

  MBOX_SND(-1)                             \

  MBOX_RCV                                 \

  MBOX_SND(-1)                              \

  MBOX_RCV                                 \

}

 

float trigfunc(int in) {

   double x;

   x = ((double) in) * 6.2831853071796 / ((double) NP);

   return ((float) (7.0 + sin(x) + cos(2*x)));

}

 

int main(int argc, char *argv[]) {

  int i, j;

  int fmem;

  int ncycles;

  unsigned int printflag;

  float lo_real, hi_real, lo_imag, hi_imag;

 

  if (argc != 3) {

    fprintf(stderr, "usage: fft <ncycles> <printflag>\n");

    return -1;

  }

  ncycles = atoi(argv[1]);

  printflag = atoi(argv[2]);

 

    /* Create a large contiguous memory buffer by allocating a large

   *    * page (or more). Large page memory will also reduce the TLB thrashing.

   *       */

  if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {

    printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);

    MALLOC_BIG_ARRAYS

  } else {

    mem_addr = (char *) mmap (0, 0x11000000, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);

    if (mem_addr == MAP_FAILED) {

      printf("ERROR: unable to mmap file %s (errno=%d). Using malloc heap.\n", mem_file, errno);

      close (fmem);

      MALLOC_BIG_ARRAYS

    }

    else {

      ar = (float *) (mem_addr+0x0000000);

      br = (float *) (mem_addr+0x4000000);

      ai = (float *) (mem_addr+0x8000400);

       bi = (float *) (mem_addr+0xc000400);

    }

  }

 

#ifndef SIM_RUN

  printf("big array addrs: %x %x %x %x\n", (int) ar, (int) br, (int) ai, (int) bi);

  printf("loading big array A\n"); fflush(stdout);

 

  for (i=0; i<NP; ++i) {

    if ((i&0xfffff) == 0) { printf("%x of %x done\n", i, NP); fflush(stdout); }

    ai[i] = 0.0f;

    ar[i] = trigfunc(i);

  }

#endif

 

  /* Create an SPE group. */

  gid = spe_create_group ( SCHED_OTHER, 0, 1 );

  if (gid == NULL) {

    fprintf(stderr, "Failed spe_create_group(errno=%d)\n", errno);

    return -1;

  }

 

  if (spe_group_max (gid) < 8) {

    fprintf(stderr, "System doesn't have eight working SPEs.  I can't continue...\n");

    return -1;

  }

 

  /* Initialize barrier count to 0. */

  for (i=0; i<32; ++i) b[0][i] = b[1][i] = b[2][i] = 0;

 

  for (i = 0; i < 8; i++) {

    cb[i].spu_num = i;

    cb[i].ncycles = ncycles;

    cb[i].barrier1_count = (unsigned long long) 8;

    cb[i].barrier1_address = (unsigned long long) b[0];

    cb[i].barrier2_count = (unsigned long long) 8;

     cb[i].barrier2_address = (unsigned long long) b[1];

    cb[i].barrier3_count = (unsigned long long) 8;

    cb[i].barrier3_address = (unsigned long long) b[2];

    cb[i].ar = (unsigned long) ar;

    cb[i].ai = (unsigned long) ai;

    cb[i].br = (unsigned long) br;

    cb[i].bi = (unsigned long) bi;

    cb[i].printflag = printflag;

  }

 

  /* allocate SPEs */

 

  fprintf(stderr, "ready to call (create) SPE threads\n"); fflush(stderr);

 

  for (i = 0; i < 8; i++)

    {

      speids[i] = spe_create_thread (gid, &fft_spu, (unsigned long long *) &cb[i], NULL, -1, 0);

      if (speids[i] == NULL)

        {

          fprintf (stderr, "FAILED: spe_create_thread(num=%d, errno=%d)\n", i, errno);

          exit (3+i);

        }

    }

 

  /* monitor progress while SPEs process time-to-frequency computations */

 

  PERFORM_TIME_TO_FREQUENCY_FFT

 

#ifndef SIM_RUN

 

#ifdef FULLRUN

  printf("Checking frequency results...\n"); fflush(stdout);

 

  for (i=0; i<NP; ++i) {

    if (ar[i] < -0.002 || ar[i] > 0.002 || ai[i] < -0.002 || ai[i] > 0.002) printf("a[%d] = (%10.3f, %9.3f)\n", i, ar[i], ai[i]);

  }

#endif

 

  /* monitor progress while SPEs process frequency-to-time computations */

 

  PERFORM_FREQUENCY_TO_TIME_FFT

 

#ifdef FULLRUN

  printf("Now checking results...\n"); fflush(stdout);

 

  hi_real = -100.0;

  lo_real =  100.0;

  hi_imag = -100.0;

  lo_imag =  100.0;

  for (i=0; i<NP; ++i) {

    float x;

    x = trigfunc(i);                       

    if (ar[i] - x > hi_real) hi_real = ar[i] - x ;

    if (ar[i] - x < lo_real) lo_real = ar[i] - x ;

    if (ai[i]     > hi_imag) hi_imag = ai[i];

    if (ai[i]     < lo_imag) lo_imag = ai[i];

  }

  fprintf(stderr, "real err*r range = %f %f\n", lo_real, hi_real);

  fprintf(stderr, "imag err*r range = %f %f\n", lo_imag, hi_imag);

  if (lo_real < -0.00003 || hi_real > 0.00003 || lo_imag < -0.00003 || hi_imag > 0.00003) {

    fprintf(stderr, "ERROR: range of error values too large...\n");

    fflush(stderr);

    return -1;

  }

#endif

 

  printf("START TIMING!\n");

 

  /* monitor timing runs */

 

  for (j=0; j<ncycles; ++j) {

    PERFORM_TIME_TO_FREQUENCY_FFT

    PERFORM_FREQUENCY_TO_TIME_FFT

  }

 

  printf("STOP TIMING!\n");

 

  MBOX_SND(0)

 

  printf("Now checking results...\n"); fflush(stdout);

 

  hi_real = -100.0;

  lo_real =  100.0;

  hi_imag = -100.0;

  lo_imag =  100.0;

  for (i=0; i<NP; ++i) {

    float x;

    x = trigfunc(i);                       

    if (ar[i] - x > hi_real) hi_real = ar[i] - x ;

    if (ar[i] - x < lo_real) lo_real = ar[i] - x ;

    if (ai[i]     > hi_imag) hi_imag = ai[i];

     if (ai[i]     < lo_imag) lo_imag = ai[i];

  }

  printf("real err*r range = %f %f\n", lo_real, hi_real);

  printf("imag err*r range = %f %f\n", lo_imag, hi_imag);

#endif

  return 0;

}


Target: SPE [This code section runs on the SPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                            */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                               */

#include "../fft.h"

#include <cos.h>

#include <sin.h>

#include <cbe_mfc.h>

#include <spu_mfcio.h>

#include <transpose_matrix4x4.h>

#include <profile.h>

#include <stdio.h>

 

vector float sin5lsb[16][8] __attribute__ ((aligned (128))) = {

{ (vector float) (0.0000000e+00,  1.2271538e-02,  2.4541229e-02,  3.6807224e-02),

  (vector float) (4.9067676e-02,  6.1320737e-02,  7.3564567e-02,  8.5797310e-02),

  (vector float) (9.8017141e-02,  1.1022221e-01,  1.2241068e-01,  1.3458070e-01),

  (vector float) (1.4673047e-01,  1.5885815e-01,  1.7096189e-01,  1.8303989e-01),

  (vector float) (1.9509032e-01,  2.0711137e-01,  2.1910124e-01,  2.3105811e-01),

  (vector float) (2.4298018e-01,  2.5486565e-01,  2.6671275e-01,  2.7851969e-01),

  (vector float) (2.9028466e-01,  3.0200595e-01,  3.1368175e-01,  3.2531029e-01),

  (vector float) (3.3688986e-01,  3.4841868e-01,  3.5989505e-01,  3.7131721e-01)},

{ (vector float) (0.0000000e+00,  6.1358847e-03,  1.2271538e-02,  1.8406730e-02),

  (vector float) (2.4541229e-02,  3.0674804e-02,  3.6807224e-02,  4.2938258e-02),

  (vector float) (4.9067676e-02,  5.5195246e-02,  6.1320737e-02,  6.7443922e-02),

  (vector float) (7.3564567e-02,  7.9682440e-02,  8.5797310e-02,  9.1908954e-02),

  (vector float) (9.8017141e-02,  1.0412163e-01,  1.1022221e-01,  1.1631863e-01),

  (vector float) (1.2241068e-01,  1.2849811e-01,  1.3458070e-01,  1.4065824e-01),

  (vector float) (1.4673047e-01,  1.5279719e-01,  1.5885815e-01,  1.6491312e-01),

  (vector float) (1.7096189e-01,  1.7700422e-01,  1.8303989e-01,  1.8906866e-01)},

{ (vector float) (0.0000000e+00,  3.0679568e-03,  6.1358847e-03,  9.2037544e-03),

  (vector float) (1.2271538e-02,  1.5339206e-02,  1.8406730e-02,  2.1474080e-02),

  (vector float) (2.4541229e-02,  2.7608145e-02,  3.0674804e-02,  3.3741172e-02),

  (vector float) (3.6807224e-02,  3.9872926e-02,  4.2938258e-02,  4.6003181e-02),

  (vector float) (4.9067676e-02,  5.2131705e-02,  5.5195246e-02,  5.8258265e-02),

  (vector float) (6.1320737e-02,  6.4382628e-02,  6.7443922e-02,  7.0504576e-02),

  (vector float) (7.3564567e-02,  7.6623864e-02,  7.9682440e-02,  8.2740262e-02),

  (vector float) (8.5797310e-02,  8.8853553e-02,  9.1908954e-02,  9.4963498e-02)},

{ (vector float) (0.0000000e+00,  1.5339801e-03,  3.0679568e-03,  4.6019261e-03),

  (vector float) (6.1358847e-03,  7.6698288e-03,  9.2037544e-03,  1.0737659e-02),

  (vector float) (1.2271538e-02,  1.3805388e-02,  1.5339206e-02,  1.6872987e-02),

  (vector float) (1.8406730e-02,  1.9940428e-02,  2.1474080e-02,  2.3007682e-02),

  (vector float) (2.4541229e-02,  2.6074719e-02,  2.7608145e-02,  2.9141508e-02),

  (vector float) (3.0674804e-02,  3.2208025e-02,  3.3741172e-02,  3.5274237e-02),

  (vector float) (3.6807224e-02,  3.8340122e-02,  3.9872926e-02,  4.1405641e-02),

  (vector float) (4.2938258e-02,  4.4470772e-02,  4.6003181e-02,  4.7535483e-02)},

{ (vector float) (0.0000000e+00,  7.6699030e-04,  1.5339801e-03,  2.3009691e-03),

  (vector float) (3.0679568e-03,  3.8349426e-03,  4.6019261e-03,  5.3689070e-03),

  (vector float) (6.1358847e-03,  6.9028586e-03,  7.6698288e-03,  8.4367944e-03),

  (vector float) (9.2037544e-03,  9.9707097e-03,  1.0737659e-02,  1.1504602e-02),

  (vector float) (1.2271538e-02,  1.3038468e-02,  1.3805388e-02,  1.4572302e-02),

  (vector float) (1.5339206e-02,  1.6106103e-02,  1.6872987e-02,  1.7639864e-02),

  (vector float) (1.8406730e-02,  1.9173585e-02,  1.9940428e-02,  2.0707261e-02),

  (vector float) (2.1474080e-02,  2.2240888e-02,  2.3007682e-02,  2.3774462e-02)},

{ (vector float) (0.0000000e+00,  3.8349518e-04,  7.6699030e-04,  1.1504854e-03),

  (vector float) (1.5339801e-03,  1.9174748e-03,  2.3009691e-03,  2.6844630e-03),

  (vector float) (3.0679568e-03,  3.4514500e-03,  3.8349426e-03,  4.2184344e-03),

  (vector float) (4.6019261e-03,   4.9854168e-03,  5.3689070e-03,  5.7523963e-03),

  (vector float) (6.1358847e-03,  6.5193721e-03,  6.9028586e-03,  7.2863442e-03),

  (vector float) (7.6698288e-03,  8.0533121e-03,  8.4367944e-03,  8.8202748e-03),

  (vector float) (9.2037544e-03,  9.5872330e-03,  9.9707097e-03,  1.0354185e-02),

  (vector float) (1.0737659e-02,  1.1121131e-02,  1.1504602e-02,  1.1888071e-02)},

{ (vector float) (0.0000000e+00,  1.9174760e-04,  3.8349518e-04,  5.7524274e-04),

  (vector float) (7.6699030e-04,  9.5873786e-04,  1.1504854e-03,  1.3422328e-03),

  (vector float) (1.5339801e-03,  1.7257276e-03,  1.9174748e-03,  2.1092221e-03),

  (vector float) (2.3009691e-03,  2.4927163e-03,  2.6844630e-03,  2.8762100e-03),

  (vector float) (3.0679568e-03,  3.2597035e-03,  3.4514500e-03,  3.6431963e-03),

  (vector float) (3.8349426e-03,  4.0266886e-03,  4.2184344e-03,  4.4101803e-03),

  (vector float) (4.6019261e-03,  4.7936714e-03,  4.9854168e-03,  5.1771621e-03),

  (vector float) (5.3689070e-03,  5.5606519e-03,  5.7523963e-03,  5.9441407e-03)},

{ (vector float) (0.0000000e+00,  9.5873802e-05,  1.9174760e-04,  2.8762140e-04),

  (vector float) (3.8349518e-04,  4.7936899e-04,  5.7524274e-04,  6.7111652e-04),

  (vector float) (7.6699030e-04,  8.6286408e-04,  9.5873786e-04,  1.0546116e-03),

  (vector float) (1.1504854e-03,  1.2463591e-03,  1.3422328e-03,  1.4381065e-03),

  (vector float) (1.5339801e-03,  1.6298539e-03,  1.7257276e-03,  1.8216012e-03),

  (vector float) (1.9174748e-03,  2.0133485e-03,  2.1092221e-03,  2.2050955e-03),

  (vector float) (2.3009691e-03,  2.3968427e-03,  2.4927163e-03,  2.5885897e-03),

  (vector float) (2.6844630e-03,  2.7803367e-03,  2.8762100e-03,  2.9720834e-03)},

{ (vector float) (0.0000000e+00,  4.7936901e-05,  9.5873802e-05,  1.4381070e-04),

  (vector float) (1.9174760e-04,  2.3968449e-04,  2.8762140e-04,  3.3555829e-04),

  (vector float) (3.8349518e-04,  4.3143210e-04,  4.7936899e-04,  5.2730588e-04),

  (vector float) (5.7524274e-04,  6.2317966e-04,  6.7111652e-04,  7.1905344e-04),

  (vector float) (7.6699030e-04,  8.1492722e-04,  8.6286408e-04,  9.1080094e-04),

  (vector float) (9.5873786e-04,  1.0066747e-03,  1.0546116e-03,  1.1025484e-03),

  (vector float) (1.1504854e-03,  1.1984222e-03,  1.2463591e-03,  1.2942959e-03),

  (vector float) (1.3422328e-03,  1.3901696e-03,  1.4381065e-03,  1.4860433e-03)},

{ (vector float) (0.0000000e+00,  2.3968450e-05,  4.7936901e-05,  7.1905350e-05),

  (vector float) (9.5873802e-05,  1.1984225e-04,  1.4381070e-04,  1.6777914e-04),

  (vector float) (1.9174760e-04,  2.1571605e-04,  2.3968449e-04,  2.6365294e-04),

  (vector float) (2.8762140e-04,  3.1158983e-04,  3.3555829e-04,  3.5952675e-04),

  (vector float) (3.8349518e-04,  4.0746364e-04,  4.3143210e-04,  4.5540053e-04),

  (vector float) (4.7936899e-04,  5.0333742e-04,  5.2730588e-04,  5.5127434e-04),

  (vector float) (5.7524274e-04,  5.9921120e-04,  6.2317966e-04,  6.4714812e-04),

  (vector float) (6.7111652e-04,  6.9508498e-04,  7.1905344e-04,  7.4302190e-04)},

{ (vector float) (0.0000000e+00,  1.1984225e-05,  2.3968450e-05,  3.5952675e-05),

  (vector float) (4.7936901e-05,  5.9921123e-05,  7.1905350e-05,  8.3889572e-05),

  (vector float) (9.5873802e-05,  1.0785802e-04,  1.1984225e-04,  1.3182647e-04),

  (vector float) (1.4381070e-04,  1.5579493e-04,  1.6777914e-04,  1.7976337e-04),

  (vector float) (1.9174760e-04,  2.0373182e-04,  2.1571605e-04,  2.2770026e-04),

  (vector float) (2.3968449e-04,  2.5166871e-04,  2.6365294e-04,  2.7563717e-04),

  (vector float) (2.8762140e-04,  2.9960563e-04,  3.1158983e-04,  3.2357406e-04),

  (vector float) (3.3555829e-04,  3.4754252e-04,  3.5952675e-04,  3.7151098e-04)},

{ (vector float) (0.0000000e+00,  5.9921126e-06,  1.1984225e-05,  1.7976337e-05),

  (vector float) (2.3968450e-05,  2.9960562e-05,  3.5952675e-05,  4.1944786e-05),

  (vector float) (4.7936901e-05,  5.3929012e-05,  5.9921123e-05,  6.5913235e-05),

  (vector float) (7.1905350e-05,  7.7897465e-05,  8.3889572e-05,  8.9881687e-05),

  (vector float) (9.5873802e-05,  1.0186591e-04,  1.0785802e-04,  1.1385014e-04),

  (vector float) (1.1984225e-04,  1.2583435e-04,  1.3182647e-04,  1.3781858e-04),

  (vector float) (1.4381070e-04,  1.4980281e-04,  1.5579493e-04,  1.6178703e-04),

  (vector float) (1.6777914e-04,  1.7377126e-04,  1.7976337e-04,  1.8575549e-04)},

{ (vector float) (0.0000000e+00,  2.9960563e-06,  5.9921126e-06,  8.9881687e-06),

  (vector float) (1.1984225e-05,  1.4980281e-05,  1.7976337e-05,  2.0972393e-05),

  (vector float) (2.3968450e-05,  2.6964506e-05,  2.9960562e-05,  3.2956617e-05),

  (vector float) (3.5952675e-05,  3.8948732e-05,  4.1944786e-05,  4.4940844e-05),

  (vector float) (4.7936901e-05,  5.0932955e-05,  5.3929012e-05,  5.6925070e-05),

  (vector float) (5.9921123e-05,  6.2917177e-05,  6.5913235e-05,  6.8909292e-05),

  (vector float) (7.1905350e-05,  7.4901407e-05,  7.7897465e-05,  8.0893515e-05),

  (vector float) (8.3889572e-05,  8.6885630e-05,  8.9881687e-05,  9.2877744e-05)},

{ (vector float) (0.0000000e+00,  1.4980282e-06,  2.9960563e-06,  4.4940844e-06),

  (vector float) (5.9921126e-06,  7.4901404e-06,  8.9881687e-06,  1.0486197e-05),

  (vector float) (1.1984225e-05,  1.3482253e-05,  1.4980281e-05,  1.6478309e-05),

  (vector float) (1.7976337e-05,  1.9474366e-05,  2.0972393e-05,  2.2470422e-05),

  (vector float) (2.3968450e-05,  2.5466477e-05,  2.6964506e-05,  2.8462535e-05),

  (vector float) (2.9960562e-05,  3.1458589e-05,  3.2956617e-05,  3.4454646e-05),

  (vector float) (3.5952675e-05,  3.7450704e-05,  3.8948732e-05,  4.0446757e-05),

  (vector float) (4.1944786e-05,  4.3442815e-05,  4.4940844e-05,  4.6438872e-05)},

{ (vector float) (0.0000000e+00,  7.4901408e-07,  1.4980282e-06,  2.2470422e-06),

  (vector float) (2.9960563e-06,  3.7450702e-06,  4.4940844e-06,  5.2430983e-06),

  (vector float) (5.9921126e-06,  6.7411265e-06,  7.4901404e-06,  8.2391543e-06),

  (vector float) (8.9881687e-06,  9.7371831e-06,  1.0486197e-05,  1.1235211e-05),

  (vector float) (1.1984225e-05,  1.2733239e-05,  1.3482253e-05,  1.4231267e-05),

  (vector float) (1.4980281e-05,  1.5729294e-05,  1.6478309e-05,  1.7227323e-05),

  (vector float) (1.7976337e-05,  1.8725352e-05,  1.9474366e-05,  2.0223379e-05),

  (vector float) (2.0972393e-05,  2.1721407e-05,  2.2470422e-05,  2.3219436e-05)},

{ (vector float) (0.0000000e+00,  3.7450704e-07,  7.4901408e-07,  1.1235211e-06),

  (vector float) (1.4980282e-06,  1.8725351e-06,  2.2470422e-06,  2.6215491e-06),

  (vector float) (2.9960563e-06,  3.3705633e-06,  3.7450702e-06,  4.1195772e-06),

  (vector float) (4.4940844e-06,  4.8685915e-06,  5.2430983e-06,  5.6176054e-06),

  (vector float) (5.9921126e-06,  6.3666193e-06,  6.7411265e-06,  7.1156337e-06),

  (vector float) (7.4901404e-06,  7.8646472e-06,  8.2391543e-06,  8.6136615e-06),

  (vector float) (8.9881687e-06,  9.3626759e-06,  9.7371831e-06,  1.0111689e-05),

  (vector float) (1.0486197e-05,  1.0860704e-05,  1.1235211e-05,  1.1609718e-05)}};

 

vector float cos5lsb[16][8] __attribute__ ((aligned (128))) = {

{ (vector float) (1.0000000e+00,  9.9992472e-01,  9.9969882e-01,  9.9932235e-01),

  (vector float) (9.9879545e-01,  9.9811810e-01,  9.9729043e-01,  9.9631262e-01),

  (vector float) (9.9518472e-01,  9.9390697e-01,  9.9247956e-01,  9.9090266e-01),

  (vector float) (9.8917651e-01,  9.8730141e-01,  9.8527765e-01,  9.8310548e-01),

  (vector float) (9.8078525e-01,  9.7831738e-01,  9.7570211e-01,  9.7293997e-01),

  (vector float) (9.7003126e-01,  9.6697646e-01,  9.6377605e-01,  9.6043050e-01),

  (vector float) (9.5694035e-01,  9.5330602e-01,  9.4952816e-01,  9.4560730e-01),

  (vector float) (9.4154406e-01,  9.3733901e-01,  9.3299282e-01,  9.2850608e-01)},

{ (vector float) (1.0000000e+00,  9.9998116e-01,  9.9992472e-01,  9.9983060e-01),

  (vector float) (9.9969882e-01,  9.9952942e-01,  9.9932235e-01,  9.9907774e-01),

  (vector float) (9.9879545e-01,  9.9847555e-01,  9.9811810e-01,  9.9772304e-01),

  (vector float) (9.9729043e-01,  9.9682027e-01,  9.9631262e-01,  9.9576741e-01),

  (vector float) (9.9518472e-01,  9.9456459e-01,  9.9390697e-01,  9.9321193e-01),

  (vector float) (9.9247956e-01,  9.9170977e-01,  9.9090266e-01,  9.9005818e-01),

  (vector float) (9.8917651e-01,  9.8825759e-01,  9.8730141e-01,  9.8630810e-01),

  (vector float) (9.8527765e-01,  9.8421007e-01,  9.8310548e-01,  9.8196387e-01)},

{ (vector float) (1.0000000e+00,  9.9999529e-01,  9.9998116e-01,  9.9995762e-01),

  (vector float) (9.9992472e-01,  9.9988234e-01,  9.9983060e-01,  9.9976939e-01),

  (vector float) (9.9969882e-01,  9.9961883e-01,  9.9952942e-01,  9.9943060e-01),

  (vector float) (9.9932235e-01,  9.9920475e-01,  9.9907774e-01,  9.9894130e-01),

  (vector float) (9.9879545e-01,  9.9864024e-01,  9.9847555e-01,  9.9830157e-01),

  (vector float) (9.9811810e-01,  9.9792528e-01,  9.9772304e-01,  9.9751145e-01),

  (vector float) (9.9729043e-01,  9.9706006e-01,  9.9682027e-01,  9.9657112e-01),

  (vector float) (9.9631262e-01,  9.9604470e-01,  9.9576741e-01,  9.9548078e-01)},

{ (vector float) (1.0000000e+00,  9.9999881e-01,  9.9999529e-01,  9.9998939e-01),

  (vector float) (9.9998116e-01,  9.9997061e-01,  9.9995762e-01,  9.9994236e-01),

  (vector float) (9.9992472e-01,  9.9990469e-01,  9.9988234e-01,  9.9985766e-01),

  (vector float) (9.9983060e-01,  9.9980116e-01,  9.9976939e-01,  9.9973530e-01),

  (vector float) (9.9969882e-01,  9.9966002e-01,  9.9961883e-01,  9.9957532e-01),

  (vector float) (9.9952942e-01,  9.9948120e-01,  9.9943060e-01,  9.9937767e-01),

  (vector float) (9.9932235e-01,  9.9926478e-01,  9.9920475e-01,  9.9914241e-01),

  (vector float) (9.9907774e-01,  9.9901068e-01,  9.9894130e-01,  9.9886954e-01)},

{ (vector float) (1.0000000e+00,  9.9999970e-01,  9.9999881e-01,  9.9999738e-01),

  (vector float) (9.9999529e-01,  9.9999267e-01,  9.9998939e-01,  9.9998558e-01),

  (vector float) (9.9998116e-01,  9.9997616e-01,  9.9997061e-01,  9.9996442e-01),

  (vector float) (9.9995762e-01,  9.9995029e-01,  9.9994236e-01,  9.9993384e-01),

  (vector float) (9.9992472e-01,  9.9991500e-01,  9.9990469e-01,  9.9989384e-01),

  (vector float) (9.9988234e-01,  9.9987030e-01,  9.9985766e-01,  9.9984443e-01),

  (vector float) (9.9983060e-01,  9.9981618e-01,  9.9980116e-01,  9.9978560e-01),

  (vector float) (9.9976939e-01,  9.9975264e-01,  9.9973530e-01,  9.9971735e-01)},

{ (vector float) (1.0000000e+00,  9.9999994e-01,  9.9999970e-01,  9.9999934e-01),

  (vector float) (9.9999881e-01,  9.9999815e-01,  9.9999738e-01,  9.9999642e-01),

  (vector float) (9.9999529e-01,   9.9999404e-01,  9.9999267e-01,  9.9999112e-01),

  (vector float) (9.9998939e-01,  9.9998760e-01,  9.9998558e-01,  9.9998343e-01),

  (vector float) (9.9998116e-01,  9.9997872e-01,  9.9997616e-01,  9.9997348e-01),

  (vector float) (9.9997061e-01,  9.9996758e-01,  9.9996442e-01,  9.9996108e-01),

  (vector float) (9.9995762e-01,  9.9995404e-01,  9.9995029e-01,  9.9994642e-01),

  (vector float) (9.9994236e-01,  9.9993813e-01,  9.9993384e-01,  9.9992931e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  9.9999994e-01,  9.9999982e-01),

  (vector float) (9.9999970e-01,  9.9999952e-01,  9.9999934e-01,  9.9999911e-01),

  (vector float) (9.9999881e-01,  9.9999851e-01,  9.9999815e-01,  9.9999779e-01),

  (vector float) (9.9999738e-01,  9.9999690e-01,  9.9999642e-01,  9.9999589e-01),

  (vector float) (9.9999529e-01,  9.9999470e-01,  9.9999404e-01,  9.9999338e-01),

  (vector float) (9.9999267e-01,  9.9999189e-01,  9.9999112e-01,  9.9999028e-01),

  (vector float) (9.9998939e-01,  9.9998850e-01,  9.9998760e-01,  9.9998659e-01),

  (vector float) (9.9998558e-01,  9.9998456e-01,  9.9998343e-01,  9.9998236e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999988e-01,  9.9999982e-01,  9.9999976e-01),

  (vector float) (9.9999970e-01,  9.9999964e-01,  9.9999952e-01,  9.9999946e-01),

  (vector float) (9.9999934e-01,  9.9999923e-01,  9.9999911e-01,  9.9999899e-01),

  (vector float) (9.9999881e-01,  9.9999869e-01,  9.9999851e-01,  9.9999833e-01),

  (vector float) (9.9999815e-01,  9.9999797e-01,  9.9999779e-01,  9.9999756e-01),

  (vector float) (9.9999738e-01,  9.9999714e-01,  9.9999690e-01,  9.9999666e-01),

  (vector float) (9.9999642e-01,  9.9999613e-01,  9.9999589e-01,  9.9999559e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  9.9999994e-01,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999988e-01,  9.9999988e-01,  9.9999988e-01),

  (vector float) (9.9999982e-01,  9.9999982e-01,  9.9999976e-01,  9.9999976e-01),

  (vector float) (9.9999970e-01,  9.9999964e-01,  9.9999964e-01,  9.9999958e-01),

  (vector float) (9.9999952e-01,  9.9999946e-01,  9.9999946e-01,  9.9999940e-01),

  (vector float) (9.9999934e-01,  9.9999928e-01,  9.9999923e-01,  9.9999917e-01),

  (vector float) (9.9999911e-01,  9.9999905e-01,  9.9999899e-01,  9.9999887e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999994e-01,  9.9999994e-01,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999994e-01,  9.9999988e-01,  9.9999988e-01),

  (vector float) (9.9999988e-01,  9.9999988e-01,  9.9999988e-01,  9.9999982e-01),

  (vector float) (9.9999982e-01,  9.9999982e-01,  9.9999982e-01,  9.9999976e-01),

  (vector float) (9.9999976e-01,  9.9999976e-01,  9.9999976e-01,  9.9999970e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  9.9999994e-01,  9.9999994e-01,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999994e-01,  9.9999994e-01,  9.9999994e-01),

  (vector float) (9.9999994e-01,  9.9999994e-01,  9.9999994e-01,  9.9999994e-01)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,   1.0000000e+00,  1.0000000e+00,  1.0000000e+00)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00)},

{ (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00),

  (vector float) (1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00)}};

 

float sin256[256] __attribute__ ((aligned (128))) = {

  0.0000000e+00,  1.2271538e-02,  2.4541229e-02,  3.6807224e-02,

  4.9067676e-02,   6.1320737e-02,  7.3564567e-02,  8.5797310e-02,

  9.8017141e-02,  1.1022221e-01,  1.2241068e-01,  1.3458070e-01,

  1.4673047e-01,  1.5885815e-01,  1.7096189e-01,  1.8303989e-01,

  1.9509032e-01,  2.0711137e-01,  2.1910124e-01,  2.3105811e-01,

  2.4298018e-01,  2.5486565e-01,  2.6671275e-01,  2.7851969e-01,

  2.9028466e-01,  3.0200595e-01,  3.1368175e-01,  3.2531029e-01,

  3.3688986e-01,  3.4841868e-01,  3.5989505e-01,  3.7131721e-01,

  3.8268343e-01,  3.9399204e-01,  4.0524131e-01,  4.1642955e-01,

  4.2755508e-01,  4.3861625e-01,  4.4961134e-01,  4.6053872e-01,

  4.7139674e-01,  4.8218378e-01,  4.9289820e-01,  5.0353837e-01,

  5.1410276e-01,  5.2458966e-01,  5.3499764e-01,  5.4532498e-01,

  5.5557024e-01,  5.6573182e-01,  5.7580817e-01,  5.8579785e-01,

  5.9569931e-01,  6.0551107e-01,  6.1523157e-01,  6.2485951e-01,

  6.3439327e-01,  6.4383155e-01,  6.5317285e-01,  6.6241580e-01,

  6.7155898e-01,  6.8060100e-01,  6.8954057e-01,  6.9837624e-01,

  7.0710677e-01,  7.1573085e-01,  7.2424710e-01,  7.3265427e-01,

    7.4095112e-01,  7.4913639e-01,  7.5720882e-01,  7.6516724e-01,

  7.7301043e-01,  7.8073722e-01,  7.8834641e-01,  7.9583693e-01,

  8.0320752e-01,  8.1045717e-01,  8.1758481e-01,  8.2458931e-01,

  8.3146960e-01,  8.3822471e-01,  8.4485358e-01,  8.5135520e-01,

  8.5772860e-01,  8.6397284e-01,  8.7008697e-01,  8.7607008e-01,

  8.8192129e-01,  8.8763964e-01,  8.9322430e-01,  8.9867449e-01,

  9.0398932e-01,  9.0916800e-01,  9.1420978e-01,  9.1911387e-01,

  9.2387950e-01,  9.2850608e-01,  9.3299282e-01,  9.3733901e-01,

  9.4154406e-01,  9.4560730e-01,  9.4952816e-01,  9.5330602e-01,

  9.5694035e-01,  9.6043050e-01,  9.6377605e-01,  9.6697646e-01,

  9.7003126e-01,  9.7293997e-01,  9.7570211e-01,  9.7831738e-01,

  9.8078525e-01,  9.8310548e-01,  9.8527765e-01,  9.8730141e-01,

  9.8917651e-01,  9.9090266e-01,  9.9247956e-01,  9.9390697e-01,

  9.9518472e-01,  9.9631262e-01,  9.9729043e-01,  9.9811810e-01,

  9.9879545e-01,  9.9932235e-01,  9.9969882e-01,  9.9992472e-01,

  1.0000000e+00,  9.9992472e-01,  9.9969882e-01,  9.9932235e-01,

  9.9879545e-01,  9.9811810e-01,  9.9729043e-01,  9.9631262e-01,

  9.9518472e-01,  9.9390697e-01,  9.9247956e-01,  9.9090266e-01,

  9.8917651e-01,  9.8730141e-01,  9.8527765e-01,  9.8310548e-01,

  9.8078525e-01,  9.7831738e-01,  9.7570211e-01,  9.7293997e-01,

  9.7003126e-01,  9.6697646e-01,  9.6377605e-01,  9.6043050e-01,

  9.5694035e-01,  9.5330602e-01,  9.4952816e-01,  9.4560730e-01,

  9.4154406e-01,  9.3733901e-01,  9.3299282e-01,  9.2850608e-01,

  9.2387950e-01,  9.1911387e-01,  9.1420978e-01,  9.0916800e-01,

  9.0398932e-01,  8.9867449e-01,  8.9322430e-01,  8.8763964e-01,

  8.8192129e-01,  8.7607008e-01,  8.7008697e-01,  8.6397284e-01,

  8.5772860e-01,  8.5135520e-01,  8.4485358e-01,  8.3822471e-01,

  8.3146960e-01,  8.2458931e-01,  8.1758481e-01,  8.1045717e-01,

  8.0320752e-01,  7.9583693e-01,  7.8834641e-01,  7.8073722e-01,

  7.7301043e-01,  7.6516724e-01,  7.5720882e-01,  7.4913639e-01,

  7.4095112e-01,  7.3265427e-01,  7.2424710e-01,  7.1573085e-01,

  7.0710677e-01,  6.9837624e-01,  6.8954057e-01,  6.8060100e-01,

  6.7155898e-01,  6.6241580