Cell SDK Code Sample: Parallel Matrix Multiplication Workload



%% --------------------------------------------------------------

%% (C) Copyright 2001,2005,                                      

%% International Business Machines Corporation,                  

%% Sony Computer Entertainment Incorporated,                      

%% Toshiba Corporation.                                          

%%                                                               

%% All Rights Reserved.                                          

%% --------------------------------------------------------------

%% PROLOG END TAG zYx                                             

Summary: Parallel Matrix Multiplication workload for CBE

 

Target: CBE/Linux

 

Description:

 

            This workload calculates C = A * B which A, B and C are N x N squared

            matrices comprised of single float values. It uses block partitioning

            algorithm for reducing required bandwidth. The block size(M) is fixed

            at 64.

 

            This workload is highly optimized and demonstrates near theoretical

            peak system performance as long as huge pages are used to store

            the matrices. The program needs one 16MB pages.

 

            The SPE arbitrate for work using an atomic counter where the counter

            specifies the output block for the SPE to compute.

 

            The workload supports many command line options. The command line

            options include:

 

            *  The ability to specify the size of the matrix. 64x64, 128x128,

               256x256, 512x512, or 1024x1024.

            *  The ability to specify the number of times the matrices are

               multiplied.

            *  The ability to specify the number of SPE's used to perform the

               calculations.

            *  The ability to enabled validation of the result.

 

How to run:

 

            To obtain usage information:

               matrix_mul -h

            To run 3 matrix multiplies of 128x128 on 5 SPE and verify the

            results:

               matrix_mul -i 3 -m 128 -s 5 -v

   

Target: PPE [This code section runs on the PPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                           */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

#include <stdlib.h>

#include <unistd.h>

#include <stdio.h>

#include <fcntl.h>

#include <fenv.h>

#include <assert.h>

#include <sys/types.h>

#include <sys/mman.h>

#include <libspe.h>

#include <libmisc.h>

#include <math.h>

#include <string.h>

#include <errno.h>

#include <sched.h>

 

 

#define M                     64        /* basic block size */

#define MAX_BLOCKS         256

#define MAX_SPUS    8

#define ERR_THRESHOLD    0.01

 

extern spe_program_handle_t block;

extern int daxpy(double *, double *, double *, int, int, double, double, double);

 

float *tmp = NULL;

 

char *mem_file = "/huge/daxpy_mem.bin";

char *mem_addr = NULL;

 

void print_usage(char * name)

{

  printf("USAGE: %s [options]\n", name);

  printf("       Valid Options include:\n");

  printf("         -i #   : perform specified number of iterations. Default is 1\n");

  printf("         -h     : output this usage help screen\n");

  printf("         -m #   : perform maxtrix multiply of matrices of size #. Valid sizes\n");

  printf("                  are 64, 128, 256, 512, and 1024. Default is 64.\n");

  printf("         -s #   : run with # number of SPUs. The dafault is 1 SPU.\n");

  printf("         -v     : verify the final matrix multiply result.\n");

  printf("         -V     : verbose output.\n");

  exit(1);

}

 

void send_mail(speid_t id, unsigned int data)

{

  while (spe_write_in_mbox(id, data));

}

 

 

void block_swizzle(float *mat, int size)

{

  int i, j, k, l;

  float *ptr, *src, *dst;

 

  ptr = tmp;

  for(i=0; i<(size/M); i++) {

    for(j=0; j<(size/M); j++){

      for(k=0; k<M; k++) {

            for(l=0; l<M; l++) {

              *ptr++ = mat[i*(M*size)+j*M+k*size+l];

            }

      }

    }

  }

 

  src = tmp;

  dst = mat;

  for(i=0; i<size*size; i++) *dst++ = *src++;

}

 

 

int main(int argc, char *argv[])

{

  int i, j, k;

  int verify = 0;

  int verbose = 0;

  int fail = 0;

  int msize = M, blocks;

  int huge_size;

  int spus = 1;

  int iterations = 1;

  spe_gid_t spe_gid;

  speid_t spe_id[MAX_SPUS];

  float *a, *b, *c, *exp;

  double *x, *y, *z;

  int *cntr;

  unsigned int offset;

  int fmem;

  int status;

 

  for (i=1; i<argc; i++) {

    if (*argv[i] == '-') {

      switch (*(argv[i]+1)) {

      case 'i':

            i++;

            if (i < argc) {

              iterations = atoi(argv[i]);

              if (iterations < 0) iterations = 0;

            } else {

              printf("ERROR: Number of iterations is not specified.\n");

              print_usage(argv[0]);

            }

            break;

      case 'm':                 /* Run matrix of specified size */

            i++;

            if (i < argc) {

              msize = atoi(argv[i]);

              if ((msize < 64) || (msize > 1024) || (msize & (msize - 1))) {

                printf("ERROR: Invalid matrix size of %d specified\n", msize);

                print_usage(argv[0]);

              }

            } else {

              printf("ERROR: Matrix size is not specifed\n");

              print_usage(argv[0]);

            }

            break;

      case 's':                   /* Run on specified number of SPUs */

            i++;

            if (i < argc) {

              spus = atoi(argv[i]);

              if (spus > MAX_SPUS) spus = MAX_SPUS;

            } else {

              printf("ERROR: Number of SPUs to use is not specified.\n");

              print_usage(argv[0]);

            }

            break;

      case 'v':                  /* Let the PPU verify the final product result. */

            verify = 1;

            break;

      case 'V':                 /* Verbose output */

            verbose = 1;

            break;

      case 'h':

      default:

            print_usage(argv[0]);

            break;

      }

    } else {

      print_usage(argv[0]);

    }

  }

 

  blocks = (msize/M) * (msize/M);

 

  spe_gid = spe_create_group(SCHED_OTHER, 0, 1);

 

  /* Allocate memory buffers.

   */

#define BASE              0x8000

#define ARRAY_SIZE 0x2A80

#define LOOP_COUNT          ((ARRAY_SIZE / 64) - 1)

#define HUGE_PAGE_SIZE    16*1024*1024

 

  offset = 0;

 

  a = (float *)offset;

  offset += sizeof(float) * msize * msize;

  b = (float *)(offset);

  offset += sizeof(float) * msize * msize;

  c = (float *)(offset);

  offset += sizeof(float) * msize * msize;

  cntr = (int *)(offset);

  offset += 128;

 

  /* Create a large contiguous memory buffer by allocating a large

   * page (or more). Large page memory will also reduce the TLB thrashing.

   */

  if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {

    printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);

 

    mem_addr = malloc_align(offset, 7);

    if (mem_addr == NULL) {

      printf("ERROR: unable to malloc %d byte buffer.\n", offset);

      exit(1);

    }

  } else {

 

    huge_size = (offset + HUGE_PAGE_SIZE-1) & ~(HUGE_PAGE_SIZE-1);

 

    assert(HUGE_PAGE_SIZE > 32*1024);

    mem_addr = (char *) mmap (0, huge_size, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);

    if (mem_addr == MAP_FAILED) {

      printf("ERROR: unable to mmap file %s (errno=%d).\n", mem_file, errno);

      close (fmem);

      exit (1);

    }

  }

  /* Correct the buffer pointers

   */

  x = (double *)(mem_addr + (unsigned int)x);

  y = (double *)(mem_addr + (unsigned int)y);

  z = (double *)(mem_addr + (unsigned int)z);

  a = (float  *)(mem_addr + (unsigned int)a);

  b = (float  *)(mem_addr + (unsigned int)b);

  c = (float  *)(mem_addr + (unsigned int)c);

  cntr = (int *)(mem_addr + (unsigned int)cntr);

 

  /* Construct matrices and expected results

   */

  exp = (float *)malloc(sizeof(float) * msize * msize);

  tmp = (float *)malloc(sizeof(float) * msize * msize);

  if ((exp == NULL) || (tmp == NULL)) {

    printf("ERROR: failed to allocated verification buffers exp and tmp\n");

    exit(1);

  }

 

  /* Initialize the matrix data and counter.

   */

  printf("Initializing Arrays ... "); fflush(stdout);

  for(i=0; i<msize; i++) {

    for(j=0; j<msize; j++) {

      a[i*msize+j] = rand_0_to_1();

      b[i*msize+j] = rand_0_to_1();

      c[i*msize+j] = 0.0f;

    }

  }

  printf("done\n"); fflush(stdout);

  *cntr = 0;

 

  if (verify) {

    /* Compute expected result.

     */

    printf("Computing expected results ... "); fflush(stdout);

    for (i=0; i<msize; i++) {

      for (j=0; j<msize; j++) {

            exp[i*msize+j] = 0.0f;

            for (k=0; k<msize; k++) {

              exp[i*msize+j] += a[i*msize+k] * b[k*msize+j];

            }

      }

    }

    /* Swizzle inputs matrices and expected results to correspond to tiling.

     */

    block_swizzle(a, msize);

    block_swizzle(b, msize);

    block_swizzle(exp, msize);

 

    printf("done\n"); fflush(stdout);

  }

 

  printf("Running test ... "); fflush(stdout);

  /* Start all the SPUs computing the matrix product.

   */

  /* Create each of the SPU threads

   */

  for (i=0; i<spus; i++) {

    spe_id[i] = spe_create_thread(spe_gid, &block, 0, 0, -1, 0);

    if (spe_id[i] == NULL) {

      printf("INTERNAL ERROR: failed to create spu thread %d. Error = %s\n", i, strerror(errno));

      exit(1);

    }

  }

 

  *cntr = 0;

 

  /* Send each of the SPUs the input parameters

   */

  for (i=0; i<spus; i++) {

    send_mail(spe_id[i], (unsigned int)msize);

    send_mail(spe_id[i], (unsigned int)iterations);

    send_mail(spe_id[i], (unsigned int)cntr);

    send_mail(spe_id[i], (unsigned int)a);

    send_mail(spe_id[i], (unsigned int)b);

    send_mail(spe_id[i], (unsigned int)c);

  }

 

 

  /* Wait for the SPUs to complete

   */

  for (i=0; i<spus; i++) {

    (void)spe_wait(spe_id[i], &status, 0);

  }

 

  printf("done\n");

 

  /* Output SPU, matrix multiply results

   */

  if (verify) {

    float delta;

 

    printf("Verifying the results ... ");

 

    /* Verify the resulting matrix output.

     */

    for (i=0; i<msize; i++) {

      for (j=0; j<msize; j++) {

            delta = exp[i*msize+j] - c[i*msize+j];

            if (delta < 0.0f) delta = -delta;

            if (delta > ERR_THRESHOLD) {

              if (verbose) printf("  %d %d exp=%f got=%f\n", i, j, exp[i*msize+j], c[i*msize+j]);

              fail++;

            }

      }

    }

     

    if (fail) {

      printf("FAILED (%d)\n", fail);

      fail = 1;

    } else {

      printf("PASSED\n");

    }

    }

 

  return (fail);

}


 Target: SPE [This code section runs on the SPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                           */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

 

/*

  * Matrix Multiply --- EUC

  *

  * block.c - Matrix Multiply with block partitioning

  */

 

#include <spu_intrinsics.h>

#include <stdlib.h>

#include <vec_literal.h>

#include <cbe_mfc.h>

#include <spu_mfcio.h>

#include "atomic_inc_return.h"

 

#ifndef M

#define M                     64                    /* Size of the matrix block - M x M */

#endif

 

#define MAX_N                      1024

#define MAX_TILES   (MAX_N / M)

 

static unsigned int N;

static unsigned int ITER;

static unsigned int A_AREA, B_AREA, C_AREA;

static atomic_ea_t  FINC_AREA2;

 

#define DIN_TAG   0

#define DOUT_TAG  2

 

#define DMA_Wait(TAG_ID)                          \

{                                                                      \

  mfc_write_tag_mask((1<<(TAG_ID)));                      \

  mfc_read_tag_status_all();                              \

}

 

#define SwapInBuf()                                         \

{                                                                      \

  OpA = InTag ? blkA1 : blkA0;                                   \

  OpB = InTag ? blkB1 : blkB0;                                    \

  InTag ^= 1;                                                    \

  InA = InTag ? blkA1 : blkA0;                         \

  InB = InTag ? blkB1 : blkB0;                          \

}

 

#define SwapOutBuf()                                      \

{                                                                      \

  OpC = (OutTag==DOUT_TAG) ? blkC1 : blkC0;     \

  OutC = (OutTag==DOUT_TAG) ? blkC1 : blkC0;    \

  OutTag ^= 1;                                                 \

}

 

#ifdef _XLC

 

#define ALIGN8B

#define SPU_FMA(RT,RA,RB,RC)     RT = spu_madd(RA, RB, RC)

#define SPU_FM(RT,RA,RB) RT = spu_mul(RA, RB)

#define SPU_LNOP

 

#else

 

#define ALIGN8B        asm volatile(".align 3")

 

#define SPU_FMA(RT,RA,RB,RC)                                                                 \

  asm volatile(".align 3");                                                                        \

  asm volatile("fma %0,%1,%2,%3":"=r"(RT):"r"(RA),"r"(RB),"r"(RC))

 

#define SPU_FM(RT,RA,RB)                                      \

  asm volatile(".align 3");                                                \

  asm volatile("fm %0,%1,%2":"=r"(RT):"r"(RA),"r"(RB))

 

#define SPU_LNOP    asm volatile("lnop")

 

 

#endif

 

 

 

#define StageCBAclr(OFFSET)                                                                                                           \

{                                                                                                                                              \

  ALIGN8B;                                                                                                                             \

  SPU_FM(c0_0B,a00,b0_0B);                                                                                                           \

  SPU_FM(c1_0B,a10,b0_0B);                                                                                                           \

  SPU_FM(c2_0B,a20,b0_0B);                                                                                                           \

  SPU_FM(c3_0B,a30,b0_0B);                                                                                                           \

  SPU_FM(c0_1B,a00,b0_1B);                                                                                                           \

  SPU_FM(c1_1B,a10,b0_1B);                                                                                                           \

  SPU_FM(c2_1B,a20,b0_1B);                                                                                                           \

  SPU_FM(c3_1B,a30,b0_1B);                                                                                                           \

  SPU_FMA(c0_0B,a01,b1_0B,c0_0B);                                                                                             \

  SPU_FMA(c1_0B,a11,b1_0B,c1_0B);                                                                                             \

  SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+16));                  \

  SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+16));            \

  SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+16));        \

  SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+16));        \

  SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+20));                  \

  SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+20));            \

  SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+20));        \

  SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+20));        \

  SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A;                        \

  SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A;                  \

  SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A;  \

  SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A;  \

  SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A;                    \

  SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A;  \

  SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A;          \

  SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A;          \

  SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP;                                                                                   \

}

 

#define StageACBclr(OFFSET)                                                                                                           \

{                                                                                                                                              \

  SPU_FM(c0_0C,a00,b0_0C);                                                                                                           \

  SPU_FM(c1_0C,a10,b0_0C);                                                                                                           \

  SPU_FM(c2_0C,a20,b0_0C);                                                                                                           \

  SPU_FM(c3_0C,a30,b0_0C);                                                                                                           \

  SPU_FM(c0_1C,a00,b0_1C);                                                                                                           \

  SPU_FM(c1_1C,a10,b0_1C);                                                                                                           \

  SPU_FM(c2_1C,a20,b0_1C);                                                                                                           \

  SPU_FM(c3_1C,a30,b0_1C);                                                                                                           \

  SPU_FMA(c0_0C,a01,b1_0C,c0_0C);                                                                                            \

  SPU_FMA(c1_0C,a11,b1_0C,c1_0C);                                                                                            \

  SPU_FMA(c2_0C,a21,b1_0C,c2_0C); b0_0A = *((volatile vector float *)(ptrB+OFFSET+16));                 \

  SPU_FMA(c3_0C,a31,b1_0C,c3_0C); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1C,a01,b1_1C,c0_1C); b1_0A = *((volatile vector float *)(ptrB+M+OFFSET+16));           \

  SPU_FMA(c1_1C,a11,b1_1C,c1_1C); b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET+16));       \

  SPU_FMA(c2_1C,a21,b1_1C,c2_1C); b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET+16));       \

  SPU_FMA(c3_1C,a31,b1_1C,c3_1C); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0C,a02,b2_0C,c0_0C); b0_1A = *((volatile vector float *)(ptrB+OFFSET+20));                 \

  SPU_FMA(c1_0C,a12,b2_0C,c1_0C); b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+20));           \

  SPU_FMA(c2_0C,a22,b2_0C,c2_0C); b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+20));       \

  SPU_FMA(c3_0C,a32,b2_0C,c3_0C); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1C,a02,b2_1C,c0_1C); b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+20));       \

  SPU_FMA(c1_1C,a12,b2_1C,c1_1C); *((volatile vector float *)(ptrC+OFFSET)) = c0_0B;                       \

  SPU_FMA(c2_1C,a22,b2_1C,c2_1C); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0B;                  \

  SPU_FMA(c3_1C,a32,b2_1C,c3_1C); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0C,a03,b3_0C,c0_0C); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0B;  \

  SPU_FMA(c1_0C,a13,b3_0C,c1_0C); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0B;  \

    SPU_FMA(c2_0C,a23,b3_0C,c2_0C); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1B;                   \

  SPU_FMA(c3_0C,a33,b3_0C,c3_0C); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1C,a03,b3_1C,c0_1C); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1B; \

  SPU_FMA(c1_1C,a13,b3_1C,c1_1C); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1B;         \

  SPU_FMA(c2_1C,a23,b3_1C,c2_1C); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1B;         \

  SPU_FMA(c3_1C,a33,b3_1C,c3_1C); SPU_LNOP;                                                                                   \

}

 

#define StageBACclr(OFFSET)                                                                                                           \

{                                                                                                                                              \

  SPU_FM(c0_0A,a00,b0_0A);                                                                                                           \

  SPU_FM(c1_0A,a10,b0_0A);                                                                                                           \

  SPU_FM(c2_0A,a20,b0_0A);                                                                                                           \

  SPU_FM(c3_0A,a30,b0_0A);                                                                                                           \

  SPU_FM(c0_1A,a00,b0_1A);                                                                                                           \

  SPU_FM(c1_1A,a10,b0_1A);                                                                                                           \

  SPU_FM(c2_1A,a20,b0_1A);                                                                                                           \

  SPU_FM(c3_1A,a30,b0_1A);                                                                                                           \

  SPU_FMA(c0_0A,a01,b1_0A,c0_0A);                                                                                            \

  SPU_FMA(c1_0A,a11,b1_0A,c1_0A);                                                                                            \

  SPU_FMA(c2_0A,a21,b1_0A,c2_0A); b0_0B = *((volatile vector float *)(ptrB+OFFSET+16));                 \

  SPU_FMA(c3_0A,a31,b1_0A,c3_0A); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1A,a01,b1_1A,c0_1A); b1_0B = *((volatile vector float *)(ptrB+M+OFFSET+16));           \

  SPU_FMA(c1_1A,a11,b1_1A,c1_1A); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET+16));       \

  SPU_FMA(c2_1A,a21,b1_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET+16));       \

  SPU_FMA(c3_1A,a31,b1_1A,c3_1A); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0A,a02,b2_0A,c0_0A); b0_1B = *((volatile vector float *)(ptrB+OFFSET+20));                 \

  SPU_FMA(c1_0A,a12,b2_0A,c1_0A); b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+20));           \

  SPU_FMA(c2_0A,a22,b2_0A,c2_0A); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+20));       \

  SPU_FMA(c3_0A,a32,b2_0A,c3_0A); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+20));       \

  SPU_FMA(c1_1A,a12,b2_1A,c1_1A); *((volatile vector float *)(ptrC+OFFSET)) = c0_0C;                       \

  SPU_FMA(c2_1A,a22,b2_1A,c2_1A); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0C;                 \

  SPU_FMA(c3_1A,a32,b2_1A,c3_1A); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_0A,a03,b3_0A,c0_0A); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0C; \

  SPU_FMA(c1_0A,a13,b3_0A,c1_0A); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0C; \

  SPU_FMA(c2_0A,a23,b3_0A,c2_0A); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1C;                   \

  SPU_FMA(c3_0A,a33,b3_0A,c3_0A); SPU_LNOP;                                                                                   \

  SPU_FMA(c0_1A,a03,b3_1A,c0_1A); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1C; \

  SPU_FMA(c1_1A,a13,b3_1A,c1_1A); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1C;         \

  SPU_FMA(c2_1A,a23,b3_1A,c2_1A); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1C;         \

  SPU_FMA(c3_1A,a33,b3_1A,c3_1A); SPU_LNOP;                                                                                   \

}

 

#define StageCBA(OFFSET,OFFB)                                                                                                                \

{                                                                                                                                                          \

  ALIGN8B;                                                                                                                                         \

  SPU_FMA(c0_0B,a00,b0_0B,c0_0B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16));                              \

  SPU_FMA(c1_0B,a10,b0_0B,c1_0B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16));                        \

  SPU_FMA(c2_0B,a20,b0_0B,c2_0B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16));                    \

  SPU_FMA(c3_0B,a30,b0_0B,c3_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_1B,a00,b0_1B,c0_1B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16));                    \

  SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20));                              \

  SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20));                        \

  SPU_FMA(c3_1B,a30,b0_1B,c3_1B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20));                    \

  SPU_FMA(c1_0B,a11,b1_0B,c1_0B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20));                    \

  SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16));                        \

  SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16));      \

  SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16));  \

  SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16));  \

  SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20));                        \

  SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20));      \

  SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20));  \

  SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20));  \

  SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A;                                    \

  SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A;                              \

  SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A;              \

  SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A;              \

  SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A;                                \

  SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A;              \

  SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A;                      \

  SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A;                      \

  SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP;                                                                                               \

}

 

#define StageCBAmod(OFFSET,OFFB)                                                                                                         \

{                                                                                                                                                          \

  ALIGN8B;                                                                                                                                         \

  SPU_FMA(c0_0B,a00,b0_0B,c0_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c1_0B,a10,b0_0B,c1_0B); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFB*M+8));                     \

  SPU_FMA(c2_0B,a20,b0_0B,c2_0B); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFB*M+12));                   \

  SPU_FMA(c3_0B,a30,b0_0B,c3_0B); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFB*M+8));                     \

  SPU_FMA(c0_1B,a00,b0_1B,c0_1B); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFB*M+12));                   \

  SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16));                              \

  SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16));                        \

  SPU_FMA(c3_1B,a30,b0_1B,c3_1B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16));                    \

  SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16));                    \

  SPU_FMA(c1_0B,a11,b1_0B,c1_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c2_0B,a21,b1_0B,c2_0B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20));                              \

  SPU_FMA(c3_0B,a31,b1_0B,c3_0B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20));                        \

  SPU_FMA(c0_1B,a01,b1_1B,c0_1B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20));                    \

  SPU_FMA(c1_1B,a11,b1_1B,c1_1B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20));                    \

  SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16));                        \

  SPU_FMA(c3_1B,a31,b1_1B,c3_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16));      \

  SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16));  \

  SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16));  \

  SPU_FMA(c2_0B,a22,b2_0B,c2_0B); SPU_LNOP;                                                                                               \

  SPU_FMA(c3_0B,a32,b2_0B,c3_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20));                        \

  SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20));      \

  SPU_FMA(c1_1B,a12,b2_1B,c1_1B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20));