Cell SDK Code Sample: Parallel Matrix Multiplication Workload
%% --------------------------------------------------------------
%% (C) Copyright 2001,2005,
%% International Business Machines Corporation,
%% Sony Computer Entertainment Incorporated,
%% Toshiba Corporation.
%%
%% All Rights Reserved.
%% --------------------------------------------------------------
%% PROLOG END TAG zYx
Summary: Parallel Matrix Multiplication workload for CBE
Target: CBE/Linux
Description:
This workload calculates C = A * B which A, B and C are N x N squared
matrices comprised of single float values. It uses block partitioning
algorithm for reducing required bandwidth. The block size(M) is fixed
at 64.
This workload is highly optimized and demonstrates near theoretical
peak system performance as long as huge pages are used to store
the matrices. The program needs one 16MB pages.
The SPE arbitrate for work using an atomic counter where the counter
specifies the output block for the SPE to compute.
The workload supports many command line options. The command line
options include:
* The ability to specify the size of the matrix. 64x64, 128x128,
256x256, 512x512, or 1024x1024.
* The ability to specify the number of times the matrices are
multiplied.
* The ability to specify the number of SPE's used to perform the
calculations.
* The ability to enabled validation of the result.
How to run:
To obtain usage information:
matrix_mul -h
To run 3 matrix multiplies of 128x128 on 5 SPE and verify the
results:
matrix_mul -i 3 -m 128 -s 5 -v
Target: PPE [This code section runs on the PPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <fenv.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <libspe.h>
#include <libmisc.h>
#include <math.h>
#include <string.h>
#include <errno.h>
#include <sched.h>
#define M 64 /* basic block size */
#define MAX_BLOCKS 256
#define MAX_SPUS 8
#define ERR_THRESHOLD 0.01
extern spe_program_handle_t block;
extern int daxpy(double *, double *, double *, int, int, double, double, double);
float *tmp = NULL;
char *mem_file = "/huge/daxpy_mem.bin";
char *mem_addr = NULL;
void print_usage(char * name)
{
printf("USAGE: %s [options]\n", name);
printf(" Valid Options include:\n");
printf(" -i # : perform specified number of iterations. Default is 1\n");
printf(" -h : output this usage help screen\n");
printf(" -m # : perform maxtrix multiply of matrices of size #. Valid sizes\n");
printf(" are 64, 128, 256, 512, and 1024. Default is 64.\n");
printf(" -s # : run with # number of SPUs. The dafault is 1 SPU.\n");
printf(" -v : verify the final matrix multiply result.\n");
printf(" -V : verbose output.\n");
exit(1);
}
void send_mail(speid_t id, unsigned int data)
{
while (spe_write_in_mbox(id, data));
}
void block_swizzle(float *mat, int size)
{
int i, j, k, l;
float *ptr, *src, *dst;
ptr = tmp;
for(i=0; i<(size/M); i++) {
for(j=0; j<(size/M); j++){
for(k=0; k<M; k++) {
for(l=0; l<M; l++) {
*ptr++ = mat[i*(M*size)+j*M+k*size+l];
}
}
}
}
src = tmp;
dst = mat;
for(i=0; i<size*size; i++) *dst++ = *src++;
}
int main(int argc, char *argv[])
{
int i, j, k;
int verify = 0;
int verbose = 0;
int fail = 0;
int msize = M, blocks;
int huge_size;
int spus = 1;
int iterations = 1;
spe_gid_t spe_gid;
speid_t spe_id[MAX_SPUS];
float *a, *b, *c, *exp;
double *x, *y, *z;
int *cntr;
unsigned int offset;
int fmem;
int status;
for (i=1; i<argc; i++) {
if (*argv[i] == '-') {
switch (*(argv[i]+1)) {
case 'i':
i++;
if (i < argc) {
iterations = atoi(argv[i]);
if (iterations < 0) iterations = 0;
} else {
printf("ERROR: Number of iterations is not specified.\n");
print_usage(argv[0]);
}
break;
case 'm': /* Run matrix of specified size */
i++;
if (i < argc) {
msize = atoi(argv[i]);
if ((msize < 64) || (msize > 1024) || (msize & (msize - 1))) {
printf("ERROR: Invalid matrix size of %d specified\n", msize);
print_usage(argv[0]);
}
} else {
printf("ERROR: Matrix size is not specifed\n");
print_usage(argv[0]);
}
break;
case 's': /* Run on specified number of SPUs */
i++;
if (i < argc) {
spus = atoi(argv[i]);
if (spus > MAX_SPUS) spus = MAX_SPUS;
} else {
printf("ERROR: Number of SPUs to use is not specified.\n");
print_usage(argv[0]);
}
break;
case 'v': /* Let the PPU verify the final product result. */
verify = 1;
break;
case 'V': /* Verbose output */
verbose = 1;
break;
case 'h':
default:
print_usage(argv[0]);
break;
}
} else {
print_usage(argv[0]);
}
}
blocks = (msize/M) * (msize/M);
spe_gid = spe_create_group(SCHED_OTHER, 0, 1);
/* Allocate memory buffers.
*/
#define BASE 0x8000
#define ARRAY_SIZE 0x2A80
#define LOOP_COUNT ((ARRAY_SIZE / 64) - 1)
#define HUGE_PAGE_SIZE 16*1024*1024
offset = 0;
a = (float *)offset;
offset += sizeof(float) * msize * msize;
b = (float *)(offset);
offset += sizeof(float) * msize * msize;
c = (float *)(offset);
offset += sizeof(float) * msize * msize;
cntr = (int *)(offset);
offset += 128;
/* Create a large contiguous memory buffer by allocating a large
* page (or more). Large page memory will also reduce the TLB thrashing.
*/
if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {
printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
mem_addr = malloc_align(offset, 7);
if (mem_addr == NULL) {
printf("ERROR: unable to malloc %d byte buffer.\n", offset);
exit(1);
}
} else {
huge_size = (offset + HUGE_PAGE_SIZE-1) & ~(HUGE_PAGE_SIZE-1);
assert(HUGE_PAGE_SIZE > 32*1024);
mem_addr = (char *) mmap (0, huge_size, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);
if (mem_addr == MAP_FAILED) {
printf("ERROR: unable to mmap file %s (errno=%d).\n", mem_file, errno);
close (fmem);
exit (1);
}
}
/* Correct the buffer pointers
*/
x = (double *)(mem_addr + (unsigned int)x);
y = (double *)(mem_addr + (unsigned int)y);
z = (double *)(mem_addr + (unsigned int)z);
a = (float *)(mem_addr + (unsigned int)a);
b = (float *)(mem_addr + (unsigned int)b);
c = (float *)(mem_addr + (unsigned int)c);
cntr = (int *)(mem_addr + (unsigned int)cntr);
/* Construct matrices and expected results
*/
exp = (float *)malloc(sizeof(float) * msize * msize);
tmp = (float *)malloc(sizeof(float) * msize * msize);
if ((exp == NULL) || (tmp == NULL)) {
printf("ERROR: failed to allocated verification buffers exp and tmp\n");
exit(1);
}
/* Initialize the matrix data and counter.
*/
printf("Initializing Arrays ... "); fflush(stdout);
for(i=0; i<msize; i++) {
for(j=0; j<msize; j++) {
a[i*msize+j] = rand_0_to_1();
b[i*msize+j] = rand_0_to_1();
c[i*msize+j] = 0.0f;
}
}
printf("done\n"); fflush(stdout);
*cntr = 0;
if (verify) {
/* Compute expected result.
*/
printf("Computing expected results ... "); fflush(stdout);
for (i=0; i<msize; i++) {
for (j=0; j<msize; j++) {
exp[i*msize+j] = 0.0f;
for (k=0; k<msize; k++) {
exp[i*msize+j] += a[i*msize+k] * b[k*msize+j];
}
}
}
/* Swizzle inputs matrices and expected results to correspond to tiling.
*/
block_swizzle(a, msize);
block_swizzle(b, msize);
block_swizzle(exp, msize);
printf("done\n"); fflush(stdout);
}
printf("Running test ... "); fflush(stdout);
/* Start all the SPUs computing the matrix product.
*/
/* Create each of the SPU threads
*/
for (i=0; i<spus; i++) {
spe_id[i] = spe_create_thread(spe_gid, &block, 0, 0, -1, 0);
if (spe_id[i] == NULL) {
printf("INTERNAL ERROR: failed to create spu thread %d. Error = %s\n", i, strerror(errno));
exit(1);
}
}
*cntr = 0;
/* Send each of the SPUs the input parameters
*/
for (i=0; i<spus; i++) {
send_mail(spe_id[i], (unsigned int)msize);
send_mail(spe_id[i], (unsigned int)iterations);
send_mail(spe_id[i], (unsigned int)cntr);
send_mail(spe_id[i], (unsigned int)a);
send_mail(spe_id[i], (unsigned int)b);
send_mail(spe_id[i], (unsigned int)c);
}
/* Wait for the SPUs to complete
*/
for (i=0; i<spus; i++) {
(void)spe_wait(spe_id[i], &status, 0);
}
printf("done\n");
/* Output SPU, matrix multiply results
*/
if (verify) {
float delta;
printf("Verifying the results ... ");
/* Verify the resulting matrix output.
*/
for (i=0; i<msize; i++) {
for (j=0; j<msize; j++) {
delta = exp[i*msize+j] - c[i*msize+j];
if (delta < 0.0f) delta = -delta;
if (delta > ERR_THRESHOLD) {
if (verbose) printf(" %d %d exp=%f got=%f\n", i, j, exp[i*msize+j], c[i*msize+j]);
fail++;
}
}
}
if (fail) {
printf("FAILED (%d)\n", fail);
fail = 1;
} else {
printf("PASSED\n");
}
}
return (fail);
}
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
/*
* Matrix Multiply --- EUC
*
* block.c - Matrix Multiply with block partitioning
*/
#include <spu_intrinsics.h>
#include <stdlib.h>
#include <vec_literal.h>
#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include "atomic_inc_return.h"
#ifndef M
#define M 64 /* Size of the matrix block - M x M */
#endif
#define MAX_N 1024
#define MAX_TILES (MAX_N / M)
static unsigned int N;
static unsigned int ITER;
static unsigned int A_AREA, B_AREA, C_AREA;
static atomic_ea_t FINC_AREA2;
#define DIN_TAG 0
#define DOUT_TAG 2
#define DMA_Wait(TAG_ID) \
{ \
mfc_write_tag_mask((1<<(TAG_ID))); \
mfc_read_tag_status_all(); \
}
#define SwapInBuf() \
{ \
OpA = InTag ? blkA1 : blkA0; \
OpB = InTag ? blkB1 : blkB0; \
InTag ^= 1; \
InA = InTag ? blkA1 : blkA0; \
InB = InTag ? blkB1 : blkB0; \
}
#define SwapOutBuf() \
{ \
OpC = (OutTag==DOUT_TAG) ? blkC1 : blkC0; \
OutC = (OutTag==DOUT_TAG) ? blkC1 : blkC0; \
OutTag ^= 1; \
}
#ifdef _XLC
#define ALIGN8B
#define SPU_FMA(RT,RA,RB,RC) RT = spu_madd(RA, RB, RC)
#define SPU_FM(RT,RA,RB) RT = spu_mul(RA, RB)
#define SPU_LNOP
#else
#define ALIGN8B asm volatile(".align 3")
#define SPU_FMA(RT,RA,RB,RC) \
asm volatile(".align 3"); \
asm volatile("fma %0,%1,%2,%3":"=r"(RT):"r"(RA),"r"(RB),"r"(RC))
#define SPU_FM(RT,RA,RB) \
asm volatile(".align 3"); \
asm volatile("fm %0,%1,%2":"=r"(RT):"r"(RA),"r"(RB))
#define SPU_LNOP asm volatile("lnop")
#endif
#define StageCBAclr(OFFSET) \
{ \
ALIGN8B; \
SPU_FM(c0_0B,a00,b0_0B); \
SPU_FM(c1_0B,a10,b0_0B); \
SPU_FM(c2_0B,a20,b0_0B); \
SPU_FM(c3_0B,a30,b0_0B); \
SPU_FM(c0_1B,a00,b0_1B); \
SPU_FM(c1_1B,a10,b0_1B); \
SPU_FM(c2_1B,a20,b0_1B); \
SPU_FM(c3_1B,a30,b0_1B); \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP; \
}
#define StageACBclr(OFFSET) \
{ \
SPU_FM(c0_0C,a00,b0_0C); \
SPU_FM(c1_0C,a10,b0_0C); \
SPU_FM(c2_0C,a20,b0_0C); \
SPU_FM(c3_0C,a30,b0_0C); \
SPU_FM(c0_1C,a00,b0_1C); \
SPU_FM(c1_1C,a10,b0_1C); \
SPU_FM(c2_1C,a20,b0_1C); \
SPU_FM(c3_1C,a30,b0_1C); \
SPU_FMA(c0_0C,a01,b1_0C,c0_0C); \
SPU_FMA(c1_0C,a11,b1_0C,c1_0C); \
SPU_FMA(c2_0C,a21,b1_0C,c2_0C); b0_0A = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0C,a31,b1_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a01,b1_1C,c0_1C); b1_0A = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1C,a11,b1_1C,c1_1C); b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1C,a21,b1_1C,c2_1C); b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1C,a31,b1_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a02,b2_0C,c0_0C); b0_1A = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0C,a12,b2_0C,c1_0C); b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0C,a22,b2_0C,c2_0C); b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0C,a32,b2_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a02,b2_1C,c0_1C); b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1C,a12,b2_1C,c1_1C); *((volatile vector float *)(ptrC+OFFSET)) = c0_0B; \
SPU_FMA(c2_1C,a22,b2_1C,c2_1C); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0B; \
SPU_FMA(c3_1C,a32,b2_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a03,b3_0C,c0_0C); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0B; \
SPU_FMA(c1_0C,a13,b3_0C,c1_0C); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0B; \
SPU_FMA(c2_0C,a23,b3_0C,c2_0C); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1B; \
SPU_FMA(c3_0C,a33,b3_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a03,b3_1C,c0_1C); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1B; \
SPU_FMA(c1_1C,a13,b3_1C,c1_1C); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1B; \
SPU_FMA(c2_1C,a23,b3_1C,c2_1C); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1B; \
SPU_FMA(c3_1C,a33,b3_1C,c3_1C); SPU_LNOP; \
}
#define StageBACclr(OFFSET) \
{ \
SPU_FM(c0_0A,a00,b0_0A); \
SPU_FM(c1_0A,a10,b0_0A); \
SPU_FM(c2_0A,a20,b0_0A); \
SPU_FM(c3_0A,a30,b0_0A); \
SPU_FM(c0_1A,a00,b0_1A); \
SPU_FM(c1_1A,a10,b0_1A); \
SPU_FM(c2_1A,a20,b0_1A); \
SPU_FM(c3_1A,a30,b0_1A); \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); b0_0B = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); b1_0B = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); b0_1B = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); *((volatile vector float *)(ptrC+OFFSET)) = c0_0C; \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0C; \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0C; \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0C; \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1C; \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1C; \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1C; \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1C; \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); SPU_LNOP; \
}
#define StageCBA(OFFSET,OFFB) \
{ \
ALIGN8B; \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP; \
}
#define StageCBAmod(OFFSET,OFFB) \
{ \
ALIGN8B; \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); SPU_LNOP; \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFB*M+8)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFB*M+12)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFB*M+8)); \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFB*M+12)); \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); SPU_LNOP; \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); SPU_LNOP; \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20));