Cell SDK Code Sample: Parallel Matrix Multiplication Workload
%% --------------------------------------------------------------
%% (C) Copyright 2001,2005,
%% International Business Machines Corporation,
%% Sony Computer Entertainment Incorporated,
%% Toshiba Corporation.
%%
%% All Rights Reserved.
%% --------------------------------------------------------------
%% PROLOG END TAG zYx
Summary: Parallel Matrix Multiplication workload for CBE
Target: CBE/Linux
Description:
This workload calculates C = A * B which A, B and C are N x N squared
matrices comprised of single float values. It uses block partitioning
algorithm for reducing required bandwidth. The block size(M) is fixed
at 64.
This workload is highly optimized and demonstrates near theoretical
peak system performance as long as huge pages are used to store
the matrices. The program needs one 16MB pages.
The SPE arbitrate for work using an atomic counter where the counter
specifies the output block for the SPE to compute.
The workload supports many command line options. The command line
options include:
* The ability to specify the size of the matrix. 64x64, 128x128,
256x256, 512x512, or 1024x1024.
* The ability to specify the number of times the matrices are
multiplied.
* The ability to specify the number of SPE's used to perform the
calculations.
* The ability to enabled validation of the result.
How to run:
To obtain usage information:
matrix_mul -h
To run 3 matrix multiplies of 128x128 on 5 SPE and verify the
results:
matrix_mul -i 3 -m 128 -s 5 -v
Target: PPE [This code section runs on the PPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <fenv.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <libspe.h>
#include <libmisc.h>
#include <math.h>
#include <string.h>
#include <errno.h>
#include <sched.h>
#define M 64 /* basic block size */
#define MAX_BLOCKS 256
#define MAX_SPUS 8
#define ERR_THRESHOLD 0.01
extern spe_program_handle_t block;
extern int daxpy(double *, double *, double *, int, int, double, double, double);
float *tmp = NULL;
char *mem_file = "/huge/daxpy_mem.bin";
char *mem_addr = NULL;
void print_usage(char * name)
{
printf("USAGE: %s [options]\n", name);
printf(" Valid Options include:\n");
printf(" -i # : perform specified number of iterations. Default is 1\n");
printf(" -h : output this usage help screen\n");
printf(" -m # : perform maxtrix multiply of matrices of size #. Valid sizes\n");
printf(" are 64, 128, 256, 512, and 1024. Default is 64.\n");
printf(" -s # : run with # number of SPUs. The dafault is 1 SPU.\n");
printf(" -v : verify the final matrix multiply result.\n");
printf(" -V : verbose output.\n");
exit(1);
}
void send_mail(speid_t id, unsigned int data)
{
while (spe_write_in_mbox(id, data));
}
void block_swizzle(float *mat, int size)
{
int i, j, k, l;
float *ptr, *src, *dst;
ptr = tmp;
for(i=0; i<(size/M); i++) {
for(j=0; j<(size/M); j++){
for(k=0; k<M; k++) {
for(l=0; l<M; l++) {
*ptr++ = mat[i*(M*size)+j*M+k*size+l];
}
}
}
}
src = tmp;
dst = mat;
for(i=0; i<size*size; i++) *dst++ = *src++;
}
int main(int argc, char *argv[])
{
int i, j, k;
int verify = 0;
int verbose = 0;
int fail = 0;
int msize = M, blocks;
int huge_size;
int spus = 1;
int iterations = 1;
spe_gid_t spe_gid;
speid_t spe_id[MAX_SPUS];
float *a, *b, *c, *exp;
double *x, *y, *z;
int *cntr;
unsigned int offset;
int fmem;
int status;
for (i=1; i<argc; i++) {
if (*argv[i] == '-') {
switch (*(argv[i]+1)) {
case 'i':
i++;
if (i < argc) {
iterations = atoi(argv[i]);
if (iterations < 0) iterations = 0;
} else {
printf("ERROR: Number of iterations is not specified.\n");
print_usage(argv[0]);
}
break;
case 'm': /* Run matrix of specified size */
i++;
if (i < argc) {
msize = atoi(argv[i]);
if ((msize < 64) || (msize > 1024) || (msize & (msize - 1))) {
printf("ERROR: Invalid matrix size of %d specified\n", msize);
print_usage(argv[0]);
}
} else {
printf("ERROR: Matrix size is not specifed\n");
print_usage(argv[0]);
}
break;
case 's': /* Run on specified number of SPUs */
i++;
if (i < argc) {
spus = atoi(argv[i]);
if (spus > MAX_SPUS) spus = MAX_SPUS;
} else {
printf("ERROR: Number of SPUs to use is not specified.\n");
print_usage(argv[0]);
}
break;
case 'v': /* Let the PPU verify the final product result. */
verify = 1;
break;
case 'V': /* Verbose output */
verbose = 1;
break;
case 'h':
default:
print_usage(argv[0]);
break;
}
} else {
print_usage(argv[0]);
}
}
blocks = (msize/M) * (msize/M);
spe_gid = spe_create_group(SCHED_OTHER, 0, 1);
/* Allocate memory buffers.
*/
#define BASE 0x8000
#define ARRAY_SIZE 0x2A80
#define LOOP_COUNT ((ARRAY_SIZE / 64) - 1)
#define HUGE_PAGE_SIZE 16*1024*1024
offset = 0;
a = (float *)offset;
offset += sizeof(float) * msize * msize;
b = (float *)(offset);
offset += sizeof(float) * msize * msize;
c = (float *)(offset);
offset += sizeof(float) * msize * msize;
cntr = (int *)(offset);
offset += 128;
/* Create a large contiguous memory buffer by allocating a large
* page (or more). Large page memory will also reduce the TLB thrashing.
*/
if ((fmem = open (mem_file, O_CREAT | O_RDWR, 0755)) == -1) {
printf("WARNING: unable to open file %s (errno=%d). Using malloc heap.\n", mem_file, errno);
mem_addr = malloc_align(offset, 7);
if (mem_addr == NULL) {
printf("ERROR: unable to malloc %d byte buffer.\n", offset);
exit(1);
}
} else {
huge_size = (offset + HUGE_PAGE_SIZE-1) & ~(HUGE_PAGE_SIZE-1);
assert(HUGE_PAGE_SIZE > 32*1024);
mem_addr = (char *) mmap (0, huge_size, PROT_READ | PROT_WRITE, MAP_SHARED, fmem, 0);
if (mem_addr == MAP_FAILED) {
printf("ERROR: unable to mmap file %s (errno=%d).\n", mem_file, errno);
close (fmem);
exit (1);
}
}
/* Correct the buffer pointers
*/
x = (double *)(mem_addr + (unsigned int)x);
y = (double *)(mem_addr + (unsigned int)y);
z = (double *)(mem_addr + (unsigned int)z);
a = (float *)(mem_addr + (unsigned int)a);
b = (float *)(mem_addr + (unsigned int)b);
c = (float *)(mem_addr + (unsigned int)c);
cntr = (int *)(mem_addr + (unsigned int)cntr);
/* Construct matrices and expected results
*/
exp = (float *)malloc(sizeof(float) * msize * msize);
tmp = (float *)malloc(sizeof(float) * msize * msize);
if ((exp == NULL) || (tmp == NULL)) {
printf("ERROR: failed to allocated verification buffers exp and tmp\n");
exit(1);
}
/* Initialize the matrix data and counter.
*/
printf("Initializing Arrays ... "); fflush(stdout);
for(i=0; i<msize; i++) {
for(j=0; j<msize; j++) {
a[i*msize+j] = rand_0_to_1();
b[i*msize+j] = rand_0_to_1();
c[i*msize+j] = 0.0f;
}
}
printf("done\n"); fflush(stdout);
*cntr = 0;
if (verify) {
/* Compute expected result.
*/
printf("Computing expected results ... "); fflush(stdout);
for (i=0; i<msize; i++) {
for (j=0; j<msize; j++) {
exp[i*msize+j] = 0.0f;
for (k=0; k<msize; k++) {
exp[i*msize+j] += a[i*msize+k] * b[k*msize+j];
}
}
}
/* Swizzle inputs matrices and expected results to correspond to tiling.
*/
block_swizzle(a, msize);
block_swizzle(b, msize);
block_swizzle(exp, msize);
printf("done\n"); fflush(stdout);
}
printf("Running test ... "); fflush(stdout);
/* Start all the SPUs computing the matrix product.
*/
/* Create each of the SPU threads
*/
for (i=0; i<spus; i++) {
spe_id[i] = spe_create_thread(spe_gid, &block, 0, 0, -1, 0);
if (spe_id[i] == NULL) {
printf("INTERNAL ERROR: failed to create spu thread %d. Error = %s\n", i, strerror(errno));
exit(1);
}
}
*cntr = 0;
/* Send each of the SPUs the input parameters
*/
for (i=0; i<spus; i++) {
send_mail(spe_id[i], (unsigned int)msize);
send_mail(spe_id[i], (unsigned int)iterations);
send_mail(spe_id[i], (unsigned int)cntr);
send_mail(spe_id[i], (unsigned int)a);
send_mail(spe_id[i], (unsigned int)b);
send_mail(spe_id[i], (unsigned int)c);
}
/* Wait for the SPUs to complete
*/
for (i=0; i<spus; i++) {
(void)spe_wait(spe_id[i], &status, 0);
}
printf("done\n");
/* Output SPU, matrix multiply results
*/
if (verify) {
float delta;
printf("Verifying the results ... ");
/* Verify the resulting matrix output.
*/
for (i=0; i<msize; i++) {
for (j=0; j<msize; j++) {
delta = exp[i*msize+j] - c[i*msize+j];
if (delta < 0.0f) delta = -delta;
if (delta > ERR_THRESHOLD) {
if (verbose) printf(" %d %d exp=%f got=%f\n", i, j, exp[i*msize+j], c[i*msize+j]);
fail++;
}
}
}
if (fail) {
printf("FAILED (%d)\n", fail);
fail = 1;
} else {
printf("PASSED\n");
}
}
return (fail);
}
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
/*
* Matrix Multiply --- EUC
*
* block.c - Matrix Multiply with block partitioning
*/
#include <spu_intrinsics.h>
#include <stdlib.h>
#include <vec_literal.h>
#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include "atomic_inc_return.h"
#ifndef M
#define M 64 /* Size of the matrix block - M x M */
#endif
#define MAX_N 1024
#define MAX_TILES (MAX_N / M)
static unsigned int N;
static unsigned int ITER;
static unsigned int A_AREA, B_AREA, C_AREA;
static atomic_ea_t FINC_AREA2;
#define DIN_TAG 0
#define DOUT_TAG 2
#define DMA_Wait(TAG_ID) \
{ \
mfc_write_tag_mask((1<<(TAG_ID))); \
mfc_read_tag_status_all(); \
}
#define SwapInBuf() \
{ \
OpA = InTag ? blkA1 : blkA0; \
OpB = InTag ? blkB1 : blkB0; \
InTag ^= 1; \
InA = InTag ? blkA1 : blkA0; \
InB = InTag ? blkB1 : blkB0; \
}
#define SwapOutBuf() \
{ \
OpC = (OutTag==DOUT_TAG) ? blkC1 : blkC0; \
OutC = (OutTag==DOUT_TAG) ? blkC1 : blkC0; \
OutTag ^= 1; \
}
#ifdef _XLC
#define ALIGN8B
#define SPU_FMA(RT,RA,RB,RC) RT = spu_madd(RA, RB, RC)
#define SPU_FM(RT,RA,RB) RT = spu_mul(RA, RB)
#define SPU_LNOP
#else
#define ALIGN8B asm volatile(".align 3")
#define SPU_FMA(RT,RA,RB,RC) \
asm volatile(".align 3"); \
asm volatile("fma %0,%1,%2,%3":"=r"(RT):"r"(RA),"r"(RB),"r"(RC))
#define SPU_FM(RT,RA,RB) \
asm volatile(".align 3"); \
asm volatile("fm %0,%1,%2":"=r"(RT):"r"(RA),"r"(RB))
#define SPU_LNOP asm volatile("lnop")
#endif
#define StageCBAclr(OFFSET) \
{ \
ALIGN8B; \
SPU_FM(c0_0B,a00,b0_0B); \
SPU_FM(c1_0B,a10,b0_0B); \
SPU_FM(c2_0B,a20,b0_0B); \
SPU_FM(c3_0B,a30,b0_0B); \
SPU_FM(c0_1B,a00,b0_1B); \
SPU_FM(c1_1B,a10,b0_1B); \
SPU_FM(c2_1B,a20,b0_1B); \
SPU_FM(c3_1B,a30,b0_1B); \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP; \
}
#define StageACBclr(OFFSET) \
{ \
SPU_FM(c0_0C,a00,b0_0C); \
SPU_FM(c1_0C,a10,b0_0C); \
SPU_FM(c2_0C,a20,b0_0C); \
SPU_FM(c3_0C,a30,b0_0C); \
SPU_FM(c0_1C,a00,b0_1C); \
SPU_FM(c1_1C,a10,b0_1C); \
SPU_FM(c2_1C,a20,b0_1C); \
SPU_FM(c3_1C,a30,b0_1C); \
SPU_FMA(c0_0C,a01,b1_0C,c0_0C); \
SPU_FMA(c1_0C,a11,b1_0C,c1_0C); \
SPU_FMA(c2_0C,a21,b1_0C,c2_0C); b0_0A = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0C,a31,b1_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a01,b1_1C,c0_1C); b1_0A = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1C,a11,b1_1C,c1_1C); b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1C,a21,b1_1C,c2_1C); b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1C,a31,b1_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a02,b2_0C,c0_0C); b0_1A = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0C,a12,b2_0C,c1_0C); b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0C,a22,b2_0C,c2_0C); b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0C,a32,b2_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a02,b2_1C,c0_1C); b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1C,a12,b2_1C,c1_1C); *((volatile vector float *)(ptrC+OFFSET)) = c0_0B; \
SPU_FMA(c2_1C,a22,b2_1C,c2_1C); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0B; \
SPU_FMA(c3_1C,a32,b2_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a03,b3_0C,c0_0C); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0B; \
SPU_FMA(c1_0C,a13,b3_0C,c1_0C); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0B; \
SPU_FMA(c2_0C,a23,b3_0C,c2_0C); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1B; \
SPU_FMA(c3_0C,a33,b3_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a03,b3_1C,c0_1C); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1B; \
SPU_FMA(c1_1C,a13,b3_1C,c1_1C); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1B; \
SPU_FMA(c2_1C,a23,b3_1C,c2_1C); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1B; \
SPU_FMA(c3_1C,a33,b3_1C,c3_1C); SPU_LNOP; \
}
#define StageBACclr(OFFSET) \
{ \
SPU_FM(c0_0A,a00,b0_0A); \
SPU_FM(c1_0A,a10,b0_0A); \
SPU_FM(c2_0A,a20,b0_0A); \
SPU_FM(c3_0A,a30,b0_0A); \
SPU_FM(c0_1A,a00,b0_1A); \
SPU_FM(c1_1A,a10,b0_1A); \
SPU_FM(c2_1A,a20,b0_1A); \
SPU_FM(c3_1A,a30,b0_1A); \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); b0_0B = *((volatile vector float *)(ptrB+OFFSET+16)); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); b1_0B = *((volatile vector float *)(ptrB+M+OFFSET+16)); \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET+16)); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET+16)); \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); b0_1B = *((volatile vector float *)(ptrB+OFFSET+20)); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+20)); \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+20)); \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+20)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); *((volatile vector float *)(ptrC+OFFSET)) = c0_0C; \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0C; \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0C; \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0C; \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1C; \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1C; \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1C; \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1C; \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); SPU_LNOP; \
}
#define StageCBA(OFFSET,OFFB) \
{ \
ALIGN8B; \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); SPU_LNOP; \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); SPU_LNOP; \
}
#define StageCBAmod(OFFSET,OFFB) \
{ \
ALIGN8B; \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); SPU_LNOP; \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFB*M+8)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFB*M+12)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFB*M+8)); \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFB*M+12)); \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); c0_0C = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); c1_0C = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); c2_0C = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); c3_0C = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); SPU_LNOP; \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); c0_1C = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); c1_1C = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); c2_1C = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); c3_1C = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); b0_0C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); b1_0C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); b2_0C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); b3_0C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); SPU_LNOP; \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); b0_1C = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); b1_1C = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); b2_1C = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); b3_1C = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); *((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); SPU_LNOP; \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
}
#define StageACB(OFFSET,OFFB) \
{ \
SPU_FMA(c0_0C,a00,b0_0C,c0_0C); c0_0A = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c1_0C,a10,b0_0C,c1_0C); c1_0A = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c2_0C,a20,b0_0C,c2_0C); c2_0A = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c3_0C,a30,b0_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a00,b0_1C,c0_1C); c3_0A = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_1C,a10,b0_1C,c1_1C); c0_1A = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c2_1C,a20,b0_1C,c2_1C); c1_1A = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c3_1C,a30,b0_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a01,b1_0C,c0_0C); c2_1A = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_0C,a11,b1_0C,c1_0C); c3_1A = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_0C,a21,b1_0C,c2_0C); b0_0A = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_0C,a31,b1_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a01,b1_1C,c0_1C); b1_0A = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_1C,a11,b1_1C,c1_1C); b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_1C,a21,b1_1C,c2_1C); b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1C,a31,b1_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a02,b2_0C,c0_0C); b0_1A = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_0C,a12,b2_0C,c1_0C); b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c2_0C,a22,b2_0C,c2_0C); b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c3_0C,a32,b2_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a02,b2_1C,c0_1C); b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1C,a12,b2_1C,c1_1C); *((volatile vector float *)(ptrC+OFFSET)) = c0_0B; \
SPU_FMA(c2_1C,a22,b2_1C,c2_1C); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0B; \
SPU_FMA(c3_1C,a32,b2_1C,c3_1C); SPU_LNOP; \
SPU_FMA(c0_0C,a03,b3_0C,c0_0C); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0B; \
SPU_FMA(c1_0C,a13,b3_0C,c1_0C); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0B; \
SPU_FMA(c2_0C,a23,b3_0C,c2_0C); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1B; \
SPU_FMA(c3_0C,a33,b3_0C,c3_0C); SPU_LNOP; \
SPU_FMA(c0_1C,a03,b3_1C,c0_1C); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1B; \
SPU_FMA(c1_1C,a13,b3_1C,c1_1C); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1B; \
SPU_FMA(c2_1C,a23,b3_1C,c2_1C); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1B; \
SPU_FMA(c3_1C,a33,b3_1C,c3_1C); SPU_LNOP; \
}
#define StageBAC(OFFSET,OFFB) \
{ \
SPU_FMA(c0_0A,a00,b0_0A,c0_0A); c0_0B = *((volatile vector float *)(ptrC+OFFSET+16)); \
SPU_FMA(c1_0A,a10,b0_0A,c1_0A); c1_0B = *((volatile vector float *)(ptrC+M+OFFSET+16)); \
SPU_FMA(c2_0A,a20,b0_0A,c2_0A); c2_0B = *((volatile vector float *)(ptrC+2*M+OFFSET+16)); \
SPU_FMA(c3_0A,a30,b0_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a00,b0_1A,c0_1A); c3_0B = *((volatile vector float *)(ptrC+3*M+OFFSET+16)); \
SPU_FMA(c1_1A,a10,b0_1A,c1_1A); c0_1B = *((volatile vector float *)(ptrC+OFFSET+20)); \
SPU_FMA(c2_1A,a20,b0_1A,c2_1A); c1_1B = *((volatile vector float *)(ptrC+M+OFFSET+20)); \
SPU_FMA(c3_1A,a30,b0_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); c2_1B = *((volatile vector float *)(ptrC+2*M+OFFSET+20)); \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); c3_1B = *((volatile vector float *)(ptrC+3*M+OFFSET+20)); \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); b0_0B = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); b1_0B = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+16)); \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+16)); \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); b0_1B = *((volatile vector float *)(ptrB+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+OFFB*M+20)); \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+OFFB*M+20)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); *((volatile vector float *)(ptrC+OFFSET)) = c0_0C; \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); *((volatile vector float *)(ptrC+M+OFFSET)) = c1_0C; \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); SPU_LNOP; \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); *((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0C; \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); *((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0C; \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); *((volatile vector float *)(ptrC+OFFSET+4)) = c0_1C; \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); SPU_LNOP; \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); *((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1C; \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); *((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1C; \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); *((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1C; \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); SPU_LNOP; \
}
#define StageMISC(OFFA,OFFB) \
{ \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); a0 = *((volatile vector float *)(ptrA+OFFA+4)); \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); a1 = *((volatile vector float *)(ptrA+M+OFFA+4)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); a2 = *((volatile vector float *)(ptrA+2*M+OFFA+4)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); a3 = *((volatile vector float *)(ptrA+3*M+OFFA+4)); \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); *((volatile vector float *)(ptrC+48)) = c0_0A; \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); *((volatile vector float *)(ptrC+M+48)) = c1_0A; \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); a00 = spu_shuffle(a0, a0, pat0); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); *((volatile vector float *)(ptrC+2*M+48)) = c2_0A; \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); *((volatile vector float *)(ptrC+3*M+48)) = c3_0A; \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); a10 = spu_shuffle(a1, a1, pat0); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); *((volatile vector float *)(ptrC+52)) = c0_1A; \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); *((volatile vector float *)(ptrC+M+52)) = c1_1A; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); a20 = spu_shuffle(a2, a2, pat0); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+52)) = c2_1A; \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+52)) = c3_1A; \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); a30 = spu_shuffle(a3, a3, pat0); \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); c0_0A = *((volatile vector float *)(ptrC)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); c1_0A = *((volatile vector float *)(ptrC+M)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); a01 = spu_shuffle(a0, a0, pat1); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); c2_0A = *((volatile vector float *)(ptrC+2*M)); \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); c3_0A = *((volatile vector float *)(ptrC+3*M)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); a11 = spu_shuffle(a1, a1, pat1); \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); b0_0A = *((volatile vector float *)(ptrB+4*M+OFFB*M)); \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); b0_1A = *((volatile vector float *)(ptrB+4*M+OFFB*M+4)); \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); a21 = spu_shuffle(a2, a2, pat1); \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); b1_0A = *((volatile vector float *)(ptrB+5*M+OFFB*M)); \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); b1_1A = *((volatile vector float *)(ptrB+5*M+OFFB*M+4)); \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); a31 = spu_shuffle(a3, a3, pat1); \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); c0_1A = *((volatile vector float *)(ptrC+4)); \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); c1_1A = *((volatile vector float *)(ptrC+M+4)); \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); a02 = spu_shuffle(a0, a0, pat2); \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); c2_1A = *((volatile vector float *)(ptrC+2*M+4)); \
SPU_FMA(c0_0A,a00,b0_0A,c0_0A); c3_1A = *((volatile vector float *)(ptrC+3*M+4)); \
SPU_FMA(c1_0A,a10,b0_0A,c1_0A); a12 = spu_shuffle(a1, a1, pat2); \
SPU_FMA(c2_0A,a20,b0_0A,c2_0A); b2_0A = *((volatile vector float *)(ptrB+6*M+OFFB*M)); \
SPU_FMA(c3_0A,a30,b0_0A,c3_0A); b2_1A = *((volatile vector float *)(ptrB+6*M+OFFB*M+4)); \
SPU_FMA(c0_1A,a00,b0_1A,c0_1A); a22 = spu_shuffle(a2, a2, pat2); \
SPU_FMA(c1_1A,a10,b0_1A,c1_1A); b3_0A = *((volatile vector float *)(ptrB+7*M+OFFB*M)); \
SPU_FMA(c2_1A,a20,b0_1A,c2_1A); b3_1A = *((volatile vector float *)(ptrB+7*M+OFFB*M+4)); \
SPU_FMA(c3_1A,a30,b0_1A,c3_1A); a32 = spu_shuffle(a3, a3, pat2); \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); *((volatile vector float *)(ptrC+56)) = c0_0B; \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); *((volatile vector float *)(ptrC+M+56)) = c1_0B; \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); a03 = spu_shuffle(a0, a0, pat3); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); *((volatile vector float *)(ptrC+2*M+56)) = c2_0B; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); *((volatile vector float *)(ptrC+3*M+56)) = c3_0B; \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); a13 = spu_shuffle(a1, a1, pat3); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); *((volatile vector float *)(ptrC+60)) = c0_1B; \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); *((volatile vector float *)(ptrC+M+60)) = c1_1B; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); a23 = spu_shuffle(a2, a2, pat3); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); *((volatile vector float *)(ptrC+2*M+60)) = c2_1B; \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); *((volatile vector float *)(ptrC+3*M+60)) = c3_1B; \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); a33 = spu_shuffle(a3, a3, pat3); \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b0_0B = *((volatile vector float *)(ptrB+4*M+OFFB*M+8)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); b0_1B = *((volatile vector float *)(ptrB+4*M+OFFB*M+12)); \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); b1_0B = *((volatile vector float *)(ptrB+5*M+OFFB*M+8)); \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); b1_1B = *((volatile vector float *)(ptrB+5*M+OFFB*M+12)); \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); c0_0B = *((volatile vector float *)(ptrC+8)); \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); c1_0B = *((volatile vector float *)(ptrC+M+8)); \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); c2_0B = *((volatile vector float *)(ptrC+2*M+8)); \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); c3_0B = *((volatile vector float *)(ptrC+3*M+8)); \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); c0_1B = *((volatile vector float *)(ptrC+12)); \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); c1_1B = *((volatile vector float *)(ptrC+M+12)); \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); c2_1B = *((volatile vector float *)(ptrC+2*M+12)); \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); c3_1B = *((volatile vector float *)(ptrC+3*M+12)); \
}
#define StageMISCmod(OFFA,OFFB) \
{ \
SPU_FMA(c0_0B,a00,b0_0B,c0_0B); a0 = *((volatile vector float *)(ptrA+OFFA+4)); \
SPU_FMA(c1_0B,a10,b0_0B,c1_0B); a1 = *((volatile vector float *)(ptrA+M+OFFA+4)); \
SPU_FMA(c2_0B,a20,b0_0B,c2_0B); a2 = *((volatile vector float *)(ptrA+2*M+OFFA+4)); \
SPU_FMA(c3_0B,a30,b0_0B,c3_0B); a3 = *((volatile vector float *)(ptrA+3*M+OFFA+4)); \
SPU_FMA(c0_1B,a00,b0_1B,c0_1B); *((volatile vector float *)(ptrC+48)) = c0_0A; \
SPU_FMA(c1_1B,a10,b0_1B,c1_1B); *((volatile vector float *)(ptrC+M+48)) = c1_0A; \
SPU_FMA(c2_1B,a20,b0_1B,c2_1B); a00 = spu_shuffle(a0, a0, pat0); \
SPU_FMA(c3_1B,a30,b0_1B,c3_1B); *((volatile vector float *)(ptrC+2*M+48)) = c2_0A; \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); *((volatile vector float *)(ptrC+3*M+48)) = c3_0A; \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); a10 = spu_shuffle(a1, a1, pat0); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); *((volatile vector float *)(ptrC+52)) = c0_1A; \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); *((volatile vector float *)(ptrC+M+52)) = c1_1A; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); a20 = spu_shuffle(a2, a2, pat0); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+52)) = c2_1A; \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+52)) = c3_1A; \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); a30 = spu_shuffle(a3, a3, pat0); \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); c0_0A = *((volatile vector float *)(ptrC)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); c1_0A = *((volatile vector float *)(ptrC+M)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); a01 = spu_shuffle(a0, a0, pat1); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); c2_0A = *((volatile vector float *)(ptrC+2*M)); \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); c3_0A = *((volatile vector float *)(ptrC+3*M)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); a11 = spu_shuffle(a1, a1, pat1); \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); b0_0A = *((volatile vector float *)(ptrB+4*M+OFFB*M)); \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); b0_1A = *((volatile vector float *)(ptrB+4*M+OFFB*M+4)); \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); a21 = spu_shuffle(a2, a2, pat1); \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); b1_0A = *((volatile vector float *)(ptrB+5*M+OFFB*M)); \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); b1_1A = *((volatile vector float *)(ptrB+5*M+OFFB*M+4)); \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); a31 = spu_shuffle(a3, a3, pat1); \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); c0_1A = *((volatile vector float *)(ptrC+4)); \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); c1_1A = *((volatile vector float *)(ptrC+M+4)); \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); a02 = spu_shuffle(a0, a0, pat2); \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); c2_1A = *((volatile vector float *)(ptrC+2*M+4)); \
SPU_FMA(c0_0A,a00,b0_0A,c0_0A); c3_1A = *((volatile vector float *)(ptrC+3*M+4)); \
SPU_FMA(c1_0A,a10,b0_0A,c1_0A); a12 = spu_shuffle(a1, a1, pat2); \
SPU_FMA(c2_0A,a20,b0_0A,c2_0A); b2_0A = *((volatile vector float *)(ptrB+6*M+OFFB*M)); \
SPU_FMA(c3_0A,a30,b0_0A,c3_0A); b2_1A = *((volatile vector float *)(ptrB+6*M+OFFB*M+4)); \
SPU_FMA(c0_1A,a00,b0_1A,c0_1A); a22 = spu_shuffle(a2, a2, pat2); \
SPU_FMA(c1_1A,a10,b0_1A,c1_1A); b3_0A = *((volatile vector float *)(ptrB+7*M+OFFB*M)); \
SPU_FMA(c2_1A,a20,b0_1A,c2_1A); b3_1A = *((volatile vector float *)(ptrB+7*M+OFFB*M+4)); \
SPU_FMA(c3_1A,a30,b0_1A,c3_1A); a32 = spu_shuffle(a3, a3, pat2); \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); *((volatile vector float *)(ptrC+56)) = c0_0B; \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); *((volatile vector float *)(ptrC+M+56)) = c1_0B; \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); a03 = spu_shuffle(a0, a0, pat3); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); *((volatile vector float *)(ptrC+2*M+56)) = c2_0B; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); *((volatile vector float *)(ptrC+3*M+56)) = c3_0B; \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); a13 = spu_shuffle(a1, a1, pat3); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); *((volatile vector float *)(ptrC+60)) = c0_1B; \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); *((volatile vector float *)(ptrC+M+60)) = c1_1B; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); a23 = spu_shuffle(a2, a2, pat3); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); *((volatile vector float *)(ptrC+2*M+60)) = c2_1B; \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); *((volatile vector float *)(ptrC+3*M+60)) = c3_1B; \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); a33 = spu_shuffle(a3, a3, pat3); \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b0_0B = *((volatile vector float *)(ptrB+4*M+OFFB*M+8)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); b0_1B = *((volatile vector float *)(ptrB+4*M+OFFB*M+12)); \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); b1_0B = *((volatile vector float *)(ptrB+5*M+OFFB*M+8)); \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); b1_1B = *((volatile vector float *)(ptrB+5*M+OFFB*M+12)); \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); c0_0B = *((volatile vector float *)(ptrC+8)); \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); c1_0B = *((volatile vector float *)(ptrC+M+8)); \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); c2_0B = *((volatile vector float *)(ptrC+2*M+8)); \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); c3_0B = *((volatile vector float *)(ptrC+3*M+8)); \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); b2_0B = *((volatile vector float *)(ptrB+6*M+OFFB*M+8)); \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); b2_1B = *((volatile vector float *)(ptrB+6*M+OFFB*M+12)); \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+7*M+OFFB*M+8)); \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); b3_1B = *((volatile vector float *)(ptrB+7*M+OFFB*M+12)); \
ALIGN8B; \
c0_1B = *((volatile vector float *)(ptrC+12)); \
c1_1B = *((volatile vector float *)(ptrC+M+12)); \
c2_1B = *((volatile vector float *)(ptrC+2*M+12)); \
c3_1B = *((volatile vector float *)(ptrC+3*M+12)); \
ptrB += OFFB*M; \
ALIGN8B; \
}
#define StageMISCclr() \
{ \
SPU_FM(c0_0B,a00,b0_0B); a0 = *((volatile vector float *)(ptrA+4)); \
SPU_FM(c1_0B,a10,b0_0B); a1 = *((volatile vector float *)(ptrA+M+4)); \
SPU_FM(c2_0B,a20,b0_0B); a2 = *((volatile vector float *)(ptrA+2*M+4)); \
SPU_FM(c3_0B,a30,b0_0B); a3 = *((volatile vector float *)(ptrA+3*M+4)); \
SPU_FM(c0_1B,a00,b0_1B); *((volatile vector float *)(ptrC+48)) = c0_0A; \
SPU_FM(c1_1B,a10,b0_1B); *((volatile vector float *)(ptrC+M+48)) = c1_0A; \
SPU_FM(c2_1B,a20,b0_1B); a00 = spu_shuffle(a0, a0, pat0); \
SPU_FM(c3_1B,a30,b0_1B); *((volatile vector float *)(ptrC+2*M+48)) = c2_0A; \
SPU_FMA(c0_0B,a01,b1_0B,c0_0B); *((volatile vector float *)(ptrC+3*M+48)) = c3_0A; \
SPU_FMA(c1_0B,a11,b1_0B,c1_0B); a10 = spu_shuffle(a1, a1, pat0); \
SPU_FMA(c2_0B,a21,b1_0B,c2_0B); *((volatile vector float *)(ptrC+52)) = c0_1A; \
SPU_FMA(c3_0B,a31,b1_0B,c3_0B); *((volatile vector float *)(ptrC+M+52)) = c1_1A; \
SPU_FMA(c0_1B,a01,b1_1B,c0_1B); a20 = spu_shuffle(a2, a2, pat0); \
SPU_FMA(c1_1B,a11,b1_1B,c1_1B); *((volatile vector float *)(ptrC+2*M+52)) = c2_1A; \
SPU_FMA(c2_1B,a21,b1_1B,c2_1B); *((volatile vector float *)(ptrC+3*M+52)) = c3_1A; \
SPU_FMA(c3_1B,a31,b1_1B,c3_1B); a30 = spu_shuffle(a3, a3, pat0); \
SPU_FMA(c0_0B,a02,b2_0B,c0_0B); c0_0A = *((volatile vector float *)(ptrC)); \
SPU_FMA(c1_0B,a12,b2_0B,c1_0B); c1_0A = *((volatile vector float *)(ptrC+M)); \
SPU_FMA(c2_0B,a22,b2_0B,c2_0B); a01 = spu_shuffle(a0, a0, pat1); \
SPU_FMA(c3_0B,a32,b2_0B,c3_0B); c2_0A = *((volatile vector float *)(ptrC+2*M)); \
SPU_FMA(c0_1B,a02,b2_1B,c0_1B); c3_0A = *((volatile vector float *)(ptrC+3*M)); \
SPU_FMA(c1_1B,a12,b2_1B,c1_1B); a11 = spu_shuffle(a1, a1, pat1); \
SPU_FMA(c2_1B,a22,b2_1B,c2_1B); b0_0A = *((volatile vector float *)(ptrB+4*M)); \
SPU_FMA(c3_1B,a32,b2_1B,c3_1B); b0_1A = *((volatile vector float *)(ptrB+4*M+4)); \
SPU_FMA(c0_0B,a03,b3_0B,c0_0B); a21 = spu_shuffle(a2, a2, pat1); \
SPU_FMA(c1_0B,a13,b3_0B,c1_0B); b1_0A = *((volatile vector float *)(ptrB+5*M)); \
SPU_FMA(c2_0B,a23,b3_0B,c2_0B); b1_1A = *((volatile vector float *)(ptrB+5*M+4)); \
SPU_FMA(c3_0B,a33,b3_0B,c3_0B); a31 = spu_shuffle(a3, a3, pat1); \
SPU_FMA(c0_1B,a03,b3_1B,c0_1B); c0_1A = *((volatile vector float *)(ptrC+4)); \
SPU_FMA(c1_1B,a13,b3_1B,c1_1B); c1_1A = *((volatile vector float *)(ptrC+M+4)); \
SPU_FMA(c2_1B,a23,b3_1B,c2_1B); a02 = spu_shuffle(a0, a0, pat2); \
SPU_FMA(c3_1B,a33,b3_1B,c3_1B); c2_1A = *((volatile vector float *)(ptrC+2*M+4)); \
SPU_FMA(c0_0A,a00,b0_0A,c0_0A); c3_1A = *((volatile vector float *)(ptrC+3*M+4)); \
SPU_FMA(c1_0A,a10,b0_0A,c1_0A); a12 = spu_shuffle(a1, a1, pat2); \
SPU_FMA(c2_0A,a20,b0_0A,c2_0A); b2_0A = *((volatile vector float *)(ptrB+6*M)); \
SPU_FMA(c3_0A,a30,b0_0A,c3_0A); b2_1A = *((volatile vector float *)(ptrB+6*M+4)); \
SPU_FMA(c0_1A,a00,b0_1A,c0_1A); a22 = spu_shuffle(a2, a2, pat2); \
SPU_FMA(c1_1A,a10,b0_1A,c1_1A); b3_0A = *((volatile vector float *)(ptrB+7*M)); \
SPU_FMA(c2_1A,a20,b0_1A,c2_1A); b3_1A = *((volatile vector float *)(ptrB+7*M+4)); \
SPU_FMA(c3_1A,a30,b0_1A,c3_1A); a32 = spu_shuffle(a3, a3, pat2); \
SPU_FMA(c0_0A,a01,b1_0A,c0_0A); *((volatile vector float *)(ptrC+56)) = c0_0B; \
SPU_FMA(c1_0A,a11,b1_0A,c1_0A); *((volatile vector float *)(ptrC+M+56)) = c1_0B; \
SPU_FMA(c2_0A,a21,b1_0A,c2_0A); a03 = spu_shuffle(a0, a0, pat3); \
SPU_FMA(c3_0A,a31,b1_0A,c3_0A); *((volatile vector float *)(ptrC+2*M+56)) = c2_0B; \
SPU_FMA(c0_1A,a01,b1_1A,c0_1A); *((volatile vector float *)(ptrC+3*M+56)) = c3_0B; \
SPU_FMA(c1_1A,a11,b1_1A,c1_1A); a13 = spu_shuffle(a1, a1, pat3); \
SPU_FMA(c2_1A,a21,b1_1A,c2_1A); *((volatile vector float *)(ptrC+60)) = c0_1B; \
SPU_FMA(c3_1A,a31,b1_1A,c3_1A); *((volatile vector float *)(ptrC+M+60)) = c1_1B; \
SPU_FMA(c0_0A,a02,b2_0A,c0_0A); a23 = spu_shuffle(a2, a2, pat3); \
SPU_FMA(c1_0A,a12,b2_0A,c1_0A); *((volatile vector float *)(ptrC+2*M+60)) = c2_1B; \
SPU_FMA(c2_0A,a22,b2_0A,c2_0A); *((volatile vector float *)(ptrC+3*M+60)) = c3_1B; \
SPU_FMA(c3_0A,a32,b2_0A,c3_0A); a33 = spu_shuffle(a3, a3, pat3); \
SPU_FMA(c0_1A,a02,b2_1A,c0_1A); b0_0B = *((volatile vector float *)(ptrB+4*M+8)); \
SPU_FMA(c1_1A,a12,b2_1A,c1_1A); b0_1B = *((volatile vector float *)(ptrB+4*M+12)); \
SPU_FMA(c2_1A,a22,b2_1A,c2_1A); b1_0B = *((volatile vector float *)(ptrB+5*M+8)); \
SPU_FMA(c3_1A,a32,b2_1A,c3_1A); b1_1B = *((volatile vector float *)(ptrB+5*M+12)); \
SPU_FMA(c0_0A,a03,b3_0A,c0_0A); c0_0B = *((volatile vector float *)(ptrC+8)); \
SPU_FMA(c1_0A,a13,b3_0A,c1_0A); c1_0B = *((volatile vector float *)(ptrC+M+8)); \
SPU_FMA(c2_0A,a23,b3_0A,c2_0A); c2_0B = *((volatile vector float *)(ptrC+2*M+8)); \
SPU_FMA(c3_0A,a33,b3_0A,c3_0A); c3_0B = *((volatile vector float *)(ptrC+3*M+8)); \
SPU_FMA(c0_1A,a03,b3_1A,c0_1A); b2_0B = *((volatile vector float *)(ptrB+6*M+8)); \
SPU_FMA(c1_1A,a13,b3_1A,c1_1A); b2_1B = *((volatile vector float *)(ptrB+6*M+12)); \
SPU_FMA(c2_1A,a23,b3_1A,c2_1A); b3_0B = *((volatile vector float *)(ptrB+7*M+8)); \
SPU_FMA(c3_1A,a33,b3_1A,c3_1A); b3_1B = *((volatile vector float *)(ptrB+7*M+12)); \
ALIGN8B; \
c0_1B = *((volatile vector float *)(ptrC+12)); \
c1_1B = *((volatile vector float *)(ptrC+M+12)); \
c2_1B = *((volatile vector float *)(ptrC+2*M+12)); \
c3_1B = *((volatile vector float *)(ptrC+3*M+12)); \
}
#define Loads4RegSetA(OFFSET) \
{ \
c0_0A = *((volatile vector float *)(ptrC+OFFSET)); \
c1_0A = *((volatile vector float *)(ptrC+M+OFFSET)); \
c2_0A = *((volatile vector float *)(ptrC+2*M+OFFSET)); \
c3_0A = *((volatile vector float *)(ptrC+3*M+OFFSET)); \
c0_1A = *((volatile vector float *)(ptrC+OFFSET+4)); \
c1_1A = *((volatile vector float *)(ptrC+M+OFFSET+4)); \
c2_1A = *((volatile vector float *)(ptrC+2*M+OFFSET+4)); \
c3_1A = *((volatile vector float *)(ptrC+3*M+OFFSET+4)); \
b0_0A = *((volatile vector float *)(ptrB+OFFSET)); \
b1_0A = *((volatile vector float *)(ptrB+M+OFFSET)); \
b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET)); \
b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET)); \
b0_1A = *((volatile vector float *)(ptrB+OFFSET+4)); \
b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+4)); \
b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+4)); \
b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+4)); \
}
#define Loads4RegSetAClr(OFFSET) \
{ \
b0_0A = *((volatile vector float *)(ptrB+OFFSET)); \
b1_0A = *((volatile vector float *)(ptrB+M+OFFSET)); \
b2_0A = *((volatile vector float *)(ptrB+2*M+OFFSET)); \
b3_0A = *((volatile vector float *)(ptrB+3*M+OFFSET)); \
b0_1A = *((volatile vector float *)(ptrB+OFFSET+4)); \
b1_1A = *((volatile vector float *)(ptrB+M+OFFSET+4)); \
b2_1A = *((volatile vector float *)(ptrB+2*M+OFFSET+4)); \
b3_1A = *((volatile vector float *)(ptrB+3*M+OFFSET+4)); \
}
#define Ops4RegSetAClr() \
{ \
c0_0A = spu_mul( a00, b0_0A); \
c1_0A = spu_mul( a10, b0_0A); \
c2_0A = spu_mul( a20, b0_0A); \
c3_0A = spu_mul( a30, b0_0A); \
c0_1A = spu_mul( a00, b0_1A); \
c1_1A = spu_mul( a10, b0_1A); \
c2_1A = spu_mul( a20, b0_1A); \
c3_1A = spu_mul( a30, b0_1A); \
c0_0A = spu_madd(a01, b1_0A, c0_0A); \
c1_0A = spu_madd(a11, b1_0A, c1_0A); \
c2_0A = spu_madd(a21, b1_0A, c2_0A); \
c3_0A = spu_madd(a31, b1_0A, c3_0A); \
c0_1A = spu_madd(a01, b1_1A, c0_1A); \
c1_1A = spu_madd(a11, b1_1A, c1_1A); \
c2_1A = spu_madd(a21, b1_1A, c2_1A); \
c3_1A = spu_madd(a31, b1_1A, c3_1A); \
c0_0A = spu_madd(a02, b2_0A, c0_0A); \
c1_0A = spu_madd(a12, b2_0A, c1_0A); \
c2_0A = spu_madd(a22, b2_0A, c2_0A); \
c3_0A = spu_madd(a32, b2_0A, c3_0A); \
c0_1A = spu_madd(a02, b2_1A, c0_1A); \
c1_1A = spu_madd(a12, b2_1A, c1_1A); \
c2_1A = spu_madd(a22, b2_1A, c2_1A); \
c3_1A = spu_madd(a32, b2_1A, c3_1A); \
c0_0A = spu_madd(a03, b3_0A, c0_0A); \
c1_0A = spu_madd(a13, b3_0A, c1_0A); \
c2_0A = spu_madd(a23, b3_0A, c2_0A); \
c3_0A = spu_madd(a33, b3_0A, c3_0A); \
c0_1A = spu_madd(a03, b3_1A, c0_1A); \
c1_1A = spu_madd(a13, b3_1A, c1_1A); \
c2_1A = spu_madd(a23, b3_1A, c2_1A); \
c3_1A = spu_madd(a33, b3_1A, c3_1A); \
}
#define Ops4RegSetA() \
{ \
c0_0A = spu_madd(a00, b0_0A, c0_0A); \
c1_0A = spu_madd(a10, b0_0A, c1_0A); \
c2_0A = spu_madd(a20, b0_0A, c2_0A); \
c3_0A = spu_madd(a30, b0_0A, c3_0A); \
c0_1A = spu_madd(a00, b0_1A, c0_1A); \
c1_1A = spu_madd(a10, b0_1A, c1_1A); \
c2_1A = spu_madd(a20, b0_1A, c2_1A); \
c3_1A = spu_madd(a30, b0_1A, c3_1A); \
c0_0A = spu_madd(a01, b1_0A, c0_0A); \
c1_0A = spu_madd(a11, b1_0A, c1_0A); \
c2_0A = spu_madd(a21, b1_0A, c2_0A); \
c3_0A = spu_madd(a31, b1_0A, c3_0A); \
c0_1A = spu_madd(a01, b1_1A, c0_1A); \
c1_1A = spu_madd(a11, b1_1A, c1_1A); \
c2_1A = spu_madd(a21, b1_1A, c2_1A); \
c3_1A = spu_madd(a31, b1_1A, c3_1A); \
c0_0A = spu_madd(a02, b2_0A, c0_0A); \
c1_0A = spu_madd(a12, b2_0A, c1_0A); \
c2_0A = spu_madd(a22, b2_0A, c2_0A); \
c3_0A = spu_madd(a32, b2_0A, c3_0A); \
c0_1A = spu_madd(a02, b2_1A, c0_1A); \
c1_1A = spu_madd(a12, b2_1A, c1_1A); \
c2_1A = spu_madd(a22, b2_1A, c2_1A); \
c3_1A = spu_madd(a32, b2_1A, c3_1A); \
c0_0A = spu_madd(a03, b3_0A, c0_0A); \
c1_0A = spu_madd(a13, b3_0A, c1_0A); \
c2_0A = spu_madd(a23, b3_0A, c2_0A); \
c3_0A = spu_madd(a33, b3_0A, c3_0A); \
c0_1A = spu_madd(a03, b3_1A, c0_1A); \
c1_1A = spu_madd(a13, b3_1A, c1_1A); \
c2_1A = spu_madd(a23, b3_1A, c2_1A); \
c3_1A = spu_madd(a33, b3_1A, c3_1A); \
}
#define Stores4RegSetA(OFFSET) \
{ \
*((volatile vector float *)(ptrC+OFFSET)) = c0_0A; \
*((volatile vector float *)(ptrC+M+OFFSET)) = c1_0A; \
*((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0A; \
*((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0A; \
*((volatile vector float *)(ptrC+OFFSET+4)) = c0_1A; \
*((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1A; \
*((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1A; \
*((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1A; \
}
#define Loads4RegSetB(OFFSET) \
{ \
c0_0B = *((volatile vector float *)(ptrC+OFFSET)); \
c1_0B = *((volatile vector float *)(ptrC+M+OFFSET)); \
c2_0B = *((volatile vector float *)(ptrC+2*M+OFFSET)); \
c3_0B = *((volatile vector float *)(ptrC+3*M+OFFSET)); \
c0_1B = *((volatile vector float *)(ptrC+OFFSET+4)); \
c1_1B = *((volatile vector float *)(ptrC+M+OFFSET+4)); \
c2_1B = *((volatile vector float *)(ptrC+2*M+OFFSET+4)); \
c3_1B = *((volatile vector float *)(ptrC+3*M+OFFSET+4)); \
b0_0B = *((volatile vector float *)(ptrB+OFFSET)); \
b1_0B = *((volatile vector float *)(ptrB+M+OFFSET)); \
b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET)); \
b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET)); \
b0_1B = *((volatile vector float *)(ptrB+OFFSET+4)); \
b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+4)); \
b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+4)); \
b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+4)); \
}
#define Loads4RegSetBClr(OFFSET) \
{ \
b0_0B = *((volatile vector float *)(ptrB+OFFSET)); \
b1_0B = *((volatile vector float *)(ptrB+M+OFFSET)); \
b2_0B = *((volatile vector float *)(ptrB+2*M+OFFSET)); \
b3_0B = *((volatile vector float *)(ptrB+3*M+OFFSET)); \
b0_1B = *((volatile vector float *)(ptrB+OFFSET+4)); \
b1_1B = *((volatile vector float *)(ptrB+M+OFFSET+4)); \
b2_1B = *((volatile vector float *)(ptrB+2*M+OFFSET+4)); \
b3_1B = *((volatile vector float *)(ptrB+3*M+OFFSET+4)); \
}
#define Ops4RegSetB() \
{ \
c0_0B = spu_madd(a00, b0_0B, c0_0B); \
c1_0B = spu_madd(a10, b0_0B, c1_0B); \
c2_0B = spu_madd(a20, b0_0B, c2_0B); \
c3_0B = spu_madd(a30, b0_0B, c3_0B); \
c0_1B = spu_madd(a00, b0_1B, c0_1B); \
c1_1B = spu_madd(a10, b0_1B, c1_1B); \
c2_1B = spu_madd(a20, b0_1B, c2_1B); \
c3_1B = spu_madd(a30, b0_1B, c3_1B); \
c0_0B = spu_madd(a01, b1_0B, c0_0B); \
c1_0B = spu_madd(a11, b1_0B, c1_0B); \
c2_0B = spu_madd(a21, b1_0B, c2_0B); \
c3_0B = spu_madd(a31, b1_0B, c3_0B); \
c0_1B = spu_madd(a01, b1_1B, c0_1B); \
c1_1B = spu_madd(a11, b1_1B, c1_1B); \
c2_1B = spu_madd(a21, b1_1B, c2_1B); \
c3_1B = spu_madd(a31, b1_1B, c3_1B); \
c0_0B = spu_madd(a02, b2_0B, c0_0B); SPU_LNOP; \
c1_0B = spu_madd(a12, b2_0B, c1_0B); \
c2_0B = spu_madd(a22, b2_0B, c2_0B); \
c3_0B = spu_madd(a32, b2_0B, c3_0B); \
c0_1B = spu_madd(a02, b2_1B, c0_1B); \
c1_1B = spu_madd(a12, b2_1B, c1_1B); \
c2_1B = spu_madd(a22, b2_1B, c2_1B); \
c3_1B = spu_madd(a32, b2_1B, c3_1B); \
c0_0B = spu_madd(a03, b3_0B, c0_0B); \
c1_0B = spu_madd(a13, b3_0B, c1_0B); \
c2_0B = spu_madd(a23, b3_0B, c2_0B); \
c3_0B = spu_madd(a33, b3_0B, c3_0B); \
c0_1B = spu_madd(a03, b3_1B, c0_1B); \
c1_1B = spu_madd(a13, b3_1B, c1_1B); \
c2_1B = spu_madd(a23, b3_1B, c2_1B); \
c3_1B = spu_madd(a33, b3_1B, c3_1B); \
}
#define Stores4RegSetB(OFFSET) \
{ \
*((volatile vector float *)(ptrC+OFFSET)) = c0_0B; \
*((volatile vector float *)(ptrC+M+OFFSET)) = c1_0B; \
*((volatile vector float *)(ptrC+2*M+OFFSET)) = c2_0B; \
*((volatile vector float *)(ptrC+3*M+OFFSET)) = c3_0B; \
*((volatile vector float *)(ptrC+OFFSET+4)) = c0_1B; \
*((volatile vector float *)(ptrC+M+OFFSET+4)) = c1_1B; \
*((volatile vector float *)(ptrC+2*M+OFFSET+4)) = c2_1B; \
*((volatile vector float *)(ptrC+3*M+OFFSET+4)) = c3_1B; \
}
float blkA0[M*M] __attribute__((aligned(128)));
float blkB0[M*M] __attribute__((aligned(128)));
float blkC0[M*M] __attribute__((aligned(128)));
float blkA1[M*M] __attribute__((aligned(128)));
float blkB1[M*M] __attribute__((aligned(128)));
float blkC1[M*M] __attribute__((aligned(128)));
// For synchronnization
char lsbuf[128] __attribute__((aligned(128)));
static void Block_Fetch(volatile float *blkA, volatile float *blkB,
unsigned int by, unsigned int bx, unsigned int idx, unsigned int rtag)
{
unsigned int baseA, baseB;
baseA = A_AREA + 4*(by*M*N+idx*M*M);
baseB = B_AREA + 4*(bx*M*M+idx*M*N);
mfc_get(blkA, baseA, sizeof(float)*M*M, rtag, 0, 0);
mfc_get(blkB, baseB, sizeof(float)*M*M, rtag, 0, 0);
}
static void MatInit_MxM(volatile float *blkC, volatile float *blkA, volatile float *blkB)
{
unsigned int i;
volatile float *ptrA, *ptrB, *ptrC;
vector float a0, a1, a2, a3;
vector float a00, a01, a02, a03;
vector float a10, a11, a12, a13;
vector float a20, a21, a22, a23;
vector float a30, a31, a32, a33;
vector float b0_0A, b1_0A, b2_0A, b3_0A;
vector float c0_0A, c1_0A, c2_0A, c3_0A;
vector float b0_1A, b1_1A, b2_1A, b3_1A;
vector float c0_1A, c1_1A, c2_1A, c3_1A;
vector float b0_0B, b1_0B, b2_0B, b3_0B;
vector float c0_0B, c1_0B, c2_0B, c3_0B;
vector float b0_1B, b1_1B, b2_1B, b3_1B;
vector float c0_1B, c1_1B, c2_1B, c3_1B;
vector float b0_0C, b1_0C, b2_0C, b3_0C;
vector float c0_0C, c1_0C, c2_0C, c3_0C;
vector float b0_1C, b1_1C, b2_1C, b3_1C;
vector float c0_1C, c1_1C, c2_1C, c3_1C;
const vector unsigned char pat0 = VEC_LITERAL(vector unsigned char,
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
const vector unsigned char pat1 = VEC_LITERAL(vector unsigned char,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07);
const vector unsigned char pat2 = VEC_LITERAL(vector unsigned char,
0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b,
0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b);
const vector unsigned char pat3 = VEC_LITERAL(vector unsigned char,
0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f,
0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f);
for(i=0; i<M; i+=4){
ptrA = &blkA[i*M];
ptrB = &blkB[0];
ptrC = &blkC[i*M];
a0 = *((volatile vector float *)(ptrA));
a1 = *((volatile vector float *)(ptrA+M));
a2 = *((volatile vector float *)(ptrA+2*M));
a3 = *((volatile vector float *)(ptrA+3*M));
a00 = spu_shuffle(a0, a0, pat0);
a01 = spu_shuffle(a0, a0, pat1);
a02 = spu_shuffle(a0, a0, pat2);
a03 = spu_shuffle(a0, a0, pat3);
a10 = spu_shuffle(a1, a1, pat0);
a11 = spu_shuffle(a1, a1, pat1);
a12 = spu_shuffle(a1, a1, pat2);
a13 = spu_shuffle(a1, a1, pat3);
a20 = spu_shuffle(a2, a2, pat0);
a21 = spu_shuffle(a2, a2, pat1);
a22 = spu_shuffle(a2, a2, pat2);
a23 = spu_shuffle(a2, a2, pat3);
a30 = spu_shuffle(a3, a3, pat0);
a31 = spu_shuffle(a3, a3, pat1);
a32 = spu_shuffle(a3, a3, pat2);
a33 = spu_shuffle(a3, a3, pat3);
Loads4RegSetAClr(0);
Ops4RegSetAClr();
Loads4RegSetBClr(8);
StageCBAclr(0);
StageACBclr(8);
StageBACclr(16);
StageCBAclr(24);
StageACBclr(32);
StageBACclr(40);
StageMISCclr();
StageCBA(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(4,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(8,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
StageMISC(12,12);
StageCBAmod(0,16);
StageACB(8,16);
StageBAC(16,16);
StageCBA(24,16);
StageACB(32,16);
StageBAC(40,16);
StageMISC(16,16);
StageCBAmod(0,20);
StageACB(8,20);
StageBAC(16,20);
StageCBA(24,20);
StageACB(32,20);
StageBAC(40,20);
StageMISC(20,20);
StageCBAmod(0,24);
StageACB(8,24);
StageBAC(16,24);
StageCBA(24,24);
StageACB(32,24);
StageBAC(40,24);
StageMISCmod(24,24);
StageCBA(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(28,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(32,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
StageMISC(36,12);
StageCBAmod(0,16);
StageACB(8,16);
StageBAC(16,16);
StageCBA(24,16);
StageACB(32,16);
StageBAC(40,16);
StageMISC(40,16);
StageCBAmod(0,20);
StageACB(8,20);
StageBAC(16,20);
StageCBA(24,20);
StageACB(32,20);
StageBAC(40,20);
StageMISC(44,20);
StageCBAmod(0,24);
StageACB(8,24);
StageBAC(16,24);
StageCBA(24,24);
StageACB(32,24);
StageBAC(40,24);
StageMISCmod(48,24);
StageCBA(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(52,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(56,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
Ops4RegSetB();
Stores4RegSetA(48);
Stores4RegSetB(56);
}
}
static void MatMult_MxM(volatile float *blkC, volatile float *blkA, volatile float *blkB)
{
unsigned int i;
volatile float *ptrA, *ptrB, *ptrC;
vector float a0, a1, a2, a3;
vector float a00, a01, a02, a03;
vector float a10, a11, a12, a13;
vector float a20, a21, a22, a23;
vector float a30, a31, a32, a33;
vector float b0_0A, b1_0A, b2_0A, b3_0A;
vector float c0_0A, c1_0A, c2_0A, c3_0A;
vector float b0_1A, b1_1A, b2_1A, b3_1A;
vector float c0_1A, c1_1A, c2_1A, c3_1A;
vector float b0_0B, b1_0B, b2_0B, b3_0B;
vector float c0_0B, c1_0B, c2_0B, c3_0B;
vector float b0_1B, b1_1B, b2_1B, b3_1B;
vector float c0_1B, c1_1B, c2_1B, c3_1B;
vector float b0_0C, b1_0C, b2_0C, b3_0C;
vector float c0_0C, c1_0C, c2_0C, c3_0C;
vector float b0_1C, b1_1C, b2_1C, b3_1C;
vector float c0_1C, c1_1C, c2_1C, c3_1C;
const vector unsigned char pat0 = VEC_LITERAL(vector unsigned char,
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
const vector unsigned char pat1 = VEC_LITERAL(vector unsigned char,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07);
const vector unsigned char pat2 = VEC_LITERAL(vector unsigned char,
0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b,
0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b);
const vector unsigned char pat3 = VEC_LITERAL(vector unsigned char,
0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f,
0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f);
for(i=0; i<M; i+=4){
ptrA = &blkA[i*M];
ptrB = &blkB[0];
ptrC = &blkC[i*M];
a0 = *((volatile vector float *)(ptrA));
a1 = *((volatile vector float *)(ptrA+M));
a2 = *((volatile vector float *)(ptrA+2*M));
a3 = *((volatile vector float *)(ptrA+3*M));
a00 = spu_shuffle(a0, a0, pat0);
a01 = spu_shuffle(a0, a0, pat1);
a02 = spu_shuffle(a0, a0, pat2);
a03 = spu_shuffle(a0, a0, pat3);
a10 = spu_shuffle(a1, a1, pat0);
a11 = spu_shuffle(a1, a1, pat1);
a12 = spu_shuffle(a1, a1, pat2);
a13 = spu_shuffle(a1, a1, pat3);
a20 = spu_shuffle(a2, a2, pat0);
a21 = spu_shuffle(a2, a2, pat1);
a22 = spu_shuffle(a2, a2, pat2);
a23 = spu_shuffle(a2, a2, pat3);
a30 = spu_shuffle(a3, a3, pat0);
a31 = spu_shuffle(a3, a3, pat1);
a32 = spu_shuffle(a3, a3, pat2);
a33 = spu_shuffle(a3, a3, pat3);
Loads4RegSetA(0);
Ops4RegSetA();
Loads4RegSetB(8);
StageCBA(0,0);
StageACB(8,0);
StageBAC(16,0);
StageCBA(24,0);
StageACB(32,0);
StageBAC(40,0);
StageMISC(0,0);
StageCBAmod(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(4,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(8,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
StageMISC(12,12);
StageCBAmod(0,16);
StageACB(8,16);
StageBAC(16,16);
StageCBA(24,16);
StageACB(32,16);
StageBAC(40,16);
StageMISC(16,16);
StageCBAmod(0,20);
StageACB(8,20);
StageBAC(16,20);
StageCBA(24,20);
StageACB(32,20);
StageBAC(40,20);
StageMISC(20,20);
StageCBAmod(0,24);
StageACB(8,24);
StageBAC(16,24);
StageCBA(24,24);
StageACB(32,24);
StageBAC(40,24);
StageMISCmod(24,24);
StageCBA(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(28,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(32,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
StageMISC(36,12);
StageCBAmod(0,16);
StageACB(8,16);
StageBAC(16,16);
StageCBA(24,16);
StageACB(32,16);
StageBAC(40,16);
StageMISC(40,16);
StageCBAmod(0,20);
StageACB(8,20);
StageBAC(16,20);
StageCBA(24,20);
StageACB(32,20);
StageBAC(40,20);
StageMISC(44,20);
StageCBAmod(0,24);
StageACB(8,24);
StageBAC(16,24);
StageCBA(24,24);
StageACB(32,24);
StageBAC(40,24);
StageMISCmod(48,24);
StageCBA(0,4);
StageACB(8,4);
StageBAC(16,4);
StageCBA(24,4);
StageACB(32,4);
StageBAC(40,4);
StageMISC(52,4);
StageCBAmod(0,8);
StageACB(8,8);
StageBAC(16,8);
StageCBA(24,8);
StageACB(32,8);
StageBAC(40,8);
StageMISC(56,8);
StageCBAmod(0,12);
StageACB(8,12);
StageBAC(16,12);
StageCBA(24,12);
StageACB(32,12);
StageBAC(40,12);
Ops4RegSetB();
Stores4RegSetA(48);
Stores4RegSetB(56);
}
}
static void Block_Store(volatile float *blkC, int by, int bx, int wtag)
{
unsigned int baseC;
baseC = C_AREA + 4*(by*M*N+bx*M*M);
mfc_put(blkC, baseC, sizeof(float)*M*M, wtag, 0, 0);
}
int main()
{
unsigned int i, j, k;
unsigned int i_next, j_next;
volatile float *InA, *InB;
volatile float *OpA, *OpB, *OpC;
volatile float *OutC;
unsigned int InTag = DIN_TAG;
unsigned int OutTag = DOUT_TAG;
unsigned int blkid = 0;
unsigned int shift, mask;
unsigned int blocks, tiles;
// Double Buffer Initialization
InA = blkA0;
InB = blkB0;
OpA = blkA0;
OpB = blkB0;
OpC = blkC0;
OutC = blkC0;
// Fetch working parameters from mailbox. The input
// parameters include:
// N = the size of the matrix
// ITER = number of times to perform matrix multiply
// FINC_AREA2 = pointer to synchronization counter
// A_AREA = pointer to input matrix A
// B_AREA = pointer to input matrix B
// C_AREA = pointer to output matrix C
N = spu_read_in_mbox();
ITER = spu_read_in_mbox();
FINC_AREA2 = (atomic_ea_t)spu_read_in_mbox();
A_AREA = spu_read_in_mbox();
B_AREA = spu_read_in_mbox();
C_AREA = spu_read_in_mbox();
tiles = N / M;
blocks = tiles * tiles;
mask = tiles - 1;
shift = 32 - spu_extract(spu_cntlz(spu_promote(mask, 0)), 0);
// Matrix Multiply with block partitioning
blkid = _atomic_inc_return(FINC_AREA2);
i = (blkid >> shift) & mask;
j = (blkid) & mask;
if (blkid >= ITER*blocks) exit(0);
Block_Fetch(InA, InB, i, j, 0, InTag);
while(1){
SwapInBuf();
Block_Fetch(InA, InB, i, j, 1, InTag);
DMA_Wait(InTag^1);
DMA_Wait(OutTag);
MatInit_MxM(OpC, OpA, OpB);
for(k=1;k<(N/M)-1;k++){
SwapInBuf();
Block_Fetch(InA, InB, i, j, k+1, InTag);
DMA_Wait(InTag^1);
MatMult_MxM(OpC, OpA, OpB);
}
// Epilogue for k==(N/M)-1
SwapInBuf();
blkid = _atomic_inc_return(FINC_AREA2);
i_next = (blkid >> shift) & mask;
j_next = (blkid) & mask;
if (blkid >= ITER*blocks) {
DMA_Wait(InTag^1);
MatMult_MxM(OpC, OpA, OpB);
Block_Store(OutC, i, j, OutTag);
DMA_Wait(OutTag);
break;
}
Block_Fetch(InA, InB, i_next, j_next, 0, InTag);
DMA_Wait(InTag^1);
MatMult_MxM(OpC, OpA, OpB);
Block_Store(OutC, i, j, OutTag);
i = i_next;
j = j_next;
SwapOutBuf();
}
return (0);
}