Cell SDK Code Sample: DMA Mechanisms
%% --------------------------------------------------------------
%% (C) Copyright 2001,2005,
%% International Business Machines Corporation,
%% Sony Computer Entertainment Incorporated,
%% Toshiba Corporation.
%%
%% All Rights Reserved.
%% --------------------------------------------------------------
%% PROLOG END TAG zYx
Target:
CBE-Linux (HW or simulator)
Description:
This directory contains a sample program demonstrating some non-trivial
DMA calls within the SPEs.
Notes:
The actual executable resides in the ppu subdirectory. It's called
'dma_sample'. It's a full CBE executable, with both PPE and SPE code.
The job of this program is to get the SPEs to increment every element
of a pre-loaded large array, such that array[i] = i+1;
Here's what happens when you run "dma_sample":
* You specify the size of the large array, in elements, as a runtime
parameter (actually, the log of the number of elements).
Permitted range for this log is 19 <= log(#elements) <= 24.
* The PPE assigns a[i] = i for every element of the large array.
* The PPE then fires up all eight SPEs, pointing them at specific sections
of the big array.
* The SPEs each use four different DMA styles to load pieces of their sections,
increment them, and write them back to main memory.
The four styles are:
a) single-buffered DMA
b) double-buffered DMA
c) single-buffered DMA List
d) double-buffered DMA List
· The PPE checks these results, and prints whether any errors were found.
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#ifndef __dma_sample_h__
#define __dma_sample_h__
#include <stdlib.h>
/* This union helps clarify calling parameters between the PPE and the SPE. */
typedef union
{
unsigned long long ull;
unsigned int ui[2];
}
addr64;
/* This "control block" contains data neede by the SPE. */
/* When the SPE starts executing it's main() code, the */
/* first thing it will do is DMA in the contents of this */
/* control block structure. */
typedef struct _control_block {
unsigned int chunk_size; /* size, in bytes, of each of these array pieces */
unsigned int addrSB; /* address to be filled by single-buffered DMA */
unsigned int addrDB; /* address to be filled by double-buffered DMA */
unsigned int addrSBL; /* address to be filled by single-buffered DMA list */
unsigned int addrDBL; /* address to be filled by double-buffered DMA list */
unsigned char pad[108]; /* pad to a full cache line (128 bytes) */
} control_block;
#endif /* __dma_sample_h__ */
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../dma_sample.h"
#include <sched.h>
#include <libspe.h>
#include <stdio.h>
#include <errno.h>
/* there are eight control blocks, one for each SPE. */
control_block cb[8] __attribute__ ((aligned (128)));
/* this is the pointer to the SPE code, to be used at thread creation time */
extern spe_program_handle_t dma_sample_spu;
/* before the threads are created, we create a "group" environment for them. */
/* this "gid" is the handle for that group. */
spe_gid_t gid;
/* these are the handles returned by "spe_create_thread" */
speid_t speids[8];
/* this variable is used to return data regarding an abnormal return from the SPE */
int status[8];
/* here is the variable which will hold the big array */
int *data;
int main(int argc, char *argv[]) {
int i, error_count;
int log_array_size, array_size, chunk_size;
/* user specifies the size of the large array */
if (argc != 2) {
printf("usage: dma_sample <log of #elements in big array (19 <= x <= 24) >\n");
return -1;
}
log_array_size = atoi(argv[1]);
if (log_array_size < 18 || log_array_size > 26) {
printf("usage: dma_sample <log of #elements in big array (19 <= x <= 24) >\n");
return -1;
}
/* compute array size from input parameter */
array_size = 1 << log_array_size;
/* compute how many elements each DMA task should handle */
/* there are four tasks for each of eight SPUs, so the */
/* size of each task is 1/32nd the size of the full array */
chunk_size = array_size >> 5;
printf("Initializing the array...\n");
/* the big array needs to be aligned on a 128-byte cache line */
data = (int *) malloc(127 + array_size*sizeof(int));
while (((int) data) & 0x7f) ++data;
/* load the big array with initial values */
for (i=0; i<array_size; ++i) data[i] = i;
fprintf(stderr, "ready to call (create) SPE threads\n"); fflush(stderr);
/* Create an SPE group which enables SPE events. */
gid = spe_create_group (SCHED_OTHER, 0, 1);
if (gid == NULL) {
fprintf(stderr, "Failed spe_create_group(errno=%d)\n", errno);
return -1;
}
if (spe_group_max (gid) < 8) {
fprintf(stderr, "System doesn't have eight working SPEs. I'm leaving.\n");
return -1;
}
/* load the control blocks for each SPE with data */
for (i = 0; i < 8; i++) {
cb[i].chunk_size = chunk_size * sizeof(int); /* convert to units of bytes */
cb[i].addrSB = (unsigned int) &data[chunk_size*(4*i+0)];
cb[i].addrDB = (unsigned int) &data[chunk_size*(4*i+1)];
cb[i].addrSBL = (unsigned int) &data[chunk_size*(4*i+2)];
cb[i].addrDBL = (unsigned int) &data[chunk_size*(4*i+3)];
}
/* allocate SPE tasks */
for (i = 0; i < 8; i++) {
speids[i] = spe_create_thread (gid, &dma_sample_spu, (unsigned long long *) &cb[i], NULL, -1, 0);
if (speids[i] == NULL) {
fprintf (stderr, "FAILED: spe_create_thread(num=%d, errno=%d)\n", i, errno);
exit (3+i);
}
}
printf("SPEs are now computing...\n");
/* wait for SPEs to all finish */
for (i=0; i<8; ++i) spe_wait(speids[i], &status[i], 0);
/* Issue a sync, just to be safe. */
__asm__ __volatile__ ("sync" : : : "memory");
printf("SPEs done. Now checking results...\n"); fflush(stdout);
error_count = 0;
/* the task for the SPEs was to increment the data in every array element */
/* check to see if any of the array elements was not properly incremented */
for (i=0; i<array_size; ++i) {
if (data[i] != i+1) {
printf("error! data[%x] = %x\n", i, data[i]);
++error_count;
}
}
printf("total errors detected = %d\n", error_count);
return 0;
}
Target: SPE [This code section runs on the SPE side]
/* -------------------------------------------------------------- */
/* (C) Copyright 2001,2005, */
/* International Business Machines Corporation, */
/* Sony Computer Entertainment Incorporated, */
/* Toshiba Corporation. */
/* */
/* All Rights Reserved. */
/* -------------------------------------------------------------- */
/* PROLOG END TAG zYx */
#include "../dma_sample.h"
#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include <stdio.h>
/* here we define the data buffer in local memory, to hold the data we DMA in. */
/* We will DMA in 4096 elements at a time, as that's the largest size which */
/* fits in a standard DMA (16,384 bytes). */
/* Since we're sometimes going to be double-buffering, we'll allocate 8192. */
int databuffer[8192] __attribute__ ((aligned (128)));
/* Here we define the pointers which will point at the upper and lower parts */
/* of this data buffer. */
int *data[2];
/* here we define buffers to hold DMA List data */
/* there are four buffers to allow us to multi-buffer more effectively */
volatile unsigned int dma_list[4][256];
/* this variable tells us how many DMA cycles each task must perform */
int loopcount;
/* control structure */
control_block cb __attribute__ ((aligned (128)));
/* the next two macros do the same thing, except using lists */
/* for each DMA List task, it's necessary to compute the addresses of */
/* each piece of address to read in. Here are three different macros */
/* which all do the same thing. They load the DMA List buffer. */
/* The first macro is scalar. */
/* The second macro is converted to SIMD code. */
/* The third macro is like the second, but hand-unrolled. */
/* This sample ships using the third macro, but the user can re-code */
/* the sample to use any of these three. */
/* DMA Lists allow you to gather data from arbitrary locations in */
/* main memory. This sample simply demonstrates the use of DMA Lists */
/* in gathering sequential cache lines of 128 bytes each. This is */
/* a somewhat silly use of the DMA List, as the memory could have */
/* been collected with a standard DMA call (without a list), but we */
/* show the code here to help the user understand how DMA Lists work. */
/* The list itself is a sequence of 4-byte words, each pair of which */
/* specifies a length in bytes, and an address in main memory. */
/* Since this sample is specifying exactly one cache line for each */
/* element of the DMA list, the lengths will always be 128. */
#define FILL_DMA_LIST(_base, _list_addr, _offset) { \
int _k; \
for (_k=0; _k<128; ++_k) { \
_list_addr[2*_k] = 128; \
_list_addr[2*_k+1] = _base + _offset * 16384 + 128 * _k; \
} \
}
#define FILL_DMA_LIST_SIMD(_base, _list_addr, _offset) { \
unsigned int _b, _k; \
vector unsigned int length = (vector unsigned int) (128, 0, 128, 0); \
vector unsigned int addend = (vector unsigned int) (0, 256, 0, 256); \
vector unsigned int *p, w; \
_b = _base + _offset * 16384; \
p = (vector unsigned int *) _list_addr; \
w = spu_insert(_b, length, 1); \
w = spu_insert(_b+128, w, 3); \
for (_k=0; _k<64; ++_k) { \
p[_k] = w; \
w = spu_add(w, addend); \
} \
}
#define FILL_DMA_LIST_SIMD_UNROLLED(_base, _list_addr, _offset) { \
unsigned int _b, _k; \
vector unsigned int length = (vector unsigned int) (128, 0, 128, 0); \
vector unsigned int addend = (vector unsigned int) (0, 256, 0, 256); \
vector unsigned int *_p, w0, w1, w2, w3; \
_b = _base + _offset * 16384; \
_p = (vector unsigned int *) _list_addr; \
w0 = spu_insert(_b, length, 1); \
w0 = spu_insert(_b+128, w0, 3); \
w1 = spu_add(w0, addend); \
w2 = spu_add(w1, addend); \
w3 = spu_add(w2, addend); \
addend = spu_add(addend, addend); \
addend = spu_add(addend, addend); \
for (_k=0; _k<64; _k+=4) { \
_p[0] = w0; \
_p[1] = w1; \
_p[2] = w2; \
_p[3] = w3; \
w0 = spu_add(w0, addend); \
w1 = spu_add(w1, addend); \
_p+=4; \
w2 = spu_add(w2, addend); \
w3 = spu_add(w3, addend); \
} \
}
/* here are two versions of the code that actually does the incrementing */
/* of the data we have read in. There are two versions, the first is */
/* scalar, and the second is SIMD. */
void load_data(int *dest) { int i;
for (i=0; i<4096; ++i) {
++dest[i];
}
}
void load_data_SIMD(int *dest) {
int i;
vector unsigned int *vdest;
vector unsigned int v1 = (vector unsigned int) (1, 1, 1, 1);
vdest = (vector unsigned int *) dest;
for (i=0; i<1024; ++i) {
vdest[i] = spu_add(vdest[i], v1);
}
}
/* Here we have the four DMA task modules. */
/* Each of them processes the data in the assigned regions in 16 kbyte chunks */
void load_singlebuffer(unsigned int addr)
{
int i;
for (i=0; i<loopcount; ++i) {
mfc_get(data[0], addr+16384*i, 16384, 20, 0, 0);
mfc_write_tag_mask(1<<20);
mfc_read_tag_status_all();
load_data_SIMD(data[0]);
mfc_put(data[0], addr+16384*i, 16384, 20, 0, 0);
mfc_write_tag_mask(1<<20);
mfc_read_tag_status_all();
}
}
void load_doublebuffer(unsigned int addr)
{
int i;
mfc_get(data[0], addr, 16384, 20, 0, 0);
for (i=1; i<loopcount; ++i) {
mfc_get(data[i&1], addr+16384*i, 16384, 20+(i&1), 0, 0);
mfc_write_tag_mask(1<<(21-(i&1)));
mfc_read_tag_status_all();
load_data_SIMD(data[(i-1)&1]);
mfc_put(data[(i-1)&1], addr+16384*(i-1), 16384, 21-(i&1), 0, 0);
}
mfc_write_tag_mask(1<<21);
mfc_read_tag_status_all();
load_data_SIMD(data[1]);
mfc_put(data[1], addr+16384*(loopcount-1), 16384, 21, 0, 0);
mfc_write_tag_mask(1<<21);
mfc_read_tag_status_all();
}
void load_singlebuffer_list(unsigned int addr)
{
int i;
for (i=0; i<loopcount; ++i) {
FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[0], i)
mfc_getl(data[0], 0, dma_list[0], 1024, 20, 0, 0);
mfc_write_tag_mask(1<<20);
mfc_read_tag_status_all();
load_data_SIMD(data[0]);
mfc_putl(data[0], 0, dma_list[0], 1024, 20, 0, 0);
mfc_write_tag_mask(1<<20);
mfc_read_tag_status_all();
}
}
void load_doublebuffer_list(unsigned int addr)
{
int i;
FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[0], 0)
mfc_getl(data[0], 0, dma_list[0], 1024, 20, 0, 0);
for (i=1; i<loopcount; ++i) {
mfc_write_tag_mask(1<<(20+(i&3)));
mfc_read_tag_status_all();
FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[i&3], i)
mfc_getl(data[i&1], 0, dma_list[i&3], 1024, 20+(i&3), 0, 0);
mfc_write_tag_mask(1<<(20+((i-1)&3)));
mfc_read_tag_status_all();
load_data_SIMD(data[(i-1)&1]);
mfc_putl(data[(i-1)&1], 0, dma_list[(i-1)&3], 1024, 20+((i-1)&3), 0, 0);
}
mfc_write_tag_mask(1<<23);
mfc_read_tag_status_all();
load_data_SIMD(data[1]);
mfc_putl(data[1], 0, dma_list[3], 1024, 23, 0, 0);
mfc_write_tag_mask(1<<23);
mfc_read_tag_status_all();
}
/* here is the location where the SPE begins execution, once its thread is created */
int main(unsigned long long speid, addr64 argp, addr64 envp) {
/* DMA control block information from system memory. */
mfc_get(&cb, argp.ui[1], sizeof(cb), 31, 0, 0);
mfc_write_tag_mask(1<<31);
mfc_read_tag_status_all();
printf("addrs = %x %x %x %x\n", cb.addrSB, cb.addrDB, cb.addrSBL, cb.addrDBL);
/* compute how many DMA cycles will be needed by each task */
loopcount = cb.chunk_size >> 14;
/* load the pointers so the point to the right part of the local store buffer */
data[0] = &databuffer[0];
data[1] = &databuffer[4096];
/* run the four tasks, indicating which portion of memory they should work with */
load_singlebuffer (cb.addrSB);
load_doublebuffer (cb.addrDB);
load_singlebuffer_list(cb.addrSBL);
load_doublebuffer_list(cb.addrDBL);
return 0;
}