Cell SDK Code Sample: DMA Mechanisms



%% --------------------------------------------------------------

%% (C) Copyright 2001,2005,                                      

%% International Business Machines Corporation,                  

%% Sony Computer Entertainment Incorporated,                      

%% Toshiba Corporation.                                          

%%                                                               

%% All Rights Reserved.                                          

%% --------------------------------------------------------------

%% PROLOG END TAG zYx                                             

Target:

        CBE-Linux (HW or simulator)

 

Description:

            This directory contains a sample program demonstrating some non-trivial

        DMA calls within the SPEs.

 

Notes:

            The actual executable resides in the ppu subdirectory.  It's called

            'dma_sample'. It's a full CBE executable, with both PPE and SPE code.

 

        The job of this program is to get the SPEs to increment every element

        of a pre-loaded large array, such that array[i] = i+1;

       

        Here's what happens when you run "dma_sample":

 

        * You specify the size of the large array, in elements, as a runtime

          parameter (actually, the log of the number of elements).

           Permitted range for this log is 19 <= log(#elements) <= 24.

 

        * The PPE assigns a[i] = i for every element of the large array.

 

        * The PPE then fires up all eight SPEs, pointing them at specific sections

          of the big array.

 

         * The SPEs each use four different DMA styles to load pieces of their sections,

          increment them, and write them back to main memory.

          The four styles are:

           a) single-buffered DMA

           b) double-buffered DMA

           c) single-buffered DMA List

           d) double-buffered DMA List

 

·         The PPE checks these results, and prints whether any errors were found.

 

 


 

/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                        */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                           */

/*                                                                 */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

#ifndef __dma_sample_h__

#define __dma_sample_h__

 

#include <stdlib.h>

 

/* This union helps clarify calling parameters between the PPE and the SPE. */

 

typedef union

{

  unsigned long long ull;

  unsigned int ui[2];

}

addr64;

 

/* This "control block" contains data neede by the SPE.    */

/* When the SPE starts executing it's main() code, the     */

/* first thing it will do is DMA in the contents of this   */

/* control block structure.                                */

 

typedef struct _control_block {

 

  unsigned int  chunk_size; /* size, in bytes, of each of these array pieces */

  unsigned int  addrSB;     /* address to be filled by single-buffered DMA */

  unsigned int  addrDB;     /* address to be filled by double-buffered DMA */

  unsigned int  addrSBL;     /* address to be filled by single-buffered DMA list */

  unsigned int  addrDBL;    /* address to be filled by double-buffered DMA list */

  unsigned char pad[108];   /* pad to a full cache line (128 bytes) */

 

} control_block;

 

#endif /* __dma_sample_h__ */

 


 


Target: PPE [This code section runs on the PPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                           */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

#include "../dma_sample.h"

#include <sched.h>

#include <libspe.h>

#include <stdio.h>

#include <errno.h>

 

/* there are eight control blocks, one for each SPE. */

control_block cb[8] __attribute__ ((aligned (128)));

 

/* this is the pointer to the SPE code, to be used at thread creation time */

extern spe_program_handle_t dma_sample_spu;

 

/* before the threads are created, we create a "group" environment for them. */

/* this "gid" is the handle for that group.                                  */

spe_gid_t gid;

 

/* these are the handles returned by "spe_create_thread" */

speid_t speids[8];

 

/* this variable is used to return data regarding an abnormal return from the SPE */

int status[8];

 

/* here is the variable which will hold the big array */

int *data;

 

int main(int argc, char *argv[]) {

  int i, error_count;

  int log_array_size, array_size, chunk_size;

 

  /* user specifies the size of the large array */

 

  if (argc != 2) {

    printf("usage: dma_sample <log of #elements in big array (19 <= x <= 24) >\n");

    return -1;

  }

 

  log_array_size = atoi(argv[1]);

 

  if (log_array_size < 18 || log_array_size > 26) {

    printf("usage: dma_sample <log of #elements in big array (19 <= x <= 24) >\n");

    return -1;

  }

 

  /* compute array size from input parameter */

  array_size = 1 << log_array_size;

 

  /* compute how many elements each DMA task should handle  */

  /* there are four tasks for each of eight SPUs, so the    */

  /* size of each task is 1/32nd the size of the full array */

  chunk_size = array_size >> 5;

 

  printf("Initializing the array...\n");

 

  /* the big array needs to be aligned on a 128-byte cache line */

  data = (int *) malloc(127 + array_size*sizeof(int));

  while (((int) data) & 0x7f) ++data;

 

  /* load the big array with initial values */

  for (i=0; i<array_size; ++i) data[i] = i;

 

  fprintf(stderr, "ready to call (create) SPE threads\n"); fflush(stderr);

 

  /* Create an SPE group which enables SPE events. */

  gid = spe_create_group (SCHED_OTHER, 0, 1);

  if (gid == NULL) {

    fprintf(stderr, "Failed spe_create_group(errno=%d)\n", errno);

    return -1;

  }

 

  if (spe_group_max (gid) < 8) {

    fprintf(stderr, "System doesn't have eight working SPEs.  I'm leaving.\n");

    return -1;

  }

 

  /* load the control blocks for each SPE with data */

  for (i = 0; i < 8; i++) {

    cb[i].chunk_size = chunk_size * sizeof(int); /* convert to units of bytes */

    cb[i].addrSB     = (unsigned int) &data[chunk_size*(4*i+0)];

    cb[i].addrDB     = (unsigned int) &data[chunk_size*(4*i+1)];

    cb[i].addrSBL    = (unsigned int) &data[chunk_size*(4*i+2)];

    cb[i].addrDBL    = (unsigned int) &data[chunk_size*(4*i+3)];

  }

 

  /* allocate SPE tasks */

  for (i = 0; i < 8; i++) {

    speids[i] = spe_create_thread (gid, &dma_sample_spu, (unsigned long long *) &cb[i], NULL, -1, 0);

    if (speids[i] == NULL) {

      fprintf (stderr, "FAILED: spe_create_thread(num=%d, errno=%d)\n", i, errno);

      exit (3+i);

    }

  }

 

  printf("SPEs are now computing...\n");

 

  /* wait for SPEs to all finish */

  for (i=0; i<8; ++i) spe_wait(speids[i], &status[i], 0);

 

  /* Issue a sync, just to be safe. */

  __asm__ __volatile__ ("sync" : : : "memory");

 

  printf("SPEs done.  Now checking results...\n"); fflush(stdout);

 

  error_count = 0;

 

  /* the task for the SPEs was to increment the data in every array element */

  /* check to see if any of the array elements was not properly incremented */

  for (i=0; i<array_size; ++i) {

    if (data[i] != i+1) {

      printf("error! data[%x] = %x\n", i, data[i]);

      ++error_count;

    }

  }

 

  printf("total errors detected = %d\n", error_count);

 

  return 0;

}

 


Target: SPE [This code section runs on the SPE side]


/* -------------------------------------------------------------- */

/* (C) Copyright 2001,2005,                                       */

/* International Business Machines Corporation,                   */

/* Sony Computer Entertainment Incorporated,                      */

/* Toshiba Corporation.                                            */

/*                                                                */

/* All Rights Reserved.                                           */

/* -------------------------------------------------------------- */

/* PROLOG END TAG zYx                                              */

#include "../dma_sample.h"

#include <cbe_mfc.h>

#include <spu_mfcio.h>

#include <stdio.h>

 

/* here we define the data buffer in local memory, to hold the data we DMA in. */

/* We will DMA in 4096 elements at a time, as that's the largest size which    */

/* fits in a standard DMA (16,384 bytes).                                      */

/* Since we're sometimes going to be double-buffering, we'll allocate 8192.    */

int databuffer[8192] __attribute__ ((aligned (128)));

 

/* Here we define the pointers which will point at the upper and lower parts   */

/* of this data buffer.                                                        */

int *data[2];

 

/* here we define buffers to hold DMA List data */

/* there are four buffers to allow us to multi-buffer more effectively */

volatile unsigned int dma_list[4][256];

 

/* this variable tells us how many DMA cycles each task must perform */

int loopcount;

 

/* control structure */

control_block cb __attribute__ ((aligned (128)));

 

/* the next two macros do the same thing, except using lists */

 

/* for each DMA List task, it's necessary to compute the addresses of */

/* each piece of address to read in.  Here are three different macros */

/* which all do the same thing.  They load the DMA List buffer.       */

/* The first macro is scalar.                                         */

/* The second macro is converted to SIMD code.                        */

/* The third macro is like the second, but hand-unrolled.              */

/* This sample ships using the third macro, but the user can re-code  */

/* the sample to use any of these three.                              */

 

/* DMA Lists allow you to gather data from arbitrary locations in     */

/* main memory.  This sample simply demonstrates the use of DMA Lists */

/* in gathering sequential cache lines of 128 bytes each.  This is    */

/* a somewhat silly use of the DMA List, as the memory could have     */

/* been collected with a standard DMA call (without a list), but we   */

/* show the code here to help the user understand how DMA Lists work. */

 

/* The list itself is a sequence of 4-byte words, each pair of which  */

/* specifies a length in bytes, and an address in main memory.        */

/* Since this sample is specifying exactly one cache line for each    */

/* element of the DMA list, the lengths will always be 128.           */

 

#define FILL_DMA_LIST(_base, _list_addr, _offset) {                     \

  int _k;                                                                \

  for (_k=0; _k<128; ++_k) {                                            \

    _list_addr[2*_k]   = 128;                                           \

    _list_addr[2*_k+1] = _base + _offset * 16384 + 128 * _k;            \

  }                                                                      \

}

 

#define FILL_DMA_LIST_SIMD(_base, _list_addr, _offset) {                \

  unsigned int _b, _k;                                                  \

  vector unsigned int length = (vector unsigned int) (128, 0, 128, 0);  \

  vector unsigned int addend = (vector unsigned int) (0, 256, 0, 256);  \

  vector unsigned int *p, w;                                            \

  _b = _base + _offset * 16384;                                         \

  p = (vector unsigned int *) _list_addr;                               \

  w = spu_insert(_b,     length, 1);                                    \

  w = spu_insert(_b+128, w, 3);                                         \

  for (_k=0; _k<64; ++_k) {                                              \

    p[_k] = w;                                                          \

    w = spu_add(w, addend);                                             \

  }                                                                     \

}

 

#define FILL_DMA_LIST_SIMD_UNROLLED(_base, _list_addr, _offset) {       \

  unsigned int _b, _k;                                                  \

  vector unsigned int length = (vector unsigned int) (128, 0, 128, 0);  \

  vector unsigned int addend = (vector unsigned int) (0, 256, 0, 256);  \

  vector unsigned int *_p, w0, w1, w2, w3;                              \

  _b = _base + _offset * 16384;                                         \

  _p = (vector unsigned int *) _list_addr;                              \

    w0 = spu_insert(_b,     length, 1);                                   \

  w0 = spu_insert(_b+128, w0, 3);                                       \

  w1 = spu_add(w0, addend);                                             \

  w2 = spu_add(w1, addend);                                              \

  w3 = spu_add(w2, addend);                                             \

  addend = spu_add(addend, addend);                                     \

  addend = spu_add(addend, addend);                                      \

  for (_k=0; _k<64; _k+=4) {                                            \

    _p[0] = w0;                                                         \

    _p[1] = w1;                                                         \

    _p[2] = w2;                                                          \

    _p[3] = w3;                                                         \

    w0 = spu_add(w0, addend);                                           \

    w1 = spu_add(w1, addend);                                            \

    _p+=4;                                                              \

    w2 = spu_add(w2, addend);                                           \

    w3 = spu_add(w3, addend);                                           \

  }                                                                      \

}

 

/* here are two versions of the code that actually does the incrementing */

/* of the data we have read in.  There are two versions, the first is    */

/* scalar, and the second is SIMD.                                        */

 

void load_data(int *dest) { int i;

  for (i=0; i<4096; ++i) {

    ++dest[i];

  }

}

 

void load_data_SIMD(int *dest) {

  int i;

  vector unsigned int *vdest;

  vector unsigned int v1 = (vector unsigned int) (1, 1, 1, 1);

  vdest = (vector unsigned int *) dest;

  for (i=0; i<1024; ++i) {

    vdest[i] = spu_add(vdest[i], v1);

  }

}

 

/* Here we have the four DMA task modules. */

/* Each of them processes the data in the assigned regions in 16 kbyte chunks */

 

void load_singlebuffer(unsigned int addr)

{

  int i;

  for (i=0; i<loopcount; ++i) {

    mfc_get(data[0], addr+16384*i, 16384, 20, 0, 0);

    mfc_write_tag_mask(1<<20);

    mfc_read_tag_status_all();

    load_data_SIMD(data[0]);

    mfc_put(data[0], addr+16384*i, 16384, 20, 0, 0);

    mfc_write_tag_mask(1<<20);

    mfc_read_tag_status_all();

  }

}

 

void load_doublebuffer(unsigned int addr)

{

  int i;

  mfc_get(data[0], addr, 16384, 20, 0, 0);

  for (i=1; i<loopcount; ++i) {

    mfc_get(data[i&1], addr+16384*i, 16384, 20+(i&1), 0, 0);

    mfc_write_tag_mask(1<<(21-(i&1)));

    mfc_read_tag_status_all();

    load_data_SIMD(data[(i-1)&1]);

    mfc_put(data[(i-1)&1], addr+16384*(i-1), 16384, 21-(i&1), 0, 0);

  }

  mfc_write_tag_mask(1<<21);

  mfc_read_tag_status_all();

  load_data_SIMD(data[1]);

  mfc_put(data[1], addr+16384*(loopcount-1), 16384, 21, 0, 0);

  mfc_write_tag_mask(1<<21);

  mfc_read_tag_status_all();

}

 

void load_singlebuffer_list(unsigned int addr)

{

  int  i;

  for (i=0; i<loopcount; ++i) {

    FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[0], i)

    mfc_getl(data[0], 0, dma_list[0], 1024, 20, 0, 0);

    mfc_write_tag_mask(1<<20);

    mfc_read_tag_status_all();

    load_data_SIMD(data[0]);

    mfc_putl(data[0], 0, dma_list[0], 1024, 20, 0, 0);

    mfc_write_tag_mask(1<<20);

     mfc_read_tag_status_all();

  }

}

 

void load_doublebuffer_list(unsigned int addr)

{

  int  i;

  FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[0], 0)

  mfc_getl(data[0], 0, dma_list[0], 1024, 20, 0, 0);

  for (i=1; i<loopcount; ++i) {

    mfc_write_tag_mask(1<<(20+(i&3)));

    mfc_read_tag_status_all();

    FILL_DMA_LIST_SIMD_UNROLLED(addr, dma_list[i&3], i)

    mfc_getl(data[i&1], 0, dma_list[i&3], 1024, 20+(i&3), 0, 0);

    mfc_write_tag_mask(1<<(20+((i-1)&3)));

    mfc_read_tag_status_all();

    load_data_SIMD(data[(i-1)&1]);

    mfc_putl(data[(i-1)&1], 0, dma_list[(i-1)&3], 1024, 20+((i-1)&3), 0, 0);

  }

  mfc_write_tag_mask(1<<23);

  mfc_read_tag_status_all();

  load_data_SIMD(data[1]);

  mfc_putl(data[1], 0, dma_list[3], 1024, 23, 0, 0);

  mfc_write_tag_mask(1<<23);

  mfc_read_tag_status_all();

}

 

/* here is the location where the SPE begins execution, once its thread is created */

int main(unsigned long long speid, addr64 argp, addr64 envp) {

 

  /* DMA control block information from system memory. */

  mfc_get(&cb, argp.ui[1], sizeof(cb), 31, 0, 0);

  mfc_write_tag_mask(1<<31);

  mfc_read_tag_status_all();

 

printf("addrs = %x %x %x %x\n", cb.addrSB, cb.addrDB, cb.addrSBL, cb.addrDBL);

 

  /* compute how many DMA cycles will be needed by each task */

  loopcount = cb.chunk_size >> 14;

 

  /* load the pointers so the point to the right part of the local store buffer */

  data[0] = &databuffer[0];

  data[1] = &databuffer[4096];

 

  /* run the four tasks, indicating which portion of memory they should work with */

  load_singlebuffer     (cb.addrSB);

  load_doublebuffer     (cb.addrDB);

  load_singlebuffer_list(cb.addrSBL);

  load_doublebuffer_list(cb.addrDBL);

 

  return 0;

}