Difference between revisions of "Guide:Opteron NUMA Analysis"

From Tau Wiki
Jump to: navigation, search
(Instrumentation)
(Instrumentation)
Line 135: Line 135:
 
== Instrumentation ==
 
== Instrumentation ==
  
Now, we add some instrumentation (manual).
+
Now, we add some (manual) instrumentation using phases and dynamic timers.
 
 
  
 
  #include <sched.h>
 
  #include <sched.h>

Revision as of 03:53, 20 January 2007

Introduction

NUMA (Non-Uniform Memory Access) machines are widespread in the HPC community. Explanations of NUMA lie elswhere on the web. Here we will examine performance analysis of the AMD Opteron NUMA architecture.

Details on the AMD Opteron NUMA architecture are given here:

http://developer.amd.com/article_print.jsp?id=18

http://developer.amd.com/articlex.jsp?id=30

Each Opteron CPU (not core) has its own memory controller and access to local RAM. To access other sockets' RAM, it must use the HyperTransport bus. It is important, therefore, to ensure that memory accesses are made to local RAM as often possible.


Code

Following is a short C++ program to test NUMA speeds.

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>

#define MEM_MB 512
#define MEM_SIZE MEM_MB*1024L*1024L
#define ITER 40

double getTime() {
  struct timeval tp;
  static double last_timestamp = 0.0;
  double timestamp;
  gettimeofday (&tp, 0);
  timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec;
  return timestamp;
}

int getNumCPU() {
  cpu_set_t mask;
  if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) {
    fprintf (stderr, "Unable to retrieve affinity\n");
    exit(1);
  }
  int nproc = 0;
  for(int i=0; i<CPU_SETSIZE; i++) {
    if( CPU_ISSET(i,&mask) ) {
      nproc++;
    }
  }
  return nproc;
}

void memtest(char *ptr) {
  for (int i=0; i<ITER; i++) {
    memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2);
  }
}

void setCPU(int cpu) {
  cpu_set_t mask;
  CPU_ZERO(&mask);
  CPU_SET(cpu, &mask);
  sched_setaffinity(0, sizeof(cpu_set_t), &mask);
}

void test(int cpu, int nproc) {
  setCPU(cpu);
  char *ptr = (char*) malloc (MEM_SIZE);
  if (!ptr) {
    fprintf (stderr, "failed to malloc\n");
    exit(1);
  }
  printf ("\nMemory allocated on cpu %d\n", cpu);
  // make sure it all gets paged in
  for (long j=0; j<MEM_SIZE; j++) {
    ptr[j] = j;
  }
  for (int i = 0; i < nproc; i++) {
    setCPU(i);
    double start = getTime();
    memtest(ptr);
    double end = getTime();
    printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
  }
  free (ptr);
}

int main (int argc, char **argv) {
  TAU_PROFILE("main()", "", TAU_DEFAULT);
  TAU_PROFILE_SET_NODE(0);
  int nproc = getNumCPU();
  for (int i = 0; i < nproc; i++) {
    test(i,nproc);
  }
  return 0;
}

Output

Following is the output of this program on a dual Opteron 285 (dual-core) system.

Memory allocated on cpu 0
0: time = 7.95079 seconds
1: time = 7.93437 seconds
2: time = 10.2906 seconds
3: time = 10.3224 seconds

Memory allocated on cpu 1
0: time = 7.94089 seconds
1: time = 7.95811 seconds
2: time = 10.2629 seconds
3: time = 10.3479 seconds

Memory allocated on cpu 2
0: time = 10.3138 seconds
1: time = 10.3115 seconds
2: time = 7.88206 seconds
3: time = 7.961 seconds

Memory allocated on cpu 3
0: time = 10.3013 seconds
1: time = 10.3484 seconds
2: time = 7.9039 seconds
3: time = 7.97128 seconds

Instrumentation

Now, we add some (manual) instrumentation using phases and dynamic timers.

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <TAU.h>
#define MEM_MB 512 #define MEM_SIZE MEM_MB*1024L*1024L #define ITER 40
double getTime() { struct timeval tp; static double last_timestamp = 0.0; double timestamp; gettimeofday (&tp, 0); timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec; return timestamp; }
int getNumCPU() { cpu_set_t mask; if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) { fprintf (stderr, "Unable to retrieve affinity\n"); exit(1); } int nproc = 0; for(int i=0; i<CPU_SETSIZE; i++) { if( CPU_ISSET(i,&mask) ) { nproc++; } } return nproc; }
void memtest(char *ptr) { for (int i=0; i<ITER; i++) { memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2); } }
void setCPU(int cpu) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(cpu, &mask); sched_setaffinity(0, sizeof(cpu_set_t), &mask); }
void test(int cpu, int nproc) { setCPU(cpu); char *ptr = (char*) malloc (MEM_SIZE); if (!ptr) { fprintf (stderr, "failed to malloc\n"); exit(1); } printf ("\nMemory allocated on cpu %d\n", cpu); // make sure it all gets paged in for (long j=0; j<MEM_SIZE; j++) { ptr[j] = j; } for (int i = 0; i < nproc; i++) { char buf[128]; sprintf (buf, "Test on CPU %d", i); TAU_PROFILE_TIMER_DYNAMIC(timer, buf, "", TAU_USER); setCPU(i); double start = getTime(); TAU_PROFILE_START(timer); memtest(ptr); TAU_PROFILE_STOP(timer); double end = getTime(); printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000)); } free (ptr); }
int main (int argc, char **argv) {
  TAU_PROFILE("main()", "", TAU_DEFAULT);
  TAU_PROFILE_SET_NODE(0);
  int nproc = getNumCPU();
  for (int i = 0; i < nproc; i++) {
    char buf[128];
    sprintf (buf, "Memory allocated on CPU %d", i);
    TAU_PHASE_CREATE_DYNAMIC(phase, buf, "", TAU_USER);
    TAU_PHASE_START(phase);
    test(i,nproc);
    TAU_PHASE_STOP(phase);
  }
  return 0;
}