Difference between revisions of "Guide:Opteron NUMA Analysis"

From Tau Wiki
Jump to: navigation, search
 
Line 7: Line 7:
  
 
http://developer.amd.com/article_print.jsp?id=18
 
http://developer.amd.com/article_print.jsp?id=18
 +
 
http://developer.amd.com/articlex.jsp?id=30
 
http://developer.amd.com/articlex.jsp?id=30
  
Line 26: Line 27:
 
#include <sys/time.h>
 
#include <sys/time.h>
 
#include <string.h>
 
#include <string.h>
#include <TAU.h>
 
  
 
#define MEM_MB 512
 
#define MEM_MB 512
Line 82: Line 82:
 
   }
 
   }
 
   for (int i = 0; i < nproc; i++) {
 
   for (int i = 0; i < nproc; i++) {
    char buf[128];
 
    sprintf (buf, "Test on CPU %d", i);
 
    TAU_PROFILE_TIMER_DYNAMIC(timer, buf, "", TAU_USER);
 
 
     setCPU(i);
 
     setCPU(i);
 
     double start = getTime();
 
     double start = getTime();
    TAU_PROFILE_START(timer);
 
 
     memtest(ptr);
 
     memtest(ptr);
    TAU_PROFILE_STOP(timer);
 
 
     double end = getTime();
 
     double end = getTime();
 
     printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
 
     printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
Line 101: Line 96:
 
   int nproc = getNumCPU();
 
   int nproc = getNumCPU();
 
   for (int i = 0; i < nproc; i++) {
 
   for (int i = 0; i < nproc; i++) {
    char buf[128];
 
    sprintf (buf, "Memory allocated on CPU %d", i);
 
    TAU_PHASE_CREATE_DYNAMIC(phase, buf, "", TAU_USER);
 
    TAU_PHASE_START(phase);
 
 
     test(i,nproc);
 
     test(i,nproc);
    TAU_PHASE_STOP(phase);
 
 
   }
 
   }
 
   return 0;
 
   return 0;
Line 141: Line 131:
 
2: time = 7.9039 seconds
 
2: time = 7.9039 seconds
 
3: time = 7.97128 seconds
 
3: time = 7.97128 seconds
 +
</pre>
 +
 +
== Instrumentation ==
 +
 +
Now, we add some instrumentation (manual).
 +
 +
<pre>
 +
#include <sched.h>
 +
#include <stdio.h>
 +
#include <stdlib.h>
 +
#include <sys/time.h>
 +
#include <string.h>
 +
#include <TAU.h>
 +
 +
#define MEM_MB 512
 +
#define MEM_SIZE MEM_MB*1024L*1024L
 +
#define ITER 40
 +
 +
double getTime() {
 +
  struct timeval tp;
 +
  static double last_timestamp = 0.0;
 +
  double timestamp;
 +
  gettimeofday (&tp, 0);
 +
  timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec;
 +
  return timestamp;
 +
}
 +
 +
int getNumCPU() {
 +
  cpu_set_t mask;
 +
  if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) {
 +
    fprintf (stderr, "Unable to retrieve affinity\n");
 +
    exit(1);
 +
  }
 +
  int nproc = 0;
 +
  for(int i=0; i<CPU_SETSIZE; i++) {
 +
    if( CPU_ISSET(i,&mask) ) {
 +
      nproc++;
 +
    }
 +
  }
 +
  return nproc;
 +
}
 +
 +
void memtest(char *ptr) {
 +
  for (int i=0; i<ITER; i++) {
 +
    memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2);
 +
  }
 +
}
 +
 +
void setCPU(int cpu) {
 +
  cpu_set_t mask;
 +
  CPU_ZERO(&mask);
 +
  CPU_SET(cpu, &mask);
 +
  sched_setaffinity(0, sizeof(cpu_set_t), &mask);
 +
}
 +
 +
void test(int cpu, int nproc) {
 +
  setCPU(cpu);
 +
  char *ptr = (char*) malloc (MEM_SIZE);
 +
  if (!ptr) {
 +
    fprintf (stderr, "failed to malloc\n");
 +
    exit(1);
 +
  }
 +
  printf ("\nMemory allocated on cpu %d\n", cpu);
 +
  // make sure it all gets paged in
 +
  for (long j=0; j<MEM_SIZE; j++) {
 +
    ptr[j] = j;
 +
  }
 +
  for (int i = 0; i < nproc; i++) {
 +
    char buf[128];
 +
    sprintf (buf, "Test on CPU %d", i);
 +
    TAU_PROFILE_TIMER_DYNAMIC(timer, buf, "", TAU_USER);
 +
    setCPU(i);
 +
    double start = getTime();
 +
    TAU_PROFILE_START(timer);
 +
    memtest(ptr);
 +
    TAU_PROFILE_STOP(timer);
 +
    double end = getTime();
 +
    printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
 +
  }
 +
  free (ptr);
 +
}
 +
 +
int main (int argc, char **argv) {
 +
  TAU_PROFILE("main()", "", TAU_DEFAULT);
 +
  TAU_PROFILE_SET_NODE(0);
 +
  int nproc = getNumCPU();
 +
  for (int i = 0; i < nproc; i++) {
 +
    char buf[128];
 +
    sprintf (buf, "Memory allocated on CPU %d", i);
 +
    TAU_PHASE_CREATE_DYNAMIC(phase, buf, "", TAU_USER);
 +
    TAU_PHASE_START(phase);
 +
    test(i,nproc);
 +
    TAU_PHASE_STOP(phase);
 +
  }
 +
  return 0;
 +
}
 +
 
</pre>
 
</pre>

Revision as of 03:40, 20 January 2007

Introduction

NUMA (Non-Uniform Memory Access) machines are widespread in the HPC community. Explanations of NUMA lie elswhere on the web. Here we will examine performance analysis of the AMD Opteron NUMA architecture.

Details on the AMD Opteron NUMA architecture are given here:

http://developer.amd.com/article_print.jsp?id=18

http://developer.amd.com/articlex.jsp?id=30

Each Opteron CPU (not core) has its own memory controller and access to local RAM. To access other sockets' RAM, it must use the HyperTransport bus. It is important, therefore, to ensure that memory accesses are made to local RAM as often possible.


Code

Following is a short C++ program to test NUMA speeds.

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>

#define MEM_MB 512
#define MEM_SIZE MEM_MB*1024L*1024L
#define ITER 40

double getTime() {
  struct timeval tp;
  static double last_timestamp = 0.0;
  double timestamp;
  gettimeofday (&tp, 0);
  timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec;
  return timestamp;
}

int getNumCPU() {
  cpu_set_t mask;
  if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) {
    fprintf (stderr, "Unable to retrieve affinity\n");
    exit(1);
  }
  int nproc = 0;
  for(int i=0; i<CPU_SETSIZE; i++) {
    if( CPU_ISSET(i,&mask) ) {
      nproc++;
    }
  }
  return nproc;
}

void memtest(char *ptr) {
  for (int i=0; i<ITER; i++) {
    memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2);
  }
}

void setCPU(int cpu) {
  cpu_set_t mask;
  CPU_ZERO(&mask);
  CPU_SET(cpu, &mask);
  sched_setaffinity(0, sizeof(cpu_set_t), &mask);
}

void test(int cpu, int nproc) {
  setCPU(cpu);
  char *ptr = (char*) malloc (MEM_SIZE);
  if (!ptr) {
    fprintf (stderr, "failed to malloc\n");
    exit(1);
  }
  printf ("\nMemory allocated on cpu %d\n", cpu);
  // make sure it all gets paged in
  for (long j=0; j<MEM_SIZE; j++) {
    ptr[j] = j;
  }
  for (int i = 0; i < nproc; i++) {
    setCPU(i);
    double start = getTime();
    memtest(ptr);
    double end = getTime();
    printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
  }
  free (ptr);
}

int main (int argc, char **argv) {
  TAU_PROFILE("main()", "", TAU_DEFAULT);
  TAU_PROFILE_SET_NODE(0);
  int nproc = getNumCPU();
  for (int i = 0; i < nproc; i++) {
    test(i,nproc);
  }
  return 0;
}

Output

Following is the output of this program on a dual Opteron 285 (dual-core) system.

Memory allocated on cpu 0
0: time = 7.95079 seconds
1: time = 7.93437 seconds
2: time = 10.2906 seconds
3: time = 10.3224 seconds

Memory allocated on cpu 1
0: time = 7.94089 seconds
1: time = 7.95811 seconds
2: time = 10.2629 seconds
3: time = 10.3479 seconds

Memory allocated on cpu 2
0: time = 10.3138 seconds
1: time = 10.3115 seconds
2: time = 7.88206 seconds
3: time = 7.961 seconds

Memory allocated on cpu 3
0: time = 10.3013 seconds
1: time = 10.3484 seconds
2: time = 7.9039 seconds
3: time = 7.97128 seconds

Instrumentation

Now, we add some instrumentation (manual).

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <TAU.h>

#define MEM_MB 512
#define MEM_SIZE MEM_MB*1024L*1024L
#define ITER 40

double getTime() {
  struct timeval tp;
  static double last_timestamp = 0.0;
  double timestamp;
  gettimeofday (&tp, 0);
  timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec;
  return timestamp;
}

int getNumCPU() {
  cpu_set_t mask;
  if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) {
    fprintf (stderr, "Unable to retrieve affinity\n");
    exit(1);
  }
  int nproc = 0;
  for(int i=0; i<CPU_SETSIZE; i++) {
    if( CPU_ISSET(i,&mask) ) {
      nproc++;
    }
  }
  return nproc;
}

void memtest(char *ptr) {
  for (int i=0; i<ITER; i++) {
    memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2);
  }
}

void setCPU(int cpu) {
  cpu_set_t mask;
  CPU_ZERO(&mask);
  CPU_SET(cpu, &mask);
  sched_setaffinity(0, sizeof(cpu_set_t), &mask);
}

void test(int cpu, int nproc) {
  setCPU(cpu);
  char *ptr = (char*) malloc (MEM_SIZE);
  if (!ptr) {
    fprintf (stderr, "failed to malloc\n");
    exit(1);
  }
  printf ("\nMemory allocated on cpu %d\n", cpu);
  // make sure it all gets paged in
  for (long j=0; j<MEM_SIZE; j++) {
    ptr[j] = j;
  }
  for (int i = 0; i < nproc; i++) {
    char buf[128];
    sprintf (buf, "Test on CPU %d", i);
    TAU_PROFILE_TIMER_DYNAMIC(timer, buf, "", TAU_USER);
    setCPU(i);
    double start = getTime();
    TAU_PROFILE_START(timer);
    memtest(ptr);
    TAU_PROFILE_STOP(timer);
    double end = getTime();
    printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000));
  }
  free (ptr);
}

int main (int argc, char **argv) {
  TAU_PROFILE("main()", "", TAU_DEFAULT);
  TAU_PROFILE_SET_NODE(0);
  int nproc = getNumCPU();
  for (int i = 0; i < nproc; i++) {
    char buf[128];
    sprintf (buf, "Memory allocated on CPU %d", i);
    TAU_PHASE_CREATE_DYNAMIC(phase, buf, "", TAU_USER);
    TAU_PHASE_START(phase);
    test(i,nproc);
    TAU_PHASE_STOP(phase);
  }
  return 0;
}