Guide:Opteron NUMA Analysis
Contents
Introduction
NUMA (Non-Uniform Memory Access) machines are widespread in the HPC community. Explanations of NUMA lie elswhere on the web. Here we will examine performance analysis of the AMD Opteron NUMA architecture.
Details on the AMD Opteron NUMA architecture are given here:
http://developer.amd.com/article_print.jsp?id=18
http://developer.amd.com/articlex.jsp?id=30
Each Opteron CPU (not core) has its own memory controller and access to local RAM. To access other sockets' RAM, it must use the HyperTransport bus. It is important, therefore, to ensure that memory accesses are made to local RAM as often possible.
Code
Following is a short C++ program to test NUMA speeds.
#include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <string.h> #define MEM_MB 512 #define MEM_SIZE MEM_MB*1024L*1024L #define ITER 40 double getTime() { struct timeval tp; static double last_timestamp = 0.0; double timestamp; gettimeofday (&tp, 0); timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec; return timestamp; } int getNumCPU() { cpu_set_t mask; if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) { fprintf (stderr, "Unable to retrieve affinity\n"); exit(1); } int nproc = 0; for(int i=0; i<CPU_SETSIZE; i++) { if( CPU_ISSET(i,&mask) ) { nproc++; } } return nproc; } void memtest(char *ptr) { for (int i=0; i<ITER; i++) { memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2); } } void setCPU(int cpu) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(cpu, &mask); sched_setaffinity(0, sizeof(cpu_set_t), &mask); } void test(int cpu, int nproc) { setCPU(cpu); char *ptr = (char*) malloc (MEM_SIZE); if (!ptr) { fprintf (stderr, "failed to malloc\n"); exit(1); } printf ("\nMemory allocated on cpu %d\n", cpu); // make sure it all gets paged in for (long j=0; j<MEM_SIZE; j++) { ptr[j] = j; } for (int i = 0; i < nproc; i++) { setCPU(i); double start = getTime(); memtest(ptr); double end = getTime(); printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000)); } free (ptr); } int main (int argc, char **argv) { TAU_PROFILE("main()", "", TAU_DEFAULT); TAU_PROFILE_SET_NODE(0); int nproc = getNumCPU(); for (int i = 0; i < nproc; i++) { test(i,nproc); } return 0; }
Output
Following is the output of this program on a dual Opteron 285 (dual-core) system.
Memory allocated on cpu 0 0: time = 7.95079 seconds 1: time = 7.93437 seconds 2: time = 10.2906 seconds 3: time = 10.3224 seconds Memory allocated on cpu 1 0: time = 7.94089 seconds 1: time = 7.95811 seconds 2: time = 10.2629 seconds 3: time = 10.3479 seconds Memory allocated on cpu 2 0: time = 10.3138 seconds 1: time = 10.3115 seconds 2: time = 7.88206 seconds 3: time = 7.961 seconds Memory allocated on cpu 3 0: time = 10.3013 seconds 1: time = 10.3484 seconds 2: time = 7.9039 seconds 3: time = 7.97128 seconds
Instrumentation
Now, we add some instrumentation (manual).
#include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <string.h> #include <TAU.h>
#define MEM_MB 512 #define MEM_SIZE MEM_MB*1024L*1024L #define ITER 40
double getTime() { struct timeval tp; static double last_timestamp = 0.0; double timestamp; gettimeofday (&tp, 0); timestamp = (double) tp.tv_sec * 1e6 + tp.tv_usec; return timestamp; }
int getNumCPU() { cpu_set_t mask; if (sched_getaffinity(0,sizeof(cpu_set_t),&mask)) { fprintf (stderr, "Unable to retrieve affinity\n"); exit(1); } int nproc = 0; for(int i=0; i<CPU_SETSIZE; i++) { if( CPU_ISSET(i,&mask) ) { nproc++; } } return nproc; }
void memtest(char *ptr) { for (int i=0; i<ITER; i++) { memcpy(ptr, ptr+(MEM_SIZE/2),MEM_SIZE/2); } }
void setCPU(int cpu) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(cpu, &mask); sched_setaffinity(0, sizeof(cpu_set_t), &mask); }
void test(int cpu, int nproc) { setCPU(cpu); char *ptr = (char*) malloc (MEM_SIZE); if (!ptr) { fprintf (stderr, "failed to malloc\n"); exit(1); } printf ("\nMemory allocated on cpu %d\n", cpu); // make sure it all gets paged in for (long j=0; j<MEM_SIZE; j++) { ptr[j] = j; } for (int i = 0; i < nproc; i++) { char buf[128]; sprintf (buf, "Test on CPU %d", i); TAU_PROFILE_TIMER_DYNAMIC(timer, buf, "", TAU_USER); setCPU(i); double start = getTime(); TAU_PROFILE_START(timer); memtest(ptr); TAU_PROFILE_STOP(timer); double end = getTime(); printf ("%d: time = %G seconds\n", i, (end - start) / (1000*1000)); } free (ptr); }
int main (int argc, char **argv) { TAU_PROFILE("main()", "", TAU_DEFAULT); TAU_PROFILE_SET_NODE(0); int nproc = getNumCPU(); for (int i = 0; i < nproc; i++) { char buf[128]; sprintf (buf, "Memory allocated on CPU %d", i); TAU_PHASE_CREATE_DYNAMIC(phase, buf, "", TAU_USER); TAU_PHASE_START(phase); test(i,nproc); TAU_PHASE_STOP(phase); } return 0; }