Romain Guy
December 15, 2014
110

# Performance Matters

Learn about the importance of playing nice with the CPU cache. This presentation shows how a simple algorithm can be rewritten to be more cache-friendly to greatly improve performance.

## Romain Guy

December 15, 2014

## Transcript

80 kHz

— —— —

32. ### DataSeries& getDataSeries() {  return m_series[m_current];  }    // For each

data series  for (size_t i = 0; i < m_buffer.size(); i++) {  getDataSeries()->getValue(i);  }

34. ### Why this example? 3D graphics, 2D graphics, UI toolkits, simulations,

perception, simulations…

36. ### // c = a x b  private void multiply(float[] a,

float[] b, float[] c) {  for (int i = 0; i < m; i++) {  for (int j = 0; j < m; j++) {  for (int k = 0; k < n; k++) {  c[i * m + j] += a[i * n + k] * b[k * m + j];  }  }  }  }
37. ### Testing conditions Two 1024x1024 matrices Intel Core i7 3667U (2

cores @ 2 Ghz) OS X 10.10 Oracle JDK 1.8 >>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

45. ### #define INDEX(i, j, stride) ((i) * (stride) + (j))

// c = a x b  void multiplyMatrices(float* a, float* b, float* c) {  for (size_t i = 0; i < MAT_M; i++) {  for (size_t j = 0; j < MAT_M; j++) {  for (size_t k = 0; k < MAT_N; k++) {  c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(k, j, MAT_M)];  }  }  }  }

50. ### Performance counters # instructions IPC (instructions per cycle) CPI (cycles

per instructions) ………………………………………………………

52. ### Performance counters # instructions IPC (instructions per cycle) CPI (cycles

per instructions) L2 loads L2 hit rate ………………………………………………………

56. ### Test environment L1 32 kB/core L2 265 kB/core L3 4

MB …………………………………………..……
57. ### Test environment L1 32 kB/core L2 265 kB/core L3 4

MB clang (LLVM) 3.5 …………………………………………..……
58. ### Test environment L1 32 kB/core L2 265 kB/core L3 4

MB clang (LLVM) 3.5 compile flag -Os …………………………………………..……
59. ### # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

1.1 B L2 hit rate 8.8%
60. ### # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

1.1 B L2 hit rate 8.8%
61. ### # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

1.1 B L2 hit rate 8.8%
62. ### # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

1.1 B L2 hit rate 8.8%

LINE

cache miss!

69. ### 1 CPU cycle 0.3 ns 1 s L1 access 0.9

ns 3 s L2 access 2.8 ns 9 s L3 access 12.9 ns 43 s RAM access 120 ns 6 min

72. ### void transpose(float* a, float* b) {  for (size_t i =

0; i < MAT_N; i++) {  for (size_t j = 0; j < MAT_M; j++) {  b[INDEX(j, i, MAT_N)] = a[INDEX(i, j, MAT_M)];  }  }  }
73. ### void multiplyMatricesT(float* a, float* b, float* c) {  for (size_t

i = 0; i < MAT_M; i++) {  for (size_t j = 0; j < MAT_M; j++) {  for (size_t k = 0; k < MAT_N; k++) {  c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(j, k, MAT_N)];  }  }  }  }
74. ### Time 9.6 s 1.17 s # Instructions 11 B 8.5

B IPC 0.35 2.22 CPI 2.8 0.45 L2 loads 1.1 B 67 M L2 hit rate 8.8% 94.6%

77. ### void multiplyMatricesT(float* a, float* b, float* c) {  for (size_t

i = 0; i < MAT_M; i++) {  for (size_t j = 0; j < MAT_M; j++) {  float s = 0;  for (size_t k = 0; k < MAT_N; k += 4) {  s += a[INDEX(i, k + 0, MAT_N)] * b[INDEX(j, k + 0, MAT_N)] +  a[INDEX(i, k + 1, MAT_N)] * b[INDEX(j, k + 1, MAT_N)] +  a[INDEX(i, k + 2, MAT_N)] * b[INDEX(j, k + 2, MAT_N)] +  a[INDEX(i, k + 3, MAT_N)] * b[INDEX(j, k + 3, MAT_N)];  }  c[INDEX(i, j, MAT_M)] = s;  }  }  }
78. ### Time 9.6 s 1.17 s 0.5 s # Instructions 11

B 8.5 B 4.7 B IPC 0.35 2.22 2.7 CPI 2.8 0.45 0.38 L2 loads 1.1 B 67 M 65 M L2 hit rate 8.8% 94.6% 95%

82. ### Time 9.6 s 1.17 s 0.5 s 19 s #

Instructions 11 B 8.5 B 4.7 B 26 B IPC 0.35 2.22 2.7 0.44 CPI 2.8 0.45 0.38 2.27 L2 loads 1.1 B 67 M 65 M 1.1 B L2 hit rate 8.8% 94.6% 95% 2.4%

84. ### Time 9.6 s 1.17 s 0.5 s 19 s 0.25

s # Inst. 11 B 8.5 B 4.7 B 26 B 1.1 B IPC 0.35 2.22 2.7 0.44 1.2 CPI 2.8 0.45 0.38 2.27 0.8 L2 loads 1.1 B 67 M 65 M 1.1 B 58 M L2 hit rate 8.8% 94.6% 95% 2.4% 76%

88. ### // c = a x b  private void multiply(float[] a,

float[] b, float[] c) {  for (int i = 0; i < m; i++) {  for (int j = 0; j < m; j++) {  for (int k = 0; k < n; k++) {  c[i * m + j] += a[i * n + k] * b[j * n + k];  }  }  }  }

92. ### final int coreCount = Runtime.getRuntime().availableProcessors();  List<Callable<Void>> tasks = new ArrayList<>(coreCount);

for (int i = m / coreCount; i <= m; i += m / coreCount) {  tasks.add(createTask(i));  }    ExecutorService executor = Executors.newFixedThreadPool(coreCount);  List <Future<Void>> results = executor.invokeAll(tasks);  for (Future<Void> result : results) {  result.get();  }

98. ### pushq %rbp movq %rsp, %rbp movq %rdi, -0x8(%rbp) movq %rsi,

-0x10(%rbp) movq \$0x0, -0x18(%rbp) cmpq \$0x4, -0x18(%rbp) jae 0x1000016bd movq \$0x0, -0x20(%rbp) cmpq \$0x4, -0x20(%rbp) jae 0x1000016a5 movq -0x18(%rbp), %rax shlq \$0x2, %rax addq -0x20(%rbp), %rax movq -0x8(%rbp), %rcx movss (%rcx,%rax,4), %xmm0 movq -0x20(%rbp), %rax shlq \$0x2, %rax addq -0x18(%rbp), %rax movq -0x10(%rbp), %rcx movss %xmm0, (%rcx,%rax,4) movq -0x20(%rbp), %rax addq \$0x1, %rax movq %rax, -0x20(%rbp) jmp 0x10000165a jmp 0x1000016aa movq -0x18(%rbp), %rax addq \$0x1, %rax movq %rax, -0x18(%rbp) jmp 0x100001644 popq %rbp retq -O0 vmovss %xmm1, -0xb0(%rbp) vmovss -0xb8(%rbp), %xmm1 vmovss %xmm1, -0xa0(%rbp) vmovss -0xbc(%rbp), %xmm1 vmovss %xmm1, -0x90(%rbp) vmovss -0xc0(%rbp), %xmm1 vmovss %xmm1, -0x80(%rbp) vmovss -0xc4(%rbp), %xmm1 vmovss %xmm1, -0xac(%rbp) vmovss -0xc8(%rbp), %xmm1 vmovss %xmm1, -0x9c(%rbp) vmovss -0xcc(%rbp), %xmm1 vmovss %xmm1, -0x8c(%rbp) vmovss -0xd0(%rbp), %xmm1 vmovss %xmm1, -0x7c(%rbp) vmovss -0xd4(%rbp), %xmm1 vmovss %xmm1, -0xa8(%rbp) vmovss -0xd8(%rbp), %xmm1 vmovss %xmm1, -0x98(%rbp) vmovss -0xdc(%rbp), %xmm1 vmovss %xmm1, -0x88(%rbp) vmovss -0xe0(%rbp), %xmm1 vmovss %xmm1, -0x78(%rbp) vmovss -0xe4(%rbp), %xmm1 vmovss %xmm1, -0xa4(%rbp) vmovss -0xe8(%rbp), %xmm1 vmovss %xmm1, -0x94(%rbp) vmovss -0xec(%rbp), %xmm1 vmovss %xmm1, -0x84(%rbp) vmovss %xmm0, -0x34(%rbp) vmovss %xmm0, -0x74(%rbp) -Ofast
99. ### ARM NEON vld1.32 {d0-d3}, [r1]! vld1.32 {d4-d7}, [r1]! vtrn.32 q0,

q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 vst1.32 {d0-d3}, [r0]! vst1.32 {d4-d7}, [r0]! Written by hand

101. ### DataSeries& getDataSeries() {  return m_series[m_current];  }    // For each

data series  for (size_t i = 0; i < m_buffer.size(); i++) {  getDataSeries()->getValue(i);  }
102. ### // For each data series  const DataSeries& series(m_series[m_current]);  for (size_t

i = 0; i < m_buffer.size(); i++) {  series.getValue(i);  }

Node Node

109. ### virtual void foo() BaseClass virtual void foo() virtual void foo()

ChildClass OtherChildClass
110. ### mutable bool m_dirty;    const Sphere& getBoundingSphere() const {  if

(m_dirty) {  m_world_sphere = m_sphere * m_world_transform; m_dirty = false;  }  return m_world_sphere;  }