Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Performance Matters

237be48129b762b31847d6167597366d?s=47 Romain Guy
December 15, 2014

Performance Matters

Learn about the importance of playing nice with the CPU cache. This presentation shows how a simple algorithm can be rewritten to be more cache-friendly to greatly improve performance.

237be48129b762b31847d6167597366d?s=128

Romain Guy

December 15, 2014
Tweet

Transcript

  1. let’s talk about… A bunch of STUFF ? ? ?

  2. Romain Guy

  3. Romain Guy

  4. Romain Guy

  5. Romain Guy Google Android Robotics

  6. curious-creature.com @romainguy

  7. Performance matters

  8. Why does it matter?

  9. Why does it matter? LATENCY

  10. Why does it matter? LATENCY SCALABILITY

  11. Why does it matter? LATENCY POWER SCALABILITY

  12. 4:40

  13. None
  14. None
  15. EASY MEDIUM HARD

  16. EASY MEDIUM HARD >>>HARD

  17. We have this awesome, super useful, telemetry application

  18. None
  19. It does 2D

  20. It does 2D It does 3D

  21. It does 2D It does 3D It goes fast

  22. How fast?

  23. None
  24. Cameras → 30 Hz

  25. Cameras → 30 Hz Motors → 0.2/3 kHz

  26. Cameras → 30 Hz Motors → 0.2/3 kHz Boards →

    80 kHz
  27. It processes a lot of data!

  28. None
  29. >>>>>>>>>>>>> <<<<<<<<<<<<< l l several GB/s

  30. Our users love the app

  31. Our users love the app —— — — — —

    — —— —
  32. Then we got an email It read like this…

  33. art by haretrinity.deviantart.com very much SORROW >>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>

  34. art by haretrinity.deviantart.com very much SORROW >>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>

  35. <16ms per frame TARGET

  36. <16ms per frame TARGET 150ms per frame ACTUAL

  37. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  38. in-depth look MATRIX MULTIPLICATION

  39. Why this example? 3D graphics, 2D graphics, UI toolkits, simulations,

    perception, simulations…
  40. None
  41. X = m m1 m2

  42. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[k * m + j];
 }
 }
 }
 }
  43. Testing conditions Two 1024x1024 matrices Intel Core i7 3667U (2

    cores @ 2 Ghz) OS X 10.10 Oracle JDK 1.8 >>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  44. >>>>>>>>>>>>>> 10,286 ms <<<<<<<<<<<<<<

  45. Is this a good result? ? ? ?

  46. instructions per second

  47. 6 MIPS >>>>>>>>>>>>>> <<<<<<<<<<<<<<

  48. My CPU ≈ 8,000 MIPS

  49. >> 8,000 6 We can optimize!

  50. C++

  51. None
  52. None
  53. #define INDEX(i, j, stride) ((i) * (stride) + (j))
 


    // c = a x b
 void multiplyMatrices(float* a, float* b, float* c) {
 for (size_t i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(k, j, MAT_M)];
 }
 }
 }
 }
  54. >>>>>>>>>>>>>> 9.6 s <<<<<<<<<<<<<<

  55. Performance counters ………………………………………………………

  56. Performance counters # instructions ………………………………………………………

  57. Performance counters # instructions IPC (instructions per cycle) ………………………………………………………

  58. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) ………………………………………………………
  59. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads ………………………………………………………
  60. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads L2 hit rate ………………………………………………………
  61. Test environment …………………………………………..……

  62. Test environment L1 32 kB/core …………………………………………..……

  63. Test environment L1 32 kB/core L2 265 kB/core …………………………………………..……

  64. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB …………………………………………..……
  65. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 …………………………………………..……
  66. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 compile flag -Os …………………………………………..……
  67. # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

    1.1 B L2 hit rate 8.8%
  68. # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

    1.1 B L2 hit rate 8.8%
  69. # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

    1.1 B L2 hit rate 8.8%
  70. # Instructions 11 B IPC 0.35 CPI 2.8 L2 loads

    1.1 B L2 hit rate 8.8%
  71. Memory layout & access

  72. None
  73. Data is fetched by CACHE LINE

  74. Contiguous accesses are always better Data is fetched by CACHE

    LINE
  75. None
  76. CACHE LINE 64 bytes 16 floats

  77. Each read is 1024 floats away Each access is a

    cache miss!
  78. How bad is a CACHE MISS anyway?

  79. 1 CPU cycle 0.3 ns 1 s L1 access 0.9

    ns 3 s L2 access 2.8 ns 9 s L3 access 12.9 ns 43 s RAM access 120 ns 6 min
  80. Let’s try to MINIMIZE MISSES

  81. Memory layout & access

  82. void transpose(float* a, float* b) {
 for (size_t i =

    0; i < MAT_N; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 b[INDEX(j, i, MAT_N)] = a[INDEX(i, j, MAT_M)];
 }
 }
 }
  83. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(j, k, MAT_N)];
 }
 }
 }
 }
  84. Time 9.6 s 1.17 s # Instructions 11 B 8.5

    B IPC 0.35 2.22 CPI 2.8 0.45 L2 loads 1.1 B 67 M L2 hit rate 8.8% 94.6%
  85. 10x

  86. 10x

  87. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 float s = 0;
 for (size_t k = 0; k < MAT_N; k += 4) {
 s += a[INDEX(i, k + 0, MAT_N)] * b[INDEX(j, k + 0, MAT_N)] +
 a[INDEX(i, k + 1, MAT_N)] * b[INDEX(j, k + 1, MAT_N)] +
 a[INDEX(i, k + 2, MAT_N)] * b[INDEX(j, k + 2, MAT_N)] +
 a[INDEX(i, k + 3, MAT_N)] * b[INDEX(j, k + 3, MAT_N)];
 }
 c[INDEX(i, j, MAT_M)] = s;
 }
 }
 }
  88. Time 9.6 s 1.17 s 0.5 s # Instructions 11

    B 8.5 B 4.7 B IPC 0.35 2.22 2.7 CPI 2.8 0.45 0.38 L2 loads 1.1 B 67 M 65 M L2 hit rate 8.8% 94.6% 95%
  89. 20x

  90. 20x

  91. $ clang++ -O0 -std=c++11 -o main main.cpp

  92. Time 9.6 s 1.17 s 0.5 s 19 s #

    Instructions 11 B 8.5 B 4.7 B 26 B IPC 0.35 2.22 2.7 0.44 CPI 2.8 0.45 0.38 2.27 L2 loads 1.1 B 67 M 65 M 1.1 B L2 hit rate 8.8% 94.6% 95% 2.4%
  93. $ clang++ -Ofast -mavx -std=c++11 -o main main.cpp

  94. Time 9.6 s 1.17 s 0.5 s 19 s 0.25

    s # Inst. 11 B 8.5 B 4.7 B 26 B 1.1 B IPC 0.35 2.22 2.7 0.44 1.2 CPI 2.8 0.45 0.38 2.27 0.8 L2 loads 1.1 B 67 M 65 M 1.1 B 58 M L2 hit rate 8.8% 94.6% 95% 2.4% 76%
  95. 40x

  96. 40x

  97. What about Java? ? ? ?

  98. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[j * n + k];
 }
 }
 }
 }
  99. >>>>>>>>>>>>>> 1,173 ms <<<<<<<<<<<<<< 10 x

  100. X = m m1 m2

  101. X = m m1 m2 X Thread 1 Thread 2

  102. final int coreCount = Runtime.getRuntime().availableProcessors();
 List<Callable<Void>> tasks = new ArrayList<>(coreCount);


    
 for (int i = m / coreCount; i <= m; i += m / coreCount) {
 tasks.add(createTask(i));
 }
 
 ExecutorService executor = Executors.newFixedThreadPool(coreCount);
 List <Future<Void>> results = executor.invokeAll(tasks);
 for (Future<Void> result : results) {
 result.get();
 }
  103. >>>>>>>>>>>>>> 517 ms <<<<<<<<<<<<<< 20 x

  104. What about SIMD? ? ? ?

  105. If a VM can’t do it…

  106. A human can do it…

  107. 4x4 transpose

  108. pushq %rbp movq %rsp, %rbp movq %rdi, -0x8(%rbp) movq %rsi,

    -0x10(%rbp) movq $0x0, -0x18(%rbp) cmpq $0x4, -0x18(%rbp) jae 0x1000016bd movq $0x0, -0x20(%rbp) cmpq $0x4, -0x20(%rbp) jae 0x1000016a5 movq -0x18(%rbp), %rax shlq $0x2, %rax addq -0x20(%rbp), %rax movq -0x8(%rbp), %rcx movss (%rcx,%rax,4), %xmm0 movq -0x20(%rbp), %rax shlq $0x2, %rax addq -0x18(%rbp), %rax movq -0x10(%rbp), %rcx movss %xmm0, (%rcx,%rax,4) movq -0x20(%rbp), %rax addq $0x1, %rax movq %rax, -0x20(%rbp) jmp 0x10000165a jmp 0x1000016aa movq -0x18(%rbp), %rax addq $0x1, %rax movq %rax, -0x18(%rbp) jmp 0x100001644 popq %rbp retq -O0 vmovss %xmm1, -0xb0(%rbp) vmovss -0xb8(%rbp), %xmm1 vmovss %xmm1, -0xa0(%rbp) vmovss -0xbc(%rbp), %xmm1 vmovss %xmm1, -0x90(%rbp) vmovss -0xc0(%rbp), %xmm1 vmovss %xmm1, -0x80(%rbp) vmovss -0xc4(%rbp), %xmm1 vmovss %xmm1, -0xac(%rbp) vmovss -0xc8(%rbp), %xmm1 vmovss %xmm1, -0x9c(%rbp) vmovss -0xcc(%rbp), %xmm1 vmovss %xmm1, -0x8c(%rbp) vmovss -0xd0(%rbp), %xmm1 vmovss %xmm1, -0x7c(%rbp) vmovss -0xd4(%rbp), %xmm1 vmovss %xmm1, -0xa8(%rbp) vmovss -0xd8(%rbp), %xmm1 vmovss %xmm1, -0x98(%rbp) vmovss -0xdc(%rbp), %xmm1 vmovss %xmm1, -0x88(%rbp) vmovss -0xe0(%rbp), %xmm1 vmovss %xmm1, -0x78(%rbp) vmovss -0xe4(%rbp), %xmm1 vmovss %xmm1, -0xa4(%rbp) vmovss -0xe8(%rbp), %xmm1 vmovss %xmm1, -0x94(%rbp) vmovss -0xec(%rbp), %xmm1 vmovss %xmm1, -0x84(%rbp) vmovss %xmm0, -0x34(%rbp) vmovss %xmm0, -0x74(%rbp) -Ofast
  109. ARM NEON vld1.32 {d0-d3}, [r1]! vld1.32 {d4-d7}, [r1]! vtrn.32 q0,

    q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 vst1.32 {d0-d3}, [r0]! vst1.32 {d4-d7}, [r0]! Written by hand
  110. Back to our telemetry application

  111. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  112. // For each data series
 const DataSeries& series(m_series[m_current]);
 for (size_t

    i = 0; i < m_buffer.size(); i++) {
 series.getValue(i);
 }
  113. 3ms per frame AFTER

  114. 3ms per frame AFTER 150ms per frame BEFORE

  115. All because of the L1/L2 cache

  116. I don’t care?! 1024x1024 MATRIX ☹

  117. bool bool float[16] Node bool bool float[16] bool bool float[16]

    Node Node
  118. bool bool bool bool bool bool float[48]

  119. virtual void foo() BaseClass virtual void foo() virtual void foo()

    ChildClass OtherChildClass
  120. mutable bool m_dirty;
 
 const Sphere& getBoundingSphere() const {
 if

    (m_dirty) {
 m_world_sphere = m_sphere * m_world_transform; m_dirty = false;
 }
 return m_world_sphere;
 }
  121. Discussion