Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Performance Matters

Sponsored · Ship Features Fearlessly Turn features on and off without deploys. Used by thousands of Ruby developers.
Avatar for Romain Guy Romain Guy
December 15, 2014

Performance Matters

Learn about the importance of playing nice with the CPU cache. This presentation shows how a simple algorithm can be rewritten to be more cache-friendly to greatly improve performance.

Avatar for Romain Guy

Romain Guy

December 15, 2014
Tweet

More Decks by Romain Guy

Other Decks in Programming

Transcript

  1. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  2. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[k * m + j];
 }
 }
 }
 }
  3. Testing conditions Two 1024x1024 matrices Intel Core i7 3667U (2

    cores @ 2 Ghz) OS X 10.10 Oracle JDK 1.8 >>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  4. C++

  5. #define INDEX(i, j, stride) ((i) * (stride) + (j))
 


    // c = a x b
 void multiplyMatrices(float* a, float* b, float* c) {
 for (size_t i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(k, j, MAT_M)];
 }
 }
 }
 }
  6. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) ………………………………………………………
  7. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads ………………………………………………………
  8. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads L2 hit rate ………………………………………………………
  9. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB …………………………………………..……
  10. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 …………………………………………..……
  11. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 compile flag -Os …………………………………………..……
  12. 1 CPU cycle 0.3 ns 1 s L1 access 0.9

    ns 3 s L2 access 2.8 ns 9 s L3 access 12.9 ns 43 s RAM access 120 ns 6 min
  13. void transpose(float* a, float* b) {
 for (size_t i =

    0; i < MAT_N; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 b[INDEX(j, i, MAT_N)] = a[INDEX(i, j, MAT_M)];
 }
 }
 }
  14. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(j, k, MAT_N)];
 }
 }
 }
 }
  15. Time 9.6 s 1.17 s # Instructions 11 B 8.5

    B IPC 0.35 2.22 CPI 2.8 0.45 L2 loads 1.1 B 67 M L2 hit rate 8.8% 94.6%
  16. 10x

  17. 10x

  18. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 float s = 0;
 for (size_t k = 0; k < MAT_N; k += 4) {
 s += a[INDEX(i, k + 0, MAT_N)] * b[INDEX(j, k + 0, MAT_N)] +
 a[INDEX(i, k + 1, MAT_N)] * b[INDEX(j, k + 1, MAT_N)] +
 a[INDEX(i, k + 2, MAT_N)] * b[INDEX(j, k + 2, MAT_N)] +
 a[INDEX(i, k + 3, MAT_N)] * b[INDEX(j, k + 3, MAT_N)];
 }
 c[INDEX(i, j, MAT_M)] = s;
 }
 }
 }
  19. Time 9.6 s 1.17 s 0.5 s # Instructions 11

    B 8.5 B 4.7 B IPC 0.35 2.22 2.7 CPI 2.8 0.45 0.38 L2 loads 1.1 B 67 M 65 M L2 hit rate 8.8% 94.6% 95%
  20. 20x

  21. 20x

  22. Time 9.6 s 1.17 s 0.5 s 19 s #

    Instructions 11 B 8.5 B 4.7 B 26 B IPC 0.35 2.22 2.7 0.44 CPI 2.8 0.45 0.38 2.27 L2 loads 1.1 B 67 M 65 M 1.1 B L2 hit rate 8.8% 94.6% 95% 2.4%
  23. Time 9.6 s 1.17 s 0.5 s 19 s 0.25

    s # Inst. 11 B 8.5 B 4.7 B 26 B 1.1 B IPC 0.35 2.22 2.7 0.44 1.2 CPI 2.8 0.45 0.38 2.27 0.8 L2 loads 1.1 B 67 M 65 M 1.1 B 58 M L2 hit rate 8.8% 94.6% 95% 2.4% 76%
  24. 40x

  25. 40x

  26. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[j * n + k];
 }
 }
 }
 }
  27. final int coreCount = Runtime.getRuntime().availableProcessors();
 List<Callable<Void>> tasks = new ArrayList<>(coreCount);


    
 for (int i = m / coreCount; i <= m; i += m / coreCount) {
 tasks.add(createTask(i));
 }
 
 ExecutorService executor = Executors.newFixedThreadPool(coreCount);
 List <Future<Void>> results = executor.invokeAll(tasks);
 for (Future<Void> result : results) {
 result.get();
 }
  28. pushq %rbp movq %rsp, %rbp movq %rdi, -0x8(%rbp) movq %rsi,

    -0x10(%rbp) movq $0x0, -0x18(%rbp) cmpq $0x4, -0x18(%rbp) jae 0x1000016bd movq $0x0, -0x20(%rbp) cmpq $0x4, -0x20(%rbp) jae 0x1000016a5 movq -0x18(%rbp), %rax shlq $0x2, %rax addq -0x20(%rbp), %rax movq -0x8(%rbp), %rcx movss (%rcx,%rax,4), %xmm0 movq -0x20(%rbp), %rax shlq $0x2, %rax addq -0x18(%rbp), %rax movq -0x10(%rbp), %rcx movss %xmm0, (%rcx,%rax,4) movq -0x20(%rbp), %rax addq $0x1, %rax movq %rax, -0x20(%rbp) jmp 0x10000165a jmp 0x1000016aa movq -0x18(%rbp), %rax addq $0x1, %rax movq %rax, -0x18(%rbp) jmp 0x100001644 popq %rbp retq -O0 vmovss %xmm1, -0xb0(%rbp) vmovss -0xb8(%rbp), %xmm1 vmovss %xmm1, -0xa0(%rbp) vmovss -0xbc(%rbp), %xmm1 vmovss %xmm1, -0x90(%rbp) vmovss -0xc0(%rbp), %xmm1 vmovss %xmm1, -0x80(%rbp) vmovss -0xc4(%rbp), %xmm1 vmovss %xmm1, -0xac(%rbp) vmovss -0xc8(%rbp), %xmm1 vmovss %xmm1, -0x9c(%rbp) vmovss -0xcc(%rbp), %xmm1 vmovss %xmm1, -0x8c(%rbp) vmovss -0xd0(%rbp), %xmm1 vmovss %xmm1, -0x7c(%rbp) vmovss -0xd4(%rbp), %xmm1 vmovss %xmm1, -0xa8(%rbp) vmovss -0xd8(%rbp), %xmm1 vmovss %xmm1, -0x98(%rbp) vmovss -0xdc(%rbp), %xmm1 vmovss %xmm1, -0x88(%rbp) vmovss -0xe0(%rbp), %xmm1 vmovss %xmm1, -0x78(%rbp) vmovss -0xe4(%rbp), %xmm1 vmovss %xmm1, -0xa4(%rbp) vmovss -0xe8(%rbp), %xmm1 vmovss %xmm1, -0x94(%rbp) vmovss -0xec(%rbp), %xmm1 vmovss %xmm1, -0x84(%rbp) vmovss %xmm0, -0x34(%rbp) vmovss %xmm0, -0x74(%rbp) -Ofast
  29. ARM NEON vld1.32 {d0-d3}, [r1]! vld1.32 {d4-d7}, [r1]! vtrn.32 q0,

    q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 vst1.32 {d0-d3}, [r0]! vst1.32 {d4-d7}, [r0]! Written by hand
  30. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  31. // For each data series
 const DataSeries& series(m_series[m_current]);
 for (size_t

    i = 0; i < m_buffer.size(); i++) {
 series.getValue(i);
 }
  32. mutable bool m_dirty;
 
 const Sphere& getBoundingSphere() const {
 if

    (m_dirty) {
 m_world_sphere = m_sphere * m_world_transform; m_dirty = false;
 }
 return m_world_sphere;
 }