Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Performance Matters

Romain Guy
December 15, 2014

Performance Matters

Learn about the importance of playing nice with the CPU cache. This presentation shows how a simple algorithm can be rewritten to be more cache-friendly to greatly improve performance.

Romain Guy

December 15, 2014
Tweet

More Decks by Romain Guy

Other Decks in Programming

Transcript

  1. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  2. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[k * m + j];
 }
 }
 }
 }
  3. Testing conditions Two 1024x1024 matrices Intel Core i7 3667U (2

    cores @ 2 Ghz) OS X 10.10 Oracle JDK 1.8 >>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  4. C++

  5. #define INDEX(i, j, stride) ((i) * (stride) + (j))
 


    // c = a x b
 void multiplyMatrices(float* a, float* b, float* c) {
 for (size_t i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(k, j, MAT_M)];
 }
 }
 }
 }
  6. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) ………………………………………………………
  7. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads ………………………………………………………
  8. Performance counters # instructions IPC (instructions per cycle) CPI (cycles

    per instructions) L2 loads L2 hit rate ………………………………………………………
  9. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB …………………………………………..……
  10. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 …………………………………………..……
  11. Test environment L1 32 kB/core L2 265 kB/core L3 4

    MB clang (LLVM) 3.5 compile flag -Os …………………………………………..……
  12. 1 CPU cycle 0.3 ns 1 s L1 access 0.9

    ns 3 s L2 access 2.8 ns 9 s L3 access 12.9 ns 43 s RAM access 120 ns 6 min
  13. void transpose(float* a, float* b) {
 for (size_t i =

    0; i < MAT_N; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 b[INDEX(j, i, MAT_N)] = a[INDEX(i, j, MAT_M)];
 }
 }
 }
  14. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 for (size_t k = 0; k < MAT_N; k++) {
 c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(j, k, MAT_N)];
 }
 }
 }
 }
  15. Time 9.6 s 1.17 s # Instructions 11 B 8.5

    B IPC 0.35 2.22 CPI 2.8 0.45 L2 loads 1.1 B 67 M L2 hit rate 8.8% 94.6%
  16. 10x

  17. 10x

  18. void multiplyMatricesT(float* a, float* b, float* c) {
 for (size_t

    i = 0; i < MAT_M; i++) {
 for (size_t j = 0; j < MAT_M; j++) {
 float s = 0;
 for (size_t k = 0; k < MAT_N; k += 4) {
 s += a[INDEX(i, k + 0, MAT_N)] * b[INDEX(j, k + 0, MAT_N)] +
 a[INDEX(i, k + 1, MAT_N)] * b[INDEX(j, k + 1, MAT_N)] +
 a[INDEX(i, k + 2, MAT_N)] * b[INDEX(j, k + 2, MAT_N)] +
 a[INDEX(i, k + 3, MAT_N)] * b[INDEX(j, k + 3, MAT_N)];
 }
 c[INDEX(i, j, MAT_M)] = s;
 }
 }
 }
  19. Time 9.6 s 1.17 s 0.5 s # Instructions 11

    B 8.5 B 4.7 B IPC 0.35 2.22 2.7 CPI 2.8 0.45 0.38 L2 loads 1.1 B 67 M 65 M L2 hit rate 8.8% 94.6% 95%
  20. 20x

  21. 20x

  22. Time 9.6 s 1.17 s 0.5 s 19 s #

    Instructions 11 B 8.5 B 4.7 B 26 B IPC 0.35 2.22 2.7 0.44 CPI 2.8 0.45 0.38 2.27 L2 loads 1.1 B 67 M 65 M 1.1 B L2 hit rate 8.8% 94.6% 95% 2.4%
  23. Time 9.6 s 1.17 s 0.5 s 19 s 0.25

    s # Inst. 11 B 8.5 B 4.7 B 26 B 1.1 B IPC 0.35 2.22 2.7 0.44 1.2 CPI 2.8 0.45 0.38 2.27 0.8 L2 loads 1.1 B 67 M 65 M 1.1 B 58 M L2 hit rate 8.8% 94.6% 95% 2.4% 76%
  24. 40x

  25. 40x

  26. // c = a x b
 private void multiply(float[] a,

    float[] b, float[] c) {
 for (int i = 0; i < m; i++) {
 for (int j = 0; j < m; j++) {
 for (int k = 0; k < n; k++) {
 c[i * m + j] += a[i * n + k] * b[j * n + k];
 }
 }
 }
 }
  27. final int coreCount = Runtime.getRuntime().availableProcessors();
 List<Callable<Void>> tasks = new ArrayList<>(coreCount);


    
 for (int i = m / coreCount; i <= m; i += m / coreCount) {
 tasks.add(createTask(i));
 }
 
 ExecutorService executor = Executors.newFixedThreadPool(coreCount);
 List <Future<Void>> results = executor.invokeAll(tasks);
 for (Future<Void> result : results) {
 result.get();
 }
  28. pushq %rbp movq %rsp, %rbp movq %rdi, -0x8(%rbp) movq %rsi,

    -0x10(%rbp) movq $0x0, -0x18(%rbp) cmpq $0x4, -0x18(%rbp) jae 0x1000016bd movq $0x0, -0x20(%rbp) cmpq $0x4, -0x20(%rbp) jae 0x1000016a5 movq -0x18(%rbp), %rax shlq $0x2, %rax addq -0x20(%rbp), %rax movq -0x8(%rbp), %rcx movss (%rcx,%rax,4), %xmm0 movq -0x20(%rbp), %rax shlq $0x2, %rax addq -0x18(%rbp), %rax movq -0x10(%rbp), %rcx movss %xmm0, (%rcx,%rax,4) movq -0x20(%rbp), %rax addq $0x1, %rax movq %rax, -0x20(%rbp) jmp 0x10000165a jmp 0x1000016aa movq -0x18(%rbp), %rax addq $0x1, %rax movq %rax, -0x18(%rbp) jmp 0x100001644 popq %rbp retq -O0 vmovss %xmm1, -0xb0(%rbp) vmovss -0xb8(%rbp), %xmm1 vmovss %xmm1, -0xa0(%rbp) vmovss -0xbc(%rbp), %xmm1 vmovss %xmm1, -0x90(%rbp) vmovss -0xc0(%rbp), %xmm1 vmovss %xmm1, -0x80(%rbp) vmovss -0xc4(%rbp), %xmm1 vmovss %xmm1, -0xac(%rbp) vmovss -0xc8(%rbp), %xmm1 vmovss %xmm1, -0x9c(%rbp) vmovss -0xcc(%rbp), %xmm1 vmovss %xmm1, -0x8c(%rbp) vmovss -0xd0(%rbp), %xmm1 vmovss %xmm1, -0x7c(%rbp) vmovss -0xd4(%rbp), %xmm1 vmovss %xmm1, -0xa8(%rbp) vmovss -0xd8(%rbp), %xmm1 vmovss %xmm1, -0x98(%rbp) vmovss -0xdc(%rbp), %xmm1 vmovss %xmm1, -0x88(%rbp) vmovss -0xe0(%rbp), %xmm1 vmovss %xmm1, -0x78(%rbp) vmovss -0xe4(%rbp), %xmm1 vmovss %xmm1, -0xa4(%rbp) vmovss -0xe8(%rbp), %xmm1 vmovss %xmm1, -0x94(%rbp) vmovss -0xec(%rbp), %xmm1 vmovss %xmm1, -0x84(%rbp) vmovss %xmm0, -0x34(%rbp) vmovss %xmm0, -0x74(%rbp) -Ofast
  29. ARM NEON vld1.32 {d0-d3}, [r1]! vld1.32 {d4-d7}, [r1]! vtrn.32 q0,

    q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 vst1.32 {d0-d3}, [r0]! vst1.32 {d4-d7}, [r0]! Written by hand
  30. DataSeries& getDataSeries() {
 return m_series[m_current];
 }
 
 // For each

    data series
 for (size_t i = 0; i < m_buffer.size(); i++) {
 getDataSeries()->getValue(i);
 }
  31. // For each data series
 const DataSeries& series(m_series[m_current]);
 for (size_t

    i = 0; i < m_buffer.size(); i++) {
 series.getValue(i);
 }
  32. mutable bool m_dirty;
 
 const Sphere& getBoundingSphere() const {
 if

    (m_dirty) {
 m_world_sphere = m_sphere * m_world_transform; m_dirty = false;
 }
 return m_world_sphere;
 }