Learn about the importance of playing nice with the CPU cache. This presentation shows how a simple algorithm can be rewritten to be more cache-friendly to greatly improve performance.
float[] b, float[] c) { for (int i = 0; i < m; i++) { for (int j = 0; j < m; j++) { for (int k = 0; k < n; k++) { c[i * m + j] += a[i * n + k] * b[k * m + j]; } } } }
// c = a x b void multiplyMatrices(float* a, float* b, float* c) { for (size_t i = 0; i < MAT_M; i++) { for (size_t j = 0; j < MAT_M; j++) { for (size_t k = 0; k < MAT_N; k++) { c[INDEX(i, j, MAT_M)] += a[INDEX(i, k, MAT_N)] * b[INDEX(k, j, MAT_M)]; } } } }
s # Inst. 11 B 8.5 B 4.7 B 26 B 1.1 B IPC 0.35 2.22 2.7 0.44 1.2 CPI 2.8 0.45 0.38 2.27 0.8 L2 loads 1.1 B 67 M 65 M 1.1 B 58 M L2 hit rate 8.8% 94.6% 95% 2.4% 76%
float[] b, float[] c) { for (int i = 0; i < m; i++) { for (int j = 0; j < m; j++) { for (int k = 0; k < n; k++) { c[i * m + j] += a[i * n + k] * b[j * n + k]; } } } }
for (int i = m / coreCount; i <= m; i += m / coreCount) { tasks.add(createTask(i)); } ExecutorService executor = Executors.newFixedThreadPool(coreCount); List <Future<Void>> results = executor.invokeAll(tasks); for (Future<Void> result : results) { result.get(); }