kntk
March 19, 2024
130

# Meet high-performance image filtering in Swift

Yakatabune.swift 2024/3/21

March 19, 2024

## Transcript

7. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } /PUF5IFDPEFGPSCPVOEBSZDPOEJUJPOJTPNJUUFE
8. ### let input: [[UInt8]] = … var output: [[UInt8]] = …

for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
9. ### let input: [[UInt8]] = … var output: [[UInt8]] = …

for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
10. ### let input: [[UInt8]] = … var output: [[UInt8]] = …

for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
11. ### let input: [[UInt8]] = … var output: [[UInt8]] = …

for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
12. ### let input: [[UInt8]] = … var output: [[UInt8]] = …

for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS 3
13. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } "WFSBHF
14. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
15. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
16. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
17. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
18. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
19. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
20. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
21. ### #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

… for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }

25. ### "DDFMFSBUFJNBHFGJMUFSJOH w "MHPSJUIN w /BJWF0 3?  w 4FQBSBCMF'JMUFSJOH0 3

 w *OUFHSBM*NBHF0 
26. ### "DDFMFSBUFJNBHFGJMUFSJOH w "MHPSJUIN w /BJWF0 3?  w 4FQBSBCMF'JMUFSJOH0 3

 w *OUFHSBM*NBHF0 
27. ### "DDFMFSBUFJNBHFGJMUFSJOH w "MHPSJUIN w /BJWF0 3?  w 4FQBSBCMF'JMUFSJOH0 3

 w *OUFHSBM*NBHF0  LJOE UJNF<NT> /BJWF  4FQBSBCMF  Y HSBZTDBMF 3 .1SP \$PSF Y

29. ### 4*.% w 4JOHMF*OTUSVDUJPO.VMUJQMF%BUB w *OTUSVDUJPOTGPSCJUT w 9@44&  "79 

"79   w "3./&0/ 

32. ### 4XJGU4*.%7FDUPS5ZQFT let a0: UInt16 = 1 let a1: UInt16 =

2 … let b0: UInt16 = 9 let b1: UInt16 = 10 … a1 + b1 a2 + b2 … a7 + b7 ldrh w0, … ldrh w1, … ldrh w2, … ldrh w3, … … ldrh w15, … add w0, w0, w1 add w2, w2, w3 … add w14, w14, w15
33. ### 4XJGU4*.%7FDUPS5ZQFT let a = SIMD8<UInt16>(1, 2, 3, …) let b

= SIMD8<UInt16>(9, 10, 11, …) a &+ b ldr q0 … ldr q1 … add.8h v0, v0, v1 let a0: UInt16 = 1 let a1: UInt16 = 2 … let b0: UInt16 = 9 let b1: UInt16 = 10 … a1 + b1 a2 + b2 … a7 + b7 ldrh w0, … ldrh w1, … ldrh w2, … ldrh w3, … … ldrh w15, … add w0, w0, w1 add w2, w2, w3 … add w14, w14, w15
34. ### #PY'JMUFSVTJOH4*.% for y in 0..<height { // yfilter (omitted) …

// xfilter do { var x = 0 while x < width { var sum = SIMD8<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD8<UInt16>(yresult[startIndex..<startIndex+8]) } sum /= weightSIMD for k in 0..<8 { output[y][x+k] = sum[k] } x += 8
35. ### #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

… let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate()
36. ### #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

… let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS  Y Y
37. ### #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

… let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 Y LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS 
38. ### LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS  #PY'JMUFSVTJOH4*.%XJUI1PJOUFS

let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> = … let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 Y '14TNT '14T㲈NT

40. ### 3FGFSFODFT w 4&4*.%7FDUPST w IUUQTHJUIVCDPNBQQMFTXJGUFWPMVUJPOCMPCNBJOQSPQPTBMTTJNENE w 4*.% w IUUQTEFWFMPQFSBQQMFDPNEPDVNFOUBUJPOTXJGUTJNE w

4BNQMF\$PEF w IUUQTHJUIVCDPNLOULZNUTXJGUJNBHFQSPDFTTJOHTBNQMF