Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Meet high-performance image filtering in Swift

kntk
March 19, 2024

Meet high-performance image filtering in Swift

Yakatabune.swift 2024/3/21

kntk

March 19, 2024
Tweet

More Decks by kntk

Other Decks in Programming

Transcript

  1. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } /PUF5IFDPEFGPSCPVOEBSZDPOEJUJPOJTPNJUUFE
  2. let input: [[UInt8]] = … var output: [[UInt8]] = …

    for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
  3. let input: [[UInt8]] = … var output: [[UInt8]] = …

    for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
  4. let input: [[UInt8]] = … var output: [[UInt8]] = …

    for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
  5. let input: [[UInt8]] = … var output: [[UInt8]] = …

    for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS
  6. let input: [[UInt8]] = … var output: [[UInt8]] = …

    for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } #PY'JMUFS 3
  7. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } } "WFSBHF
  8. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  9. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  10. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  11. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  12. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  13. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  14. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  15. #PY'JMUFS let input: [[UInt8]] = … var output: [[UInt8]] =

    … for j in 0..<height { for i in 0..<width { var sum = 0 for ry in -R...R { for rx in -R...R { sum += Int(input[j + ry][i + rx]) } } output[j][i] = UInt8(sum / (2 * R + 1) * (2 * R + 1)) } }
  16. "DDFMFSBUFJNBHFGJMUFSJOH w "MHPSJUIN w /BJWF0 3?  w 4FQBSBCMF'JMUFSJOH0 3

     w *OUFHSBM*NBHF0  LJOE UJNF<NT> /BJWF  4FQBSBCMF  Y HSBZTDBMF 3 .1SP $PSF Y
  17. 4XJGU4*.%7FDUPS5ZQFT let a0: UInt16 = 1 let a1: UInt16 =

    2 … let b0: UInt16 = 9 let b1: UInt16 = 10 … a1 + b1 a2 + b2 … a7 + b7 ldrh w0, … ldrh w1, … ldrh w2, … ldrh w3, … … ldrh w15, … add w0, w0, w1 add w2, w2, w3 … add w14, w14, w15
  18. 4XJGU4*.%7FDUPS5ZQFT let a = SIMD8<UInt16>(1, 2, 3, …) let b

    = SIMD8<UInt16>(9, 10, 11, …) a &+ b ldr q0 … ldr q1 … add.8h v0, v0, v1 let a0: UInt16 = 1 let a1: UInt16 = 2 … let b0: UInt16 = 9 let b1: UInt16 = 10 … a1 + b1 a2 + b2 … a7 + b7 ldrh w0, … ldrh w1, … ldrh w2, … ldrh w3, … … ldrh w15, … add w0, w0, w1 add w2, w2, w3 … add w14, w14, w15
  19. #PY'JMUFSVTJOH4*.% for y in 0..<height { // yfilter (omitted) …

    // xfilter do { var x = 0 while x < width { var sum = SIMD8<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD8<UInt16>(yresult[startIndex..<startIndex+8]) } sum /= weightSIMD for k in 0..<8 { output[y][x+k] = sum[k] } x += 8
  20. #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

    … let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate()
  21. #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

    … let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS  Y Y
  22. #PY'JMUFSVTJOH4*.%XJUI1PJOUFS let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> =

    … let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 Y LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS 
  23. LJOE UJNF<NT> /BJWF  4FQBSBCMF  4FQBSBCMF 4*.%1PJOUFS  #PY'JMUFSVTJOH4*.%XJUI1PJOUFS

    let imagePointer: UnsafeMutablePointer<UInt16> = … var resultPointer: UnsafeMutableBufferPointer<UInt16> = … let weightSIMD = SIMD16<UInt16>(repeating: UInt16(L * L)) let widthExtended = width + 2 * radius // ຖճdeallocate͠ͳ͍ͱϝϞϦϦʔΫ͢Δ͕ // Ұ൪࠷ޙͷ࣮ݧ݁Ռ͸ඞཁͳͨΊઌ಄Ͱߦ͏ resultPointer.deinitialize() resultPointer.deallocate() resultPointer = .allocate(capacity: width * height) resultPointer.initialize(repeating: .zero) let extendedPointer: UnsafeMutableBufferPointer<UnsafeMutableBufferPointer<UInt16>> = .allocate(capacity: L) for k in 0..<L { extendedPointer[k] = .allocate(capacity: widthExtended) } for k in 0..<L-1 { extendImage( from: imagePointer.advanced(by: max(0, k - radius) * width), srcWidth: width, extendTo: extendedPointer[k], extendRadius: radius ) } let yresultPointer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: widthExtended) yresultPointer.initialize(repeating: .zero) for y in 0..<height { extendImage( from: imagePointer.advanced(by: min(height - 1, y + radius) * width), srcWidth: width, extendTo: extendedPointer[L-1], extendRadius: radius ) // yfilter do { var x = 0 while x < widthExtended - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][x..<x+16]) } for k in 0..<16 { yresultPointer[x+k] = sum[k] } x += 16 } } // yfilter ͋·Γॲཧ do { let offset = widthExtended - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { sum &+= SIMD16<UInt16>(extendedPointer[k][offset..<offset+16]) } // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { yresultPointer[offset+k] = sum[k] } } // xfilter do { var x = 0 while x < width - 16 { var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = x + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD for k in 0..<16 { // ͜͜Ͳ͏ʹ͔͍ͨ͠ resultPointer[width * y + x + k] = sum[k] } x += 16 } } // xfilter ͋·Γॲཧ do { let offset = width - 16 var sum = SIMD16<UInt16>.zero for k in 0..<L { let startIndex = offset + k sum &+= SIMD16<UInt16>(yresultPointer[startIndex..<startIndex+16]) } sum /= weightSIMD // ͜ΕΛSIMDͷstoreʹ͍ͨ͠ for k in 0..<16 { resultPointer[width * y + offset + k] = sum[k] } } // ringBuffering let temp = extendedPointer.moveElement(from: 0) for k in 0..<L-1 { extendedPointer[k] = extendedPointer[k+1] } extendedPointer[L-1] = temp } yresultPointer.deinitialize() yresultPointer.deallocate() for k in 0..<L { extendedPointer[k].deinitialize() extendedPointer[k].deallocate() } extendedPointer.deinitialize() extendedPointer.deallocate() Y HSBZTDBMF 3 Y '14TNT '14T㲈NT