Upgrade to Pro — share decks privately, control downloads, hide ads and more …

用 Go 語言打造多台機器 Scale 架構

265bcbb56e831266de7a9f9281aab57a?s=47 Bo-Yi Wu
September 08, 2020

用 Go 語言打造多台機器 Scale 架構

由於公司內部有分多個網路環境架構,各自有不同的限制,以及背後都有各自的運算伺服器資源,那該如何用設計同一份 Go 語言架構來進行部署,讓使用者可以將檔案上傳,並自動部署到後端任意運算伺服器處理,最後將結果傳回前端給使用者。

265bcbb56e831266de7a9f9281aab57a?s=128

Bo-Yi Wu

September 08, 2020
Tweet

Transcript

  1. ⽤用 Go 語⾔言 打造多台機器 Scale 架構 Bo-Yi Wu 2020/09/08

  2. About me • Software Engineer in Mediatek • Member of

    Drone CI/CD Platform • Member of Gitea Platform • Member of Gin Golang Framework • Maintain Some GitHub Actions Plugins. • Teacher of Udemy Platform: Golang + Drone
  3. NeuroPilot MediaTek Ecosystem for AI Development https://neuropilot.mediatek.com/

  4. 專案需求 • 客⼾戶單機版 (Docker 版本) • 內建簡易易的 Queue 機制 •

    公司內部架構 (軟體 + 硬體) • 多台 Queue 機制 + 硬體模擬 每個 Job 吃 2core 8GB 記憶體
  5. 為什什麼選 Go 語⾔言 • 公司環境限制 • 保護程式邏輯 • 跨平台編譯 (Windows,

    Linux) • 強⼤大 Concurrency
  6. 客⼾戶單機版

  7. 導入 Queue 機制 3BCCJU.2 /42

  8. Service 部分元件 • Database: SQLite (不需要 MySQL, Postgres) • Cache:

    Memory (不需要 Redis) • Queue: ⾃自⾏行行開發
  9. 客⼾戶 IT 環境

  10. 如何實作簡易易的 Queue 機制 每個 Job 吃 2core 8GB 記憶體

  11. 先了了解 Channel Blocking

  12. https://utcc.utoronto.ca/~cks/space/blog/programming/GoConcurrencyStillNotEasy

  13. Limit Concurrency Issue

  14. found := make(chan int) limitCh := make(chan struct{}, concurrencyProcesses) for

    i := 0; i < jobCount; i++ { limitCh <- struct{}{} go func(val int) { defer func() { wg.Done() <-limitCh }() found <- val }(i) } jobCount = 100 concurrencyProcesses = 10
  15. found := make(chan int) limitCh := make(chan struct{}, concurrencyProcesses) for

    i := 0; i < jobCount; i++ { limitCh <- struct{}{} go func(val int) { defer func() { wg.Done() <-limitCh }() found <- val }(i) } jobCount = 100 concurrencyProcesses = 10
  16. 解決⽅方案 將 limitCh 丟到背景處理理?

  17. found := make(chan int) limitCh := make(chan struct{}, concurrencyProcesses) for

    i := 0; i < jobCount; i++ { go func() { limitCh <- struct{}{} }() go func(val int) { defer func() { <-limitCh wg.Done() }() found <- val }(i) } jobCount = 100 concurrencyProcesses = 10
  18. found := make(chan int) limitCh := make(chan struct{}, concurrencyProcesses) for

    i := 0; i < jobCount; i++ { go func() { limitCh <- struct{}{} }() go func(val int) { defer func() { <-limitCh wg.Done() }() found <- val }(i) } 無法解決 Limit Concurrency jobCount = 100 concurrencyProcesses = 10
  19. 解決⽅方案 重新改寫架構

  20. found := make(chan int) queue := make(chan int) go func(queue

    chan<- int) { for i := 0; i < jobCount; i++ { queue <- i } close(queue) }(queue) for i := 0; i < concurrencyProcesses; i++ { go func(queue <-chan int, found chan<- int) { for val := range queue { defer wg.Done() found <- val } }(queue, found) } jobCount = 100 concurrencyProcesses = 10
  21. Internal Queue 單機版

  22. None
  23. Setup Consumer

  24. type Consumer struct { inputChan chan int jobsChan chan int

    } const PoolSize = 200 func main() { // create the consumer consumer := Consumer{ inputChan: make(chan int, 1), jobsChan: make(chan int, PoolSize), } }
  25. None
  26. func (c *Consumer) queue(input int) { fmt.Println("send input value:", input)

    c.jobsChan <- input } func (c *Consumer) worker(num int) { for job := range c.jobsChan { fmt.Println("worker:", num, " job value:", job) } } for i := 0; i < WorkerSize; i++ { go consumer.worker(i) }
  27. rewrite queue func func (c *Consumer) queue(input int) bool {

    fmt.Println("send input value:", input) select { case c.jobsChan <- input: return true default: return false } } 避免使⽤用者⼤大量量送資料進來來
  28. Shutdown with Sigterm Handling

  29. func WithContextFunc(ctx context.Context, f func()) context.Context { ctx, cancel :=

    context.WithCancel(ctx) go func() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGINT, syscall.SIGTERM) defer signal.Stop(c) select { case <-ctx.Done(): case <-c: f() cancel() } }() return ctx }
  30. func (c Consumer) startConsumer(ctx context.Context) { for { select {

    case job := <-c.inputChan: if ctx.Err() != nil { close(c.jobsChan) return } c.jobsChan <- job case <-ctx.Done(): close(c.jobsChan) return } } } select 不保證讀取 Channel 的順序性
  31. Cancel by ctx.Done() event func (c *Consumer) worker(num int) {

    for job := range c.jobsChan { fmt.Println("worker:", num, " job value:", job) } } Channel 關閉後,還是可以讀取資料到結束
  32. Graceful shutdown with worker sync.WaitGroup

  33. None
  34. wg := &sync.WaitGroup{} wg.Add(WorkerSize) // Start [PoolSize] workers for i

    := 0; i < WorkerSize; i++ { go consumer.worker(i) }
  35. WaitGroup WaitGroup WaitGroup WaitGroup

  36. func (c Consumer) worker(wg *sync.WaitGroup) { defer wg.Done() for job

    := range c.jobsChan { // handle the job event } }
  37. Add WaitGroup after Cancel Function

  38. func WithContextFunc(ctx context.Context, f func()) context.Context { ctx, cancel :=

    context.WithCancel(ctx) go func() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGINT, syscall.SIGTERM) defer signal.Stop(c) select { case <-ctx.Done(): case <-c: cancel() f() } }() return ctx } Add WaitGroup after Cancel Function
  39. wg := &sync.WaitGroup{} wg.Add(numberOfWorkers) ctx := signal.WithContextFunc( context.Background(), func() {

    wg.Wait() close(finishChan) }, ) go consumer.startConsumer(ctx)
  40. End of Program select { case <-finished: case err :=

    <-errChannel: if err != nil { return err } }
  41. 單機版限制 系統資源不⾜足

  42. 系統架構

  43. Server - Agent

  44. 4FSWFS᪑"HFOUߔ௨ํࣜ https://github.com/hashicorp/go-retryablehttp

  45. r := e.Group("/rpc") r.Use(rpc.Check()) { r.POST("/v1/healthz", web.RPCHeartbeat) r.POST("/v1/request", web.RPCRquest) r.POST("/v1/accept",

    web.RPCAccept) r.POST("/v1/details", web.RPCDetails) r.POST("/v1/updateStatus", web.RPCUpdateStatus) r.POST("/v1/upload", web.RPCUploadBytes) r.POST("/v1/reset", web.RPCResetStatus) } Check RPC Secret
  46. /rpc/v1/accept Update jobs set version = (oldVersion + 1) where

    machine = "fooBar" and version = oldVersion
  47. Create multiple worker

  48. if r.Capacity != 0 { var g errgroup.Group for i

    := 0; i < r.Capacity; i++ { g.Go(func() error { return r.start(ctx, 0) }) time.Sleep(1 * time.Second) } return g.Wait() } 單機版設定多個 Worker
  49. for { var ( id int64 err error ) if

    id, err = r.request(ctx); err != nil { time.Sleep(1 * time.Second) continue } go func() { if err := r.start(ctx, id); err != nil { log.Error().Err(err).Msg("runner: cannot start the job") } }() } 公司內部 + Submit Job
  50. Break for and select loop func (r *Runner) start(ctx context.Context,

    id int64) error { LOOP: for { select { case <-ctx.Done(): return ctx.Err() default: r.poll(ctx, id) if r.Capacity == 0 { break LOOP } } time.Sleep(1 * time.Second) } return nil }
  51. 即時取消正在執⾏行行的任務?

  52. None
  53. None
  54. None
  55. Context with Cancel or Timeout ctx, cancel := context.WithCancel(context.Background()) defer

    cancel() timeout, cancel := context.WithTimeout(ctx, 60*time.Minute) defer cancel() Job03 context
  56. Context with Cancel or Timeout ctx, cancel := context.WithCancel(context.Background()) defer

    cancel() timeout, cancel := context.WithTimeout(ctx, 60*time.Minute) defer cancel() Job03 context Job05 context
  57. Watch the Cancel event (Agent) go func() { done, _

    := r.Manager.Watch(ctx, id) if done { cancel() } }()
  58. Handle cancel event on Server subscribers: make(map[chan struct{}]int64), cancelled: make(map[int64]time.Time),

  59. User cancel running job c.Lock() c.cancelled[id] = time.Now().Add(time.Minute * 5)

    for subscriber, build := range c.subscribers { if id == build { close(subscriber) } } c.Unlock()
  60. Agent subscribe the cancel event for { select { case

    <-ctx.Done(): return false, ctx.Err() case <-time.After(time.Minute): c.Lock() _, ok := c.cancelled[id] c.Unlock() if ok { return true, nil } case <-subscriber: return true, nil } }
  61. case <-time.After(time.Minute): c.Lock() _, ok := c.cancelled[id] c.Unlock() if ok

    { return true, nil }
  62. case <-time.After(time.Minute): c.Lock() _, ok := c.cancelled[id] c.Unlock() if ok

    { return true, nil } 1 Cancel
  63. case <-time.After(time.Minute): c.Lock() _, ok := c.cancelled[id] c.Unlock() if ok

    { return true, nil } 1 2 Reconnect Server Cancel
  64. 感謝參參與