Upgrade to Pro — share decks privately, control downloads, hide ads and more …

自動化監控伺服器工具 - Gatus

Bo-Yi Wu
July 26, 2022

自動化監控伺服器工具 - Gatus

1. Why not Prometheus Alert manager, CloudWatch, or event Splunk?
2. 為什麼我選擇 Gatus?
3. Gatus 運作流程
4. Gatus 系統架構
5. Scalability (Distributed Approach)

Bo-Yi Wu

July 26, 2022
Tweet

More Decks by Bo-Yi Wu

Other Decks in Technology

Transcript

  1. About me • Software Engineer in Mediatek (AIDE) • Member

    of Drone CI/CD Platform • Member of Gitea Platform • Member of Gin Golang Framework • Maintain Some GitHub Actions Plugins.
  2. Conditions • [STATUS] == 200 • [STATUS] < 300 •

    [RESPONSE_TIME] < 500 • [BODY].user.name == John • len([BODY].data) < 10
  3. 客製化監控協定 • HTTP (REST API, GraphQL) • ICMP (ping) •

    DNS (A, AAAA, CNAME, MX, NS) • TCP (Database) • TLS (LDAP, HTTPS, mail servers) • STARTTLS (mail servers)
  4. ⽀援多種 Alert 機制 • Discord • Email • Google Chat

    • Matrix • Mattermost • Slack • Teams • Telegram • Twilio • PagerDuty • Opsgenie • Custom
  5. endpoints: - name: monitoring group: internal url: "https://example.org/" interval: 5m

    conditions: - "[STATUS] == 200" - name: example-dns-query url: "1.1.1.1" interval: 5m dns: query-name: "example.com" query-type: "A" conditions: - "[BODY] == 93.184.216.34" - "[DNS_RCODE] == NOERROR" - name: icmp-ping url: "icmp://example.org" interval: 1m conditions: - "[CONNECTED] == true" .FNPSZ 42-JUF 1PTUHSFT ৽⃧ࢿྉ
  6. endpoints: - name: monitoring group: internal url: "https://example.org/" interval: 5m

    conditions: - "[STATUS] == 200" - name: example-dns-query url: "1.1.1.1" interval: 5m dns: query-name: "example.com" query-type: "A" conditions: - "[BODY] == 93.184.216.34" - "[DNS_RCODE] == NOERROR" .FNPSZ 42-JUF 1PTUHSFT Ҡআࢿྉ
  7. err := store.Initialize(cfg.Storage) if err != nil { panic(err) }

    var keys []string for _, endpoint := range cfg.Endpoints { keys = append(keys, endpoint.Key()) } numberOfDeleted := store.Get().DeleteAllEndpointStatusesNotInKeys(keys) if numberOfDeleted > 0 { }
  8. endpoints: - name: front-end group: core url: "https://twin.sh/health" interval: 5m

    conditions: - "[STATUS] == 200" - "[BODY].status == UP" - "[RESPONSE_TIME] < 150" - name: back-end group: core url: "https://example.org/" interval: 5m conditions: - "[STATUS] == 200" - "[CERTIFICATE_EXPIRATION] > 48h"
  9. endpoints: - name: front-end group: core url: "https://twin.sh/health" interval: 5m

    conditions: - "[STATUS] == 200" - "[BODY].status == UP" - "[RESPONSE_TIME] < 150" - name: back-end group: core url: "https://example.org/" interval: 5m conditions: - "[STATUS] == 200" - "[CERTIFICATE_EXPIRATION] > 48h"
  10. for _, endpoint := range cfg.Endpoints { if endpoint.IsEnabled() {

    go monitor(endpoint, cfg.Alerting, cfg.Maintenance, …) } } ᩇऔॴ༗&OEQPJOUࢿྉ എܠ႔ཧࢿྉ
  11. // Run it immediately on start execute(endpoint, alertingConfig …) //

    Loop for the next executions for { select { case <-ctx.Done(): return case <-time.After(endpoint.Interval): execute(endpoint, alertingConfig …) } }
  12. for _, endpoint := range cfg.Endpoints { if endpoint.IsEnabled() {

    time.Sleep(777 * time.Millisecond) go monitor(endpoint, cfg.Alerting, cfg.Maintenance, …) } }
  13. var router http.Handler = handler.CreateRouter(ui.StaticFolder, securityConfig …) server = &http.Server{

    Addr: fmt.Sprintf("%s:%d", webConfig.Address, webConfig.Port), Handler: router, ReadTimeout: 15 * time.Second, WriteTimeout: 15 * time.Second, IdleTimeout: 15 * time.Second, } log.Println("[controller][Handle] Listening on " + webConfig.SocketAddress()) log.Println("[controller][Handle]", server.ListenAndServe())
  14. for { time.Sleep(30 * time.Second) if cfg.HasLoadedConfigurationFileBeenModified() { stop() time.Sleep(time.Second)

    save() updatedConfig, err := loadConfiguration() if err != nil { if cfg.SkipInvalidConfigUpdate { cfg.UpdateLastFileModTime() continue } else { panic(err) } } initializeStorage(updatedConfig) start(updatedConfig) return } }   
  15. ctx, cancel = context.WithCancel(context.Background()) for _, endpoint := range cfg.Endpoints

    { if endpoint.IsEnabled() { time.Sleep(777 * time.Millisecond) go monitor(endpoint, cfg.Alerting, ctx) } } 4IVUEPXO8BUDI%PH
  16. // Loop for the next executions for { select {

    case <-ctx.Done(): return case <-time.After(endpoint.Interval): execute(endpoint, alertingConfig …) } } 4IVUEPXO8BUDI%PH