Building Resilient Elixir Systems

Building Resilient Elixir Systems

Presented at GigCity Elixir - 2018

This was my attempt at describing a methodology for building systems in elixir that can handle failures at all levels. It touches on technology solutions as well as how to engage humans in those solutions.

06f8b41980eb4c577fa40c41d5030c19?s=128

Chris Keathley

October 27, 2018
Tweet

Transcript

  1. 4.

    How to build reliable systems with your face (and not

    on your face) Chris Keathley / @ChrisKeathley / c@keathey.io
  2. 6.
  3. 7.
  4. 9.

    Resilience an ability to recover from or adjust easily to

    Misfortune or change /ri-ˈzil-yən(t)s/
  5. 10.
  6. 11.

    Complex systems run in degraded mode. “…complex systems run as

    broken systems. The system continues to function because it contains so many redundancies and because people can make it function, despite the presence of many flaws… System operations are dynamic, with components (organizational, human, technical) failing and being replaced continuously.”
  7. 14.
  8. 22.
  9. 24.
  10. 43.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  11. 44.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  12. 45.
  13. 49.
  14. 52.
  15. 60.
  16. 61.

    defmodule Jenga.Application do use Application def start(_type, _args) do children

    = [ ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  17. 62.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  18. 63.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  19. 66.

    defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) end end
  20. 67.

    defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} end end end
  21. 68.

    defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} :error -> {:stop, :could_not_load_config} end end end
  22. 69.

    defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} :error -> {:stop, :could_not_load_config} end end defp load_config(table, config, retry_count \\ 0) defp load_config(_table, [], _), do: :ok defp load_config(_table, _, 10), do: :error defp load_config(table, [{k, v} | tail], retry_count) do case System.get_env(v) do nil -> load_config(table, [{k, v} | tail], retry_count + 1) value -> :ets.insert(table, {k, value}) load_config(table, tail, retry_count) end end end
  23. 70.

    ** (Mix) Could not start application jenga: Jenga.Application.start(:normal, []) returned

    an error: shutdown: failed to start child: Jenga.Config ** (EXIT) :could_not_load_config
  24. 71.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  25. 72.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  26. 73.

    App

  27. 76.

    App

  28. 77.

    App

  29. 79.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  30. 80.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, JengaWeb.Endpoint, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  31. 81.

    defmodule JengaWeb.Endpoint do use Phoenix.Endpoint, otp_app: :jenga def init(_key, config)

    do port = Jenga.Config.get(:port) {:ok, Keyword.put(config, :http, [:inet6, port: port])} end end
  32. 82.

    defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do {500, %{status: “LOADING”}} end end
  33. 83.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  34. 84.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  35. 96.

    defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end end
  36. 97.

    defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end def handle_info({:try_connect, opts}, _) do do_connect(opts) {:noreply, state} end end
  37. 98.

    defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end def handle_info(:try_connect, state) do case do_connect do :ok -> {:noreply, %{state | state: :connected}} :error -> wait_for = 3_000 + backoff() + jitter() Process.send_after(self(), :try_connect, wait_for) {:noreply, state} end end end
  38. 99.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  39. 100.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  40. 107.

    defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do {500, %{status: “LOADING”}} end end
  41. 108.

    defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do case Database.check_status() do :ok -> {200, %{status: "OK"}} _ -> {500, %{status: "LOADING"}} end end end
  42. 113.
  43. 119.

    defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end end
  44. 120.

    defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do end end
  45. 121.

    defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do case {result, status, count} do {:ok, :connected, count} -> if count == 3 do :alarm_handler.clear_alarm(@alarm_id) end %{status: :connected, passing_checks: count + 1} {:ok, :degraded, _} -> %{status: :connected, passing_checks: 0} end end end
  46. 122.

    defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do case {result, status, count} do {:ok, :connected, count} -> if count == 3 do :alarm_handler.clear_alarm(@alarm_id) end %{status: :connected, passing_checks: count + 1} {:ok, :degraded, _} -> %{status: :connected, passing_checks: 0} {:error, :connected, _} -> :alarm_handler.set_alarm({@alarm_id, "We cannot connect to the database”}) %{status: :degraded, passing_checks: 0} {:error, :degraded, _} -> %{status: :degraded, passing_checks: 0} end end end
  47. 124.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: “PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, JengaWeb.Endpoint, Jenga.Database.Supervisor, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  48. 125.

    defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: “PORT", db_url: "DB_URL", ] :gen_event.swap_handler( :alarm_handler, {:alarm_handler, :swap}, {Jenga.AlarmHandler, :ok}) children = [ {Jenga.Config, config}, JengaWeb.Endpoint, Jenga.Database.Supervisor, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  49. 126.

    defmodule Jenga.AlarmHandler do require Logger def init({:ok, {:alarm_handler, _old_alarms}}) do

    Logger.info("Installing alarm handler") {:ok, %{}} end def handle_event({:set_alarm, :database_disconnected}, alarms) do send_alert_to_slack(database_alarm()) {:ok, alarms} end def handle_event({:clear_alarm, :database_disconnected}, alarms) do send_recovery_to_slack(database_alarm()) {:ok, alarms} end def handle_event(event, state) do Logger.info("Unhandled alarm event: #{inspect(event)}") {:ok, state} end end
  50. 127.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  51. 128.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  52. 137.

    defmodule Jenga.ExternalService do def fetch(params) do with :ok <- :fuse.ask(@fuse,

    :async_dirty), {:ok, result} <- make_call(params) do {:ok, result} else {:error, e} -> :ok = :fuse.melt(@fuse) {:error, e} :blown -> {:error, :service_is_down} end end end
  53. 146.

    Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  54. 148.