Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Building Resilient Elixir Systems

Building Resilient Elixir Systems

Presented at GigCity Elixir - 2018

This was my attempt at describing a methodology for building systems in elixir that can handle failures at all levels. It touches on technology solutions as well as how to engage humans in those solutions.

Chris Keathley

October 27, 2018
Tweet

More Decks by Chris Keathley

Other Decks in Programming

Transcript

  1. How to build reliable systems with your face (and not

    on your face) Chris Keathley / @ChrisKeathley / [email protected]
  2. Resilience an ability to recover from or adjust easily to

    Misfortune or change /ri-ˈzil-yən(t)s/
  3. Complex systems run in degraded mode. “…complex systems run as

    broken systems. The system continues to function because it contains so many redundancies and because people can make it function, despite the presence of many flaws… System operations are dynamic, with components (organizational, human, technical) failing and being replaced continuously.”
  4. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  5. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  6. defmodule Jenga.Application do use Application def start(_type, _args) do children

    = [ ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  7. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  8. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  9. defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) end end
  10. defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} end end end
  11. defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} :error -> {:stop, :could_not_load_config} end end end
  12. defmodule Jenga.Config do use GenServer def start_link(desired_config) do GenServer.start_link(__MODULE__, desired_config,

    name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} :error -> {:stop, :could_not_load_config} end end defp load_config(table, config, retry_count \\ 0) defp load_config(_table, [], _), do: :ok defp load_config(_table, _, 10), do: :error defp load_config(table, [{k, v} | tail], retry_count) do case System.get_env(v) do nil -> load_config(table, [{k, v} | tail], retry_count + 1) value -> :ets.insert(table, {k, value}) load_config(table, tail, retry_count) end end end
  13. ** (Mix) Could not start application jenga: Jenga.Application.start(:normal, []) returned

    an error: shutdown: failed to start child: Jenga.Config ** (EXIT) :could_not_load_config
  14. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  15. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  16. App

  17. App

  18. App

  19. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  20. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: "PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, JengaWeb.Endpoint, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  21. defmodule JengaWeb.Endpoint do use Phoenix.Endpoint, otp_app: :jenga def init(_key, config)

    do port = Jenga.Config.get(:port) {:ok, Keyword.put(config, :http, [:inet6, port: port])} end end
  22. defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do {500, %{status: “LOADING”}} end end
  23. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  24. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  25. defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end end
  26. defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end def handle_info({:try_connect, opts}, _) do do_connect(opts) {:noreply, state} end end
  27. defmodule Jenga.DemoConnection do use GenServer def init(opts) do wait_for =

    3_000 + backoff() + jitter() Process.send_after(self(), {:try_connect, opts}, wait_for) {:ok, %{state: :disconnected}} end def handle_info(:try_connect, state) do case do_connect do :ok -> {:noreply, %{state | state: :connected}} :error -> wait_for = 3_000 + backoff() + jitter() Process.send_after(self(), :try_connect, wait_for) {:noreply, state} end end end
  28. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  29. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  30. defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do {500, %{status: “LOADING”}} end end
  31. defmodule JengaWeb.UpController do use JengaWeb, :controller def up(conn, _params) do

    {code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do case Database.check_status() do :ok -> {200, %{status: "OK"}} _ -> {500, %{status: "LOADING"}} end end end
  32. defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end end
  33. defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do end end
  34. defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do case {result, status, count} do {:ok, :connected, count} -> if count == 3 do :alarm_handler.clear_alarm(@alarm_id) end %{status: :connected, passing_checks: count + 1} {:ok, :degraded, _} -> %{status: :connected, passing_checks: 0} end end end
  35. defmodule Jenga.Database.Watchdog do use GenServer def init(:ok) do schedule_check() {:ok,

    %{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do case {result, status, count} do {:ok, :connected, count} -> if count == 3 do :alarm_handler.clear_alarm(@alarm_id) end %{status: :connected, passing_checks: count + 1} {:ok, :degraded, _} -> %{status: :connected, passing_checks: 0} {:error, :connected, _} -> :alarm_handler.set_alarm({@alarm_id, "We cannot connect to the database”}) %{status: :degraded, passing_checks: 0} {:error, :degraded, _} -> %{status: :degraded, passing_checks: 0} end end end
  36. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: “PORT", db_url: "DB_URL", ] children = [ {Jenga.Config, config}, JengaWeb.Endpoint, Jenga.Database.Supervisor, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  37. defmodule Jenga.Application do use Application def start(_type, _args) do config

    = [ port: “PORT", db_url: "DB_URL", ] :gen_event.swap_handler( :alarm_handler, {:alarm_handler, :swap}, {Jenga.AlarmHandler, :ok}) children = [ {Jenga.Config, config}, JengaWeb.Endpoint, Jenga.Database.Supervisor, ] opts = [strategy: :one_for_one, name: Jenga.Supervisor] Supervisor.start_link(children, opts) end end
  38. defmodule Jenga.AlarmHandler do require Logger def init({:ok, {:alarm_handler, _old_alarms}}) do

    Logger.info("Installing alarm handler") {:ok, %{}} end def handle_event({:set_alarm, :database_disconnected}, alarms) do send_alert_to_slack(database_alarm()) {:ok, alarms} end def handle_event({:clear_alarm, :database_disconnected}, alarms) do send_recovery_to_slack(database_alarm()) {:ok, alarms} end def handle_event(event, state) do Logger.info("Unhandled alarm event: #{inspect(event)}") {:ok, state} end end
  39. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  40. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control
  41. defmodule Jenga.ExternalService do def fetch(params) do with :ok <- :fuse.ask(@fuse,

    :async_dirty), {:ok, result} <- make_call(params) do {:ok, result} else {:error, e} -> :ok = :fuse.melt(@fuse) {:error, e} :blown -> {:error, :service_is_down} end end end
  42. Lets talk about… Booting the runtime & Configuration Starting dependencies

    Connecting to external systems Alarms and feedback Communicating with services we don’t control