This was my attempt at describing a methodology for building systems in elixir that can handle failures at all levels. It touches on technology solutions as well as how to engage humans in those solutions.
broken systems. The system continues to function because it contains so many redundancies and because people can make it function, despite the presence of many flaws… System operations are dynamic, with components (organizational, human, technical) failing and being replaced continuously.”
name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} end end end
name: __MODULE__) end def init(desired) do :jenga_config = :ets.new(:jenga_config, [:set, :protected, :named_table]) case load_config(:jenga_config, desired) do :ok -> {:ok, %{table: :jenga_config, desired: desired}} :error -> {:stop, :could_not_load_config} end end end
{code, message} = status() conn |> Plug.Conn.put_status(code) |> json(message) end defp status do case Database.check_status() do :ok -> {200, %{status: "OK"}} _ -> {500, %{status: "LOADING"}} end end end
%{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end end
%{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do end end
%{status: :degraded, passing_checks: 0}} end def handle_info(:check_db, state) do status = Jenga.Database.check_status() state = change_state(status, state) schedule_check() {:noreply, state} end defp change_state(result, %{status: status, passing_checks: count}) do case {result, status, count} do {:ok, :connected, count} -> if count == 3 do :alarm_handler.clear_alarm(@alarm_id) end %{status: :connected, passing_checks: count + 1} {:ok, :degraded, _} -> %{status: :connected, passing_checks: 0} end end end
Logger.info("Installing alarm handler") {:ok, %{}} end def handle_event({:set_alarm, :database_disconnected}, alarms) do send_alert_to_slack(database_alarm()) {:ok, alarms} end def handle_event({:clear_alarm, :database_disconnected}, alarms) do send_recovery_to_slack(database_alarm()) {:ok, alarms} end def handle_event(event, state) do Logger.info("Unhandled alarm event: #{inspect(event)}") {:ok, state} end end