Building Resilient Systems with OTP: Let It Crash

tags: #Elixir #Otp #Erlang #Architecture #Resilience #Fault-Tolerance
categories: Development Architecture
published: 2024-03-15
reading time: 7 minutes

OTP isn’t just a framework - it’s a philosophy for building systems that survive failures. Learn how supervisors, GenServers, and the “let it crash” mentality create applications that heal themselves.

The Problem with Traditional Error Handling

Most programming languages teach you to catch every error, handle every edge case, and never let your program crash. This leads to defensive code littered with try-catch blocks, null checks, and complex error recovery logic that often makes things worse.

What if there was a better way? What if crashes were good?

Enter OTP: Open Telecom Platform

OTP was built by Ericsson to run telephone switches - systems that need 99.9999999% uptime (31ms of downtime per year). The secret isn’t preventing crashes. It’s recovering from them instantly.

Core OTP Principles:

Let it crash - Don’t try to handle every error. Let processes fail fast.
Isolation - Failures in one process don’t affect others.
Supervision - Parent processes monitor and restart failed children.
State separation - Keep state minimal and recoverable.

The Building Blocks

GenServer: Your Workhorse Process

GenServer is a generic server process. It handles synchronous calls, asynchronous casts, and maintains state.

defmodule PaymentProcessor do
  use GenServer

  # Client API
  def start_link(opts) do
    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
  end

  def process_payment(amount, card_token) do
    GenServer.call(__MODULE__, {:process, amount, card_token})
  end

  # Server Callbacks
  def init(_opts) do
    {:ok, %{processed: 0, failed: 0}}
  end

  def handle_call({:process, amount, card_token}, _from, state) do
    case charge_card(amount, card_token) do
      {:ok, transaction_id} ->
        {:reply, {:ok, transaction_id}, %{state | processed: state.processed + 1}}
      
      {:error, reason} ->
        {:reply, {:error, reason}, %{state | failed: state.failed + 1}}
    end
  end

  defp charge_card(amount, card_token) do
    # This might fail - network issues, invalid card, etc.
    StripeAPI.charge(amount, card_token)
  end
end

If charge_card/2 raises an exception, the GenServer crashes. That’s okay - we’ll handle it at the supervision level.

Supervisors: The Safety Net

Supervisors monitor processes and restart them when they crash.

defmodule MyApp.Application do
  use Application

  def start(_type, _args) do
    children = [
      # Database connection pool
      {MyApp.Repo, []},
      
      # Payment processor - restart if it crashes
      {PaymentProcessor, []},
      
      # Worker pool for background jobs
      {Task.Supervisor, name: MyApp.TaskSupervisor},
      
      # Web endpoint
      MyAppWeb.Endpoint
    ]

    opts = [strategy: :one_for_one, name: MyApp.Supervisor]
    Supervisor.start_link(children, opts)
  end
end

Supervision Strategies:

:one_for_one - If a child crashes, restart only that child
:one_for_all - If a child crashes, restart all children
:rest_for_one - If a child crashes, restart it and all children started after it

Real-World Example: API Client with Circuit Breaker

Let’s build a resilient external API client that handles failures gracefully.

defmodule ExternalAPI.Client do
  use GenServer
  require Logger

  @max_failures 5
  @reset_timeout :timer.seconds(60)

  defmodule State do
    defstruct [
      :circuit_state,  # :closed | :open | :half_open
      :failure_count,
      :last_failure_time
    ]
  end

  # Client API
  def start_link(opts) do
    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
  end

  def fetch_user(user_id) do
    GenServer.call(__MODULE__, {:fetch_user, user_id})
  end

  # Server Callbacks
  def init(_opts) do
    state = %State{
      circuit_state: :closed,
      failure_count: 0,
      last_failure_time: nil
    }
    {:ok, state}
  end

  def handle_call({:fetch_user, user_id}, _from, %{circuit_state: :open} = state) do
    if should_attempt_reset?(state) do
      # Try half-open state
      new_state = %{state | circuit_state: :half_open}
      attempt_request(user_id, new_state)
    else
      {:reply, {:error, :circuit_open}, state}
    end
  end

  def handle_call({:fetch_user, user_id}, _from, state) do
    attempt_request(user_id, state)
  end

  defp attempt_request(user_id, state) do
    case HTTPoison.get("https://api.example.com/users/#{user_id}", [], timeout: 5000) do
      {:ok, %{status_code: 200, body: body}} ->
        # Success - reset circuit
        new_state = %{state | circuit_state: :closed, failure_count: 0}
        {:reply, {:ok, Jason.decode!(body)}, new_state}
      
      {:ok, %{status_code: status}} ->
        handle_failure(state, "HTTP #{status}")
      
      {:error, %HTTPoison.Error{reason: reason}} ->
        handle_failure(state, reason)
    end
  end

  defp handle_failure(state, reason) do
    Logger.warning("API request failed: #{inspect(reason)}")
    
    new_failure_count = state.failure_count + 1
    new_state = %{state | 
      failure_count: new_failure_count,
      last_failure_time: System.monotonic_time(:second)
    }

    if new_failure_count >= @max_failures do
      Logger.error("Circuit breaker opened after #{new_failure_count} failures")
      {:reply, {:error, :circuit_open}, %{new_state | circuit_state: :open}}
    else
      {:reply, {:error, reason}, new_state}
    end
  end

  defp should_attempt_reset?(%{last_failure_time: last_failure}) do
    System.monotonic_time(:second) - last_failure > div(@reset_timeout, 1000)
  end
end

Now supervise it:

defmodule ExternalAPI.Supervisor do
  use Supervisor

  def start_link(opts) do
    Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
  end

  def init(_opts) do
    children = [
      {ExternalAPI.Client, []},
      # Add a periodic health check
      {ExternalAPI.HealthCheck, []}
    ]

    Supervisor.init(children, strategy: :one_for_one, max_restarts: 10, max_seconds: 60)
  end
end

If the API client crashes (network timeout, parsing error, etc.), the supervisor restarts it with a fresh state. The circuit breaker state is lost, but that’s fine - we start with a closed circuit and learn again.

Advanced Pattern: Dynamic Supervisors

Sometimes you need to spawn processes dynamically. Use DynamicSupervisor:

defmodule WebSocket.ConnectionSupervisor do
  use DynamicSupervisor

  def start_link(init_arg) do
    DynamicSupervisor.start_link(__MODULE__, init_arg, name: __MODULE__)
  end

  def start_connection(user_id) do
    spec = {WebSocket.Connection, user_id: user_id}
    DynamicSupervisor.start_child(__MODULE__, spec)
  end

  def init(_init_arg) do
    DynamicSupervisor.init(strategy: :one_for_one)
  end
end

Each WebSocket connection runs in its own supervised process. If one crashes, others are unaffected.

defmodule WebSocket.Connection do
  use GenServer
  require Logger

  def start_link(opts) do
    user_id = Keyword.fetch!(opts, :user_id)
    GenServer.start_link(__MODULE__, user_id, name: via_tuple(user_id))
  end

  defp via_tuple(user_id) do
    {:via, Registry, {WebSocket.Registry, user_id}}
  end

  def init(user_id) do
    Logger.info("WebSocket connection started for user #{user_id}")
    {:ok, %{user_id: user_id, connected_at: DateTime.utc_now()}}
  end

  def handle_info({:message, data}, state) do
    # Process message - if this crashes, only this connection dies
    process_message(data, state)
    {:noreply, state}
  end

  def terminate(reason, state) do
    Logger.info("WebSocket connection terminated for user #{state.user_id}: #{inspect(reason)}")
    :ok
  end
end

Task Supervision: Fire and Forget Safely

For one-off async work, use supervised tasks:

# In your application supervisor
{Task.Supervisor, name: MyApp.TaskSupervisor}

# Spawn supervised tasks
Task.Supervisor.start_child(MyApp.TaskSupervisor, fn ->
  send_welcome_email(user)
end)

# Or with async/await
task = Task.Supervisor.async(MyApp.TaskSupervisor, fn ->
  expensive_computation()
end)

result = Task.await(task, 30_000)

If the task crashes, it doesn’t take down your caller. The supervisor handles cleanup.

Handling State Recovery

When a process crashes and restarts, state is lost. Design for this:

Bad: Keeping everything in memory

def init(_) do
  # If this crashes, we lose all user sessions
  {:ok, %{sessions: %{}, active_users: []}}
end

Good: Recoverable state

def init(_) do
  # Load from database or cache on restart
  sessions = load_sessions_from_redis()
  {:ok, %{sessions: sessions}}
end

Better: Separate state storage

# Use ETS for shared state that survives process crashes
def init(_) do
  :ets.new(:sessions, [:named_table, :public, read_concurrency: true])
  {:ok, %{}}
end

Monitoring and Telemetry

Add observability to your OTP processes:

defmodule PaymentProcessor do
  use GenServer
  require Logger

  def handle_call({:process, amount, card_token}, _from, state) do
    start_time = System.monotonic_time()
    
    result = case charge_card(amount, card_token) do
      {:ok, transaction_id} = success ->
        :telemetry.execute(
          [:payment, :processed],
          %{amount: amount, duration: System.monotonic_time() - start_time},
          %{status: :success}
        )
        success
      
      {:error, reason} = error ->
        :telemetry.execute(
          [:payment, :processed],
          %{amount: amount, duration: System.monotonic_time() - start_time},
          %{status: :error, reason: reason}
        )
        error
    end

    new_state = update_metrics(state, result)
    {:reply, result, new_state}
  end
end

Testing Resilience

Test that your system handles failures:

defmodule PaymentProcessorTest do
  use ExUnit.Case

  test "supervisor restarts crashed payment processor" do
    # Get the original PID
    original_pid = Process.whereis(PaymentProcessor)
    
    # Kill the process
    Process.exit(original_pid, :kill)
    
    # Wait for supervisor to restart it
    Process.sleep(100)
    
    # Verify it's running with a new PID
    new_pid = Process.whereis(PaymentProcessor)
    assert new_pid != original_pid
    assert Process.alive?(new_pid)
  end

  test "circuit breaker opens after repeated failures" do
    # Simulate 5 failures
    for _ <- 1..5 do
      assert {:error, _} = ExternalAPI.Client.fetch_user(123)
    end
    
    # Circuit should be open now
    assert {:error, :circuit_open} = ExternalAPI.Client.fetch_user(123)
  end
end

Production Patterns

1. Graceful Degradation

def get_user_profile(user_id) do
  case ExternalAPI.Client.fetch_user(user_id) do
    {:ok, profile} -> 
      profile
    
    {:error, :circuit_open} ->
      # Serve cached data when circuit is open
      Cache.get_user_profile(user_id) || default_profile()
    
    {:error, _reason} ->
      default_profile()
  end
end

2. Backpressure with GenStage

defmodule EventProducer do
  use GenStage

  def start_link(_) do
    GenStage.start_link(__MODULE__, :ok, name: __MODULE__)
  end

  def init(:ok) do
    {:producer, %{queue: :queue.new(), demand: 0}}
  end

  def handle_demand(demand, state) do
    # Only produce what consumers can handle
    {events, new_queue} = take_events(state.queue, demand)
    {:noreply, events, %{state | queue: new_queue}}
  end
end

3. Process Pooling

# Use poolboy or nimble_pool for expensive resources
defmodule DatabasePool do
  use Supervisor

  def start_link(_) do
    Supervisor.start_link(__MODULE__, :ok, name: __MODULE__)
  end

  def init(:ok) do
    pool_opts = [
      name: {:local, :db_pool},
      worker_module: DatabaseWorker,
      size: 10,
      max_overflow: 5
    ]

    children = [
      :poolboy.child_spec(:db_pool, pool_opts)
    ]

    Supervisor.init(children, strategy: :one_for_one)
  end
end

The OTP Mindset Shift

Traditional programming: “Prevent all errors” OTP programming: “Errors will happen, recover quickly”

Key Takeaways:

Isolate failures - Use processes as failure boundaries
Fail fast - Don’t try to recover from everything
Supervise everything - Let supervisors handle recovery
Design for restarts - State should be recoverable
Monitor and observe - Know when things crash and why

Real-World Impact

At a previous company, we rebuilt a monolithic payment system using OTP principles:

Before: One payment failure could lock the entire system. Required manual restarts.
After: Individual payment processes crash and restart. System self-heals. 99.99% uptime.

The code was simpler, more maintainable, and more reliable.

Conclusion

OTP isn’t magic - it’s decades of hard-won wisdom about building reliable systems. The “let it crash” philosophy feels wrong at first, but it leads to simpler, more resilient code.

Stop trying to handle every error. Start building systems that survive failures.

That’s the OTP way.