Upgrade to Pro — share decks privately, control downloads, hide ads and more …

GitHub Pages on Riak and Webmachine

GitHub Pages on Riak and Webmachine

GitHub Pages, a feature allowing users to publish content to the web by simply pushing content to one of their GitHub hosted repositories, has had lackluster performance and uptime in the recent years. In this talk, Jesse will discuss the core requirements of the GitHub Pages application, why Erlang, Riak, and Webmachine were chosen for the development, and how they were used to fulfill those requirements now and for years to come with minimal development and operational maintenance.

Jesse Newland

March 30, 2012
Tweet

More Decks by Jesse Newland

Other Decks in Technology

Transcript

  1. riak &
    webmachine
    on
    github:pages

    View Slide

  2. github.com/jnewland
    @jnewland

    View Slide

  3. riak &
    webmachine
    on
    github:pages

    View Slide

  4. what
    is
    pages?

    View Slide

  5. simple
    static
    file
    hosting*

    View Slide

  6. View Slide

  7. github.com/user/repo/gh-pages

    http://user.github.com/repo

    View Slide

  8. View Slide

  9. github.com/user/user.github.com

    http://user.github.com

    View Slide

  10. View Slide

  11. View Slide

  12. $ cat CNAME
    example.com
    custom domain support

    View Slide

  13. what
    is wrong with
    pages?

    View Slide

  14. SLOW

    View Slide

  15. 1 node
    ext3
    gigs and gigs of HTML

    View Slide

  16. IO bound

    View Slide

  17. fscking downtime

    View Slide

  18. not HA

    View Slide

  19. LOTS
    TO
    FIX
    THROW
    IT ALL
    OUT

    View Slide

  20. what
    can power
    pages 2.0?

    View Slide

  21. 1. grab content from git
    2. run through jekyll
    3. write somewhere
    4. serve over HTTP

    View Slide

  22. what
    currently powers
    pages?

    View Slide

  23. 1. ruby
    2. ruby
    3. ext3
    4. nginx

    View Slide

  24. $ wc -l pages_map.conf
    57615 pages_map.conf

    View Slide

  25. 1. ruby
    2. ruby
    3. riak_kv
    4. riak_kv*

    View Slide

  26. *
    read-only
    transactional builds
    CNAMEs & redirects
    index.html fallback
    custom 404.html per-repo
    almost

    View Slide

  27. 1. ruby
    2. ruby
    3. riak_kv
    4. webmachine resource

    View Slide

  28. why
    not
    x?

    View Slide

  29. squid
    +
    nginx/apache
    +
    filesystem

    View Slide

  30. should work fine
    what happens when you need N > 1?
    just shard ‘em
    how do you populate new partitions?
    just rsync stuff around

    View Slide

  31. building a
    distributed
    system
    ASS
    FIRST

    View Slide

  32. remember, I do ops
    low maintenance, resilient
    systems make lazy
    sysadmins love you

    View Slide

  33. View Slide

  34. schema
    design

    View Slide

  35. 2 buckets
    hostspages

    View Slide

  36. hosts
    key: HTTP Host Header
    value: redirect or repo/sha map
    use: data key prefix lookup
    index: user_id

    View Slide

  37. hosts
    {
    “repos”: {
    “jnewland.github.com”: “deadbeef”,
    “pages”: “beadfeed”
    }
    }
    jnewland.github.com

    View Slide

  38. pages
    key: sha/URI
    value: HTML / other data
    use: data storage
    index: repo_id

    View Slide




  39. content="text/html; charset=utf-8" />
    Jesse Newland

    View Slide

  40. how does
    data get in?

    View Slide

  41. riak-ruby-client
    read files from disk
    write to riak
    update sites object
    gc old builds

    View Slide

  42. how does
    data get out?

    View Slide

  43. curl jnewland.github.com/raptor.gif
    GET /riak/hosts/jnewland.github.com
    GET /riak/pages/deadbeef/raptor.gif

    View Slide

  44. curl jnewland.github.com/pages/flow.png
    GET /riak/hosts/jnewland.github.com
    GET /riak/pages/beadfeed/flow.png

    View Slide

  45. what
    about
    CNAMEs?

    View Slide

  46. hosts
    example.github.com
    {
    “redirect”: “example.com”
    }

    View Slide

  47. hosts
    example.com
    {
    “repos”: {
    “example.com”: “deadbeef”
    }
    }

    View Slide

  48. curl example.github.com/raptor.gif
    GET /riak/hosts/example.github.com
    301 example.com/raptor.gif
    GET /riak/hosts/example.com
    GET /riak/pages/deadbeef/raptor.gif

    View Slide

  49. pages_wm_resource.erl

    View Slide

  50. webmachine
    is
    SO DAMN COOL

    View Slide

  51. View Slide

  52. %% webmachine resource exports
    -export([
    init/1,
    service_available/2,
    malformed_request/2,
    content_types_provided/2,
    resource_exists/2,
    previously_existed/2,
    moved_permanently/2,
    last_modified/2,
    generate_etag/2,
    produce_doc_body/2
    ]).

    View Slide

  53. grab local riak client
    check app config var
    service_available/2

    View Slide

  54. service_available(RD, Ctx=#ctx{riak=RiakProps,req_id=ReqId}) ->
    IdRD = wrq:set_resp_header("X-Request-Id", ReqId, RD),
    BrandedRD = wrq:set_resp_header(
    "X-GitHub-Pages-Version",
    release_handler_util:app_version(pages),
    IdRD),
    case application:get_env(pages, disabled) of
    {ok, true} ->
    {false, BrandedRD, Ctx};
    _ ->
    case riak_kv_wm_utils:get_riak_client(
    RiakProps,
    riak_kv_wm_utils:get_client_id(RD)) of
    {ok, C} ->
    {true, BrandedRD, Ctx#ctx{client=C}};
    _Error ->
    {false, BrandedRD, Ctx}
    end
    end.

    View Slide

  55. parse Host header, URI
    malformed_request/2

    View Slide

  56. malformed_request(RD, Ctx) ->
    try
    Host = wrq:get_req_header("Host", RD),
    HostWithoutPort = re:replace(
    Host,
    "\:.*",
    "",
    [{return,list}]),
    Tokens = [
    riak_kv_wm_utils:maybe_decode_uri(RD, X) ||
    X <- wrq:path_tokens(RD)],
    ParsedCtx = Ctx#ctx{tokens=Tokens,host=HostWithoutPort},
    {false, RD, ParsedCtx}
    catch
    Exception:Reason ->
    log_error({exception, Exception, Reason}, RD, Ctx),
    {true, RD, Ctx}
    end.

    View Slide

  57. guess mime type from path
    assume text/html for / URIs
    content_types_provided/2

    View Slide

  58. content_types_provided(RD, Ctx) ->
    Filename = lists:last(Ctx#ctx.tokens),
    Extension = filename:extension(Filename),
    case mochiweb_mime:from_extension(Extension) of
    undefined ->
    {[{"text/html", produce_doc_body}], RD, Ctx};
    Mime ->
    {[{Mime, produce_doc_body}], RD, Ctx}
    end.

    View Slide

  59. hit hosts bucket (r=1)
    stash redirect or sha
    404 if no hosts data
    resource_exists/2

    View Slide

  60. resource_exists(RD, Ctx) ->
    RedirectOrSha = redirect_or_sha(Ctx),
    case RedirectOrSha of
    {redirect, Redirect} ->
    {true, RD, Ctx#ctx{redirect={redirect, Redirect}}};
    {sha, Sha} ->
    page_data_exists(RD, Ctx#ctx{sha={sha, Sha}});
    _ ->
    {false, RD, Ctx}
    end.

    View Slide

  61. previously_existed/2
    moved_permanently/2

    View Slide

  62. previously_existed(RD, Ctx) ->
    case Ctx#ctx.redirect of
    {redirect, _} ->
    {true, RD, Ctx};
    _ ->
    {false, RD, Ctx}
    end.
    moved_permanently(RD, Ctx) ->
    case Ctx#ctx.redirect of
    {redirect, RedirectHost} ->
    MovedURI = list_join(lists:append(
    [RedirectHost],
    Ctx#ctx.tokens),
    "/"),
    {{true}, MovedURI, RD, Ctx};
    _ ->
    {false, RD, Ctx}
    end.

    View Slide

  63. hit pages bucket (r=1)
    fallback for index.html
    fallback for 404.html
    page_data_exists/2

    View Slide

  64. curl foo.github.com/
    GET /riak/hosts/foo.github.com
    GET /riak/pages/f0f0f0f0/
    GET /riak/pages/f0f0f0f0/index.html
    GET /riak/pages/f0f0f0f0/index.htm
    GET /riak/pages/f0f0f0f0/index.xhtml
    GET /riak/pages/f0f0f0f0/index.xml
    GET /riak/pages/f0f0f0f0/404.html

    View Slide

  65. < 300 lines of erlang
    simple

    View Slide

  66. {webmachine, [{
    dispatch_list, [{
    %% riak_kv stuff
    {["pages",'*'],pages_wm_resource,[]},
    {["pages"],pages_wm_resource,[]}]}}
    ]}
    nginx proxies to / /pages

    View Slide

  67. remember, I do ops
    one system service
    data store and api
    predictable performance
    busy ops best friend

    View Slide

  68. what’s
    next

    View Slide

  69. metrics with folsom
    graphite / gaug.es
    logging with lager
    HTTP caching
    ???

    View Slide

  70. private beta soon
    turn on / off with DNS
    repo access
    @jnewland

    View Slide

  71. erlang
    ruby
    ops
    c
    work with me

    View Slide

  72. thanks

    View Slide