Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Programs that Write Programs: How Compilers Work

Programs that Write Programs: How Compilers Work

As presented at Momentum Dev Con, 19 April 2018
Compilers are the bridge between the code you write and the applications you run. While production compilers can be quite complicated, the principles of compiler design are not too hard to learn, and are broadly applicable to many seemingly difficult programming problems. In this session you will learn how every phase of a real compiler works, including lexing, parsing, type checking, optimization, and code generation. The lessons learned here will help you with many common programming problems, such as deserialization, maintaining large amounts of legacy code, static analysis, testing, and validation. In contrast to common Java-based compiler tutorials, I'll demonstrate how to build a compiler using functional programming techniques in F#. Full source code for a working compiler targeting the .NET CLR will be included!

56e5c49368a2e0ab999848a8d9e3c116?s=128

Craig Stuntz

April 20, 2018
Tweet

Transcript

  1. Programs that Write Programs Craig Stuntz https://speakerdeck.com/craigstuntz https://github.com/CraigStuntz/TinyLanguage

  2. What Do Developers Do?

  3. –Steve Yegge “You're actually surrounded by compilation problems. You run

    into them almost every day.” http://steve-yegge.blogspot.ca/2007/06/rich-programmer-food.html
  4. –Greenspun’s Tenth Rule “Any sufficiently complicated C or Fortran program

    contains an ad hoc, informally- specified, bug-ridden, slow implementation of half of Co!"on Lisp.” https://commons.wikimedia.org/wiki/File:Philip_Greenspun_and_Alex_the_dog.jpg
  5. The Hoover Dam

  6. Generalize the Problem

  7. –Eugene Wallingford “…compilers ultimately depend on a single big idea

    from the theory of computer science: that a certain kind of machine can simulate anything — including itself. As a result, this certain kind of machine, the Turing machine, is the very definition of computability.” http://www.cs.uni.edu/~wallingf/blog/archives/monthly/2015-09.html#e2015-09-03T15_26_47.htm
  8. Compiler

  9. Compiler Interpreter

  10. code exe

  11. code exe

  12. Useful Bits • Regular Expressions (lexing) • Deserializers (parsing) •

    Linters, static analysis (syntax, type checking) • Solvers, theorem provers (optimization) • Code migration tools (compilers!)
  13. A → B

  14. Source code → Program JPEG file → Image on screen

    Source code → Potential style error list JSON → Object graph Code with 2 digit years → Y2K compliant code VB6 → C# Object graph → User interface markup Algorithm → Faster, equivalent algorithm
  15. Designing with Formal Methods

  16. #define D define #D Y return #D R for #D

    e while #D I printf #D l int #D W if #D C y=v+111;H(x,v)*y++= *x #D H(a,b)R(a=b+11;a<b+89;a++) #D s(a)t=scanf("%d",&a) #D U Z I #D Z I("123\ 45678\n");H(x,V){putchar(".XO"[*x]);W((x-V)%10==8){x+=2;I("%d\n",(x-V)/10-1);}} l V[1600],u,r[]={-1,-11,-10,-9,1,11,10,9},h[]={11,18,81,88},ih[]={22,27,72,77}, bz,lv=60,*x,*y,m,t;S(d,v,f,_,a,b)l*v;{l c=0,*n=v+100,j=d<u-1?a:-9000,w,z,i,g,q= 3-f;W(d>u){R(w=i=0;i<4;i++)w+=(m=v[h[i]])==f?300:m==q?-300:(t=v[ih[i]])==f?-50: t==q?50:0;Y w;}H(z,0){W(E(v,z,f,100)){c++;w= -S(d+1,n,q,0,-b,-j);W(w>j){g=bz=z; j=w;W(w#$b%&w#$8003)Y w;}}}W(!c){g=0;W(_){H(x,v)c+= *x==f?1:*x==3-f?-1:0;Y c>0? 8000+c:c-8000;}C;j= -S(d+1,n,q,1,-b,-j);)bz=g;Y d#$u-1?j+(c'(3):j;}main(){R(;t< 1600;t+=100)R(m=0;m<100;m++)V[t+m]=m<11%&m>88%&(m+1)%10<2?3:0;I("Level:");V[44] =V[55]=1;V[45]=V[54]=2;s(u);e(lv>0){Z do{I("You:");s(m);}e(!E(V,m,2,0))*m+,99); W(m+,99)lv--;W(lv<15)*u<10)u+=2;U("Wait\n");I("Value:%d\n",S(0,V,1,0,-9000,9000 ));I("move: %d\n",(lv-=E(V,bz,1,0),bz));}}E(v,z,f,o)l*v;{l*j,q=3-f,g=0,i,w,*k=v +z;W(*k==0)R(i=7;i#$0;i--){j=k+(w=r[i]);e(*j==q)j+=w;W(*j==f)*j-w+,k){W(!g){g=1 ;C;}e(j+,k)*((j-=w)+o)=f;}}Y g;}
  17. Duff’s Device There Are No Edge Cases In Progra"#ing Languages

    send(to, from, count) register short *to, *from; register count; { register n = (count + 7) / 8; switch (count % 8) { case 0: do { *to = *from++; case 7: *to = *from++; case 6: *to = *from++; case 5: *to = *from++; case 4: *to = *from++; case 3: *to = *from++; case 2: *to = *from++; case 1: *to = *from++; } while (--n > 0); } }
  18. None
  19. 1 + 2 + 3 + … + 100 =

    100 * 101 / 2 = 5050
  20. None
  21. Lexer → Regular Expressions Parser → Context Free Gra!"ar Optimizer

    → Algebra Type Checker → Logical Inference Rules Code Generator → Denotational Semantics
  22. –Leslie Lamport “You don’t achieve simplicity by thinking in terms

    of complicated languages. Simplicity requires thinking abstractly before you start implementing.” http://www.heidelberg-laureate-forum.org/blog/video/lecture-monday-august-24-2015-leslie-lamport/ https://commons.wikimedia.org/wiki/File:Leslie_Lamport.jpg
  23. A Few Important Concepts

  24. Syntax x = x + 1; alert(x); Sequence Assign Invoke

    x add x 1 alert x
  25. Semantics name = "Nate" # +/ "Nate" String.upcase(name) # +/

    "NATE" name # +/ "Nate" name = "Nate" # +/ "Nate" name.upcase! # +/ "NATE" name # +/ "NATE" http://www.natescottwest.com/elixir-for-rubyists-part-2/
  26. Semantics Imports System Namespace Hello Class HelloWorld Overloads Shared Sub

    Main(ByVal args() As String) Dim name As String = "VB.NET" 'See if argument passed If args.Length = 1 Then name = args(0) Console.WriteLine("Hello, " & name & "!") End Sub End Class End Namespace using System; namespace Hello { public class HelloWorld { public static void Main(string[] args) { string name = "C#"; !" See if argument passed if (args.Length == 1) name = args[0]; Console.WriteLine("Hello, " + name + "!"); } } } http://www.harding.edu/fmccown/vbnet_csharp_comparison.html
  27. Front End: Understand Language Back End: Emit Code

  28. Lexer IL Generator Parser Type Checker Optimizer Optimizer Object Code

    Generator Binder
  29. OK, so let’s compile something already! module Compiler let compile

    = Lexer.lex 01 Parser.parse 01 Binder.bind 01 Optimize Binding.optimize 01 IlGenerator.codegen 01 Railway.map OptimizeIl.optimize 01 Railway.map Il.toAssemblyBuilder
  30. (inc -1)

  31. (inc -1) Ldc.i4 -1 Ldc.i4 1 Add

  32. (inc -1) Ldc.i4 -1 Ldc.i4 1 Add Ldc.i4.0

  33. (inc -1) Lex LeftParen, Identifier(inc), Number(-1), RightParen Parse Apply “inc”

    to -1 Type check “inc” exists and takes an int argument, and -1 is an int. Great! Optimize -1 + 1 = 0, so just emit int 0! IL generate Ldc.i4 0 Optimize Ldc.i4 0 → Ldc.i4.0 Object code Produce assembly with entry point which contains the IL generated
  34. (defun add-1 (int x) (inc x)) (defun main () (print

    (add-1 2)))
  35. Lexer What Problem Are We Solving? String → Sequence of

    tokens Non-Compiler Example Text search
  36. Lexer Search “am” I am. You are.

  37. Regular Expressions leftParenthesis = ‘(‘ rightParenthesis = ‘)’ letter =

    ‘A’ | ‘B’ | ‘C’ | … digit = ‘0’ | ‘1’ | ‘2’ | … number = (‘+’digit|‘-’digit|digit) digit* alphanumeric = letter | number !3 …
  38. Lexer

  39. Lexer type Lexeme = | LeftParenthesis | RightParenthesis | Identifier

    of string | LiteralInt of int | LiteralString of string | Unrecognized of char
  40. Lexer type Lexeme = | LeftParenthesis | RightParenthesis | Identifier

    of string | LiteralInt of int | LiteralString of string | Unrecognized of char
  41. Lexer let private prettyPrint (lexeme: Lexeme) = match lexeme with

    | LeftParenthesis !→ "(" | RightParenthesis !→ ")" | Identifier identifier !→ identifier | LiteralInt num !→ num.ToString() | LiteralString str !→ str | Unrecognized ch !→ ch.ToString()
  42. Lexer (inc -1)

  43. Lexer (inc -1) “(“ “inc” “-1” “)” LeftParenthesis Identifier(“inc”) LiteralInt(-1)

    RightParenthesis
  44. Lexer ( -1)

  45. Lexer let rec private lexChars (source: char list) : Lexeme

    list = match source with | '(' :: rest !→ LeftParenthesis :: lexChars rest | ')' :: rest !→ RightParenthesis :: lexChars rest | '"' :: rest !→ lexString(rest, "") | c :: rest when isIdentifierStart c !→ lexName (source, "") | d :: rest when System.Char.IsDigit d !→ lexNumber(source, "") | [] !→ [] | w :: rest when System.Char.IsWhiteSpace w !→ lexChars rest | c :: rest !→ Unrecognized c :: lexChars rest
  46. Lexer http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags

  47. Lexer http://www.regular-expressions.info/email.html “So even when following official standards, there are

    still trade-offs to be made. Don't blindly copy regular expressions from online libraries or discussion forums.” -Jan Goyvaerts, regular-expressions.info
  48. Parser What Problem Are We Solving? Sequence of tokens →

    Syntax tree Non-Compiler Example Deserialization
  49. PEMDAS 1 + 2 * 3 1 + (2 *

    3)
  50. None
  51. Gra"#ar <program> := <statement> | <program> <statement> <statement> := <defun>

    | <expr> <defun> := “(defun” identifier <expr> <expr> “)” <expr> := number | string | <invoke> <invoke> := “(” identifier <expr> “)”
  52. Gra"#ar type Expression = | IntExpr of int | StringExpr

    of string | DefunExpr of name: string * argument: ArgumentExp | InvokeExpr of name: string * argument: Expression | IdentifierExpr of string | ErrorExpr of string | EmptyListExpr
  53. Parser LeftParenthesis Identifier(“inc”) LiteralInt(-1) RightParenthesis

  54. Parser LeftParenthesis Identifier(“inc”) LiteralInt(-1) RightParenthesis Invoke “inc” -1

  55. Parser LeftParenthesis Identifier(“inc”) LiteralInt(-1) LiteralInt(-1)

  56. Parser LeftParenthesis Identifier(“inc”) LiteralInt(-1) LiteralInt(-1) “Expected ‘)’”

  57. Parser let rec private parseExpression (state : ParseState): ParseState =

    match state.Remaining with | LeftParenthesis :: Identifier "defun" :: Identifier name :: rest !→ let defun = parseDefun (name, { state with Remaining = rest }) match defun.Expressions, defun.Remaining with | [ ErrorExpr _ ], _ !→ defun | _, RightParenthesis :: remaining !→ { defun with Remaining = remaining } | _, [] !→ error ("Expected ')'.") | _, wrong :: _ !→ error (sprintf "Expected ')'; found %A." wrong) | LeftParenthesis :: Identifier name :: argumentsAndBody !→ let invoke = parseInvoke (name, { state with Remaining = argumentsAndBody }) match invoke.Remaining with | RightParenthesis :: remaining !→ { invoke with Remaining = remaining } | [] !→ error ("Expected ')'.") | wrong :: _ !→ error (sprintf "Expected ')'; found %A." wrong) | LeftParenthesis :: wrong !→ error (sprintf "%A cannot follow '('." wrong)
  58. Parser

  59. –Guy Steele “If it's worth telling another progra!"er, it's worth

    telling the compiler, I think.” https://joshvarty.wordpress.com/2015/08/03/learn-roslyn-now-part-11-introduction-to-code-fixes/
  60. Parser https://joshvarty.wordpress.com/2015/08/03/learn-roslyn-now-part-11-introduction-to-code-fixes/

  61. Scope What Problem Are We Solving? What does “x” mean

    right now? Non-Compiler Example Bounded Context in Domain Driven Design
  62. Scope https://msujaws.wordpress.com/2011/05/03/static-vs-dynamic-scoping/

  63. Binding InvokeExpr “inc” -1

  64. InvokeBinding { FunctionName = "inc" Function = Inc Argument =

    IntBinding -1} Binding InvokeExpr “inc” -1
  65. InvokeExpr { Name = "not-a-function" Argument = StringExpr "" }

    Binding
  66. InvokeExpr { Name = "not-a-function" Argument = StringExpr "" }

    Binding “Undefined function ‘not-a-function’.”
  67. https://msdn.microsoft.com/en-us/library/ms228296.aspx?f=255&MSPPError=-2147217396

  68. None
  69. About Those Errors [<Test>] member this.``should return error for unbound

    invocation``() = let source = "(bad-method 2)" let expected = ErrorBinding ( "Undefined function 'bad-method'.", EmptyBinding) let actual = bind source actual |> should equal expected
  70. About Those Errors http://www.drdobbs.com/architecture-and-design/so-you-want-to-write-your-own-language/240165488?pgno=2

  71. About Those Errors • Die in a fire http://www.drdobbs.com/architecture-and-design/so-you-want-to-write-your-own-language/240165488?pgno=2

  72. About Those Errors • Die in a fire • Guess

    what I meant, not what I said http://www.drdobbs.com/architecture-and-design/so-you-want-to-write-your-own-language/240165488?pgno=2
  73. About Those Errors • Die in a fire • Guess

    what I meant, not what I said • Poisoning http://www.drdobbs.com/architecture-and-design/so-you-want-to-write-your-own-language/240165488?pgno=2
  74. Type Checking What Problem Are We Solving? AST → Boolean

    “Is it valid?” Non-Compiler Example Linter
  75. Type Checking ldstr "Hi" ldstr "Hi" div This is bad.

    Don’t do this.
  76. Type Inference Rules Γ ⊢ A Γ ⊢ B Γ

    ⊢ A×B Γ ⊢ v1 :Int Γ ⊢ v2 :Int Γ ⊢ v1 +v2 :Int
  77. Type Checking • Statically typed • Unityped (“dynamic language”) •

    Untyped
  78. Type Checking let rec private toBinding (environment: Map<string, Binding>) match

    expression with | IntExpr n !→ IntBinding n | StringExpr str !→ String Binding str
  79. Type Checking | InvokeExpr (name, argument) !→ match environment.TryFind name

    with | Some (Function Binding func) !→ let argumentBinding = toInvokedArgumentBinding environment argument match argumentTypeError argumentBinding func with | None !→ InvokeBinding { FunctionName = name Function = func Argument = argumentBinding } | Some argumentTypeErrorMessage !→ ErrorBinding (argumentTypeErrorMessage, EmptyBinding) | Some bindingType !→ ErrorBinding (sprintf "Expected function; found %A" bindingType, EmptyBinding) | None !→ ErrorBinding (sprintf "Undefined function '%s'." name, EmptyBinding)
  80. InvokeExpr { Name = "inc" Argument = StringExpr “Oops!" }

    Type Checking
  81. InvokeExpr { Name = "inc" Argument = StringExpr “Oops!" }

    Type Checking “Expected integer; found ‘Oops!’.”
  82. Optimizers What Problem Are We Solving? Program → Faster, but

    equivalent program Non-Compiler Example Theorem prover
  83. Optimization (I) InvokeBinding “inc” -1

  84. Optimization (I) InvokeBinding “inc” -1 IntBinding 0

  85. Optimization (I) Invoke “some-method” -1

  86. Optimization (I) Invoke “some-method” -1 Invoke “some-method” -1

  87. Optimization (I) let private optimizeInc (binding: Binding) : Binding =

    match binding with | IncBinding (IntBinding number) !→ IntBinding (number + 1) | IncBinding _ | BoolBinding _ | IntBinding _ | String Binding _ | VariableBinding _ | Function Binding _ | InvokeBinding _ | DefBinding _ | ErrorBinding _ | EmptyBinding _ !→ binding
  88. IL Generation IntBinding 0

  89. IL Generation IntBinding 0 Ldc.i4 0

  90. IL Generation IntBinding 0 Ldc.i4 0 Ldc.i4.0

  91. IL Generation let rec private codegenBinding (binding : Binding) =

    match binding with | BoolBinding b !→ match b with | true !→ [Ldc_I4_1] | false !→ [Ldc_I4_0] | IntBinding n !→ [Ldc_I4 n] | String Binding s !→ [Ldstr s] | !" …
  92. IL Generation let private writeLineMethod = typeof<System.Console>.GetMethod( "WriteLine", [| typeof<System.Int32>

    |] let private codegenOper = function | IncInt !→ [ Instruction.Ldc_I4_1 Instruction.Add ] | WriteLine !→ [ Instruction.Call writeLineMethod ]
  93. Optimization (II) Ldc.i4 0

  94. Optimization (II) Ldc.i4 0 Ldc.i4.0

  95. Optimization (II) let private optimalShortEncodingFor = function | Ldc_I4 0

    !→ Ldc_I4_0 | Ldc_I4 1 !→ Ldc_I4_1 | Ldc_I4 2 !→ Ldc_I4_2 | Ldc_I4 3 !→ Ldc_I4_3 | Ldc_I4 4 !→ Ldc_I4_4 | Ldc_I4 5 !→ Ldc_I4_5 | Ldc_I4 6 !→ Ldc_I4_6 | Ldc_I4 7 !→ Ldc_I4_7 | Ldc_I4 8 !→ Ldc_I4_8 | Ldloc 0 !→ Ldloc_0 | Ldloc 1 !→ Ldloc_1 | Ldloc 2 !→ Ldloc_2 | Ldloc 3 !→ Ldloc_3 | Ldloc i when i :; maxByte !→ Ldloc_S(Convert.ToByte(i))
  96. Special Tools!

  97. Compare!

  98. https://www.ece.cmu.edu/~ganger/712.fall02/papers/p761-thompson.pdf

  99. Trusting Trust Compiler Executable Compiler Source Code Compiler Executable

  100. Trusting Trust Compiler Executable Compiler Source Code Trojaned Compiler Executable

    Trojan Code
  101. Trusting Trust Trojaned Compiler Executable Benign App Source Code Trojaned

    App Executable
  102. Trusting Trust Trojaned Compiler Executable (Benign!) Compiler Source Code Trojaned

    Compiler Executable
  103. Conclusion

  104. Further Reading

  105. Further Reading • Progra!"ing Language Concepts, by Peter Sestoft •

    Modern Compiler Implementation in ML, by Andrew W. Appel • miniml (608 line implementation of ML subset), by Andrej Bauer • Coursera Compilers Course, by Alex Aiken
  106. Craig Stuntz @craigstuntz Craig.Stuntz@Improving.com https://www.craigstuntz.com https://www.meetup.com/Papers-We-Love-Columbus/ https://speakerdeck.com/craigstuntz https://github.com/CraigStuntz/TinyLanguage