Parser combinators: a type-driven approach to input processiEOF

Parser combinators: a type-driven approach to input processiEOF

Parsing is a truly fundamental problem. How can we extract and refine intelligible information out of the chaotic and hostile world we operate in? The absurd nature of the task and the breadth of possible solutions can quickly turn into an obsession, watch out.
The composability of first-class functions paired with powerful types can offer an extremely elegant and satifying approach to parsing. In this talk, we will build our own parser-combinators library from scratch using Scala. We will make he most of the language by using all kinds of types, data structures, and other tricks.
May contain traces of Idris, it depends.

D3399964c2083a09ade9234017fe263d?s=128

Bastien Louërat

February 28, 2018
Tweet

Transcript

  1. 3.

    import validation._ def validateAddress(form: Form) Either[AddressError, Address] = for {

    number validateNumber(form) street validateStreet(form) city validateCity(form) postCode validatePostCode(form) } yield Address(number, street, city, postCode) 2 / 39
  2. 5.
  3. 6.
  4. 7.
  5. 8.

    EOF

  6. 9.
  7. 10.
  8. 11.
  9. 12.
  10. 13.

    sealed trait Parser[A] case class Exactly(char: Char) extends Parser[Char] val

    a: Parser[Char] = Exactly('a') val b: Parser[Char] = Exactly('b') def run[A](parser: Parser[A])(input: String) Option[A] = parser match { case Exactly(char) input.headOption.filter(_ char) } scala> run(a)("") res1 Option[Char] = None scala> run(a)("z") res2 Option[Char] = None scala> run(a)("a") res3 Option[Char] = Some(a) 12 / 39
  11. 14.

    sealed trait Error case object EOF extends Error case class

    Unexpected(char: Char) extends Error def run[A](parser: Parser[A])(input: String) Either[Error, A] = parser match { case Exactly(char) input.headOption.toRight(EOF).flatMap { head if (head char) Right(char) else Left(Unexpected(head)) } } scala> run(a)("") res4 Either[Error,Char] = Left(EOF) scala> run(a)("z") res5 Either[Error,Char] = Left(Unexpected(z)) scala> run(a)("a") res6 Either[Error,Char] = Right(a) 13 / 39
  12. 15.

    case class Or[A](parser1 Parser[A], parser2 Parser[A]) extends Parser[A] val aOrB

    Parser[Char] = Or(a, b) def run[A](parser: Parser[A])(input: String) Either[Error, A] = parser match { case Exactly(char) input.headOption.toRight(EOF).flatMap { head if (head char) Right(char) else Left(Unexpected(head)) } case Or(parser1, parser2) run(parser1)(input) match { case Left(error) run(parser2)(input) case Right(result) Right(result) } } scala> run(aOrB)("z") res7 Either[Error,Char] = Left(Unexpected(z)) scala> run(aOrB)("a") res8 Either[Error,Char] = Right(a) scala> run(aOrB)("b") res9 Either[Error,Char] = Right(b) 14 / 39
  13. 16.

    case class NEL[A](head: A, tail: List[A]) { def map[B](f: A

    B) NEL[B] = NEL(f(head), tail.map(f)) } def oneOf[A](parsers: NEL[Parser[A]]) Parser[A] = parsers.tail.foldLeft(parsers.head)(Or(_, _)) val allDigits: NEL[Char] = NEL('0', ('1' to '9').toList) val digit: Parser[Char] = oneOf(allDigits.map(Exactly(_))) scala> run(digit)("") res11 Either[Error,Char] = Left(EOF) scala> run(digit)("z") res12 Either[Error,Char] = Left(Unexpected(z)) scala> run(digit)("0") res13 Either[Error,Char] = Right(0) scala> run(digit)("1") res14 Either[Error,Char] = Right(1) scala> run(digit)("2") res15 Either[Error,Char] = Right(2) scala> run(digit)("9") res16 Either[Error,Char] = Right(9) 15 / 39
  14. 17.

    scala> run(a)("abcd") res17 Either[Error,Char] = Right(a) How about bcd? def

    run[A](parser: Parser[A])(input: String) Either[Error, (A, String)] = parser match { case Exactly(char) input.headOption.toRight(EOF).flatMap { head if (head char) Right((char, input.tail)) is safe, trust me else Left(Unexpected(head)) } case Or(parser1, parser2) run(parser1)(input) match { case Left(error) run(parser2)(input) case Right(result) Right(result) } } scala> run(a)("") res18 Either[Error,(Char, String)] = Left(EOF) scala> run(a)("z") res19 Either[Error,(Char, String)] = Left(Unexpected(z)) scala> run(a)("abcd") res20 Either[Error,(Char, String)] = Right((a,bcd)) 16 / 39
  15. 18.

    case class And[A, B]( parserA Parser[A], parserB Parser[B] ) extends

    Parser[(A, B)] val aAndB Parser[(Char, Char)] = And(a, b) def run[A](parser: Parser[A])(input: String) Either[Error, (A, String)] = parser match { case Exactly(char) input.headOption.toRight(EOF).flatMap { head if (head char) Right((char, input.tail)) else Left(Unexpected(head)) } case Or(parser1, parser2) run(parser1)(input) match { case Left(error) run(parser2)(input) case Right(result) Right(result) } case And(parserA, parserB) for { resultA run(parserA)(input) resultB run(parserB)(resultA._2) } yield ((resultA._1, resultB._1), resultB._2) } 17 / 39
  16. 19.

    scala> run(aAndB)("") res21 Either[Error,((Char, Char), String)] = Left(EOF) scala> run(aAndB)("z")

    res22 Either[Error,((Char, Char), String)] = Left(Unexpected(z)) scala> run(aAndB)("a") res23 Either[Error,((Char, Char), String)] = Left(EOF) scala> run(aAndB)("abcd") res24 Either[Error,((Char, Char), String)] = Right(((a,b),cd)) 18 / 39
  17. 20.

    import scalaz._ import Scalaz._ def run[F[_], A]( parser: Parser[A] )(

    MS: MonadState[F, String], ME: MonadError[F, Error] ) F[A] = ??? 19 / 39
  18. 21.

    trait MonadStateError[F[_], S, E] extends MonadState[F, S] with MonadError[F, object

    MonadStateError { def apply[F[_], S, E]( MS: MonadState[F, S], ME: MonadError[F, E] ) MonadStateError[F, S, E] = new MonadStateError[F, S, E] { def point[A](a: A) F[A] = MS.point(a) def bind[A, B](fa: F[A])(f: A F[B]) F[B] = MS.bind(fa)(f) def handleError[A](fa: F[A])(f: E F[A]) F[A] = ME.handleError(fa)(f def raiseError[A](e: E) F[A] = ME.raiseError(e) def get: F[S] = MS.get def init: F[S] = MS.init def put(s: S) F[Unit] = MS.put(s) } } 20 / 39
  19. 22.

    def next[F[_]]( implicit M: MonadStateError[F, String, Error] ) F[Char] =

    M.get.flatMap { case "" M.raiseError(EOF) case input M.put(input.tail) >| input.head } def run[F[_], A]( parser: Parser[A] )( implicit M: MonadStateError[F, String, Error] ) F[A] = parser match { case Exactly(char) for { head next _ whenM(head char)(M.raiseError(Unexpected(head))) } yield head case Or(parser1, parser2) M.get.flatMap { input M.handleError(run(parser1)) { _ M.put(input) run(parser2) } } case And(parserA, parserB) for { a run(parserA) b run(parserB) } yield (a, b) } 21 / 39
  20. 23.

    type Result[A] = Either[Error, A] type ResultST[A] = StateT[Result, String,

    A] val toResultST Parser ResultST = new (Parser ResultST) { val MS: MonadState[ResultST, String] = StateT.stateTMonadState[String, Result] val ME: MonadError[ResultST, Error] = StateT.stateTMonadError[String, Result, Error] def apply[A](parser: Parser[A]) ResultST[A] = run(parser)(MonadStateError(MS, ME)) } def eval[A](parser: Parser[A])(input: String) Result[A] = toResultST(parser).eval(input) scala> val result = toResultST(aAndB) result: ResultST[(Char, Char)] = scalaz.IndexedStateT$$anon$12@71a3769d scala> result.run("") res28 Result[(String, (Char, Char))] = Left(EOF) scala> result.run("z") res29 Result[(String, (Char, Char))] = Left(Unexpected(z)) scala> result.run("abcd") res30 Result[(String, (Char, Char))] = Right((cd,(a,b))) scala> result.eval("abcd") res31 Result[(Char, Char)] = Right((a,b)) 22 / 39
  21. 24.

    sealed trait Parser[A] case class Exactly(char: Char) extends Parser[Char] case

    class Pure[A](value: A) extends Parser[A] case class Or[A]( parser1 Parser[A], parser2 Parser[A] ) extends Parser[A] case class Bind[A, B]( parser: Parser[A], f: A Parser[B] ) extends Parser[B] implicit val parserMonad: Monad[Parser] = new Monad[Parser] { def point[A](a: A) Parser[A] = Pure(a) def bind[A, B](fa: Parser[A])(f: A Parser[B]) Parser[B] = Bind(fa, f) } implicit val parserPlus: Plus[Parser] = new Plus[Parser] { def plus[A](a: Parser[A], b: Parser[A]) Parser[A] = Or(a, b) } implicit def parserSemigroup[A] Semigroup[Parser[A]] = parserPlus.semigroup[A] 23 / 39
  22. 25.

    def and[A, B](parserA Parser[A], parserB Parser[B]) Parser[(A, B)] = parserA.tuple(parserB)

    val allDigits: NonEmptyList[Char] = NonEmptyList('0', ('1' to '9') _*) def charToInt(c: Char) Int = c.toString.toInt ¯\_( ツ)_/¯ val digit: Parser[Int] = allDigits.foldMap1(d Exactly(d).map(charToInt)) def maybe[A](parser: Parser[A]) Parser[Option[A]] = Or(parser.map(Option(_)), Pure(Option.empty[A])) object someMany { def some[A](parser: Parser[A]) Parser[List[A]] = for { head parser tail many(parser) } yield head tail def many[A](parser: Parser[A]) Parser[List[A]] = Or(some(parser), Pure(List.empty[A])) } import someMany._ 24 / 39
  23. 26.

    def run[F[_], A]( parser: Parser[A] )( implicit M: MonadStateError[F, String,

    Error] ) F[A] = parser match { case Exactly(char) for { head next _ whenM(head char)(M.raiseError(Unexpected(head))) } yield head case Pure(value) M.point(value) case Or(parser1, parser2) M.get.flatMap { input M.handleError(run(parser1)) { _ M.put(input) run(parser2) } } case Bind(parser, f) run(parser).flatMap(a run(f(a))) } type Result[A] = Either[Error, A] type ResultST[A] = StateT[Result, String, A] val toResultST Parser ResultST = new (Parser ResultST) { val MS: MonadState[ResultST, String] = StateT.stateTMonadState[String, Resu val ME: MonadError[ResultST, Error] = StateT.stateTMonadError[String, Resul def apply[A](parser: Parser[A]) ResultST[A] = run(parser)(MonadStateError( } def eval[A](parser: Parser[A])(input: String) Result[A] = toResultST(parser) 25 / 39
  24. 27.

    val lowercase: Parser[Char] = NonEmptyList('a', ('b' to 'z') _*).foldMap1[Parser[Char]](Exactly(_)) val

    foo: Parser[(Char, List[Int])] = and(lowercase, some(digit)) val result = toResultST(foo) scala> result.run("") res35 Result[(String, (Char, List[Int]))] = Left(EOF) scala> result.run("A") res36 Result[(String, (Char, List[Int]))] = Left(Unexpected(A)) scala> result.run("a") res37 Result[(String, (Char, List[Int]))] = Left(EOF) scala> result.run("b42az") res38 Result[(String, (Char, List[Int]))] = Right((az,(b,List(4, 2)))) 26 / 39
  25. 28.

    val protocol: Parser[String] = for { n digit _ Exactly(':')

    chars lowercase.replicateM(n) } yield chars.mkString scala> eval(protocol)("") res39 Result[String] = Left(EOF) scala> eval(protocol)("abcdefg42") res40 Result[String] = Left(Unexpected(a)) scala> eval(protocol)("3abcdefg42") res41 Result[String] = Left(Unexpected(a)) scala> eval(protocol)("3:abcdefg42") res42 Result[String] = Right(abc) 27 / 39
  26. 29.

    Parser combinators in Idris data Grammar : (tok : Type)

    (consumes : Bool) Type Type where Empty : (val : ty) Grammar tok False ty Terminal : (tok Maybe a) Grammar tok True a NextIs : (tok Bool) Grammar tok False tok EOF : Grammar tok False () Fail : String Grammar tok c ty Commit : Grammar tok False () SeqEat : Grammar tok True a Inf (a Grammar tok c2 b) Grammar tok True b SeqEmpty : {c1, c2 : Bool} Grammar tok c1 a (a Grammar tok c2 b) Grammar tok (c1 || c2) b Alt : {c1, c2 : Bool} Grammar tok c1 ty Grammar tok c2 ty Grammar tok (c1 c2) ty ( ) : {c1 : Bool} Grammar tok c1 a inf c1 (a Grammar tok c2 b) Grammar tok (c1 || c2) b ( ) {c1 = False} = SeqEmpty ( ) {c1 = True} = SeqEat 28 / 39
  27. 30.

    sealed trait Bool { type If[T Out, F Out, Out]

    <: Out } sealed trait True extends Bool { type If[T Out, F Out, Out] = T } sealed trait False extends Bool { type If[T Out, F Out, Out] = F } type [A Bool, B Bool] = A#If[B, False, Bool] type [A Bool, B Bool] = A#If[True, B, Bool] object Bool { sealed trait Refl[A Bool, B Bool] object Refl extends Refl0 trait Refl0 extends Refl1 { implicit def reflAndTrue[A Bool] Refl[A True, A] = null implicit def reflAndFalse[A Bool] Refl[A False, False] = null } trait Refl1 { implicit def reflOrTrue[A Bool] Refl[A True, True] = null implicit def reflOrFalse[A Bool] Refl[A False, A] = null } } import Bool._ 29 / 39
  28. 31.

    sealed trait Parser[A, X Bool] case class Exactly(char: Char) extends

    Parser[Char, True] case class Pure[A](value: A) extends Parser[A, False] case class Or[A, X1 Bool, X2 Bool]( parser1 Parser[A, X1], parser2 Parser[A, X2] ) extends Parser[A, X1 X2] case class Bind[A, X1 Bool, B, X2 Bool]( parser: Parser[A, X1], f: A Parser[B, X2] ) extends Parser[B, X1 X2] 30 / 39
  29. 32.

    object Parser { sealed trait Parser[A, X Bool] { self

    def map[B](f: A B) Parser[B, X] = Bind(self, (a: A) Pure(f(a))).refl def flatMap[B, X2 Bool](f: A Parser[B, X2]) Parser[B, X X2] = Bind(self, f) def refl[X2 Bool](implicit refl: Refl[X, X2]) Parser[A, X2] = this.asInstanceOf[Parser[A, X2]] } case class Exactly(char: Char) extends Parser[Char, True] case class Pure[A](value: A) extends Parser[A, False] case class Or[A, X1 Bool, X2 Bool]( parser1 Parser[A, X1], parser2 Parser[A, X2] ) extends Parser[A, X1 X2] case class Bind[A, X1 Bool, B, X2 Bool]( parser: Parser[A, X1], f: A Parser[B, X2] ) extends Parser[B, X1 X2] } import Parser._ 31 / 39
  30. 33.

    val a: Parser[Char, True] = Exactly('a') val b: Parser[Char, True]

    = Exactly('b') val aOrB Parser[Char, True] = Or(a, b) def wrapped[A, X Bool](parser: Parser[A, X]) Parser[A, True] = for { _ Exactly('(') a parser _ Exactly(')') } yield a def maybe[A, X Bool](parser: Parser[A, X]) Parser[Option[A], False] = Or(parser.map(Option(_)), Pure(Option.empty[A])).refl object someMany { def some[A, X Bool](parser: Parser[A, X]) Parser[List[A], X] = parser.flatMap { a many(parser).map(a _) }.refl def many[A, X Bool](parser: Parser[A, X]) Parser[List[A], False] = Or(some(parser), Pure(List.empty[A])).refl } import someMany._ 32 / 39
  31. 34.

    def runForget[F[_], A]( parser: Parser[A, _] )( implicit M: MonadStateError[F,

    String, Error] ) F[A] = parser match { case Exactly(char) for { head next _ whenM(head char)(M.raiseError(Unexpected(head))) } yield head case Pure(value) M.point(value) case Or(parser1, parser2) M.get.flatMap { input M.handleError(runForget(parser1)) { _ M.put(input) runForget(parser2) } } case Bind(parser, f) runForget(parser).flatMap(a runForget(f(a))) } type Result[A] = Either[Error, A] type ResultST[A] = StateT[Result, String, A] def toResultST[A](parser: Parser[A, True]) ResultST[A] = { val MS: MonadState[ResultST, String] = StateT.stateTMonadState[String, Resu val ME: MonadError[ResultST, Error] = StateT.stateTMonadError[String, Resul runForget(parser)(MonadStateError(MS, ME)) } def eval[A](parser: Parser[A, True])(input: String) Result[A] = toResultST(p 33 / 39
  32. 35.

    scala> eval(aOrBs)("") res48 Result[List[Char]] = Left(EOF) scala> eval(aOrBs)("abcd") res49 Result[List[Char]]

    = Left(Unexpected(a)) scala> eval(aOrBs)("(a)(b)(b)(a)cd") res50 Result[List[Char]] = Right(List(a, b, b, a)) scala> eval(maybeAB)("") <console>:43: error: type mismatch; found : Parser.Parser[Option[Char],False] required: Parser.Parser[?,True] eval(maybeAB)("") ^ val aOrBs = some(wrapped(aOrB)) aOrBs: Parser.Parser[List[Char],True] = Bind(Bind(Exactly((),$$Lambda$4605 val maybeAB = maybe(aOrB) maybeAB Parser.Parser[Option[Char],False] = Or(Bind(Or(Exactly(a),Exactly 34 / 39
  33. 36.

    def stupid: Parser[Unit, False] = Pure(()).flatMap(_ stupid) def woops: Parser[Unit,

    True] = Exactly('$').flatMap(_ stupid) scala> eval(woops)("") res52 Result[Unit] = Left(EOF) scala> eval(woops)("$") java.lang.StackOverflowError at scalaz.std.EitherInstances$$anon$1.point(Either.scala:82) at scalaz.std.EitherInstances$$anon$1.point(Either.scala:67) at scalaz.Monad.$anonfun$map$1(Monad.scala:14) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:70) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:67) at scalaz.Monad.map(Monad.scala:14) at scalaz.Monad.map$(Monad.scala:14) at scalaz.std.EitherInstances$$anon$1.map(Either.scala:67) at scalaz.IndexedStateT.apply(StateT.scala:10) at scalaz.IndexedStateT.run(StateT.scala:13) at scalaz.IndexedStateT.$anonfun$mapsf$2(StateT.scala:98) at scalaz.IndexedStateT.$anonfun$flatMap$2(StateT.scala:65) at scalaz.IndexedStateT.$anonfun$flatMap$4(StateT.scala:67) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:70) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:67) at scalaz.IndexedStateT.$anonfun$flatMap$3(StateT.scala:67) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:70) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:67) at scalaz.IndexedStateT.$anonfun$flatMap$2(StateT.scala:65) at scalaz.IndexedStateT.$anonfun$flatMap$4(StateT.scala:67) at scalaz.std.EitherInstances$$anon$1.bind(Either.scala:70) 35 / 39
  34. 37.

    Cynicism: The hope that someday you will have known better

    all along. Nein. A Manifesto, Eric Jarosinski 36 / 39
  35. 39.
  36. 40.

    Reading list Functional Programming in Scala, Paul Chiusano and Runar

    Bjarnason Type-Driven Development with Idris, Edwin Brady Monadic parsing in Haskell, Graham Hutton and Erik Meijer 39 / 39