Slide 38
Slide 38 text
3. From RDDs, with manual schema definition
4. By reading (semi-)structured data files
val schema = StructType(Array(
StructField("level", StringType, nullable = true),
StructField("date", DateType, nullable = true),
StructField("client_id", IntType, nullable = true),
StructField("stage", StringType, nullable = true),
StructField("msg", StringType, nullable = true),
))
val rowRdd = sc.textFile("ghtorrent-log.txt").
map(_.split("#")).
map(r => Row(r(0), new Date(r(1)), r(2).toInt,
r(3), r(4)))
val logDF = spark.createDataframe(rowRDD, schema)
val df = spark.read.json("examples/src/main/resources/people.json")
df = sqlContext.read.csv("/datasets/pullreqs.csv", sep=",",
header=True, inferSchema=True) 5 . 17