Slide 15
Slide 15 text
+--------------+
| sentence|
+--------------+
|spark spark fast
fast fast|
+--------------+
Word Count by DataFrame
df = cxt.createDataFrame([('spark spark fast fast fast',)],['sentence'])
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType,ArrayType
from pyspark.sql.functions import explode
+-----+-----+
| word|count|
+-----+-----+
|spark| 2|
| fast| 3|
+-----+-----+
.groupBy('word').count()\
.show()
groupBy
+-----------------------------+
| split(sentence)|
+-----------------------------+
|ArrayBuffer(spark, spark, ...|
+-----------------------------+
split_udf = udf(lambda x: x.split(), ArrayType(StringType()))
UDF split
split_udf(df['sentence'])
+-----+
| word|
+-----+
|spark|
|spark|
| fast|
| fast|
| fast|
+-----+
explode
df.withColumn('word',explode(split_udf(df['sentence'])))\