Slide 11
Slide 11 text
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
case class Data(InvoiceNo: String, StockCode: String, Description: String, Quantity: Long, InvoiceDate: String, UnitPrice:
Double, CustomerID: String, Country: String)
val schema = Encoders.product[Data].schema
val df=spark.read.option("header",true).schema(schema).csv("./data.csv")
val clean=df.na.drop(Seq("CustomerID")).dropDuplicates()
val data = clean.withColumn("total",when($"StockCode"!=="D",$"UnitPrice"*$"Quantity").otherwise(0))
.withColumn("Discount",when($"StockCode"==="D",$"UnitPrice"*$"Quantity").otherwise(0))
.withColumn("Postage",when($"StockCode"==="P",1).otherwise(0))
.withColumn("Invoice",regexp_replace($"InvoiceNo","^C",""))
.withColumn("Cancelled",when(substring($"InvoiceNo",0,1)==="C",1).otherwise(0))
val aggregated=data.groupBy($"Invoice",$"Country",$"CustomerID")
.agg(sum($"Discount").as("Discount"),sum($"total").as("Total"),max($"Cancelled").as("Cancelled"))
val customers =aggregated.groupBy($"CustomerID")
.agg(sum($"Total").as("Total"),sum($"Discount").as("Discount"),sum($"Cancelled").as("Cancelled"),count($"Invoice").as("In
voices"))
import org.apache.spark.ml.feature.VectorAssembler
val assembler = new
VectorAssembler().setInputCols(Array("Total","Discount","Cancelled","Invoices")).setOutputCol("features")
val features=assembler.transform(customers)
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
val Array(test,train)= features.randomSplit(Array(0.3,0.7))
val kmeans=new KMeans().setK(12).setFeaturesCol("features").setPredictionCol("prediction")
val model = kmeans.fit(train)
model.clusterCenters.foreach(println)
val predictions=model.transform(test)