Slide 23
Slide 23 text
val normalize = (x: Column) => lower(x)
def fanoutOriginals(columns: Seq[String], df: DataFrame) =
df.withColumn("wrapper", array( => struct(lit(c) as "k", col(c) as "v")): _*))
.withColumn("v", normalize('v))
.dropDuplicates("k", "v")
def replaceOriginalsWithTokens(columns: Seq[String],
df: DataFrame, tokens: DataFrame) =
columns.foldLeft(df)((df, c) =>
df.withColumn(c, normalize(col(c)))
.withColumnRenamed(c, s"${c}_normalized")
'k === c)
"v", s"${c}_normalized")
"token", c),
Seq(s"${c}_normalized"), "left"))
col(_)): _*)
def tokenizeSnapshot(columns: String*)(df: DataFrame) = {
val newToken = row_number() over Window.orderBy(rand())
val tokens = fanoutOriginals(columns, df)
"token", newToken)
replaceOriginalsWithTokens(columns, df, tokens)
val generic = spark.table(
tokenizeSnapshot("email", "name", "joindate", "city", "industry"))
● protects from sequence attacks by
randomizing token allocation within append
● good for a quick demonstration of concepts
● doesn't do the most important thing - persist
token <-> value relationships in a vault
● generically applies tokenization to specified