Slide 23
Slide 23 text
val normalize = (x: Column) => lower(x)
def fanoutOriginals(columns: Seq[String], df: DataFrame) =
df.withColumn("wrapper", array(
columns.map(c => struct(lit(c) as "k", col(c) as "v")): _*))
.select(explode('wrapper))
.selectExpr("col.*")
.withColumn("v", normalize('v))
.dropDuplicates("k", "v")
def replaceOriginalsWithTokens(columns: Seq[String],
df: DataFrame, tokens: DataFrame) =
columns.foldLeft(df)((df, c) =>
df.withColumn(c, normalize(col(c)))
.withColumnRenamed(c, s"${c}_normalized")
.join(tokens.where(
'k === c)
.withColumnRenamed(
"v", s"${c}_normalized")
.withColumnRenamed(
"token", c),
Seq(s"${c}_normalized"), "left"))
.select(df.columns.map(
col(_)): _*)
def tokenizeSnapshot(columns: String*)(df: DataFrame) = {
val newToken = row_number() over Window.orderBy(rand())
val tokens = fanoutOriginals(columns, df)
.withColumn(
"token", newToken)
replaceOriginalsWithTokens(columns, df, tokens)
}
val generic = spark.table(
"resellers").transform(
tokenizeSnapshot("email", "name", "joindate", "city", "industry"))
display(generic)
● protects from sequence attacks by
randomizing token allocation within append
microbatch
● good for a quick demonstration of concepts
● doesn't do the most important thing - persist
token <-> value relationships in a vault
● generically applies tokenization to specified
columns