import pyspark.sql.functions as F
from pyspark.sql import Row
stats = []
# age
age_stats = df.groupby("region").agg(F.avg("age").alias("avg"), F.max("age").alias("max"), F.min("age").alias("min"))
for row in age_stats.toLocalIterator():
stats.append({"col": "age", "region": row.region, "metric": "avg", "value": row["avg"]})
stats.append({"col": "age", "region": row.region, "metric": "max", "value": row["max"]})
stats.append({"col": "age", "region": row.region, "metric": "min", "value": row["min"]})
# device
device_counts = df.groupby("region", "device").agg(F.count("device").alias("count"))
device_unique = device_counts.groupby("region").agg(F.count("count").alias("unique"))
for row in device_counts.toLocalIterator():
stats.append({"col": "device", "region": row.region, "metric": "count", "value": row["count"], "device": row.device})
for row in device_unique.toLocalIterator():
stats.append({"col": "device", "region": row.region, "metric": "count", "unique": row["unique"]})
# ratings
def truncate(df, col, k):
def _(row):
dic = row.asDict()
dic[col] = dic[col][:k]
return Row(**dic)
return df.rdd.map(_).toDF()
ratings_stats_all = (
df.select("region", F.explode("ratings").alias("ratings"))
.groupby("region").agg(F.avg("ratings").alias("avg"), F.max("ratings").alias("max"), F.min("ratings").alias("min"))
)
for row in ratings_stats_all.toLocalIterator():
stats.append({"col": "ratings", "region": row.region, "metric": "avg", "value": row["avg"]})
stats.append({"col": "ratings", "region": row.region, "metric": "max", "value": row["max"]})
stats.append({"col": "ratings", "region": row.region, "metric": "min", "value": row["min"]})
ratings_stats_top5 = (
truncate(df, "ratings", 5)
.select("region", F.explode("ratings").alias("ratings"))
.groupby("region").agg(F.avg("ratings").alias("avg"), F.max("ratings").alias("max"), F.min("ratings").alias("min"))
)
for row in ratings_stats_top5.toLocalIterator():
stats.append({"col": "ratings", "region": row.region, "metric": "[email protected]", "value": row["avg"]})
stats.append({"col": "ratings", "region": row.region, "metric": "[email protected]", "value": row["max"]})
stats.append({"col": "ratings", "region": row.region, "metric": "[email protected]", "value": row["min"]})
interests_count_all = (
df.select("region", F.explode("interests").alias("interests"))
.groupby("region", "interests").agg(F.count("interests").alias("count"))
)
interests_unique_all = interests_count_all.groupby("region").agg(F.count("count").alias("unique"))
for row in interests_count_all.toLocalIterator():
stats.append({"col": "interests", "region": row.region, "metric": "count", "value": row["count"], "device": row.interests})
for row in interests_unique_all.toLocalIterator():
stats.append({"col": "interests", "region": row.region, "metric": "count", "unique": row["unique"]})
interests_count_top5 = (
truncate(df, "interests", 5)
.select("region", F.explode("interests").alias("interests"))
.groupby("region", "interests").agg(F.count("interests").alias("count"))
)
interests_unique_top5 = interests_count_top5.groupby("region").agg(F.count("count").alias("unique"))
for row in interests_count_top5.toLocalIterator():
stats.append({"col": "interests", "region": row.region, "metric": "count", "value": row["count"], "device": row.interests})
for row in interests_unique_top5.toLocalIterator():
stats.append({"col": "interests", "region": row.region, "metric": "count", "unique": row["unique"]}
from lupus.processor.spark import \
DistributionProcessor
processor = DistributionProcessor(
df,
group_columns=[“region”],
column_metrics={
“age”: [“avg”, “p25”, “p50”, “p75”],
“device”: [“count”, “unique”],
“ratings”: [“avg”, “[email protected]”, “min”, “max”],
“interests”: [“count”, “unique”, “[email protected]”],
},
)
metrics = processor.get_metrics()
Case: Metrics collection
Library support