[] # age age_stats = df.groupby("region").agg(F.avg("age").alias("avg"), F.max("age").alias("max"), F.min("age").alias("min")) for row in age_stats.toLocalIterator(): stats.append({"col": "age", "region": row.region, "metric": "avg", "value": row["avg"]}) stats.append({"col": "age", "region": row.region, "metric": "max", "value": row["max"]}) stats.append({"col": "age", "region": row.region, "metric": "min", "value": row["min"]}) # device device_counts = df.groupby("region", "device").agg(F.count("device").alias("count")) device_unique = device_counts.groupby("region").agg(F.count("count").alias("unique")) for row in device_counts.toLocalIterator(): stats.append({"col": "device", "region": row.region, "metric": "count", "value": row["count"], "device": row.device}) for row in device_unique.toLocalIterator(): stats.append({"col": "device", "region": row.region, "metric": "count", "unique": row["unique"]}) # ratings def truncate(df, col, k): def _(row): dic = row.asDict() dic[col] = dic[col][:k] return Row(**dic) return df.rdd.map(_).toDF() ratings_stats_all = ( df.select("region", F.explode("ratings").alias("ratings")) .groupby("region").agg(F.avg("ratings").alias("avg"), F.max("ratings").alias("max"), F.min("ratings").alias("min")) ) for row in ratings_stats_all.toLocalIterator(): stats.append({"col": "ratings", "region": row.region, "metric": "avg", "value": row["avg"]}) stats.append({"col": "ratings", "region": row.region, "metric": "max", "value": row["max"]}) stats.append({"col": "ratings", "region": row.region, "metric": "min", "value": row["min"]}) ratings_stats_top5 = ( truncate(df, "ratings", 5) .select("region", F.explode("ratings").alias("ratings")) .groupby("region").agg(F.avg("ratings").alias("avg"), F.max("ratings").alias("max"), F.min("ratings").alias("min")) ) for row in ratings_stats_top5.toLocalIterator(): stats.append({"col": "ratings", "region": row.region, "metric": "avg@5", "value": row["avg"]}) stats.append({"col": "ratings", "region": row.region, "metric": "max@5", "value": row["max"]}) stats.append({"col": "ratings", "region": row.region, "metric": "min@5", "value": row["min"]}) interests_count_all = ( df.select("region", F.explode("interests").alias("interests")) .groupby("region", "interests").agg(F.count("interests").alias("count")) ) interests_unique_all = interests_count_all.groupby("region").agg(F.count("count").alias("unique")) for row in interests_count_all.toLocalIterator(): stats.append({"col": "interests", "region": row.region, "metric": "count", "value": row["count"], "device": row.interests}) for row in interests_unique_all.toLocalIterator(): stats.append({"col": "interests", "region": row.region, "metric": "count", "unique": row["unique"]}) interests_count_top5 = ( truncate(df, "interests", 5) .select("region", F.explode("interests").alias("interests")) .groupby("region", "interests").agg(F.count("interests").alias("count")) ) interests_unique_top5 = interests_count_top5.groupby("region").agg(F.count("count").alias("unique")) for row in interests_count_top5.toLocalIterator(): stats.append({"col": "interests", "region": row.region, "metric": "count", "value": row["count"], "device": row.interests}) for row in interests_unique_top5.toLocalIterator(): stats.append({"col": "interests", "region": row.region, "metric": "count", "unique": row["unique"]} from lupus.processor.spark import \ DistributionProcessor processor = DistributionProcessor( df, group_columns=[“region”], column_metrics={ “age”: [“avg”, “p25”, “p50”, “p75”], “device”: [“count”, “unique”], “ratings”: [“avg”, “avg@5”, “min”, “max”], “interests”: [“count”, “unique”, “unique@3”], }, ) metrics = processor.get_metrics() Case: Metrics collection Library support