feat: 创建亚马逊和谷歌权重的广播变量

- 新增广播变量 amazon_weights_broadcast 和 google_weights_broadcast- 使用 collectAsMap() 方法将 RDD 转换为字典并进行广播
- 这些广播变量可以用于后续的分布式计算中,提高效率
This commit is contained in:
fly6516 2025-04-20 02:55:27 +08:00
parent 8bccc2cad7
commit dc883eaf72

4
5-1.py
View File

@ -95,6 +95,10 @@ def tfidf(tokens, idfs):
amazon_weights_rdd = amazon_rec_to_token.map(lambda x: (x[0], tfidf(x[1], idfs_full_broadcast.value)))
google_weights_rdd = google_rec_to_token.map(lambda x: (x[0], tfidf(x[1], idfs_full_broadcast.value)))
# 创建广播变量
amazon_weights_broadcast = sc.broadcast(amazon_weights_rdd.collectAsMap())
google_weights_broadcast = sc.broadcast(google_weights_rdd.collectAsMap())
# 计算权重范数
def norm(weights):