refactor(5-1):重构代码以提高可读性和效率

- 重新组织代码结构,使逻辑更清晰
- 使用更有意义的变量名,提高代码可读性
- 移除冗余的中间变量,简化代码- 添加注释以解释关键步骤
This commit is contained in:
fly6516 2025-04-20 03:05:48 +08:00
parent 1d16bebe43
commit 6e973586e0

12
5-1.py
View File

@ -76,6 +76,18 @@ print('There are %s unique tokens in the full datasets.' % idfsFullCount)
idfsFullWeights = idfsFull.collectAsMap()
idfsFullBroadcast = sc.broadcast(idfsFullWeights)
# 计算 TF-IDF
def tfidf(tokens, idfs):
""" 计算 TF-IDF 权重 """
tf = {}
for token in tokens:
tf[token] = tf.get(token, 0) + 1
tfidf_weights = {}
for token, freq in tf.items():
if token in idfs:
tfidf_weights[token] = freq * idfs[token]
return tfidf_weights
# 计算完整数据集的 TF-IDF
amazonWeightsRDD = amazonFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))
googleWeightsRDD = googleFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))