refactor(5-1):重构代码以提高可读性和效率
- 重新组织代码结构,使逻辑更清晰 - 使用更有意义的变量名,提高代码可读性 - 移除冗余的中间变量,简化代码- 添加注释以解释关键步骤
This commit is contained in:
parent
1d16bebe43
commit
6e973586e0
12
5-1.py
12
5-1.py
@ -76,6 +76,18 @@ print('There are %s unique tokens in the full datasets.' % idfsFullCount)
|
||||
idfsFullWeights = idfsFull.collectAsMap()
|
||||
idfsFullBroadcast = sc.broadcast(idfsFullWeights)
|
||||
|
||||
# 计算 TF-IDF
|
||||
def tfidf(tokens, idfs):
|
||||
""" 计算 TF-IDF 权重 """
|
||||
tf = {}
|
||||
for token in tokens:
|
||||
tf[token] = tf.get(token, 0) + 1
|
||||
tfidf_weights = {}
|
||||
for token, freq in tf.items():
|
||||
if token in idfs:
|
||||
tfidf_weights[token] = freq * idfs[token]
|
||||
return tfidf_weights
|
||||
|
||||
# 计算完整数据集的 TF-IDF
|
||||
amazonWeightsRDD = amazonFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))
|
||||
googleWeightsRDD = googleFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))
|
||||
|
Loading…
Reference in New Issue
Block a user