From 6e973586e0bef5d9560839fc768551368bb6c772 Mon Sep 17 00:00:00 2001 From: fly6516 Date: Sun, 20 Apr 2025 03:05:48 +0800 Subject: [PATCH] =?UTF-8?q?refactor(5-1):=E9=87=8D=E6=9E=84=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E4=BB=A5=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB=E6=80=A7?= =?UTF-8?q?=E5=92=8C=E6=95=88=E7=8E=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重新组织代码结构,使逻辑更清晰 - 使用更有意义的变量名,提高代码可读性 - 移除冗余的中间变量,简化代码- 添加注释以解释关键步骤 --- 5-1.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/5-1.py b/5-1.py index 24b112e..2c76a21 100644 --- a/5-1.py +++ b/5-1.py @@ -76,6 +76,18 @@ print('There are %s unique tokens in the full datasets.' % idfsFullCount) idfsFullWeights = idfsFull.collectAsMap() idfsFullBroadcast = sc.broadcast(idfsFullWeights) +# 计算 TF-IDF +def tfidf(tokens, idfs): + """ 计算 TF-IDF 权重 """ + tf = {} + for token in tokens: + tf[token] = tf.get(token, 0) + 1 + tfidf_weights = {} + for token, freq in tf.items(): + if token in idfs: + tfidf_weights[token] = freq * idfs[token] + return tfidf_weights + # 计算完整数据集的 TF-IDF amazonWeightsRDD = amazonFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value))) googleWeightsRDD = googleFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))