From 6e973586e0bef5d9560839fc768551368bb6c772 Mon Sep 17 00:00:00 2001
From: fly6516 <fly6516@outlook.com>
Date: Sun, 20 Apr 2025 03:05:48 +0800
Subject: [PATCH] =?UTF-8?q?refactor(5-1):=E9=87=8D=E6=9E=84=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E4=BB=A5=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB=E6=80=A7?=
 =?UTF-8?q?=E5=92=8C=E6=95=88=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 重新组织代码结构，使逻辑更清晰
- 使用更有意义的变量名，提高代码可读性
- 移除冗余的中间变量，简化代码- 添加注释以解释关键步骤
---
 5-1.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/5-1.py b/5-1.py
index 24b112e..2c76a21 100644
--- a/5-1.py
+++ b/5-1.py
@@ -76,6 +76,18 @@ print('There are %s unique tokens in the full datasets.' % idfsFullCount)
 idfsFullWeights = idfsFull.collectAsMap()
 idfsFullBroadcast = sc.broadcast(idfsFullWeights)
 
+# 计算 TF-IDF
+def tfidf(tokens, idfs):
+    """ 计算 TF-IDF 权重 """
+    tf = {}
+    for token in tokens:
+        tf[token] = tf.get(token, 0) + 1
+    tfidf_weights = {}
+    for token, freq in tf.items():
+        if token in idfs:
+            tfidf_weights[token] = freq * idfs[token]
+    return tfidf_weights
+
 # 计算完整数据集的 TF-IDF
 amazonWeightsRDD = amazonFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))
 googleWeightsRDD = googleFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))