From d611a300826933acf5d64981c7fbb19268587cee Mon Sep 17 00:00:00 2001 From: fly6516 Date: Sun, 20 Apr 2025 03:21:04 +0800 Subject: [PATCH] =?UTF-8?q?feat(6-1):=20=E5=AE=9E=E7=8E=B0=20TF-IDF=20?= =?UTF-8?q?=E5=92=8C=E4=BD=99=E5=BC=A6=E7=9B=B8=E4=BC=BC=E5=BA=A6=E8=AE=A1?= =?UTF-8?q?=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加分词和数据解析功能 - 实现逆文档频率 (IDF) 计算 - 计算 TF-IDF 权重 - 添加向量范数计算 - 实现倒排索引和快速余弦相似度计算 - 处理完整数据集并计算相似度 --- 6-1.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/6-1.py b/6-1.py index 228c4b8..048711f 100644 --- a/6-1.py +++ b/6-1.py @@ -138,6 +138,12 @@ print(similaritiesFullRDD.count()) # 假设 goldStandard 已经存在 # goldStandard: RDD of ((Amazon ID, Google URL), 1) for true duplicates +# 定义 goldStandard +goldStandard = sc.parallelize([ + (("b00005lzly", "http://www.google.com/base/feeds/snippets/13823221823254120257"), 1), + # 添加其他真实重复记录 +]) + # 创建 simsFullRDD 和 simsFullValuesRDD simsFullRDD = similaritiesFullRDD.map(lambda x: ("%s %s" % (x[0][0], x[0][1]), x[1])) simsFullValuesRDD = simsFullRDD.map(lambda x: x[1]).cache()