diff --git a/6-1.py b/6-1.py index 228c4b8..048711f 100644 --- a/6-1.py +++ b/6-1.py @@ -138,6 +138,12 @@ print(similaritiesFullRDD.count()) # 假设 goldStandard 已经存在 # goldStandard: RDD of ((Amazon ID, Google URL), 1) for true duplicates +# 定义 goldStandard +goldStandard = sc.parallelize([ + (("b00005lzly", "http://www.google.com/base/feeds/snippets/13823221823254120257"), 1), + # 添加其他真实重复记录 +]) + # 创建 simsFullRDD 和 simsFullValuesRDD simsFullRDD = similaritiesFullRDD.map(lambda x: ("%s %s" % (x[0][0], x[0][1]), x[1])) simsFullValuesRDD = simsFullRDD.map(lambda x: x[1]).cache()