feat(6-1.py): 新增文本分析功能

- 创建 SparkContext 和相关 RDD - 计算真阳性相似度和累加器 - 定义精度、召回率和 F 度量的计算方法 - 生成阈值列表并计算对应指标值 - 绘制精度、召回率和 F 度量的折线图 - 停止 SparkContext
2025-04-20 03:16:29 +08:00 · 2025-04-20 03:16:29 +08:00 · 6f703860a6
commit 6f703860a6
parent 14552b200f
1 changed files with 113 additions and 0 deletions
--- a/6-1.py
+++ b/6-1.py
@ -0,0 +1,113 @@
+from pyspark import SparkContext
+from pyspark.accumulators import AccumulatorParam
+import matplotlib.pyplot as plt
+
+# 创建 SparkContext
+sc = SparkContext(appName="TextAnalysis")
+
+# 假设 similaritiesFullRDD 和 goldStandard 已经存在
+# similaritiesFullRDD: RDD of ((Amazon ID, Google URL), Similarity)
+# goldStandard: RDD of ((Amazon ID, Google URL), 1) for true duplicates
+
+# 创建 simsFullRDD 和 simsFullValuesRDD
+simsFullRDD = similaritiesFullRDD.map(lambda x: ("%s %s" % (x[0][0], x[0][1]), x[1]))
+simsFullValuesRDD = simsFullRDD.map(lambda x: x[1]).cache()
+
+# 计算真阳性的相似度
+def gs_value(record):
+    if record[1][1] is None:
+        return 0
+    else:
+        return record[1][1]
+
+trueDupSimsRDD = (goldStandard
+                  .leftOuterJoin(simsFullRDD)
+                  .map(gs_value)
+                  .cache())
+print('There are %s true duplicates.' % trueDupSimsRDD.count())
+
+# 定义累加器
+class VectorAccumulatorParam(AccumulatorParam):
+    def zero(self, value):
+        return [0] * len(value)
+    def addInPlace(self, val1, val2):
+        for i in range(len(val1)):
+            val1[i] += val2[i]
+        return val1
+
+def set_bit(x, value, length):
+    bits = []
+    for y in range(length):
+        if x == y:
+            bits.append(value)
+        else:
+            bits.append(0)
+    return bits
+
+BINS = 101
+nthresholds = 100
+def bin(similarity):
+    return int(similarity * nthresholds)
+
+zeros = [0] * BINS
+fpCounts = sc.accumulator(zeros, VectorAccumulatorParam())
+
+def add_element(score):
+    global fpCounts
+    b = bin(score)
+    fpCounts += set_bit(b, 1, BINS)
+
+simsFullValuesRDD.foreach(add_element)
+
+def sub_element(score):
+    global fpCounts
+    b = bin(score)
+    fpCounts += set_bit(b, -1, BINS)
+
+trueDupSimsRDD.foreach(sub_element)
+
+def falsepos(threshold):
+    fpList = fpCounts.value
+    return sum([fpList[b] for b in range(0, BINS) if float(b) / nthresholds >= threshold])
+
+def falseneg(threshold):
+    return trueDupSimsRDD.filter(lambda x: x < threshold).count()
+
+def truepos(threshold):
+    return trueDupSimsRDD.count() - falseneg(threshold)
+
+# 计算准确率、召回率和F度量
+def precision(threshold):
+    tp = truepos(threshold)
+    return float(tp) / (tp + falsepos(threshold))
+
+def recall(threshold):
+    tp = truepos(threshold)
+    return float(tp) / (tp + falseneg(threshold))
+
+def fmeasure(threshold):
+    r = recall(threshold)
+    p = precision(threshold)
+    if r == 0 and p == 0:
+        return 1
+    else:
+        return 2 * r * p / (r + p)
+
+# 生成阈值列表
+thresholds = [float(n) / nthresholds for n in range(0, nthresholds)]
+
+# 计算准确率、召回率和F度量的值
+precisions = [precision(t) for t in thresholds]
+recalls = [recall(t) for t in thresholds]
+fmeasures = [fmeasure(t) for t in thresholds]
+
+# 绘制折线图
+fig = plt.figure()
+plt.plot(thresholds, precisions)
+plt.plot(thresholds, recalls)
+plt.plot(thresholds, fmeasures)
+plt.legend(['Precision', 'Recall', 'F-measure'])
+plt.show()
+
+# 停止 SparkContext
+sc.stop()