feat(6-1.py): 新增文本分析功能

- 创建 SparkContext 和相关 RDD
- 计算真阳性相似度和累加器
- 定义精度、召回率和 F 度量的计算方法
- 生成阈值列表并计算对应指标值
- 绘制精度、召回率和 F 度量的折线图
- 停止 SparkContext
This commit is contained in:
fly6516 2025-04-20 03:16:29 +08:00
parent 14552b200f
commit 6f703860a6

113
6-1.py Normal file
View File

@ -0,0 +1,113 @@
from pyspark import SparkContext
from pyspark.accumulators import AccumulatorParam
import matplotlib.pyplot as plt
# 创建 SparkContext
sc = SparkContext(appName="TextAnalysis")
# 假设 similaritiesFullRDD 和 goldStandard 已经存在
# similaritiesFullRDD: RDD of ((Amazon ID, Google URL), Similarity)
# goldStandard: RDD of ((Amazon ID, Google URL), 1) for true duplicates
# 创建 simsFullRDD 和 simsFullValuesRDD
simsFullRDD = similaritiesFullRDD.map(lambda x: ("%s %s" % (x[0][0], x[0][1]), x[1]))
simsFullValuesRDD = simsFullRDD.map(lambda x: x[1]).cache()
# 计算真阳性的相似度
def gs_value(record):
if record[1][1] is None:
return 0
else:
return record[1][1]
trueDupSimsRDD = (goldStandard
.leftOuterJoin(simsFullRDD)
.map(gs_value)
.cache())
print('There are %s true duplicates.' % trueDupSimsRDD.count())
# 定义累加器
class VectorAccumulatorParam(AccumulatorParam):
def zero(self, value):
return [0] * len(value)
def addInPlace(self, val1, val2):
for i in range(len(val1)):
val1[i] += val2[i]
return val1
def set_bit(x, value, length):
bits = []
for y in range(length):
if x == y:
bits.append(value)
else:
bits.append(0)
return bits
BINS = 101
nthresholds = 100
def bin(similarity):
return int(similarity * nthresholds)
zeros = [0] * BINS
fpCounts = sc.accumulator(zeros, VectorAccumulatorParam())
def add_element(score):
global fpCounts
b = bin(score)
fpCounts += set_bit(b, 1, BINS)
simsFullValuesRDD.foreach(add_element)
def sub_element(score):
global fpCounts
b = bin(score)
fpCounts += set_bit(b, -1, BINS)
trueDupSimsRDD.foreach(sub_element)
def falsepos(threshold):
fpList = fpCounts.value
return sum([fpList[b] for b in range(0, BINS) if float(b) / nthresholds >= threshold])
def falseneg(threshold):
return trueDupSimsRDD.filter(lambda x: x < threshold).count()
def truepos(threshold):
return trueDupSimsRDD.count() - falseneg(threshold)
# 计算准确率、召回率和F度量
def precision(threshold):
tp = truepos(threshold)
return float(tp) / (tp + falsepos(threshold))
def recall(threshold):
tp = truepos(threshold)
return float(tp) / (tp + falseneg(threshold))
def fmeasure(threshold):
r = recall(threshold)
p = precision(threshold)
if r == 0 and p == 0:
return 1
else:
return 2 * r * p / (r + p)
# 生成阈值列表
thresholds = [float(n) / nthresholds for n in range(0, nthresholds)]
# 计算准确率、召回率和F度量的值
precisions = [precision(t) for t in thresholds]
recalls = [recall(t) for t in thresholds]
fmeasures = [fmeasure(t) for t in thresholds]
# 绘制折线图
fig = plt.figure()
plt.plot(thresholds, precisions)
plt.plot(thresholds, recalls)
plt.plot(thresholds, fmeasures)
plt.legend(['Precision', 'Recall', 'F-measure'])
plt.show()
# 停止 SparkContext
sc.stop()