diff --git a/5-1.py b/5-1.py index 24b112e..2c76a21 100644 --- a/5-1.py +++ b/5-1.py @@ -76,6 +76,18 @@ print('There are %s unique tokens in the full datasets.' % idfsFullCount) idfsFullWeights = idfsFull.collectAsMap() idfsFullBroadcast = sc.broadcast(idfsFullWeights) +# 计算 TF-IDF +def tfidf(tokens, idfs): + """ 计算 TF-IDF 权重 """ + tf = {} + for token in tokens: + tf[token] = tf.get(token, 0) + 1 + tfidf_weights = {} + for token, freq in tf.items(): + if token in idfs: + tfidf_weights[token] = freq * idfs[token] + return tfidf_weights + # 计算完整数据集的 TF-IDF amazonWeightsRDD = amazonFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value))) googleWeightsRDD = googleFullRecToToken.map(lambda x: (x[0], tfidf(x[1], idfsFullBroadcast.value)))