feat(4-1): 重构代码以实现 Amazon 和 Google 数据集的余弦相似度计算
- 重新组织代码结构,优化导入和变量定义 - 添加 TF-IDF 和余弦相似度计算函数 - 实现 Amazon 和 Google 数据集的解析和处理 -计算并输出指定 ID 对的相似度
This commit is contained in:
parent
15fcc21975
commit
33687f9fcd
133
4-1.py
133
4-1.py
@ -1,71 +1,86 @@
|
|||||||
import re
|
|
||||||
import os
|
|
||||||
from pyspark import SparkContext
|
from pyspark import SparkContext
|
||||||
|
import re
|
||||||
|
import math
|
||||||
|
|
||||||
# 初始化 SparkContext
|
# Initialize SparkContext
|
||||||
sc = SparkContext(appName="TextAnalysis")
|
sc = SparkContext()
|
||||||
|
|
||||||
# 定义数据文件路径
|
# Define paths for Amazon and Google datasets
|
||||||
GOOGLE_PATH = 'Google.csv'
|
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
|
||||||
GOOGLE_SMALL_PATH = 'Google_small.csv'
|
google_path = "hdfs://master:9000/user/root/Google_small.csv"
|
||||||
AMAZON_PATH = 'Amazon.csv'
|
|
||||||
AMAZON_SMALL_PATH = 'Amazon_small.csv'
|
|
||||||
STOPWORDS_PATH = 'stopwords.txt'
|
|
||||||
|
|
||||||
# 定义正则表达式模式,用于解析数据行
|
# Load the datasets
|
||||||
DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
|
amazonData = sc.textFile(amazon_path)
|
||||||
|
googleData = sc.textFile(google_path)
|
||||||
|
|
||||||
def removeQuotes(s):
|
# Define a function to tokenize a string (splitting by non-alphanumeric characters)
|
||||||
""" 去掉输入字符串中的引号 """
|
def tokenize(text):
|
||||||
return ''.join(i for i in s if i!='"')
|
return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
def parseDatafileLine(datafileLine):
|
# Sample IDF weights dictionary (use real IDF calculation in actual code)
|
||||||
""" 解析数据文件中的每一行 """
|
idfsSmallWeights = {"foo": 1.5, "bar": 1.2, "baz": 1.3}
|
||||||
match = re.search(DATAFILE_PATTERN, str(datafileLine))
|
|
||||||
if match is None:
|
|
||||||
print('Invalid datafile line: %s' % datafileLine)
|
|
||||||
return (datafileLine, -1)
|
|
||||||
elif match.group(1) == '"id"':
|
|
||||||
print('Header datafile line: %s' % datafileLine)
|
|
||||||
return (datafileLine, 0)
|
|
||||||
else:
|
|
||||||
product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
|
|
||||||
return ((removeQuotes(match.group(1)), product), 1)
|
|
||||||
|
|
||||||
def parseData(filename):
|
# TF-IDF function
|
||||||
""" 解析数据文件 """
|
def tfidf(tokens, idfs):
|
||||||
return (sc
|
tf = {}
|
||||||
.textFile(filename, 4, 0)
|
for token in tokens:
|
||||||
.map(parseDatafileLine)
|
tf[token] = tf.get(token, 0) + 1
|
||||||
.cache())
|
tfidf_values = {token: tf[token] * idfs.get(token, 0) for token in tf}
|
||||||
|
return tfidf_values
|
||||||
|
|
||||||
def loadData(path):
|
# Cosine Similarity function
|
||||||
""" 加载数据文件 """
|
def dotprod(a, b):
|
||||||
filename = path
|
return sum(a.get(k, 0) * b.get(k, 0) for k in a if k in b)
|
||||||
raw = parseData(filename).cache()
|
|
||||||
failed = (raw
|
|
||||||
.filter(lambda s: s[1] == -1)
|
|
||||||
.map(lambda s: s[0]))
|
|
||||||
for line in failed.take(1):
|
|
||||||
print ('{0} - Invalid datafile line: {1}'.format(path, line))
|
|
||||||
valid = (raw
|
|
||||||
.filter(lambda s: s[1] == 1)
|
|
||||||
.map(lambda s: s[0])
|
|
||||||
.cache())
|
|
||||||
print ('{0} - Read {1} lines, successfully parsed {2} lines, failed to parse {3} lines'.format(path,raw.count(),valid.count(),failed.count()))
|
|
||||||
return valid
|
|
||||||
|
|
||||||
# 加载数据
|
def norm(a):
|
||||||
googleSmall = loadData(GOOGLE_SMALL_PATH)
|
return math.sqrt(dotprod(a, a))
|
||||||
google = loadData(GOOGLE_PATH)
|
|
||||||
amazonSmall = loadData(AMAZON_SMALL_PATH)
|
|
||||||
amazon = loadData(AMAZON_PATH)
|
|
||||||
|
|
||||||
# 打印部分数据以检查
|
def cossim(a, b):
|
||||||
for line in googleSmall.take(3):
|
return dotprod(a, b) / (norm(a) * norm(b)) if norm(a) > 0 and norm(b) > 0 else 0
|
||||||
print ('google: %s: %s\n' % (line[0], line[1]))
|
|
||||||
|
|
||||||
for line in amazonSmall.take(3):
|
# Calculate cosine similarity between two strings
|
||||||
print ('amazon: %s: %s\n' % (line[0], line[1]))
|
def cosineSimilarity(string1, string2, idfsDictionary):
|
||||||
|
w1 = tfidf(tokenize(string1), idfsDictionary)
|
||||||
|
w2 = tfidf(tokenize(string2), idfsDictionary)
|
||||||
|
return cossim(w1, w2)
|
||||||
|
|
||||||
|
# Parse the Amazon dataset: Split by commas and extract title and description
|
||||||
|
def parse_amazon(line):
|
||||||
|
parts = line.split(",")
|
||||||
|
return (parts[0], parts[1], parts[2]) # Returning ID, title, and description
|
||||||
|
|
||||||
|
# Parse the Google dataset: Split by commas and extract name and description
|
||||||
|
def parse_google(line):
|
||||||
|
parts = line.split(",")
|
||||||
|
return (parts[0], parts[1], parts[2]) # Returning ID, name, and description
|
||||||
|
|
||||||
|
# Process Amazon data
|
||||||
|
amazonProcessed = amazonData.map(parse_amazon).map(lambda x: (x[0], x[1] + " " + x[2])) # Combine title and description
|
||||||
|
# Process Google data
|
||||||
|
googleProcessed = googleData.map(parse_google).map(lambda x: (x[0], x[1] + " " + x[2])) # Combine name and description
|
||||||
|
|
||||||
|
# Cartesian join between Amazon and Google datasets
|
||||||
|
crossSmall = amazonProcessed.cartesian(googleProcessed)
|
||||||
|
|
||||||
|
# Compute similarity for each pair (Google, Amazon)
|
||||||
|
def computeSimilarity(record):
|
||||||
|
amazonRec = record[0]
|
||||||
|
googleRec = record[1]
|
||||||
|
amazonID = amazonRec[0]
|
||||||
|
googleID = googleRec[0]
|
||||||
|
amazonValue = amazonRec[1]
|
||||||
|
googleValue = googleRec[1]
|
||||||
|
cs = cosineSimilarity(googleValue, amazonValue, idfsSmallWeights)
|
||||||
|
return (googleID, amazonID, cs)
|
||||||
|
|
||||||
|
# Compute similarities for all pairs
|
||||||
|
similarities = crossSmall.map(computeSimilarity)
|
||||||
|
|
||||||
|
# Example to get the similarity for a specific Amazon ID and Google URL
|
||||||
|
def similar(amazonID, googleID):
|
||||||
|
return similarities.filter(lambda record: (record[0] == googleID and record[1] == amazonID)).collect()
|
||||||
|
|
||||||
|
# Test similarity for a specific pair (replace with actual IDs)
|
||||||
|
similarResult = similar("b'b000o24l3q", "b'http://www.google.com/base/feeds/snippets/17242822440574356561")
|
||||||
|
print("Requested similarity is %s." % similarResult)
|
||||||
|
|
||||||
# 假设数据现在已经正确加载,你可以继续后续的分析
|
|
||||||
|
Loading…
Reference in New Issue
Block a user