From 33687f9fcdc39a7c144ad3435561fce8b54c51c5 Mon Sep 17 00:00:00 2001
From: fly6516 <fly6516@outlook.com>
Date: Sun, 20 Apr 2025 02:38:15 +0800
Subject: [PATCH] =?UTF-8?q?feat(4-1):=20=E9=87=8D=E6=9E=84=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E4=BB=A5=E5=AE=9E=E7=8E=B0=20Amazon=20=E5=92=8C=20Goo?=
 =?UTF-8?q?gle=20=E6=95=B0=E6=8D=AE=E9=9B=86=E7=9A=84=E4=BD=99=E5=BC=A6?=
 =?UTF-8?q?=E7=9B=B8=E4=BC=BC=E5=BA=A6=E8=AE=A1=E7=AE=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 重新组织代码结构，优化导入和变量定义
- 添加 TF-IDF 和余弦相似度计算函数
- 实现 Amazon 和 Google 数据集的解析和处理
-计算并输出指定 ID 对的相似度
---
 4-1.py | 133 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 74 insertions(+), 59 deletions(-)

diff --git a/4-1.py b/4-1.py
index 4a2ed22..f494baf 100644
--- a/4-1.py
+++ b/4-1.py
@@ -1,71 +1,86 @@
-import re
-import os
 from pyspark import SparkContext
+import re
+import math
 
-# 初始化 SparkContext
-sc = SparkContext(appName="TextAnalysis")
+# Initialize SparkContext
+sc = SparkContext()
 
-# 定义数据文件路径
-GOOGLE_PATH = 'Google.csv'
-GOOGLE_SMALL_PATH = 'Google_small.csv'
-AMAZON_PATH = 'Amazon.csv'
-AMAZON_SMALL_PATH = 'Amazon_small.csv'
-STOPWORDS_PATH = 'stopwords.txt'
+# Define paths for Amazon and Google datasets
+amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
+google_path = "hdfs://master:9000/user/root/Google_small.csv"
 
-# 定义正则表达式模式，用于解析数据行
-DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
+# Load the datasets
+amazonData = sc.textFile(amazon_path)
+googleData = sc.textFile(google_path)
 
-def removeQuotes(s):
-    """ 去掉输入字符串中的引号 """
-    return ''.join(i for i in s if i!='"')
+# Define a function to tokenize a string (splitting by non-alphanumeric characters)
+def tokenize(text):
+    return re.findall(r'\w+', text.lower())
 
-def parseDatafileLine(datafileLine):
-    """ 解析数据文件中的每一行 """
-    match = re.search(DATAFILE_PATTERN, str(datafileLine))
-    if match is None:
-        print('Invalid datafile line: %s' % datafileLine)
-        return (datafileLine, -1)
-    elif match.group(1) == '"id"':
-        print('Header datafile line: %s' % datafileLine)
-        return (datafileLine, 0)
-    else:
-        product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
-        return ((removeQuotes(match.group(1)), product), 1)
+# Sample IDF weights dictionary (use real IDF calculation in actual code)
+idfsSmallWeights = {"foo": 1.5, "bar": 1.2, "baz": 1.3}
 
-def parseData(filename):
-    """ 解析数据文件 """
-    return (sc
-            .textFile(filename, 4, 0)
-            .map(parseDatafileLine)
-            .cache())
+# TF-IDF function
+def tfidf(tokens, idfs):
+    tf = {}
+    for token in tokens:
+        tf[token] = tf.get(token, 0) + 1
+    tfidf_values = {token: tf[token] * idfs.get(token, 0) for token in tf}
+    return tfidf_values
 
-def loadData(path):
-    """ 加载数据文件 """
-    filename = path
-    raw = parseData(filename).cache()
-    failed = (raw
-              .filter(lambda s: s[1] == -1)
-              .map(lambda s: s[0]))
-    for line in failed.take(1):
-        print ('{0} - Invalid datafile line: {1}'.format(path, line))
-    valid = (raw
-             .filter(lambda s: s[1] == 1)
-             .map(lambda s: s[0])
-             .cache())
-    print ('{0} - Read {1} lines, successfully parsed {2} lines, failed to parse {3} lines'.format(path,raw.count(),valid.count(),failed.count()))
-    return valid
+# Cosine Similarity function
+def dotprod(a, b):
+    return sum(a.get(k, 0) * b.get(k, 0) for k in a if k in b)
 
-# 加载数据
-googleSmall = loadData(GOOGLE_SMALL_PATH)
-google = loadData(GOOGLE_PATH)
-amazonSmall = loadData(AMAZON_SMALL_PATH)
-amazon = loadData(AMAZON_PATH)
+def norm(a):
+    return math.sqrt(dotprod(a, a))
 
-# 打印部分数据以检查
-for line in googleSmall.take(3):
-    print ('google: %s: %s\n' % (line[0], line[1]))
+def cossim(a, b):
+    return dotprod(a, b) / (norm(a) * norm(b)) if norm(a) > 0 and norm(b) > 0 else 0
 
-for line in amazonSmall.take(3):
-    print ('amazon: %s: %s\n' % (line[0], line[1]))
+# Calculate cosine similarity between two strings
+def cosineSimilarity(string1, string2, idfsDictionary):
+    w1 = tfidf(tokenize(string1), idfsDictionary)
+    w2 = tfidf(tokenize(string2), idfsDictionary)
+    return cossim(w1, w2)
+
+# Parse the Amazon dataset: Split by commas and extract title and description
+def parse_amazon(line):
+    parts = line.split(",")
+    return (parts[0], parts[1], parts[2])  # Returning ID, title, and description
+
+# Parse the Google dataset: Split by commas and extract name and description
+def parse_google(line):
+    parts = line.split(",")
+    return (parts[0], parts[1], parts[2])  # Returning ID, name, and description
+
+# Process Amazon data
+amazonProcessed = amazonData.map(parse_amazon).map(lambda x: (x[0], x[1] + " " + x[2]))  # Combine title and description
+# Process Google data
+googleProcessed = googleData.map(parse_google).map(lambda x: (x[0], x[1] + " " + x[2]))  # Combine name and description
+
+# Cartesian join between Amazon and Google datasets
+crossSmall = amazonProcessed.cartesian(googleProcessed)
+
+# Compute similarity for each pair (Google, Amazon)
+def computeSimilarity(record):
+    amazonRec = record[0]
+    googleRec = record[1]
+    amazonID = amazonRec[0]
+    googleID = googleRec[0]
+    amazonValue = amazonRec[1]
+    googleValue = googleRec[1]
+    cs = cosineSimilarity(googleValue, amazonValue, idfsSmallWeights)
+    return (googleID, amazonID, cs)
+
+# Compute similarities for all pairs
+similarities = crossSmall.map(computeSimilarity)
+
+# Example to get the similarity for a specific Amazon ID and Google URL
+def similar(amazonID, googleID):
+    return similarities.filter(lambda record: (record[0] == googleID and record[1] == amazonID)).collect()
+
+# Test similarity for a specific pair (replace with actual IDs)
+similarResult = similar("b'b000o24l3q", "b'http://www.google.com/base/feeds/snippets/17242822440574356561")
+print("Requested similarity is %s." % similarResult)
 
-# 假设数据现在已经正确加载，你可以继续后续的分析