init code

This commit is contained in:
fly6516 2025-04-16 01:40:52 +00:00
commit 8e93a04382
3 changed files with 110 additions and 0 deletions

29
1-1.py Normal file
View File

@ -0,0 +1,29 @@
# 1-1.py
from pyspark import SparkContext
sc = SparkContext()
def parse_line(line):
try:
parts = line.strip().split('","')
id_ = parts[0].replace('"', '')
title = parts[1].strip()
desc = parts[2].strip()
manufacturer = parts[3].strip()
content = "{} {} {}".format(title, desc, manufacturer).strip()
return (id_, content)
except Exception as e:
return None
google_path = "hdfs://master:9000/user/root/Google_small.csv"
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
google_raw = sc.textFile(google_path)
amazon_raw = sc.textFile(amazon_path)
google_parsed = google_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
amazon_parsed = amazon_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
print("Google ", google_parsed.take(1))
print("Amazon ", amazon_parsed.take(1))

42
2-1.py Normal file
View File

@ -0,0 +1,42 @@
# 2-1.py
from pyspark import SparkContext
sc = SparkContext()
stopwords = set(sc.textFile("hdfs://master:9000/user/root/stopwords.txt").collect())
def tokenize(text):
import re
words = re.findall(r'\w+', text.lower())
return [word for word in words if word not in stopwords]
def to_token_rdd(record):
return (record[0], tokenize(record[1]))
google_path = "hdfs://master:9000/user/root/Google_small.csv"
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
def parse_line(line):
try:
parts = line.strip().split('","')
id_ = parts[0].replace('"', '')
title = parts[1].strip()
desc = parts[2].strip()
manufacturer = parts[3].strip()
content = "{} {} {}".format(title, desc, manufacturer).strip()
return (id_, content)
except:
return None
google = sc.textFile(google_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
amazon = sc.textFile(amazon_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
google_tokens = google.map(to_token_rdd)
amazon_tokens = amazon.map(to_token_rdd)
token_count = google_tokens.union(amazon_tokens).flatMap(lambda x: x[1]).count()
longest_amazon_doc = amazon_tokens.map(lambda x: (x[0], len(x[1]))).takeOrdered(1, key=lambda x: -x[1])
print("total token: ", token_count)
print("longest Amazon record:", longest_amazon_doc)

39
3-1.py Normal file
View File

@ -0,0 +1,39 @@
# 3-1.py
from pyspark import SparkContext
from collections import defaultdict
sc = SparkContext()
corpus = google_tokens.union(amazon_tokens)
N = corpus.count()
def compute_tf(record):
doc_id, tokens = record
tf = defaultdict(float)
for token in tokens:
tf[token] += 1.0
total = float(len(tokens))
for key in tf:
tf[key] = tf[key] / total
return (doc_id, tf)
tf_rdd = corpus.map(compute_tf)
token_docs = corpus.flatMap(lambda x: [(token, x[0]) for token in set(x[1])])
doc_freq = token_docs.groupByKey().mapValues(lambda x: len(set(x)))
idf_dict = doc_freq.map(lambda x: (x[0], float(N) / x[1])).collectAsMap()
idf_bcast = sc.broadcast(idf_dict)
def compute_tfidf(record):
doc_id, tf_map = record
idf_map = idf_bcast.value
tfidf = {}
for token in tf_map:
tfidf[token] = tf_map[token] * idf_map.get(token, 0.0)
return (doc_id, tfidf)
tfidf_rdd = tf_rdd.map(compute_tfidf)
print("TF-IDF sample: ", tfidf_rdd.take(1))