BD-exp-9/3-1.py
fly6516 c699e3d21e feat(3-1.py):重构代码以构建倒排索引
- 重新设计代码结构,添加函数以提高可读性和可维护性
- 增加对 stopwords 的处理,提高索引准确性
- 使用 csv 模块解析 CSV 数据,提高数据处理能力
- 优化文本分词和数据提取逻辑,增强数据处理效率
- 构建倒排索引并保存到 HDFS,实现数据索引功能
2025-04-16 09:43:34 +08:00

67 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
from pyspark import SparkContext
import csv
import re
# Python 3.5 没有 f-string使用 format
def tokenize(text):
# 分词并保留英文、数字
return re.findall(r'\w+', text.lower())
def load_stopwords(sc):
try:
return set(sc.textFile("hdfs:///user/root/stopwords.txt").collect())
except:
# fallback to local
with open("stopwords.txt", "r") as f:
return set([line.strip() for line in f])
def parse_csv_line(line):
# 使用 csv.reader 兼容逗号分隔含引号的数据
reader = csv.reader([line])
return next(reader)
def extract_info(line, source):
try:
fields = parse_csv_line(line)
if source == "google":
# Google: id, name, description, manufacturer...
pid = fields[0].strip()
text = "{} {} {}".format(fields[1], fields[2], fields[3])
else:
# Amazon: id, title, description, manufacturer...
pid = fields[0].strip()
text = "{} {} {}".format(fields[1], fields[2], fields[3])
return (pid, text)
except:
return (None, None)
if __name__ == "__main__":
sc = SparkContext(appName="InvertedIndex")
stopwords = load_stopwords(sc)
# 加载数据
google = sc.textFile("hdfs://master:9000/user/root/GoogleProducts.csv")
amazon = sc.textFile("hdfs://master:9000/user/root/AmazonProducts.csv")
# 提取内容
google_rdd = google.map(lambda line: extract_info(line, "google")) \
.filter(lambda x: x[0] is not None)
amazon_rdd = amazon.map(lambda line: extract_info(line, "amazon")) \
.filter(lambda x: x[0] is not None)
# 合并两数据集
all_data = google_rdd.union(amazon_rdd)
# 构建倒排索引
inverted_index = all_data.flatMap(lambda x: [((word, x[0])) for word in tokenize(x[1]) if word not in stopwords]) \
.groupByKey() \
.mapValues(lambda ids: list(set(ids)))
# 输出(可保存到 HDFS
inverted_index.saveAsTextFile("hdfs:///user/root/output/inverted_index")
sc.stop()