BD-exp-9/3-1.py

67 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
from pyspark import SparkContext
import csv
import re
# Python 3.5 没有 f-string使用 format
def tokenize(text):
# 分词并保留英文、数字
return re.findall(r'\w+', text.lower())
def load_stopwords(sc):
try:
return set(sc.textFile("hdfs://master:9000/user/root/stopwords.txt").collect())
except:
# fallback to local
with open("stopwords.txt", "r") as f:
return set([line.strip() for line in f])
def parse_csv_line(line):
# 使用 csv.reader 兼容逗号分隔含引号的数据
reader = csv.reader([line])
return next(reader)
def extract_info(line, source):
try:
fields = parse_csv_line(line)
if source == "google":
# Google: id, name, description, manufacturer...
pid = fields[0].strip()
text = "{} {} {}".format(fields[1], fields[2], fields[3])
else:
# Amazon: id, title, description, manufacturer...
pid = fields[0].strip()
text = "{} {} {}".format(fields[1], fields[2], fields[3])
return (pid, text)
except:
return (None, None)
if __name__ == "__main__":
sc = SparkContext(appName="InvertedIndex")
stopwords = load_stopwords(sc)
# 加载数据
google = sc.textFile("hdfs://master:9000/user/root/Google.csv")
amazon = sc.textFile("hdfs://master:9000/user/root/Amazon.csv")
# 提取内容
google_rdd = google.map(lambda line: extract_info(line, "google")) \
.filter(lambda x: x[0] is not None)
amazon_rdd = amazon.map(lambda line: extract_info(line, "amazon")) \
.filter(lambda x: x[0] is not None)
# 合并两数据集
all_data = google_rdd.union(amazon_rdd)
# 构建倒排索引
inverted_index = all_data.flatMap(lambda x: [((word, x[0])) for word in tokenize(x[1]) if word not in stopwords]) \
.groupByKey() \
.mapValues(lambda ids: list(set(ids)))
# 输出(可保存到 HDFS
inverted_index.saveAsTextFile("hdfs:///user/root/output/inverted_index")
sc.stop()