# 2-1.py from pyspark import SparkContext sc = SparkContext() stopwords = set(sc.textFile("hdfs://master:9000/user/root/stopwords.txt").collect()) def tokenize(text): import re words = re.findall(r'\w+', text.lower()) return [word for word in words if word not in stopwords] def to_token_rdd(record): return (record[0], tokenize(record[1])) google_path = "hdfs://master:9000/user/root/Google_small.csv" amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv" def parse_line(line): try: parts = line.strip().split('","') id_ = parts[0].replace('"', '') title = parts[1].strip() desc = parts[2].strip() manufacturer = parts[3].strip() content = "{} {} {}".format(title, desc, manufacturer).strip() return (id_, content) except: return None google = sc.textFile(google_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id') amazon = sc.textFile(amazon_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id') google_tokens = google.map(to_token_rdd) amazon_tokens = amazon.map(to_token_rdd) token_count = google_tokens.union(amazon_tokens).flatMap(lambda x: x[1]).count() longest_amazon_doc = amazon_tokens.map(lambda x: (x[0], len(x[1]))).takeOrdered(1, key=lambda x: -x[1]) print("total token: ", token_count) print("longest Amazon record:", longest_amazon_doc)