43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
# 2-1.py
|
|
from pyspark import SparkContext
|
|
|
|
sc = SparkContext()
|
|
|
|
stopwords = set(sc.textFile("hdfs://master:9000/user/root/stopwords.txt").collect())
|
|
|
|
def tokenize(text):
|
|
import re
|
|
words = re.findall(r'\w+', text.lower())
|
|
return [word for word in words if word not in stopwords]
|
|
|
|
def to_token_rdd(record):
|
|
return (record[0], tokenize(record[1]))
|
|
|
|
google_path = "hdfs://master:9000/user/root/Google_small.csv"
|
|
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
|
|
|
|
def parse_line(line):
|
|
try:
|
|
parts = line.strip().split('","')
|
|
id_ = parts[0].replace('"', '')
|
|
title = parts[1].strip()
|
|
desc = parts[2].strip()
|
|
manufacturer = parts[3].strip()
|
|
content = "{} {} {}".format(title, desc, manufacturer).strip()
|
|
return (id_, content)
|
|
except:
|
|
return None
|
|
|
|
google = sc.textFile(google_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
|
amazon = sc.textFile(amazon_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
|
|
|
google_tokens = google.map(to_token_rdd)
|
|
amazon_tokens = amazon.map(to_token_rdd)
|
|
|
|
|
|
token_count = google_tokens.union(amazon_tokens).flatMap(lambda x: x[1]).count()
|
|
longest_amazon_doc = amazon_tokens.map(lambda x: (x[0], len(x[1]))).takeOrdered(1, key=lambda x: -x[1])
|
|
|
|
print("total token: ", token_count)
|
|
print("longest Amazon record:", longest_amazon_doc)
|