30 lines
899 B
Python
30 lines
899 B
Python
# 1-1.py
|
|
from pyspark import SparkContext
|
|
|
|
sc = SparkContext()
|
|
|
|
def parse_line(line):
|
|
try:
|
|
parts = line.strip().split('","')
|
|
id_ = parts[0].replace('"', '')
|
|
title = parts[1].strip()
|
|
desc = parts[2].strip()
|
|
manufacturer = parts[3].strip()
|
|
content = "{} {} {}".format(title, desc, manufacturer).strip()
|
|
return (id_, content)
|
|
except Exception as e:
|
|
return None
|
|
|
|
google_path = "hdfs://master:9000/user/root/Google_small.csv"
|
|
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
|
|
|
|
google_raw = sc.textFile(google_path)
|
|
amazon_raw = sc.textFile(amazon_path)
|
|
|
|
google_parsed = google_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
|
amazon_parsed = amazon_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
|
|
|
|
|
print("Google ", google_parsed.take(1))
|
|
print("Amazon ", amazon_parsed.take(1))
|