BD-exp-9/1-1.py

# 1-1.py
from pyspark import SparkContext

sc = SparkContext()

def parse_line(line):
    try:
        parts = line.strip().split('","')
        id_ = parts[0].replace('"', '')
        title = parts[1].strip()
        desc = parts[2].strip()
        manufacturer = parts[3].strip()
        content = "{} {} {}".format(title, desc, manufacturer).strip()
        return (id_, content)
    except Exception as e:
        return None

google_path = "hdfs://master:9000/user/root/Google_small.csv"
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"

google_raw = sc.textFile(google_path)
amazon_raw = sc.textFile(amazon_path)

google_parsed = google_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
amazon_parsed = amazon_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')


print("Google ", google_parsed.take(1))
print("Amazon ", amazon_parsed.take(1))