# 1-1.py from pyspark import SparkContext sc = SparkContext() def parse_line(line): try: parts = line.strip().split('","') id_ = parts[0].replace('"', '') title = parts[1].strip() desc = parts[2].strip() manufacturer = parts[3].strip() content = "{} {} {}".format(title, desc, manufacturer).strip() return (id_, content) except Exception as e: return None google_path = "hdfs://master:9000/user/root/Google_small.csv" amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv" google_raw = sc.textFile(google_path) amazon_raw = sc.textFile(amazon_path) google_parsed = google_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id') amazon_parsed = amazon_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id') print("Google ", google_parsed.take(1)) print("Amazon ", amazon_parsed.take(1))