init code
This commit is contained in:
commit
8e93a04382
29
1-1.py
Normal file
29
1-1.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# 1-1.py
|
||||||
|
from pyspark import SparkContext
|
||||||
|
|
||||||
|
sc = SparkContext()
|
||||||
|
|
||||||
|
def parse_line(line):
|
||||||
|
try:
|
||||||
|
parts = line.strip().split('","')
|
||||||
|
id_ = parts[0].replace('"', '')
|
||||||
|
title = parts[1].strip()
|
||||||
|
desc = parts[2].strip()
|
||||||
|
manufacturer = parts[3].strip()
|
||||||
|
content = "{} {} {}".format(title, desc, manufacturer).strip()
|
||||||
|
return (id_, content)
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
google_path = "hdfs://master:9000/user/root/Google_small.csv"
|
||||||
|
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
|
||||||
|
|
||||||
|
google_raw = sc.textFile(google_path)
|
||||||
|
amazon_raw = sc.textFile(amazon_path)
|
||||||
|
|
||||||
|
google_parsed = google_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
||||||
|
amazon_parsed = amazon_raw.map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
||||||
|
|
||||||
|
|
||||||
|
print("Google ", google_parsed.take(1))
|
||||||
|
print("Amazon ", amazon_parsed.take(1))
|
42
2-1.py
Normal file
42
2-1.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# 2-1.py
|
||||||
|
from pyspark import SparkContext
|
||||||
|
|
||||||
|
sc = SparkContext()
|
||||||
|
|
||||||
|
stopwords = set(sc.textFile("hdfs://master:9000/user/root/stopwords.txt").collect())
|
||||||
|
|
||||||
|
def tokenize(text):
|
||||||
|
import re
|
||||||
|
words = re.findall(r'\w+', text.lower())
|
||||||
|
return [word for word in words if word not in stopwords]
|
||||||
|
|
||||||
|
def to_token_rdd(record):
|
||||||
|
return (record[0], tokenize(record[1]))
|
||||||
|
|
||||||
|
google_path = "hdfs://master:9000/user/root/Google_small.csv"
|
||||||
|
amazon_path = "hdfs://master:9000/user/root/Amazon_small.csv"
|
||||||
|
|
||||||
|
def parse_line(line):
|
||||||
|
try:
|
||||||
|
parts = line.strip().split('","')
|
||||||
|
id_ = parts[0].replace('"', '')
|
||||||
|
title = parts[1].strip()
|
||||||
|
desc = parts[2].strip()
|
||||||
|
manufacturer = parts[3].strip()
|
||||||
|
content = "{} {} {}".format(title, desc, manufacturer).strip()
|
||||||
|
return (id_, content)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
google = sc.textFile(google_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
||||||
|
amazon = sc.textFile(amazon_path).map(parse_line).filter(lambda x: x is not None and x[0] != 'id')
|
||||||
|
|
||||||
|
google_tokens = google.map(to_token_rdd)
|
||||||
|
amazon_tokens = amazon.map(to_token_rdd)
|
||||||
|
|
||||||
|
|
||||||
|
token_count = google_tokens.union(amazon_tokens).flatMap(lambda x: x[1]).count()
|
||||||
|
longest_amazon_doc = amazon_tokens.map(lambda x: (x[0], len(x[1]))).takeOrdered(1, key=lambda x: -x[1])
|
||||||
|
|
||||||
|
print("total token: ", token_count)
|
||||||
|
print("longest Amazon record:", longest_amazon_doc)
|
39
3-1.py
Normal file
39
3-1.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# 3-1.py
|
||||||
|
from pyspark import SparkContext
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
sc = SparkContext()
|
||||||
|
|
||||||
|
corpus = google_tokens.union(amazon_tokens)
|
||||||
|
N = corpus.count()
|
||||||
|
|
||||||
|
def compute_tf(record):
|
||||||
|
doc_id, tokens = record
|
||||||
|
tf = defaultdict(float)
|
||||||
|
for token in tokens:
|
||||||
|
tf[token] += 1.0
|
||||||
|
total = float(len(tokens))
|
||||||
|
for key in tf:
|
||||||
|
tf[key] = tf[key] / total
|
||||||
|
return (doc_id, tf)
|
||||||
|
|
||||||
|
tf_rdd = corpus.map(compute_tf)
|
||||||
|
|
||||||
|
token_docs = corpus.flatMap(lambda x: [(token, x[0]) for token in set(x[1])])
|
||||||
|
doc_freq = token_docs.groupByKey().mapValues(lambda x: len(set(x)))
|
||||||
|
idf_dict = doc_freq.map(lambda x: (x[0], float(N) / x[1])).collectAsMap()
|
||||||
|
idf_bcast = sc.broadcast(idf_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_tfidf(record):
|
||||||
|
doc_id, tf_map = record
|
||||||
|
idf_map = idf_bcast.value
|
||||||
|
tfidf = {}
|
||||||
|
for token in tf_map:
|
||||||
|
tfidf[token] = tf_map[token] * idf_map.get(token, 0.0)
|
||||||
|
return (doc_id, tfidf)
|
||||||
|
|
||||||
|
tfidf_rdd = tf_rdd.map(compute_tfidf)
|
||||||
|
|
||||||
|
|
||||||
|
print("TF-IDF sample: ", tfidf_rdd.take(1))
|
Loading…
Reference in New Issue
Block a user