2025-04-22 07:54:23 +00:00
|
|
|
from pyspark import SparkContext
|
|
|
|
from pyspark.mllib.recommendation import ALS, Rating
|
2025-04-22 07:41:20 +00:00
|
|
|
import math
|
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Initialize Spark context
|
|
|
|
sc = SparkContext(appName="MovieLensALS")
|
|
|
|
|
|
|
|
# Load ratings data: format (userId, movieId, rating)
|
|
|
|
rawData = sc.textFile("hdfs://master:9000/user/root/ratings.csv")
|
|
|
|
header = rawData.first()
|
|
|
|
rawData = rawData.filter(lambda line: line != header)
|
|
|
|
|
|
|
|
# Convert to Rating objects and filter incomplete lines
|
|
|
|
ratingsRDD = rawData.map(lambda line: line.split(",")) \
|
|
|
|
.filter(lambda fields: len(fields) >= 3) \
|
|
|
|
.map(lambda fields: Rating(int(fields[0]), int(fields[1]), float(fields[2])))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Split data into training, validation, and test sets (60%, 20%, 20%)
|
|
|
|
trainingRDD, validationRDD, testRDD = ratingsRDD.randomSplit([6, 2, 2], seed=0)
|
|
|
|
print("Training: %d, validation: %d, test: %d" % (trainingRDD.count(), validationRDD.count(), testRDD.count()))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Set ALS model parameters
|
|
|
|
rank = 10
|
|
|
|
numIterations = 10
|
|
|
|
lambda_ = 0.1
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Train ALS model on the training data
|
|
|
|
myModel = ALS.train(trainingRDD, rank, iterations=numIterations, lambda_=lambda_)
|
|
|
|
|
|
|
|
# Function to compute RMSE between predicted and actual ratings
|
|
|
|
def computeError(predictedRDD, actualRDD):
|
|
|
|
predictedReformattedRDD = predictedRDD.filter(lambda r: len(r) == 3) \
|
|
|
|
.map(lambda r: ((r[0], r[1]), r[2]))
|
|
|
|
actualReformattedRDD = actualRDD.filter(lambda r: len(r) == 3) \
|
|
|
|
.map(lambda r: ((r[0], r[1]), r[2]))
|
|
|
|
joined = predictedReformattedRDD.join(actualReformattedRDD)
|
|
|
|
squaredErrorsRDD = joined.map(lambda pair: (pair[1][0] - pair[1][1]) ** 2)
|
|
|
|
totalError = squaredErrorsRDD.reduce(lambda a, b: a + b)
|
|
|
|
numRatings = squaredErrorsRDD.count()
|
|
|
|
return math.sqrt(float(totalError) / numRatings) if numRatings != 0 else float('nan')
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Predict and evaluate on the validation set
|
|
|
|
validationPairs = validationRDD.map(lambda rating: (rating[0], rating[1]))
|
|
|
|
predictedValRDD = myModel.predictAll(validationPairs)
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
print("\nSample validation predictions:")
|
|
|
|
print(predictedValRDD.take(3))
|
|
|
|
print(validationRDD.take(3))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
try:
|
|
|
|
validationRMSE = computeError(predictedValRDD, validationRDD)
|
|
|
|
print("Validation RMSE: %f" % validationRMSE)
|
|
|
|
except Exception as e:
|
|
|
|
print("Error computing validation RMSE:", str(e))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Predict and evaluate on the test set
|
|
|
|
testPairs = testRDD.map(lambda rating: (rating[0], rating[1]))
|
|
|
|
predictedTestRDD = myModel.predictAll(testPairs)
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
print("\nSample test predictions:")
|
|
|
|
print(predictedTestRDD.take(3))
|
|
|
|
print(testRDD.take(3))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
print("Predicted count: %d" % predictedTestRDD.count())
|
|
|
|
print("Test count: %d" % testRDD.count())
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
try:
|
|
|
|
testRMSE = computeError(predictedTestRDD, testRDD)
|
|
|
|
print("Test RMSE: %f" % testRMSE)
|
|
|
|
except Exception as e:
|
|
|
|
print("Error computing test RMSE:", str(e))
|
2025-04-22 07:41:20 +00:00
|
|
|
|
2025-04-22 07:54:23 +00:00
|
|
|
# Stop the Spark context
|
|
|
|
sc.stop()
|