commit 2912a4f86a957a6d10db7b465724cf233afabec0 Author: fly6516 Date: Sun Mar 16 19:01:24 2025 +0800 docs(BD-exp-3): 添加项目文档和示例代码 - 新增 App.java 和 AppTest.java 文件作为项目示例 - 添加 command.md、origin.md 和 sql.md 文件,包含项目相关命令和 SQL 语句 - 创建 pom.xml 文件定义项目依赖 diff --git a/command.md b/command.md new file mode 100644 index 0000000..406300a --- /dev/null +++ b/command.md @@ -0,0 +1,15 @@ +sed 's/::/,/g' users.dat > users_cleaned.dat +sed 's/::/,/g' movies.dat > movies_cleaned.dat +sed 's/::/,/g' ratings.dat > ratings_cleaned.dat + +hadoop fs -rm /user/hadoop/movie_data/users.dat +hadoop fs -rm /user/hadoop/movie_data/movies.dat +hadoop fs -rm /user/hadoop/movie_data/ratings.dat + +hadoop fs -mkdir /user/hadoop/movie_data/users +hadoop fs -mkdir /user/hadoop/movie_data/movies +hadoop fs -mkdir /user/hadoop/movie_data/ratings + +hadoop fs -put users_cleaned.dat /user/hadoop/movie_data/users/users.dat +hadoop fs -put movies_cleaned.dat /user/hadoop/movie_data/movies/movies.dat +hadoop fs -put ratings_cleaned.dat /user/hadoop/movie_data/ratings/ratings.dat diff --git a/origin.md b/origin.md new file mode 100644 index 0000000..7312030 --- /dev/null +++ b/origin.md @@ -0,0 +1,251 @@ +好的,我们可以使用 **HDFS** 存储数据,并利用 **Hive** 进行 SQL 查询分析。以下是实验步骤和指令: + +--- + +## **1. 数据准备** +### **1.1 上传数据到 HDFS** +1. **解压数据包** + ```bash + unzip data.zip + ``` + +2. **创建 HDFS 目录并上传数据** + ```bash + hdfs dfs -mkdir -p /user/hadoop/movie_data + hdfs dfs -put users.dat /user/hadoop/movie_data/ + hdfs dfs -put movies.dat /user/hadoop/movie_data/ + hdfs dfs -put ratings.dat /user/hadoop/movie_data/ + ``` + +--- + +## **2. 创建 Hive 数据库和表** +### **2.1 进入 Hive** +```bash +hive +``` +### **2.2 创建数据库** +```sql +CREATE DATABASE IF NOT EXISTS movie_analysis; +USE movie_analysis; +``` + +### **2.3 创建表** +```sql +-- 用户表 +CREATE EXTERNAL TABLE users ( + UserID BIGINT, + Gender STRING, + Age INT, + Occupation STRING, + Zipcode STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '::' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/'; + +-- 电影表 +CREATE EXTERNAL TABLE movies ( + MovieID BIGINT, + Title STRING, + Genres STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '::' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/'; + +-- 评分表 +CREATE EXTERNAL TABLE ratings ( + UserID BIGINT, + MovieID BIGINT, + Rating DOUBLE, + Timestamp STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '::' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/'; +``` + +### **2.4 验证数据是否正确导入** +```sql +SELECT * FROM users LIMIT 5; +SELECT * FROM movies LIMIT 5; +SELECT * FROM ratings LIMIT 5; +``` + +--- + +## **3. 数据分析任务** +### **3.1 评分次数最多的10部电影** +```sql +SELECT m.Title, COUNT(r.MovieID) AS RatingCount +FROM ratings r +JOIN movies m ON r.MovieID = m.MovieID +GROUP BY m.Title +ORDER BY RatingCount DESC +LIMIT 10; +``` + +### **3.2 男性、女性评分最高的10部电影** +```sql +-- 男性 +SELECT u.Gender, m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN users u ON r.UserID = u.UserID +JOIN movies m ON r.MovieID = m.MovieID +WHERE u.Gender = 'M' +GROUP BY u.Gender, m.Title +ORDER BY AvgRating DESC +LIMIT 10; + +-- 女性 +SELECT u.Gender, m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN users u ON r.UserID = u.UserID +JOIN movies m ON r.MovieID = m.MovieID +WHERE u.Gender = 'F' +GROUP BY u.Gender, m.Title +ORDER BY AvgRating DESC +LIMIT 10; +``` + +### **3.3 电影 ID 为 2116 的各年龄段平均影评分** +```sql +SELECT u.Age, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN users u ON r.UserID = u.UserID +WHERE r.MovieID = 2116 +GROUP BY u.Age +ORDER BY u.Age; +``` + +### **3.4 观看次数最多的女性评最高分的10部电影** +```sql +WITH MostActiveFemale AS ( + SELECT UserID + FROM ( + SELECT UserID, COUNT(MovieID) AS rating_count + FROM ratings + WHERE UserID IN (SELECT UserID FROM users WHERE Gender = 'F') + GROUP BY UserID + ORDER BY rating_count DESC + LIMIT 1 + ) t +) +SELECT r.UserID, m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r + JOIN movies m ON r.MovieID = m.MovieID + JOIN MostActiveFemale u ON r.UserID = u.UserID +GROUP BY r.UserID, m.Title +ORDER BY AvgRating DESC + LIMIT 10; +``` + +### **3.5 好片(评分≥4.0)最多的年份的前10部电影** +```sql + +WITH YearlyGoodMovies AS ( + SELECT + YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))) AS Year, + COUNT(DISTINCT r.MovieID) AS GoodMovieCount +FROM ratings r +WHERE r.Rating >= 4.0 +GROUP BY YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))) +ORDER BY GoodMovieCount DESC + LIMIT 1 + ), + + + TopMovies AS ( +SELECT + YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))) AS Year, + m.Title, + COUNT(r.MovieID) AS RatingCount, + AVG(r.Rating) AS AvgRating +FROM ratings r + JOIN movies m ON r.MovieID = m.MovieID +WHERE r.Rating >= 4.0 +GROUP BY YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))), m.Title + ) + +SELECT t.Year, t.Title, t.RatingCount, t.AvgRating +FROM TopMovies t + JOIN YearlyGoodMovies y ON t.Year = y.Year +ORDER BY t.RatingCount DESC, t.AvgRating DESC +LIMIT 10; +``` + +### **3.6 1997年上映的评分最高的10部Comedy电影** +```sql +SELECT m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN movies m ON r.MovieID = m.MovieID +WHERE m.Title LIKE '%(1997)%' AND m.Genres LIKE '%Comedy%' +GROUP BY m.Title +ORDER BY AvgRating DESC +LIMIT 10; +``` + +### **3.7 各类型电影中评价最高的5部电影** +```sql +SELECT m.Genres, m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN movies m ON r.MovieID = m.MovieID +GROUP BY m.Genres, m.Title +ORDER BY m.Genres, AvgRating DESC +LIMIT 5; +``` + +### **3.8 各年评分最高的电影类型** +```sql +SELECT YEAR(FROM_UNIXTIME(CAST(r.Timestamp AS BIGINT))) AS Year, m.Genres, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN movies m ON r.MovieID = m.MovieID +GROUP BY Year, m.Genres +ORDER BY Year, AvgRating DESC; +``` + +### **3.9 每个地区最高评分的电影名** +```sql +INSERT OVERWRITE DIRECTORY '/user/hadoop/movie_results' +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +SELECT u.Zipcode, m.Title, AVG(r.Rating) AS AvgRating +FROM ratings r +JOIN users u ON r.UserID = u.UserID +JOIN movies m ON r.MovieID = m.MovieID +GROUP BY u.Zipcode, m.Title +ORDER BY u.Zipcode, AvgRating DESC; +``` + +--- + +## **4. 结果存入 HDFS** +执行后,查询结果会存储在 `/user/hadoop/movie_results/` 目录下,可用以下命令查看: +```bash +hdfs dfs -ls /user/hadoop/movie_results/ +hdfs dfs -cat /user/hadoop/movie_results/part-00000 +``` + +--- + +### **总结** +1. **数据准备** + - 解压并上传数据到 HDFS + - 创建 Hive 数据库和表 + - 验证数据是否正确导入 + +2. **数据分析** + - 计算评分最多的电影 + - 计算男女评分最高的电影 + - 计算不同年龄段对某部电影的评分 + - 找出最活跃女性的最爱电影 + - 计算评分最高的电影年份及类别 + - 存储地区最高评分电影至 HDFS + +--- + +这就是完整的实验报告步骤和指令,你可以直接复制执行。如果有任何问题,欢迎交流!🚀 \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..253383f --- /dev/null +++ b/pom.xml @@ -0,0 +1,25 @@ + + 4.0.0 + + org.example + BD-exp-3 + 1.0-SNAPSHOT + jar + + BD-exp-3 + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 3.8.1 + test + + + diff --git a/sql.md b/sql.md new file mode 100644 index 0000000..c4a028f --- /dev/null +++ b/sql.md @@ -0,0 +1,42 @@ +DROP TABLE IF EXISTS ratings; +DROP TABLE IF EXISTS movies; +DROP TABLE IF EXISTS users; + +CREATE EXTERNAL TABLE users ( +UserID BIGINT, +Gender STRING, +Age INT, +Occupation STRING, +Zipcode STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/users'; + + +CREATE EXTERNAL TABLE movies ( +MovieID BIGINT, +Title STRING, +Genres STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/movies'; + + +CREATE EXTERNAL TABLE ratings ( +UserID BIGINT, +MovieID BIGINT, +Rating DOUBLE, +Timestamped STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +STORED AS TEXTFILE +LOCATION '/user/hadoop/movie_data/ratings'; + +SELECT * FROM users LIMIT 5; +SELECT * FROM movies LIMIT 5; +SELECT * FROM ratings LIMIT 5; diff --git a/src/main/java/org/example/App.java b/src/main/java/org/example/App.java new file mode 100644 index 0000000..5f21d2e --- /dev/null +++ b/src/main/java/org/example/App.java @@ -0,0 +1,13 @@ +package org.example; + +/** + * Hello world! + * + */ +public class App +{ + public static void main( String[] args ) + { + System.out.println( "Hello World!" ); + } +} diff --git a/src/test/java/org/example/AppTest.java b/src/test/java/org/example/AppTest.java new file mode 100644 index 0000000..d5f435d --- /dev/null +++ b/src/test/java/org/example/AppTest.java @@ -0,0 +1,38 @@ +package org.example; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for simple App. + */ +public class AppTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public AppTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( AppTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +}