docs(BD-exp-3): 添加项目文档和示例代码
- 新增 App.java 和 AppTest.java 文件作为项目示例 - 添加 command.md、origin.md 和 sql.md 文件,包含项目相关命令和 SQL 语句 - 创建 pom.xml 文件定义项目依赖
This commit is contained in:
commit
2912a4f86a
15
command.md
Normal file
15
command.md
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
sed 's/::/,/g' users.dat > users_cleaned.dat
|
||||||
|
sed 's/::/,/g' movies.dat > movies_cleaned.dat
|
||||||
|
sed 's/::/,/g' ratings.dat > ratings_cleaned.dat
|
||||||
|
|
||||||
|
hadoop fs -rm /user/hadoop/movie_data/users.dat
|
||||||
|
hadoop fs -rm /user/hadoop/movie_data/movies.dat
|
||||||
|
hadoop fs -rm /user/hadoop/movie_data/ratings.dat
|
||||||
|
|
||||||
|
hadoop fs -mkdir /user/hadoop/movie_data/users
|
||||||
|
hadoop fs -mkdir /user/hadoop/movie_data/movies
|
||||||
|
hadoop fs -mkdir /user/hadoop/movie_data/ratings
|
||||||
|
|
||||||
|
hadoop fs -put users_cleaned.dat /user/hadoop/movie_data/users/users.dat
|
||||||
|
hadoop fs -put movies_cleaned.dat /user/hadoop/movie_data/movies/movies.dat
|
||||||
|
hadoop fs -put ratings_cleaned.dat /user/hadoop/movie_data/ratings/ratings.dat
|
251
origin.md
Normal file
251
origin.md
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
好的,我们可以使用 **HDFS** 存储数据,并利用 **Hive** 进行 SQL 查询分析。以下是实验步骤和指令:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## **1. 数据准备**
|
||||||
|
### **1.1 上传数据到 HDFS**
|
||||||
|
1. **解压数据包**
|
||||||
|
```bash
|
||||||
|
unzip data.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **创建 HDFS 目录并上传数据**
|
||||||
|
```bash
|
||||||
|
hdfs dfs -mkdir -p /user/hadoop/movie_data
|
||||||
|
hdfs dfs -put users.dat /user/hadoop/movie_data/
|
||||||
|
hdfs dfs -put movies.dat /user/hadoop/movie_data/
|
||||||
|
hdfs dfs -put ratings.dat /user/hadoop/movie_data/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## **2. 创建 Hive 数据库和表**
|
||||||
|
### **2.1 进入 Hive**
|
||||||
|
```bash
|
||||||
|
hive
|
||||||
|
```
|
||||||
|
### **2.2 创建数据库**
|
||||||
|
```sql
|
||||||
|
CREATE DATABASE IF NOT EXISTS movie_analysis;
|
||||||
|
USE movie_analysis;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **2.3 创建表**
|
||||||
|
```sql
|
||||||
|
-- 用户表
|
||||||
|
CREATE EXTERNAL TABLE users (
|
||||||
|
UserID BIGINT,
|
||||||
|
Gender STRING,
|
||||||
|
Age INT,
|
||||||
|
Occupation STRING,
|
||||||
|
Zipcode STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY '::'
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/';
|
||||||
|
|
||||||
|
-- 电影表
|
||||||
|
CREATE EXTERNAL TABLE movies (
|
||||||
|
MovieID BIGINT,
|
||||||
|
Title STRING,
|
||||||
|
Genres STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY '::'
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/';
|
||||||
|
|
||||||
|
-- 评分表
|
||||||
|
CREATE EXTERNAL TABLE ratings (
|
||||||
|
UserID BIGINT,
|
||||||
|
MovieID BIGINT,
|
||||||
|
Rating DOUBLE,
|
||||||
|
Timestamp STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY '::'
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/';
|
||||||
|
```
|
||||||
|
|
||||||
|
### **2.4 验证数据是否正确导入**
|
||||||
|
```sql
|
||||||
|
SELECT * FROM users LIMIT 5;
|
||||||
|
SELECT * FROM movies LIMIT 5;
|
||||||
|
SELECT * FROM ratings LIMIT 5;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## **3. 数据分析任务**
|
||||||
|
### **3.1 评分次数最多的10部电影**
|
||||||
|
```sql
|
||||||
|
SELECT m.Title, COUNT(r.MovieID) AS RatingCount
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
GROUP BY m.Title
|
||||||
|
ORDER BY RatingCount DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.2 男性、女性评分最高的10部电影**
|
||||||
|
```sql
|
||||||
|
-- 男性
|
||||||
|
SELECT u.Gender, m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN users u ON r.UserID = u.UserID
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
WHERE u.Gender = 'M'
|
||||||
|
GROUP BY u.Gender, m.Title
|
||||||
|
ORDER BY AvgRating DESC
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
-- 女性
|
||||||
|
SELECT u.Gender, m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN users u ON r.UserID = u.UserID
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
WHERE u.Gender = 'F'
|
||||||
|
GROUP BY u.Gender, m.Title
|
||||||
|
ORDER BY AvgRating DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.3 电影 ID 为 2116 的各年龄段平均影评分**
|
||||||
|
```sql
|
||||||
|
SELECT u.Age, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN users u ON r.UserID = u.UserID
|
||||||
|
WHERE r.MovieID = 2116
|
||||||
|
GROUP BY u.Age
|
||||||
|
ORDER BY u.Age;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.4 观看次数最多的女性评最高分的10部电影**
|
||||||
|
```sql
|
||||||
|
WITH MostActiveFemale AS (
|
||||||
|
SELECT UserID
|
||||||
|
FROM (
|
||||||
|
SELECT UserID, COUNT(MovieID) AS rating_count
|
||||||
|
FROM ratings
|
||||||
|
WHERE UserID IN (SELECT UserID FROM users WHERE Gender = 'F')
|
||||||
|
GROUP BY UserID
|
||||||
|
ORDER BY rating_count DESC
|
||||||
|
LIMIT 1
|
||||||
|
) t
|
||||||
|
)
|
||||||
|
SELECT r.UserID, m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
JOIN MostActiveFemale u ON r.UserID = u.UserID
|
||||||
|
GROUP BY r.UserID, m.Title
|
||||||
|
ORDER BY AvgRating DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.5 好片(评分≥4.0)最多的年份的前10部电影**
|
||||||
|
```sql
|
||||||
|
|
||||||
|
WITH YearlyGoodMovies AS (
|
||||||
|
SELECT
|
||||||
|
YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))) AS Year,
|
||||||
|
COUNT(DISTINCT r.MovieID) AS GoodMovieCount
|
||||||
|
FROM ratings r
|
||||||
|
WHERE r.Rating >= 4.0
|
||||||
|
GROUP BY YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT)))
|
||||||
|
ORDER BY GoodMovieCount DESC
|
||||||
|
LIMIT 1
|
||||||
|
),
|
||||||
|
|
||||||
|
|
||||||
|
TopMovies AS (
|
||||||
|
SELECT
|
||||||
|
YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))) AS Year,
|
||||||
|
m.Title,
|
||||||
|
COUNT(r.MovieID) AS RatingCount,
|
||||||
|
AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
WHERE r.Rating >= 4.0
|
||||||
|
GROUP BY YEAR(FROM_UNIXTIME(CAST(r.Timestamped AS BIGINT))), m.Title
|
||||||
|
)
|
||||||
|
|
||||||
|
SELECT t.Year, t.Title, t.RatingCount, t.AvgRating
|
||||||
|
FROM TopMovies t
|
||||||
|
JOIN YearlyGoodMovies y ON t.Year = y.Year
|
||||||
|
ORDER BY t.RatingCount DESC, t.AvgRating DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.6 1997年上映的评分最高的10部Comedy电影**
|
||||||
|
```sql
|
||||||
|
SELECT m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
WHERE m.Title LIKE '%(1997)%' AND m.Genres LIKE '%Comedy%'
|
||||||
|
GROUP BY m.Title
|
||||||
|
ORDER BY AvgRating DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.7 各类型电影中评价最高的5部电影**
|
||||||
|
```sql
|
||||||
|
SELECT m.Genres, m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
GROUP BY m.Genres, m.Title
|
||||||
|
ORDER BY m.Genres, AvgRating DESC
|
||||||
|
LIMIT 5;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.8 各年评分最高的电影类型**
|
||||||
|
```sql
|
||||||
|
SELECT YEAR(FROM_UNIXTIME(CAST(r.Timestamp AS BIGINT))) AS Year, m.Genres, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
GROUP BY Year, m.Genres
|
||||||
|
ORDER BY Year, AvgRating DESC;
|
||||||
|
```
|
||||||
|
|
||||||
|
### **3.9 每个地区最高评分的电影名**
|
||||||
|
```sql
|
||||||
|
INSERT OVERWRITE DIRECTORY '/user/hadoop/movie_results'
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY ','
|
||||||
|
SELECT u.Zipcode, m.Title, AVG(r.Rating) AS AvgRating
|
||||||
|
FROM ratings r
|
||||||
|
JOIN users u ON r.UserID = u.UserID
|
||||||
|
JOIN movies m ON r.MovieID = m.MovieID
|
||||||
|
GROUP BY u.Zipcode, m.Title
|
||||||
|
ORDER BY u.Zipcode, AvgRating DESC;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## **4. 结果存入 HDFS**
|
||||||
|
执行后,查询结果会存储在 `/user/hadoop/movie_results/` 目录下,可用以下命令查看:
|
||||||
|
```bash
|
||||||
|
hdfs dfs -ls /user/hadoop/movie_results/
|
||||||
|
hdfs dfs -cat /user/hadoop/movie_results/part-00000
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### **总结**
|
||||||
|
1. **数据准备**
|
||||||
|
- 解压并上传数据到 HDFS
|
||||||
|
- 创建 Hive 数据库和表
|
||||||
|
- 验证数据是否正确导入
|
||||||
|
|
||||||
|
2. **数据分析**
|
||||||
|
- 计算评分最多的电影
|
||||||
|
- 计算男女评分最高的电影
|
||||||
|
- 计算不同年龄段对某部电影的评分
|
||||||
|
- 找出最活跃女性的最爱电影
|
||||||
|
- 计算评分最高的电影年份及类别
|
||||||
|
- 存储地区最高评分电影至 HDFS
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
这就是完整的实验报告步骤和指令,你可以直接复制执行。如果有任何问题,欢迎交流!🚀
|
25
pom.xml
Normal file
25
pom.xml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>org.example</groupId>
|
||||||
|
<artifactId>BD-exp-3</artifactId>
|
||||||
|
<version>1.0-SNAPSHOT</version>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<name>BD-exp-3</name>
|
||||||
|
<url>http://maven.apache.org</url>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>3.8.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
42
sql.md
Normal file
42
sql.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
DROP TABLE IF EXISTS ratings;
|
||||||
|
DROP TABLE IF EXISTS movies;
|
||||||
|
DROP TABLE IF EXISTS users;
|
||||||
|
|
||||||
|
CREATE EXTERNAL TABLE users (
|
||||||
|
UserID BIGINT,
|
||||||
|
Gender STRING,
|
||||||
|
Age INT,
|
||||||
|
Occupation STRING,
|
||||||
|
Zipcode STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY ','
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/users';
|
||||||
|
|
||||||
|
|
||||||
|
CREATE EXTERNAL TABLE movies (
|
||||||
|
MovieID BIGINT,
|
||||||
|
Title STRING,
|
||||||
|
Genres STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY ','
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/movies';
|
||||||
|
|
||||||
|
|
||||||
|
CREATE EXTERNAL TABLE ratings (
|
||||||
|
UserID BIGINT,
|
||||||
|
MovieID BIGINT,
|
||||||
|
Rating DOUBLE,
|
||||||
|
Timestamped STRING
|
||||||
|
)
|
||||||
|
ROW FORMAT DELIMITED
|
||||||
|
FIELDS TERMINATED BY ','
|
||||||
|
STORED AS TEXTFILE
|
||||||
|
LOCATION '/user/hadoop/movie_data/ratings';
|
||||||
|
|
||||||
|
SELECT * FROM users LIMIT 5;
|
||||||
|
SELECT * FROM movies LIMIT 5;
|
||||||
|
SELECT * FROM ratings LIMIT 5;
|
13
src/main/java/org/example/App.java
Normal file
13
src/main/java/org/example/App.java
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hello world!
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class App
|
||||||
|
{
|
||||||
|
public static void main( String[] args )
|
||||||
|
{
|
||||||
|
System.out.println( "Hello World!" );
|
||||||
|
}
|
||||||
|
}
|
38
src/test/java/org/example/AppTest.java
Normal file
38
src/test/java/org/example/AppTest.java
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import junit.framework.Test;
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import junit.framework.TestSuite;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit test for simple App.
|
||||||
|
*/
|
||||||
|
public class AppTest
|
||||||
|
extends TestCase
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Create the test case
|
||||||
|
*
|
||||||
|
* @param testName name of the test case
|
||||||
|
*/
|
||||||
|
public AppTest( String testName )
|
||||||
|
{
|
||||||
|
super( testName );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the suite of tests being tested
|
||||||
|
*/
|
||||||
|
public static Test suite()
|
||||||
|
{
|
||||||
|
return new TestSuite( AppTest.class );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rigourous Test :-)
|
||||||
|
*/
|
||||||
|
public void testApp()
|
||||||
|
{
|
||||||
|
assertTrue( true );
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user