refactor(2-9):优化日志数据分析代码

-将 RDD 转换为 DataFrame,提高数据处理效率
- 使用 DataFrame API 进行数据过滤和处理,代码更简洁
- 添加小时信息提取,便于后续分析
This commit is contained in:
fly6516 2025-04-14 04:04:57 +08:00
parent 5190f1fb90
commit eb42506ea8

5
2-9.py
View File

@ -58,8 +58,11 @@ def main():
# 过滤出有效日志行
access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache()
# 将RDD转换为DataFrame
access_logs_df = spark.createDataFrame(access_logs)
# 过滤出404响应代码的日志
access_logs_404 = access_logs.filter(access_logs.response_code == 404)
access_logs_404 = access_logs_df.filter(access_logs_df.response_code == 404)
# 提取小时信息
access_logs_with_hour = access_logs_404.withColumn("hour", F.hour(access_logs_404.date_time))