refactor(2-9):优化日志数据分析代码
-将 RDD 转换为 DataFrame,提高数据处理效率 - 使用 DataFrame API 进行数据过滤和处理,代码更简洁 - 添加小时信息提取,便于后续分析
This commit is contained in:
parent
5190f1fb90
commit
eb42506ea8
5
2-9.py
5
2-9.py
@ -58,8 +58,11 @@ def main():
|
||||
# 过滤出有效日志行
|
||||
access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache()
|
||||
|
||||
# 将RDD转换为DataFrame
|
||||
access_logs_df = spark.createDataFrame(access_logs)
|
||||
|
||||
# 过滤出404响应代码的日志
|
||||
access_logs_404 = access_logs.filter(access_logs.response_code == 404)
|
||||
access_logs_404 = access_logs_df.filter(access_logs_df.response_code == 404)
|
||||
|
||||
# 提取小时信息
|
||||
access_logs_with_hour = access_logs_404.withColumn("hour", F.hour(access_logs_404.date_time))
|
||||
|
Loading…
Reference in New Issue
Block a user