From eb42506ea8ac462ec19ef9cf88c2721b09b0b083 Mon Sep 17 00:00:00 2001 From: fly6516 Date: Mon, 14 Apr 2025 04:04:57 +0800 Subject: [PATCH] =?UTF-8?q?refactor(2-9):=E4=BC=98=E5=8C=96=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E6=95=B0=E6=8D=AE=E5=88=86=E6=9E=90=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -将 RDD 转换为 DataFrame,提高数据处理效率 - 使用 DataFrame API 进行数据过滤和处理,代码更简洁 - 添加小时信息提取,便于后续分析 --- 2-9.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/2-9.py b/2-9.py index 47d3c0b..454ca96 100644 --- a/2-9.py +++ b/2-9.py @@ -58,8 +58,11 @@ def main(): # 过滤出有效日志行 access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache() + # 将RDD转换为DataFrame + access_logs_df = spark.createDataFrame(access_logs) + # 过滤出404响应代码的日志 - access_logs_404 = access_logs.filter(access_logs.response_code == 404) + access_logs_404 = access_logs_df.filter(access_logs_df.response_code == 404) # 提取小时信息 access_logs_with_hour = access_logs_404.withColumn("hour", F.hour(access_logs_404.date_time))