diff --git a/2-9.py b/2-9.py index 47d3c0b..454ca96 100644 --- a/2-9.py +++ b/2-9.py @@ -58,8 +58,11 @@ def main(): # 过滤出有效日志行 access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache() + # 将RDD转换为DataFrame + access_logs_df = spark.createDataFrame(access_logs) + # 过滤出404响应代码的日志 - access_logs_404 = access_logs.filter(access_logs.response_code == 404) + access_logs_404 = access_logs_df.filter(access_logs_df.response_code == 404) # 提取小时信息 access_logs_with_hour = access_logs_404.withColumn("hour", F.hour(access_logs_404.date_time))