From 28d60018af86b1f0b23119ad3b9ea7091d8d1afd Mon Sep 17 00:00:00 2001 From: fly6516 Date: Mon, 14 Apr 2025 01:51:33 +0800 Subject: [PATCH] =?UTF-8?q?fix(1-1.py):=E4=BC=98=E5=8C=96=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E8=A7=A3=E6=9E=90=E5=92=8C=E7=A9=BA=20RDD=E5=A4=84?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构日志解析逻辑,提高代码可读性 - 改进空 RDD检查方法,使用 count() 替代 isEmpty() - 优化输出格式,使用格式化字符串替代 f-string --- 1-1.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/1-1.py b/1-1.py index b27440b..4c12a5f 100644 --- a/1-1.py +++ b/1-1.py @@ -29,11 +29,13 @@ def parse_log_line(line): logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" raw_logs = sc.textFile(logFile) -access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache() -# 加入一个保护,防止 access_logs 空时报错 -if access_logs.isEmpty(): - print("日志文件为空或解析失败") +parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None) +access_logs = parsed_logs.cache() + +# 判断是否为空 RDD(兼容性写法:count() == 0) +if access_logs.count() == 0: + print("日志文件为空或格式不匹配") else: endpoint_counts = (access_logs .map(lambda log: (log['endpoint'], 1)) @@ -42,7 +44,8 @@ else: .take(10)) print("Top 10 most visited endpoints:") - for endpoint, count in endpoint_counts: - print(f"{endpoint}: {count} hits") + for item in endpoint_counts: + endpoint, count = item[0], item[1] + print("{0}: {1} hits".format(endpoint, count)) sc.stop()