fix(1-1.py):优化日志解析和空 RDD处理

- 重构日志解析逻辑,提高代码可读性
- 改进空 RDD检查方法,使用 count() 替代 isEmpty()
- 优化输出格式,使用格式化字符串替代 f-string
This commit is contained in:
fly6516 2025-04-14 01:51:33 +08:00
parent c48a91f11e
commit 28d60018af

15
1-1.py
View File

@ -29,11 +29,13 @@ def parse_log_line(line):
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
raw_logs = sc.textFile(logFile)
access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache()
# 加入一个保护,防止 access_logs 空时报错
if access_logs.isEmpty():
print("日志文件为空或解析失败")
parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
access_logs = parsed_logs.cache()
# 判断是否为空 RDD兼容性写法count() == 0
if access_logs.count() == 0:
print("日志文件为空或格式不匹配")
else:
endpoint_counts = (access_logs
.map(lambda log: (log['endpoint'], 1))
@ -42,7 +44,8 @@ else:
.take(10))
print("Top 10 most visited endpoints:")
for endpoint, count in endpoint_counts:
print(f"{endpoint}: {count} hits")
for item in endpoint_counts:
endpoint, count = item[0], item[1]
print("{0}: {1} hits".format(endpoint, count))
sc.stop()