fix(1-1.py):优化日志解析和空 RDD处理
- 重构日志解析逻辑,提高代码可读性 - 改进空 RDD检查方法,使用 count() 替代 isEmpty() - 优化输出格式,使用格式化字符串替代 f-string
This commit is contained in:
parent
c48a91f11e
commit
28d60018af
15
1-1.py
15
1-1.py
@ -29,11 +29,13 @@ def parse_log_line(line):
|
||||
|
||||
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
||||
raw_logs = sc.textFile(logFile)
|
||||
access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache()
|
||||
|
||||
# 加入一个保护,防止 access_logs 空时报错
|
||||
if access_logs.isEmpty():
|
||||
print("日志文件为空或解析失败")
|
||||
parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
|
||||
access_logs = parsed_logs.cache()
|
||||
|
||||
# 判断是否为空 RDD(兼容性写法:count() == 0)
|
||||
if access_logs.count() == 0:
|
||||
print("日志文件为空或格式不匹配")
|
||||
else:
|
||||
endpoint_counts = (access_logs
|
||||
.map(lambda log: (log['endpoint'], 1))
|
||||
@ -42,7 +44,8 @@ else:
|
||||
.take(10))
|
||||
|
||||
print("Top 10 most visited endpoints:")
|
||||
for endpoint, count in endpoint_counts:
|
||||
print(f"{endpoint}: {count} hits")
|
||||
for item in endpoint_counts:
|
||||
endpoint, count = item[0], item[1]
|
||||
print("{0}: {1} hits".format(endpoint, count))
|
||||
|
||||
sc.stop()
|
||||
|
Loading…
Reference in New Issue
Block a user