fix(1-1.py):优化日志解析和空 RDD处理
- 重构日志解析逻辑,提高代码可读性 - 改进空 RDD检查方法,使用 count() 替代 isEmpty() - 优化输出格式,使用格式化字符串替代 f-string
This commit is contained in:
parent
c48a91f11e
commit
28d60018af
15
1-1.py
15
1-1.py
@ -29,11 +29,13 @@ def parse_log_line(line):
|
|||||||
|
|
||||||
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
||||||
raw_logs = sc.textFile(logFile)
|
raw_logs = sc.textFile(logFile)
|
||||||
access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache()
|
|
||||||
|
|
||||||
# 加入一个保护,防止 access_logs 空时报错
|
parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
|
||||||
if access_logs.isEmpty():
|
access_logs = parsed_logs.cache()
|
||||||
print("日志文件为空或解析失败")
|
|
||||||
|
# 判断是否为空 RDD(兼容性写法:count() == 0)
|
||||||
|
if access_logs.count() == 0:
|
||||||
|
print("日志文件为空或格式不匹配")
|
||||||
else:
|
else:
|
||||||
endpoint_counts = (access_logs
|
endpoint_counts = (access_logs
|
||||||
.map(lambda log: (log['endpoint'], 1))
|
.map(lambda log: (log['endpoint'], 1))
|
||||||
@ -42,7 +44,8 @@ else:
|
|||||||
.take(10))
|
.take(10))
|
||||||
|
|
||||||
print("Top 10 most visited endpoints:")
|
print("Top 10 most visited endpoints:")
|
||||||
for endpoint, count in endpoint_counts:
|
for item in endpoint_counts:
|
||||||
print(f"{endpoint}: {count} hits")
|
endpoint, count = item[0], item[1]
|
||||||
|
print("{0}: {1} hits".format(endpoint, count))
|
||||||
|
|
||||||
sc.stop()
|
sc.stop()
|
||||||
|
Loading…
Reference in New Issue
Block a user