diff --git a/1-1.py b/1-1.py index b27440b..4c12a5f 100644 --- a/1-1.py +++ b/1-1.py @@ -29,11 +29,13 @@ def parse_log_line(line): logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" raw_logs = sc.textFile(logFile) -access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache() -# 加入一个保护,防止 access_logs 空时报错 -if access_logs.isEmpty(): - print("日志文件为空或解析失败") +parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None) +access_logs = parsed_logs.cache() + +# 判断是否为空 RDD(兼容性写法:count() == 0) +if access_logs.count() == 0: + print("日志文件为空或格式不匹配") else: endpoint_counts = (access_logs .map(lambda log: (log['endpoint'], 1)) @@ -42,7 +44,8 @@ else: .take(10)) print("Top 10 most visited endpoints:") - for endpoint, count in endpoint_counts: - print(f"{endpoint}: {count} hits") + for item in endpoint_counts: + endpoint, count = item[0], item[1] + print("{0}: {1} hits".format(endpoint, count)) sc.stop()