web-analyze/2-1.py
fly6516 beb743f10d feat: 添加日志解析和404 错误统计功能
- 新增日志解析函数 parse_log_line,用于解析 Apache 日志
- 添加过滤 404 错误的函数 filter_404
- 实现从 HDFS 读取日志、解析、过滤和统计 404 错误的完整流程- 打印 404 错误记录的数量
2025-04-14 03:29:09 +08:00

48 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
raw_logs = sc.textFile(logFile)
# 日志模式
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')
# 解析日志
def parse_log_line(line):
match = LOG_PATTERN.match(line)
if not match:
return None
content_size_str = match.group(9)
content_size = int(content_size_str) if content_size_str.isdigit() else 0
return {
'ip': match.group(1),
'user_identity': match.group(2),
'user_id': match.group(3),
'timestamp': match.group(4),
'method': match.group(5),
'endpoint': match.group(6),
'protocol': match.group(7),
'status_code': int(match.group(8)),
'content_size': content_size
}
# 过滤出包含 404 响应代码的日志
def filter_404(log):
return log['status_code'] == 404
# 解析并过滤日志
parsed_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
# 过滤出 404 错误记录并缓存
error_404_logs = parsed_logs.filter(filter_404).cache()
# 统计 404 错误数量
count_404 = error_404_logs.count()
# 打印结果(使用 .format 替代 f-string
print("日志中共有 {} 条 404 响应代码记录。".format(count_404))
sc.stop()