web-analyze/2-5.py
fly6516 9469f76e1a feat:统计每日 404 错误记录数量- 新增 2-5.py 文件,实现 Apache 日志解析和 404 错误统计功能
- 使用 Spark 计算框架处理大规模日志数据
- 提取日志中的日期信息,统计每日 404 错误次数
- 结果按日期排序并输出
2025-04-14 03:49:08 +08:00

62 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
LOG_PATTERN = re.compile(
r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)'
)
def parse_log_line(line):
match = LOG_PATTERN.match(line)
if not match:
return None
content_size_str = match.group(9)
content_size = int(content_size_str) if content_size_str.isdigit() else 0
return {
'ip': match.group(1),
'user_identity': match.group(2),
'user_id': match.group(3),
'timestamp': match.group(4),
'method': match.group(5),
'endpoint': match.group(6),
'protocol': match.group(7),
'status_code': int(match.group(8)),
'content_size': content_size
}
def extract_day(log):
# 时间格式为10/Oct/2000:13:55:36 -0700
full_date = log['timestamp']
day = full_date.split('/')[0] # 只提取日
return day
if __name__ == "__main__":
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
raw_logs = sc.textFile(logFile)
access_logs = raw_logs.map(parse_log_line).filter(lambda x: x is not None).cache()
# 过滤 404 状态码的日志
error_404_logs = access_logs.filter(lambda log: log['status_code'] == 404).cache()
# 每日 404 次数统计
errDateSorted = (
error_404_logs
.map(lambda log: (extract_day(log), 1))
.reduceByKey(lambda a, b: a + b)
.sortByKey(True)
.cache()
)
# 收集结果
daily_404_stats = errDateSorted.collect()
print("每天 404 错误记录数量:")
for day, count in daily_404_stats:
print("Day {}: {} 次 404 错误".format(day, count))
sc.stop()