import re import datetime from pyspark.sql import SparkSession, Row from pyspark.sql import functions as F import matplotlib.pyplot as plt # 定义日志解析的正则表达式 APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s(\S)" (\d{3}) (\S+)' # 将Apache日志中的时间字符串解析为datetime对象 month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12} def parse_apache_time(s): """Convert Apache time format into a Python datetime object""" return datetime.datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), int(s[12:14]), int(s[15:17]), int(s[18:20])) def parseApacheLogLine(logline): """Parse a line in the Apache Common Log format""" match = re.search(APACHE_ACCESS_LOG_PATTERN, logline) if match is None: return (logline, 0) size_field = match.group(9) size = int(size_field) if size_field != '-' else 0 return (Row( host=match.group(1), client_identd=match.group(2), user_id=match.group(3), date_time=parse_apache_time(match.group(4)), method=match.group(5), endpoint=match.group(6), protocol=match.group(7), response_code=int(match.group(8)), content_size=size ), 1) def main(): # 创建SparkSession spark = SparkSession.builder \ .appName("Apache Log Analysis") \ .getOrCreate() # 读取日志文件 logFile = 'apache.access.log.PROJECT' # 替换为您的日志文件路径 rdd = spark.sparkContext.textFile(logFile) # 解析日志行 parsed_logs = rdd.map(parseApacheLogLine) # 过滤出有效日志行 access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache() # 过滤出404响应代码的日志 access_logs_404 = access_logs.filter(access_logs.response_code == 404) # 提取小时信息 access_logs_with_hour = access_logs_404.withColumn("hour", F.hour(access_logs_404.date_time)) # 计算每小时的404响应代码数量 hourly_404_counts = access_logs_with_hour.groupBy("hour").count().orderBy("hour") # 收集数据并准备绘图 hourly_counts = hourly_404_counts.collect() # 提取小时和计数 hours = [row["hour"] for row in hourly_counts] counts = [row["count"] for row in hourly_counts] # 使用Matplotlib绘制折线图 plt.figure(figsize=(10, 6)) plt.plot(hours, counts, marker='o', linestyle='-', color='b', label='404 Responses') plt.title("Hourly 404 Response Code Counts") plt.xlabel("Hour of the Day") plt.ylabel("Count of 404 Responses") plt.xticks(range(24)) # 显示24小时 plt.grid(True) plt.legend() plt.show() # 使用Matplotlib绘制条形图 plt.figure(figsize=(10, 6)) plt.bar(hours, counts, color='orange', label='404 Responses') plt.title("Hourly 404 Response Code Counts") plt.xlabel("Hour of the Day") plt.ylabel("Count of 404 Responses") plt.xticks(range(24)) # 显示24小时 plt.grid(True) plt.legend() plt.show() # 结束SparkSession spark.stop() if __name__ == "__main__": main()