From 2f47d1d55bc0fae6df6d3f56cbb5973d9d86859a Mon Sep 17 00:00:00 2001 From: fly6516 Date: Mon, 14 Apr 2025 02:31:43 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=201-6.py=20=E4=BB=A5?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=AF=8F=E6=97=A5=E5=B9=B3=E5=9D=87=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E9=87=8F=E5=B9=B6=E7=BB=98=E5=9B=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 1-6.py 文件,实现以下功能: - 从 HDFS读取日志文件 - 解析日志以获取每日请求数和唯一主机数 - 计算每日平均请求量 - 使用 Matplotlib 绘制折线图展示结果 - 该脚本为后续的1-7 绘制平均请求量折线图提供了数据准备 --- 1-6.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 1-6.py diff --git a/1-6.py b/1-6.py new file mode 100644 index 0000000..8c54b9b --- /dev/null +++ b/1-6.py @@ -0,0 +1,70 @@ +import re +import matplotlib.pyplot as plt +from pyspark import SparkContext + +sc = SparkContext.getOrCreate() + +logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" +raw_logs = sc.textFile(logFile) + +LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)') + +# 解析日期和主机 +def parse_day_ip(line): + match = LOG_PATTERN.match(line) + if not match: + return None + ip = match.group(1) + full_time = match.group(4) + day = full_time.split('/')[0] # "10" + return (day, ip) + +# 1️⃣ 每天总请求数 +def parse_day(line): + match = LOG_PATTERN.match(line) + if not match: + return None + full_time = match.group(4) + day = full_time.split('/')[0] + return (day, 1) + +# 得到每天总请求数 +dailyRequests = raw_logs.map(parse_day).filter(lambda x: x is not None) \ + .reduceByKey(lambda a, b: a + b) + +# 得到每天不同主机数(如 1.3 中的逻辑) +day_ip_pairs = raw_logs.map(parse_day_ip).filter(lambda x: x is not None) +dailyUniqueHosts = day_ip_pairs.distinct() \ + .map(lambda x: (x[0], 1)) \ + .reduceByKey(lambda a, b: a + b) + +# 2️⃣ 合并两个 RDD,并计算平均值(用整数除法) +dailyReqJoinHost = dailyRequests.join(dailyUniqueHosts) + +avgDailyReqPerHost = dailyReqJoinHost.map( + lambda x: (x[0], x[1][0] // x[1][1]) # (day, total_requests // unique_hosts) +).sortByKey().cache() # ⚠️ 缓存结果供后续使用 + +# 收集数据并准备绘图 +daysWithAvg = [] +avgs = [] + +for day, avg in avgDailyReqPerHost.collect(): + daysWithAvg.append(day) + avgs.append(avg) + +# 绘制折线图 +plt.figure(figsize=(10, 6)) +plt.plot(daysWithAvg, avgs, marker='o', linestyle='-', color='b', label='Average Requests per Host') + +plt.xlabel('Day') +plt.ylabel('Average Requests per Host') +plt.title('Average Daily Requests per Host') + +plt.xticks(rotation=45) +plt.tight_layout() +plt.legend() + +plt.show() + +sc.stop()