web-analyze/1-6.py
fly6516 2f47d1d55b feat: 添加 1-6.py 以计算每日平均请求量并绘图
- 新增 1-6.py 文件,实现以下功能:
  - 从 HDFS读取日志文件
  - 解析日志以获取每日请求数和唯一主机数 - 计算每日平均请求量
  - 使用 Matplotlib 绘制折线图展示结果
- 该脚本为后续的1-7 绘制平均请求量折线图提供了数据准备
2025-04-14 02:31:43 +08:00

71 lines
1.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
raw_logs = sc.textFile(logFile)
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')
# 解析日期和主机
def parse_day_ip(line):
match = LOG_PATTERN.match(line)
if not match:
return None
ip = match.group(1)
full_time = match.group(4)
day = full_time.split('/')[0] # "10"
return (day, ip)
# 1⃣ 每天总请求数
def parse_day(line):
match = LOG_PATTERN.match(line)
if not match:
return None
full_time = match.group(4)
day = full_time.split('/')[0]
return (day, 1)
# 得到每天总请求数
dailyRequests = raw_logs.map(parse_day).filter(lambda x: x is not None) \
.reduceByKey(lambda a, b: a + b)
# 得到每天不同主机数(如 1.3 中的逻辑)
day_ip_pairs = raw_logs.map(parse_day_ip).filter(lambda x: x is not None)
dailyUniqueHosts = day_ip_pairs.distinct() \
.map(lambda x: (x[0], 1)) \
.reduceByKey(lambda a, b: a + b)
# 2⃣ 合并两个 RDD并计算平均值用整数除法
dailyReqJoinHost = dailyRequests.join(dailyUniqueHosts)
avgDailyReqPerHost = dailyReqJoinHost.map(
lambda x: (x[0], x[1][0] // x[1][1]) # (day, total_requests // unique_hosts)
).sortByKey().cache() # ⚠️ 缓存结果供后续使用
# 收集数据并准备绘图
daysWithAvg = []
avgs = []
for day, avg in avgDailyReqPerHost.collect():
daysWithAvg.append(day)
avgs.append(avg)
# 绘制折线图
plt.figure(figsize=(10, 6))
plt.plot(daysWithAvg, avgs, marker='o', linestyle='-', color='b', label='Average Requests per Host')
plt.xlabel('Day')
plt.ylabel('Average Requests per Host')
plt.title('Average Daily Requests per Host')
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()
sc.stop()