plot(dailyHosts): 绘制每日唯一主机数量图表
- 从 Apache 日志文件中提取日期和 IP 地址 - 计算每日的唯一主机数量 - 使用 Matplotlib 绘制折线图 - 添加图表标题、轴标签和网格
This commit is contained in:
parent
9e9d18b78e
commit
60c3a1e4ca
48
1-4.py
Normal file
48
1-4.py
Normal file
@ -0,0 +1,48 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from pyspark import SparkContext
|
||||
|
||||
sc = SparkContext.getOrCreate()
|
||||
|
||||
# 假设这是上一个练习中 dailyHosts 缓存后的 RDD
|
||||
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
||||
raw_logs = sc.textFile(logFile)
|
||||
|
||||
import re
|
||||
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')
|
||||
|
||||
def parse_log_line(line):
|
||||
match = LOG_PATTERN.match(line)
|
||||
if not match:
|
||||
return None
|
||||
ip = match.group(1)
|
||||
full_time = match.group(4)
|
||||
day = full_time.split('/')[0] # "10"
|
||||
return (day, ip)
|
||||
|
||||
date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
|
||||
unique_daily_hosts = date_ip_pairs.distinct()
|
||||
|
||||
# dailyHosts 缓存
|
||||
dailyHosts = unique_daily_hosts \
|
||||
.map(lambda pair: (pair[0], 1)) \
|
||||
.reduceByKey(lambda a, b: a + b) \
|
||||
.cache()
|
||||
|
||||
# 排序并 collect 成 Python 列表
|
||||
sorted_daily_hosts = dailyHosts.sortByKey().collect()
|
||||
|
||||
# 拆分为两个列表
|
||||
daysWithHosts = [day for (day, _) in sorted_daily_hosts]
|
||||
hosts = [count for (_, count) in sorted_daily_hosts]
|
||||
|
||||
# 绘图
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.plot(daysWithHosts, hosts, marker='o', linestyle='-', color='blue')
|
||||
plt.title("Number of Unique Hosts per Day")
|
||||
plt.xlabel("Day of Month")
|
||||
plt.ylabel("Unique Hosts")
|
||||
plt.grid(True)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
sc.stop()
|
Loading…
Reference in New Issue
Block a user