From 60c3a1e4ca33a623d741c1f4b4506df5ba0fe839 Mon Sep 17 00:00:00 2001 From: fly6516 Date: Mon, 14 Apr 2025 02:19:26 +0800 Subject: [PATCH] =?UTF-8?q?plot(dailyHosts):=20=E7=BB=98=E5=88=B6=E6=AF=8F?= =?UTF-8?q?=E6=97=A5=E5=94=AF=E4=B8=80=E4=B8=BB=E6=9C=BA=E6=95=B0=E9=87=8F?= =?UTF-8?q?=E5=9B=BE=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 从 Apache 日志文件中提取日期和 IP 地址 - 计算每日的唯一主机数量 - 使用 Matplotlib 绘制折线图 - 添加图表标题、轴标签和网格 --- 1-4.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 1-4.py diff --git a/1-4.py b/1-4.py new file mode 100644 index 0000000..b94e4c3 --- /dev/null +++ b/1-4.py @@ -0,0 +1,48 @@ +import matplotlib.pyplot as plt +from pyspark import SparkContext + +sc = SparkContext.getOrCreate() + +# 假设这是上一个练习中 dailyHosts 缓存后的 RDD +logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" +raw_logs = sc.textFile(logFile) + +import re +LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)') + +def parse_log_line(line): + match = LOG_PATTERN.match(line) + if not match: + return None + ip = match.group(1) + full_time = match.group(4) + day = full_time.split('/')[0] # "10" + return (day, ip) + +date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None) +unique_daily_hosts = date_ip_pairs.distinct() + +# dailyHosts 缓存 +dailyHosts = unique_daily_hosts \ + .map(lambda pair: (pair[0], 1)) \ + .reduceByKey(lambda a, b: a + b) \ + .cache() + +# 排序并 collect 成 Python 列表 +sorted_daily_hosts = dailyHosts.sortByKey().collect() + +# 拆分为两个列表 +daysWithHosts = [day for (day, _) in sorted_daily_hosts] +hosts = [count for (_, count) in sorted_daily_hosts] + +# 绘图 +plt.figure(figsize=(10, 5)) +plt.plot(daysWithHosts, hosts, marker='o', linestyle='-', color='blue') +plt.title("Number of Unique Hosts per Day") +plt.xlabel("Day of Month") +plt.ylabel("Unique Hosts") +plt.grid(True) +plt.tight_layout() +plt.show() + +sc.stop()