import matplotlib.pyplot as plt from pyspark import SparkContext sc = SparkContext.getOrCreate() # 假设这是上一个练习中 dailyHosts 缓存后的 RDD logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" raw_logs = sc.textFile(logFile) import re LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)') def parse_log_line(line): match = LOG_PATTERN.match(line) if not match: return None ip = match.group(1) full_time = match.group(4) day = full_time.split('/')[0] # "10" return (day, ip) date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None) unique_daily_hosts = date_ip_pairs.distinct() # dailyHosts 缓存 dailyHosts = unique_daily_hosts \ .map(lambda pair: (pair[0], 1)) \ .reduceByKey(lambda a, b: a + b) \ .cache() # 排序并 collect 成 Python 列表 sorted_daily_hosts = dailyHosts.sortByKey().collect() # 拆分为两个列表 daysWithHosts = [day for (day, _) in sorted_daily_hosts] hosts = [count for (_, count) in sorted_daily_hosts] # 绘图 plt.figure(figsize=(10, 5)) plt.plot(daysWithHosts, hosts, marker='o', linestyle='-', color='blue') plt.title("Number of Unique Hosts per Day") plt.xlabel("Day of Month") plt.ylabel("Unique Hosts") plt.grid(True) plt.tight_layout() plt.show() sc.stop()