diff --git a/1-4.py b/1-4.py new file mode 100644 index 0000000..b94e4c3 --- /dev/null +++ b/1-4.py @@ -0,0 +1,48 @@ +import matplotlib.pyplot as plt +from pyspark import SparkContext + +sc = SparkContext.getOrCreate() + +# 假设这是上一个练习中 dailyHosts 缓存后的 RDD +logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT" +raw_logs = sc.textFile(logFile) + +import re +LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)') + +def parse_log_line(line): + match = LOG_PATTERN.match(line) + if not match: + return None + ip = match.group(1) + full_time = match.group(4) + day = full_time.split('/')[0] # "10" + return (day, ip) + +date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None) +unique_daily_hosts = date_ip_pairs.distinct() + +# dailyHosts 缓存 +dailyHosts = unique_daily_hosts \ + .map(lambda pair: (pair[0], 1)) \ + .reduceByKey(lambda a, b: a + b) \ + .cache() + +# 排序并 collect 成 Python 列表 +sorted_daily_hosts = dailyHosts.sortByKey().collect() + +# 拆分为两个列表 +daysWithHosts = [day for (day, _) in sorted_daily_hosts] +hosts = [count for (_, count) in sorted_daily_hosts] + +# 绘图 +plt.figure(figsize=(10, 5)) +plt.plot(daysWithHosts, hosts, marker='o', linestyle='-', color='blue') +plt.title("Number of Unique Hosts per Day") +plt.xlabel("Day of Month") +plt.ylabel("Unique Hosts") +plt.grid(True) +plt.tight_layout() +plt.show() + +sc.stop()