import matplotlib.pyplot as plt
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

# 假设这是上一个练习中 dailyHosts 缓存后的 RDD
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
raw_logs = sc.textFile(logFile)

import re
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')

def parse_log_line(line):
    match = LOG_PATTERN.match(line)
    if not match:
        return None
    ip = match.group(1)
    full_time = match.group(4)
    day = full_time.split('/')[0]  # "10"
    return (day, ip)

date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
unique_daily_hosts = date_ip_pairs.distinct()

# dailyHosts 缓存
dailyHosts = unique_daily_hosts \
    .map(lambda pair: (pair[0], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .cache()

# 排序并 collect 成 Python 列表
sorted_daily_hosts = dailyHosts.sortByKey().collect()

# 拆分为两个列表
daysWithHosts = [day for (day, _) in sorted_daily_hosts]
hosts = [count for (_, count) in sorted_daily_hosts]

# 绘图
plt.figure(figsize=(10, 5))
plt.plot(daysWithHosts, hosts, marker='o', linestyle='-', color='blue')
plt.title("Number of Unique Hosts per Day")
plt.xlabel("Day of Month")
plt.ylabel("Unique Hosts")
plt.grid(True)
plt.tight_layout()
plt.show()

sc.stop()