feat:统计每天的独立访客数
- 读取 HDFS 上的 Apache 日志文件 - 解析日志,提取日期和 IP 地址 - 去重后按天统计独立访客数 - 结果按日期排序并打印
This commit is contained in:
parent
bed459e4c5
commit
9e9d18b78e
46
1-3.py
Normal file
46
1-3.py
Normal file
@ -0,0 +1,46 @@
|
||||
import re
|
||||
from pyspark import SparkContext
|
||||
|
||||
sc = SparkContext.getOrCreate()
|
||||
|
||||
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')
|
||||
|
||||
|
||||
def parse_log_line(line):
|
||||
match = LOG_PATTERN.match(line)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
ip = match.group(1)
|
||||
# 提取日期字符串的“日”部分,格式如:10/Oct/2000:13:55:36 -0700
|
||||
full_time = match.group(4)
|
||||
day = full_time.split('/')[0] # "10"
|
||||
|
||||
return (day, ip)
|
||||
|
||||
|
||||
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
||||
raw_logs = sc.textFile(logFile)
|
||||
|
||||
# 提取日期与 IP,过滤掉解析失败的行
|
||||
date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
|
||||
|
||||
# 去重 (day, ip) 对
|
||||
unique_daily_hosts = date_ip_pairs.distinct()
|
||||
|
||||
# 按天统计唯一主机数
|
||||
daily_host_counts = unique_daily_hosts \
|
||||
.map(lambda pair: (pair[0], 1)) \
|
||||
.reduceByKey(lambda a, b: a + b)
|
||||
|
||||
# 缓存 dailyHosts 以便后续使用
|
||||
dailyHosts = daily_host_counts.cache()
|
||||
|
||||
# 按日期升序排序并收集结果
|
||||
sorted_daily_hosts = dailyHosts.sortByKey().collect()
|
||||
|
||||
# 打印结果
|
||||
for day, count in sorted_daily_hosts:
|
||||
print("Day {0}: {1} unique hosts".format(day, count))
|
||||
|
||||
sc.stop()
|
Loading…
Reference in New Issue
Block a user