feat:统计每天的独立访客数
- 读取 HDFS 上的 Apache 日志文件 - 解析日志,提取日期和 IP 地址 - 去重后按天统计独立访客数 - 结果按日期排序并打印
This commit is contained in:
parent
bed459e4c5
commit
9e9d18b78e
46
1-3.py
Normal file
46
1-3.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import re
|
||||||
|
from pyspark import SparkContext
|
||||||
|
|
||||||
|
sc = SparkContext.getOrCreate()
|
||||||
|
|
||||||
|
LOG_PATTERN = re.compile(r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_log_line(line):
|
||||||
|
match = LOG_PATTERN.match(line)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ip = match.group(1)
|
||||||
|
# 提取日期字符串的“日”部分,格式如:10/Oct/2000:13:55:36 -0700
|
||||||
|
full_time = match.group(4)
|
||||||
|
day = full_time.split('/')[0] # "10"
|
||||||
|
|
||||||
|
return (day, ip)
|
||||||
|
|
||||||
|
|
||||||
|
logFile = "hdfs://master:9000/user/root/apache.access.log.PROJECT"
|
||||||
|
raw_logs = sc.textFile(logFile)
|
||||||
|
|
||||||
|
# 提取日期与 IP,过滤掉解析失败的行
|
||||||
|
date_ip_pairs = raw_logs.map(parse_log_line).filter(lambda x: x is not None)
|
||||||
|
|
||||||
|
# 去重 (day, ip) 对
|
||||||
|
unique_daily_hosts = date_ip_pairs.distinct()
|
||||||
|
|
||||||
|
# 按天统计唯一主机数
|
||||||
|
daily_host_counts = unique_daily_hosts \
|
||||||
|
.map(lambda pair: (pair[0], 1)) \
|
||||||
|
.reduceByKey(lambda a, b: a + b)
|
||||||
|
|
||||||
|
# 缓存 dailyHosts 以便后续使用
|
||||||
|
dailyHosts = daily_host_counts.cache()
|
||||||
|
|
||||||
|
# 按日期升序排序并收集结果
|
||||||
|
sorted_daily_hosts = dailyHosts.sortByKey().collect()
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
for day, count in sorted_daily_hosts:
|
||||||
|
print("Day {0}: {1} unique hosts".format(day, count))
|
||||||
|
|
||||||
|
sc.stop()
|
Loading…
Reference in New Issue
Block a user