104 lines
3.2 KiB
Python
104 lines
3.2 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
from __future__ import print_function
|
|||
|
import pandas as pd
|
|||
|
from collections import defaultdict
|
|||
|
|
|||
|
|
|||
|
class FPNode:
|
|||
|
def __init__(self, item=None, count=0, parent=None):
|
|||
|
self.item = item
|
|||
|
self.count = count
|
|||
|
self.parent = parent
|
|||
|
self.children = {}
|
|||
|
self.next = None
|
|||
|
|
|||
|
|
|||
|
def build_fp_tree(data, min_support):
|
|||
|
# 构建FP树
|
|||
|
header_table = defaultdict(int)
|
|||
|
for transaction in data:
|
|||
|
for item in transaction:
|
|||
|
header_table[item] += 1
|
|||
|
|
|||
|
# 移除不满足最小支持度的项
|
|||
|
header_table = {k: v for k, v in header_table.items() if v >= min_support}
|
|||
|
if not header_table:
|
|||
|
return None, None
|
|||
|
|
|||
|
# 初始化头表
|
|||
|
for k in header_table:
|
|||
|
header_table[k] = [header_table[k], None]
|
|||
|
|
|||
|
root = FPNode()
|
|||
|
for transaction in data:
|
|||
|
filtered_items = [item for item in transaction if item in header_table]
|
|||
|
if filtered_items:
|
|||
|
filtered_items.sort(key=lambda x: header_table[x][0], reverse=True)
|
|||
|
update_fp_tree(filtered_items, root, header_table)
|
|||
|
return root, header_table
|
|||
|
|
|||
|
|
|||
|
def update_fp_tree(items, node, header_table):
|
|||
|
# 更新FP树
|
|||
|
if items[0] in node.children:
|
|||
|
node.children[items[0]].count += 1
|
|||
|
else:
|
|||
|
new_node = FPNode(item=items[0], count=1, parent=node)
|
|||
|
node.children[items[0]] = new_node
|
|||
|
update_header_table(header_table, items[0], new_node)
|
|||
|
if len(items) > 1:
|
|||
|
update_fp_tree(items[1:], node.children[items[0]], header_table)
|
|||
|
|
|||
|
|
|||
|
def update_header_table(header_table, item, target_node):
|
|||
|
# 更新头表指针
|
|||
|
if header_table[item][1] is None:
|
|||
|
header_table[item][1] = target_node
|
|||
|
else:
|
|||
|
current = header_table[item][1]
|
|||
|
while current.next:
|
|||
|
current = current.next
|
|||
|
current.next = target_node
|
|||
|
|
|||
|
|
|||
|
def mine_fp_tree(header_table, prefix, min_support, frequent_itemsets):
|
|||
|
# 挖掘FP树中的频繁项集
|
|||
|
sorted_items = [item[0] for item in sorted(header_table.items(), key=lambda x: x[1][0])]
|
|||
|
for item in sorted_items:
|
|||
|
new_prefix = prefix.copy()
|
|||
|
new_prefix.add(item)
|
|||
|
frequent_itemsets.append(new_prefix)
|
|||
|
conditional_pattern_bases = find_prefix_paths(item, header_table)
|
|||
|
conditional_fp_tree, conditional_header_table = build_fp_tree(conditional_pattern_bases, min_support)
|
|||
|
if conditional_header_table:
|
|||
|
mine_fp_tree(conditional_header_table, new_prefix, min_support, frequent_itemsets)
|
|||
|
|
|||
|
|
|||
|
def find_prefix_paths(base_item, header_table):
|
|||
|
# 找到条件模式基
|
|||
|
paths = []
|
|||
|
node = header_table[base_item][1]
|
|||
|
while node:
|
|||
|
path = []
|
|||
|
ascend_tree(node, path)
|
|||
|
if path:
|
|||
|
paths.append(path)
|
|||
|
node = node.next
|
|||
|
return paths
|
|||
|
|
|||
|
|
|||
|
def ascend_tree(node, path):
|
|||
|
# 从节点向上遍历树
|
|||
|
while node.parent and node.parent.item:
|
|||
|
path.append(node.parent.item)
|
|||
|
node = node.parent
|
|||
|
|
|||
|
|
|||
|
def find_frequent_itemsets(data, min_support):
|
|||
|
# 主函数:使用FP-Growth算法挖掘频繁项集
|
|||
|
root, header_table = build_fp_tree(data, min_support)
|
|||
|
if not root:
|
|||
|
return []
|
|||
|
frequent_itemsets = []
|
|||
|
mine_fp_tree(header_table, set(), min_support, frequent_itemsets)
|
|||
|
return frequent_itemsets
|