DM-exp-2/code/fpgrowth.py

# -*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd
from collections import defaultdict


class FPNode:
    def __init__(self, item=None, count=0, parent=None):
        self.item = item
        self.count = count
        self.parent = parent
        self.children = {}
        self.next = None


def build_fp_tree(data, min_support):
    # 构建FP树
    header_table = defaultdict(int)
    for transaction in data:
        for item in transaction:
            header_table[item] += 1

    # 移除不满足最小支持度的项
    header_table = {k: v for k, v in header_table.items() if v >= min_support}
    if not header_table:
        return None, None

    # 初始化头表
    for k in header_table:
        header_table[k] = [header_table[k], None]

    root = FPNode()
    for transaction in data:
        filtered_items = [item for item in transaction if item in header_table]
        if filtered_items:
            filtered_items.sort(key=lambda x: header_table[x][0], reverse=True)
            update_fp_tree(filtered_items, root, header_table)
    return root, header_table


def update_fp_tree(items, node, header_table):
    # 更新FP树
    if items[0] in node.children:
        node.children[items[0]].count += 1
    else:
        new_node = FPNode(item=items[0], count=1, parent=node)
        node.children[items[0]] = new_node
        update_header_table(header_table, items[0], new_node)
    if len(items) > 1:
        update_fp_tree(items[1:], node.children[items[0]], header_table)


def update_header_table(header_table, item, target_node):
    # 更新头表指针
    if header_table[item][1] is None:
        header_table[item][1] = target_node
    else:
        current = header_table[item][1]
        while current.next:
            current = current.next
        current.next = target_node


def mine_fp_tree(header_table, prefix, min_support, frequent_itemsets):
    # 挖掘FP树中的频繁项集
    sorted_items = [item[0] for item in sorted(header_table.items(), key=lambda x: x[1][0])]
    for item in sorted_items:
        new_prefix = prefix.copy()
        new_prefix.add(item)
        frequent_itemsets.append(new_prefix)
        conditional_pattern_bases = find_prefix_paths(item, header_table)
        conditional_fp_tree, conditional_header_table = build_fp_tree(conditional_pattern_bases, min_support)
        if conditional_header_table:
            mine_fp_tree(conditional_header_table, new_prefix, min_support, frequent_itemsets)


def find_prefix_paths(base_item, header_table):
    # 找到条件模式基
    paths = []
    node = header_table[base_item][1]
    while node:
        path = []
        ascend_tree(node, path)
        if path:
            paths.append(path)
        node = node.next
    return paths


def ascend_tree(node, path):
    # 从节点向上遍历树
    while node.parent and node.parent.item:
        path.append(node.parent.item)
        node = node.parent


def find_frequent_itemsets(data, min_support):
    # 主函数：使用FP-Growth算法挖掘频繁项集
    root, header_table = build_fp_tree(data, min_support)
    if not root:
        return []
    frequent_itemsets = []
    mine_fp_tree(header_table, set(), min_support, frequent_itemsets)
    return frequent_itemsets