feat: 添加 Iris 数据集的 C4.5 决策树实验代码

- 从 UCI机器学习库获取 Iris 数据集- 实现了熵、信息增益计算函数 - 定义了选择最佳特征、构建决策树和预测的函数 - 合并特征和目标数据，构建决策树 - 打印决策树结构并进行预测测试
2025-03-07 11:48:04 +08:00 · 2025-03-07 11:48:04 +08:00 · 24bdddcc63
commit 24bdddcc63
parent 1d2138e0d2
1 changed files with 91 additions and 0 deletions
--- a/iris_c45_experiment.py
+++ b/iris_c45_experiment.py
@ -0,0 +1,91 @@
+# 导入必要的库
+from ucimlrepo import fetch_ucirepo  # 用于从UCI机器学习库获取数据集
+import pandas as pd  # 用于数据处理
+import math  # 用于数学计算
+
+# fetch dataset 
+iris = fetch_ucirepo(id=53)  # 获取Iris数据集
+
+# data (as pandas dataframes) 
+X = iris.data.features  # 特征数据
+y = iris.data.targets  # 目标数据
+
+# metadata 
+print(iris.metadata)  # 打印数据集的元数据
+
+# variable information 
+print(iris.variables)  # 打印数据集的变量信息
+
+# 定义计算熵的函数
+def entropy(data):
+    # 计算数据集的熵
+    label_counts = data.value_counts()  # 计算每个标签的数量
+    probabilities = label_counts / len(data)  # 计算每个标签的概率
+    return -sum(probabilities * probabilities.apply(math.log2))  # 计算熵
+
+# 定义计算信息增益的函数
+def information_gain(data, feature, target):
+    # 计算特征的信息增益
+    total_entropy = entropy(data[target])  # 计算目标数据的熵
+    weighted_entropy = 0  # 初始化加权熵
+    for value in data[feature].unique():  # 遍历特征的所有可能值
+        subset = data[data[feature] == value]  # 获取特征值为value的子集
+        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])  # 计算加权熵
+    return total_entropy - weighted_entropy  # 返回信息增益
+
+# 定义选择最佳特征的函数
+def choose_best_feature(data, features, target):
+    # 选择信息增益最大的特征
+    best_feature = None  # 初始化最佳特征
+    max_gain = -1  # 初始化最大信息增益
+    for feature in features:  # 遍历所有特征
+        gain = information_gain(data, feature, target)  # 计算特征的信息增益
+        if gain > max_gain:  # 如果当前信息增益大于最大信息增益
+            max_gain = gain  # 更新最大信息增益
+            best_feature = feature  # 更新最佳特征
+    return best_feature  # 返回最佳特征
+
+# 定义构建决策树的函数
+def build_tree(data, features, target):
+    # 构建决策树
+    labels = data[target]  # 获取目标数据
+    if len(labels.unique()) == 1:  # 如果所有标签相同
+        return labels.iloc[0]  # 返回该标签
+    if len(features) == 0:  # 如果没有特征
+        return labels.mode()[0]  # 返回多数标签
+    best_feature = choose_best_feature(data, features, target)  # 选择最佳特征
+    tree = {best_feature: {}}  # 创建树节点
+    for value in data[best_feature].unique():  # 遍历最佳特征的所有可能值
+        subset = data[data[best_feature] == value]  # 获取特征值为value的子集
+        remaining_features = [f for f in features if f != best_feature]  # 获取剩余特征
+        subtree = build_tree(subset, remaining_features, target)  # 递归构建子树
+        tree[best_feature][value] = subtree  # 将子树添加到树节点
+    return tree  # 返回决策树
+
+# 定义预测的函数
+def predict(tree, sample):
+    # 使用决策树进行预测
+    feature = list(tree.keys())[0]  # 获取当前节点的特征
+    value = sample[feature]  # 获取样本在该特征上的值
+    subtree = tree[feature][value]  # 获取子树
+    if isinstance(subtree, dict):  # 如果子树是字典
+        return predict(subtree, sample)  # 递归预测
+    else:  # 如果子树是叶子节点
+        return subtree  # 返回预测结果
+
+# 合并特征和目标数据
+data = pd.concat([X, y], axis=1)  # 合并特征和目标数据
+data.columns = list(X.columns) + ['target']  # 设置列名
+
+# 构建决策树
+features = list(X.columns)  # 获取特征列表
+target = 'target'  # 设置目标列名
+tree = build_tree(data, features, target)  # 构建决策树
+
+# 打印决策树
+print(tree)  # 打印决策树
+
+# 测试预测
+sample = data.iloc[0].drop('target')  # 取第一个样本进行测试
+prediction = predict(tree, sample)  # 进行预测
+print(f'Predicted class: {prediction}')  # 打印预测结果