From 24bdddcc633434e91d8faa7723268f8ca798af33 Mon Sep 17 00:00:00 2001 From: fly6516 Date: Fri, 7 Mar 2025 11:48:04 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=20Iris=20=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=9B=86=E7=9A=84=20C4.5=20=E5=86=B3=E7=AD=96?= =?UTF-8?q?=E6=A0=91=E5=AE=9E=E9=AA=8C=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 从 UCI机器学习库获取 Iris 数据集- 实现了熵、信息增益计算函数 - 定义了选择最佳特征、构建决策树和预测的函数 - 合并特征和目标数据,构建决策树 - 打印决策树结构并进行预测测试 --- iris_c45_experiment.py | 91 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 iris_c45_experiment.py diff --git a/iris_c45_experiment.py b/iris_c45_experiment.py new file mode 100644 index 0000000..d82e711 --- /dev/null +++ b/iris_c45_experiment.py @@ -0,0 +1,91 @@ +# 导入必要的库 +from ucimlrepo import fetch_ucirepo # 用于从UCI机器学习库获取数据集 +import pandas as pd # 用于数据处理 +import math # 用于数学计算 + +# fetch dataset +iris = fetch_ucirepo(id=53) # 获取Iris数据集 + +# data (as pandas dataframes) +X = iris.data.features # 特征数据 +y = iris.data.targets # 目标数据 + +# metadata +print(iris.metadata) # 打印数据集的元数据 + +# variable information +print(iris.variables) # 打印数据集的变量信息 + +# 定义计算熵的函数 +def entropy(data): + # 计算数据集的熵 + label_counts = data.value_counts() # 计算每个标签的数量 + probabilities = label_counts / len(data) # 计算每个标签的概率 + return -sum(probabilities * probabilities.apply(math.log2)) # 计算熵 + +# 定义计算信息增益的函数 +def information_gain(data, feature, target): + # 计算特征的信息增益 + total_entropy = entropy(data[target]) # 计算目标数据的熵 + weighted_entropy = 0 # 初始化加权熵 + for value in data[feature].unique(): # 遍历特征的所有可能值 + subset = data[data[feature] == value] # 获取特征值为value的子集 + weighted_entropy += (len(subset) / len(data)) * entropy(subset[target]) # 计算加权熵 + return total_entropy - weighted_entropy # 返回信息增益 + +# 定义选择最佳特征的函数 +def choose_best_feature(data, features, target): + # 选择信息增益最大的特征 + best_feature = None # 初始化最佳特征 + max_gain = -1 # 初始化最大信息增益 + for feature in features: # 遍历所有特征 + gain = information_gain(data, feature, target) # 计算特征的信息增益 + if gain > max_gain: # 如果当前信息增益大于最大信息增益 + max_gain = gain # 更新最大信息增益 + best_feature = feature # 更新最佳特征 + return best_feature # 返回最佳特征 + +# 定义构建决策树的函数 +def build_tree(data, features, target): + # 构建决策树 + labels = data[target] # 获取目标数据 + if len(labels.unique()) == 1: # 如果所有标签相同 + return labels.iloc[0] # 返回该标签 + if len(features) == 0: # 如果没有特征 + return labels.mode()[0] # 返回多数标签 + best_feature = choose_best_feature(data, features, target) # 选择最佳特征 + tree = {best_feature: {}} # 创建树节点 + for value in data[best_feature].unique(): # 遍历最佳特征的所有可能值 + subset = data[data[best_feature] == value] # 获取特征值为value的子集 + remaining_features = [f for f in features if f != best_feature] # 获取剩余特征 + subtree = build_tree(subset, remaining_features, target) # 递归构建子树 + tree[best_feature][value] = subtree # 将子树添加到树节点 + return tree # 返回决策树 + +# 定义预测的函数 +def predict(tree, sample): + # 使用决策树进行预测 + feature = list(tree.keys())[0] # 获取当前节点的特征 + value = sample[feature] # 获取样本在该特征上的值 + subtree = tree[feature][value] # 获取子树 + if isinstance(subtree, dict): # 如果子树是字典 + return predict(subtree, sample) # 递归预测 + else: # 如果子树是叶子节点 + return subtree # 返回预测结果 + +# 合并特征和目标数据 +data = pd.concat([X, y], axis=1) # 合并特征和目标数据 +data.columns = list(X.columns) + ['target'] # 设置列名 + +# 构建决策树 +features = list(X.columns) # 获取特征列表 +target = 'target' # 设置目标列名 +tree = build_tree(data, features, target) # 构建决策树 + +# 打印决策树 +print(tree) # 打印决策树 + +# 测试预测 +sample = data.iloc[0].drop('target') # 取第一个样本进行测试 +prediction = predict(tree, sample) # 进行预测 +print(f'Predicted class: {prediction}') # 打印预测结果 \ No newline at end of file