feat: 添加 Iris 数据集的 C4.5 决策树实验代码
- 从 UCI机器学习库获取 Iris 数据集- 实现了熵、信息增益计算函数 - 定义了选择最佳特征、构建决策树和预测的函数 - 合并特征和目标数据,构建决策树 - 打印决策树结构并进行预测测试
This commit is contained in:
parent
1d2138e0d2
commit
24bdddcc63
91
iris_c45_experiment.py
Normal file
91
iris_c45_experiment.py
Normal file
@ -0,0 +1,91 @@
|
||||
# 导入必要的库
|
||||
from ucimlrepo import fetch_ucirepo # 用于从UCI机器学习库获取数据集
|
||||
import pandas as pd # 用于数据处理
|
||||
import math # 用于数学计算
|
||||
|
||||
# fetch dataset
|
||||
iris = fetch_ucirepo(id=53) # 获取Iris数据集
|
||||
|
||||
# data (as pandas dataframes)
|
||||
X = iris.data.features # 特征数据
|
||||
y = iris.data.targets # 目标数据
|
||||
|
||||
# metadata
|
||||
print(iris.metadata) # 打印数据集的元数据
|
||||
|
||||
# variable information
|
||||
print(iris.variables) # 打印数据集的变量信息
|
||||
|
||||
# 定义计算熵的函数
|
||||
def entropy(data):
|
||||
# 计算数据集的熵
|
||||
label_counts = data.value_counts() # 计算每个标签的数量
|
||||
probabilities = label_counts / len(data) # 计算每个标签的概率
|
||||
return -sum(probabilities * probabilities.apply(math.log2)) # 计算熵
|
||||
|
||||
# 定义计算信息增益的函数
|
||||
def information_gain(data, feature, target):
|
||||
# 计算特征的信息增益
|
||||
total_entropy = entropy(data[target]) # 计算目标数据的熵
|
||||
weighted_entropy = 0 # 初始化加权熵
|
||||
for value in data[feature].unique(): # 遍历特征的所有可能值
|
||||
subset = data[data[feature] == value] # 获取特征值为value的子集
|
||||
weighted_entropy += (len(subset) / len(data)) * entropy(subset[target]) # 计算加权熵
|
||||
return total_entropy - weighted_entropy # 返回信息增益
|
||||
|
||||
# 定义选择最佳特征的函数
|
||||
def choose_best_feature(data, features, target):
|
||||
# 选择信息增益最大的特征
|
||||
best_feature = None # 初始化最佳特征
|
||||
max_gain = -1 # 初始化最大信息增益
|
||||
for feature in features: # 遍历所有特征
|
||||
gain = information_gain(data, feature, target) # 计算特征的信息增益
|
||||
if gain > max_gain: # 如果当前信息增益大于最大信息增益
|
||||
max_gain = gain # 更新最大信息增益
|
||||
best_feature = feature # 更新最佳特征
|
||||
return best_feature # 返回最佳特征
|
||||
|
||||
# 定义构建决策树的函数
|
||||
def build_tree(data, features, target):
|
||||
# 构建决策树
|
||||
labels = data[target] # 获取目标数据
|
||||
if len(labels.unique()) == 1: # 如果所有标签相同
|
||||
return labels.iloc[0] # 返回该标签
|
||||
if len(features) == 0: # 如果没有特征
|
||||
return labels.mode()[0] # 返回多数标签
|
||||
best_feature = choose_best_feature(data, features, target) # 选择最佳特征
|
||||
tree = {best_feature: {}} # 创建树节点
|
||||
for value in data[best_feature].unique(): # 遍历最佳特征的所有可能值
|
||||
subset = data[data[best_feature] == value] # 获取特征值为value的子集
|
||||
remaining_features = [f for f in features if f != best_feature] # 获取剩余特征
|
||||
subtree = build_tree(subset, remaining_features, target) # 递归构建子树
|
||||
tree[best_feature][value] = subtree # 将子树添加到树节点
|
||||
return tree # 返回决策树
|
||||
|
||||
# 定义预测的函数
|
||||
def predict(tree, sample):
|
||||
# 使用决策树进行预测
|
||||
feature = list(tree.keys())[0] # 获取当前节点的特征
|
||||
value = sample[feature] # 获取样本在该特征上的值
|
||||
subtree = tree[feature][value] # 获取子树
|
||||
if isinstance(subtree, dict): # 如果子树是字典
|
||||
return predict(subtree, sample) # 递归预测
|
||||
else: # 如果子树是叶子节点
|
||||
return subtree # 返回预测结果
|
||||
|
||||
# 合并特征和目标数据
|
||||
data = pd.concat([X, y], axis=1) # 合并特征和目标数据
|
||||
data.columns = list(X.columns) + ['target'] # 设置列名
|
||||
|
||||
# 构建决策树
|
||||
features = list(X.columns) # 获取特征列表
|
||||
target = 'target' # 设置目标列名
|
||||
tree = build_tree(data, features, target) # 构建决策树
|
||||
|
||||
# 打印决策树
|
||||
print(tree) # 打印决策树
|
||||
|
||||
# 测试预测
|
||||
sample = data.iloc[0].drop('target') # 取第一个样本进行测试
|
||||
prediction = predict(tree, sample) # 进行预测
|
||||
print(f'Predicted class: {prediction}') # 打印预测结果
|
Loading…
Reference in New Issue
Block a user