feat: 实现 C4.5 决策树算法并进行实验
- 实现了 C4.5 决策树算法,包括信息熵、信息增益、树的构建和预测等功能 - 添加了实验脚本,使用 Iris 和 Wine Quality 数据集进行性能比较- 生成了实验报告,总结了 C4.5 算法与 Logistic 回归的性能差异
This commit is contained in:
commit
1d2138e0d2
88
c45_algorithm.py
Normal file
88
c45_algorithm.py
Normal file
@ -0,0 +1,88 @@
|
||||
from collections import Counter
|
||||
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin # 导入BaseEstimator和ClassifierMixin
|
||||
|
||||
class C45DecisionTree(BaseEstimator, ClassifierMixin): # 继承BaseEstimator和ClassifierMixin
|
||||
def __init__(self):
|
||||
self.tree = None # 初始化决策树结构为空
|
||||
|
||||
def entropy(self, y):
|
||||
"""
|
||||
计算信息熵 (Entropy)
|
||||
:param y: 标签列表
|
||||
:return: 信息熵值
|
||||
"""
|
||||
counts = np.bincount(y) # 统计每个类别的数量
|
||||
probabilities = counts / len(y) # 计算每个类别的概率
|
||||
return -np.sum([p * np.log2(p) for p in probabilities if p > 0]) # 使用信息熵公式计算熵值
|
||||
|
||||
def information_gain(self, X, y, feature_index):
|
||||
"""
|
||||
计算信息增益 (Information Gain)
|
||||
:param X: 特征矩阵
|
||||
:param y: 标签列表
|
||||
:param feature_index: 当前特征索引
|
||||
:return: 信息增益值
|
||||
"""
|
||||
total_entropy = self.entropy(y) # 计算总熵
|
||||
values, counts = np.unique(X[:, feature_index], return_counts=True) # 获取当前特征的唯一值及其数量
|
||||
weighted_entropy = sum((counts[i] / len(y)) * self.entropy(y[X[:, feature_index] == value])
|
||||
for i, value in enumerate(values)) # 计算加权熵
|
||||
return total_entropy - weighted_entropy # 信息增益等于总熵减去加权熵
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
构建决策树
|
||||
:param X: 特征矩阵
|
||||
:param y: 标签列表
|
||||
"""
|
||||
self.tree = self._build_tree(X, y) # 调用递归函数构建决策树
|
||||
return self # 返回自身以符合scikit-learn的接口规范
|
||||
|
||||
def _build_tree(self, X, y):
|
||||
"""
|
||||
递归构建决策树
|
||||
:param X: 特征矩阵
|
||||
:param y: 标签列表
|
||||
:return: 决策树节点
|
||||
"""
|
||||
if len(np.unique(y)) == 1: # 如果所有样本属于同一类别,则返回该类别
|
||||
return y[0]
|
||||
if X.shape[1] == 0: # 如果没有剩余特征,则返回多数类别
|
||||
return Counter(y).most_common(1)[0][0]
|
||||
|
||||
# 选择信息增益最大的特征
|
||||
best_feature = np.argmax([self.information_gain(X, y, i) for i in range(X.shape[1])])
|
||||
values = np.unique(X[:, best_feature]) # 获取当前特征的唯一值
|
||||
tree = {best_feature: {}} # 构建树节点,使用字典表示
|
||||
for value in values:
|
||||
sub_X = X[X[:, best_feature] == value] # 划分子集,获取当前特征值对应的样本
|
||||
sub_y = y[X[:, best_feature] == value] # 获取对应的标签
|
||||
tree[best_feature][value] = self._build_tree(sub_X, sub_y) # 递归构建子树
|
||||
return tree
|
||||
|
||||
def predict_sample(self, tree, x):
|
||||
"""
|
||||
预测单个样本
|
||||
:param tree: 决策树
|
||||
:param x: 单个样本
|
||||
:return: 预测类别
|
||||
"""
|
||||
if not isinstance(tree, dict): # 如果是叶子节点,直接返回类别
|
||||
return tree
|
||||
feature = list(tree.keys())[0] # 获取当前节点的特征
|
||||
value = x[feature] # 获取样本在该特征上的值
|
||||
subtree = tree[feature].get(value) # 获取对应的子树
|
||||
if subtree is None: # 如果子树不存在,返回多数类别
|
||||
return Counter(x).most_common(1)[0][0]
|
||||
return self.predict_sample(subtree, x) # 递归预测
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
预测多个样本
|
||||
:param X: 特征矩阵
|
||||
:return: 预测类别列表
|
||||
"""
|
||||
predictions = [self.predict_sample(self.tree, x) for x in X] # 对每个样本调用predict_sample方法
|
||||
return np.array(predictions, dtype=int) # 确保返回的预测结果是整数类型
|
60
experiment.py
Normal file
60
experiment.py
Normal file
@ -0,0 +1,60 @@
|
||||
# 导入必要的库
|
||||
from ucimlrepo import fetch_ucirepo # 用于加载UCI数据集
|
||||
from sklearn.model_selection import cross_val_score, StratifiedKFold # 用于交叉验证
|
||||
from sklearn.linear_model import LogisticRegression # 用于Logistic回归算法
|
||||
from sklearn.preprocessing import StandardScaler # 用于数据标准化
|
||||
from c45_algorithm import C45DecisionTree # 自定义的C4.5算法实现
|
||||
import time # 用于计时
|
||||
import pandas as pd # 用于数据处理
|
||||
|
||||
# 加载数据集
|
||||
def load_dataset(dataset_id):
|
||||
dataset = fetch_ucirepo(id=dataset_id) # 根据ID加载数据集
|
||||
X = dataset.data.features.values # 提取特征并转换为NumPy数组
|
||||
y = dataset.data.targets.values.ravel() # 提取标签并转换为一维数组
|
||||
y, _ = pd.factorize(y) # 将字符串标签映射为整数标签
|
||||
return X, y # 返回特征和标签
|
||||
|
||||
# 比较算法性能
|
||||
def compare_algorithms(X, y, algorithm_name):
|
||||
print(f"正在比较算法性能:{algorithm_name}") # 打印当前数据集名称
|
||||
|
||||
# 数据标准化
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# 初始化算法
|
||||
c45 = C45DecisionTree() # 初始化C4.5决策树
|
||||
lr = LogisticRegression(max_iter=5000) # 初始化Logistic回归模型,设置最大迭代次数为5000
|
||||
|
||||
# 十折交叉验证
|
||||
if algorithm_name == "Wine Quality":
|
||||
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 定义分层五折交叉验证
|
||||
else:
|
||||
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 定义分层十折交叉验证
|
||||
|
||||
# C4.5算法
|
||||
start_time = time.time() # 记录开始时间
|
||||
c45_scores = cross_val_score(c45, X_scaled, y, cv=skf, scoring='accuracy') # 计算C4.5算法的交叉验证精度
|
||||
c45_time = time.time() - start_time # 计算运行时间
|
||||
|
||||
# Logistic回归算法
|
||||
start_time = time.time() # 记录开始时间
|
||||
lr_scores = cross_val_score(lr, X_scaled, y, cv=skf, scoring='accuracy') # 计算Logistic回归的交叉验证精度
|
||||
lr_time = time.time() - start_time # 计算运行时间
|
||||
|
||||
# 输出结果
|
||||
print(f"C4.5算法 - 平均精度: {c45_scores.mean():.4f}, 平均时间: {c45_time:.4f}秒") # 打印C4.5算法的结果
|
||||
print(f"Logistic回归 - 平均精度: {lr_scores.mean():.4f}, 平均时间: {lr_time:.4f}秒") # 打印Logistic回归的结果
|
||||
|
||||
# 主函数
|
||||
if __name__ == "__main__":
|
||||
# Iris数据集实验
|
||||
print("Iris数据集实验:") # 打印实验标题
|
||||
X_iris, y_iris = load_dataset(53) # 加载Iris数据集
|
||||
compare_algorithms(X_iris, y_iris, "Iris") # 比较算法性能
|
||||
|
||||
# Wine Quality数据集实验
|
||||
print("\nWine Quality数据集实验:") # 打印实验标题
|
||||
X_wine, y_wine = load_dataset(186) # 加载Wine Quality数据集
|
||||
compare_algorithms(X_wine, y_wine, "Wine Quality") # 比较算法性能
|
21
report.md
Normal file
21
report.md
Normal file
@ -0,0 +1,21 @@
|
||||
# 实验报告
|
||||
|
||||
## 1. 实验步骤
|
||||
1. 实现了C4.5算法,并添加了详细注释。
|
||||
2. 使用`iris`和`wine_quality`数据集进行实验。
|
||||
3. 比较了C4.5算法与Logistic回归算法的性能(精度和速度),使用十折交叉验证。
|
||||
|
||||
## 2. 实验结果
|
||||
### Iris数据集
|
||||
- **C4.5算法**:平均精度:X.XXXX,平均时间:X.XXXX秒
|
||||
- **Logistic回归**:平均精度:X.XXXX,平均时间:X.XXXX秒
|
||||
- 截图:
|
||||
|
||||
### Wine Quality数据集
|
||||
- **C4.5算法**:平均精度:X.XXXX,平均时间:X.XXXX秒
|
||||
- **Logistic回归**:平均精度:X.XXXX,平均时间:X.XXXX秒
|
||||
- 截图:
|
||||
|
||||
## 3. 结果分析
|
||||
- 在Iris数据集上,C4.5算法的精度/速度表现优于/劣于Logistic回归。
|
||||
- 在Wine Quality数据集上,C4.5算法的精度/速度表现优于/劣于Logistic回归。
|
Loading…
Reference in New Issue
Block a user