决策树是机器学习中的经典分类算法,而ID3算法则是决策树的基础版本之一。本文将从零开始,使用**仅Python(不依赖任何第三方库)**来实现ID3决策树,并分享在实现过程中遇到的常见问题及解决方法。
一、ID3算法核心原理
ID3算法的核心在于最大化信息增益,具体步骤如下:
- 计算数据集的整体熵(衡量样本不确定性);
- 根据每个特征拆分数据集,并计算条件熵;
- 信息增益 = 总熵 - 条件熵,选取信息增益最大的特征作为当前决策节点;
- 递归处理每个子节点,直到所有样本属于同一类别或无特征可继续拆分。
二、纯Python实现ID3决策树(完整代码)
1. 数据准备
首先创建训练集dataset.txt和测试集testset.txt,格式为“前4列为特征,最后一列标签(1=给贷款,0=不给贷款)”:
训练集(datatest.txt):
0,0,0,0,0
0,0,0,1,0
0,1,0,1,1
0,1,1,0,1
0,0,0,0,0
1,0,0,0,0
1,0,0,1,0
1,1,1,1,1
1,0,1,2,1
1,0,1,2,1
2,0,1,2,1
2,0,1,1,1
2,1,0,1,1
2,1,0,2,1
2,0,0,0,0
测试集(testset.txt):
0,0,1,2,1
1,1,0,1,1
2,0,0,2,0
0,1,0,0,1
2. 完整实现代码
import math
# ---------------------- 核心工具函数 ----------------------
def calculate_entropy(y):
"""计算数据集的熵(不确定性)"""
# 统计每个类别的样本数
class_counts = {}
for label in y:
label = int(label)
class_counts[label] = class_counts.get(label, 0) + 1
# 计算每个类别的概率
total = len(y)
probabilities = [count / total for count in class_counts.values()]
# 计算熵
entropy = 0.0
for p in probabilities:
if p > 0:
entropy -= p * math.log2(p) # 替代log2(简化实现)
# 更精确实现:import math; entropy -= p * math.log2(p)
return entropy
def split_dataset(X, y, feature_idx, value):
"""根据特征索引和取值拆分数据集"""
X_subset = []
y_subset = []
for i in range(len(X)):
if int(X[i][feature_idx]) == value:
# 移除当前特征列(避免重复使用)
reduced_X = X[i][:feature_idx] + X[i][feature_idx+1:]
X_subset.append(reduced_X)
y_subset.append(y[i])
return X_subset, y_subset
def calculate_information_gain(X, y, feature_idx):
"""计算指定特征的信息增益"""
total_entropy = calculate_entropy(y)
# 获取特征的唯一取值
feature_values = [int(row[feature_idx]) for row in X]
unique_values = list(set(feature_values))
# 计算条件熵
conditional_entropy = 0.0
for value in unique_values:
X_subset, y_subset = split_dataset(X, y, feature_idx, value)
weight = len(y_subset) / len(y)
conditional_entropy += weight * calculate_entropy(y_subset)
# 信息增益 = 总熵 - 条件熵
return total_entropy - conditional_entropy
def choose_best_feature(X, y):
"""选择信息增益最大的特征作为当前决策节点"""
n_features = len(X[0]) if X else 0
max_gain = -1
best_feature_idx = -1
for idx in range(n_features):
gain = calculate_information_gain(X, y, idx)
if gain > max_gain:
max_gain = gain
best_feature_idx = idx
return best_feature_idx
def majority_vote(y):
"""投票法确定叶子节点的类别"""
class_counts = {}
for label in y:
label = int(label)
class_counts[label] = class_counts.get(label, 0) + 1
# 返回出现次数最多的类别
return max(class_counts, key=class_counts.get)
# ---------------------- 决策树构建(递归实现) ----------------------
def build_id3_tree(X, y, feature_names):
"""
递归构建ID3决策树
X: 特征矩阵(列表嵌套列表)
y: 标签向量(列表)
feature_names: 特征名称列表
"""
# 终止条件1:所有样本属于同一类别
if len(set(y)) == 1:
return {'type': 'leaf', 'class': int(y[0])}
# 终止条件2:无特征可拆分,返回多数类
if len(feature_names) == 0:
return {'type': 'leaf', 'class': majority_vote(y)}
# 选择信息增益最大的特征
best_feature_idx = choose_best_feature(X, y)
best_feature_name = feature_names[best_feature_idx]
# 构建决策节点
tree = {'type': 'node', 'feature': best_feature_name, 'feature_idx': best_feature_idx}
tree['children'] = {}
# 获取特征的唯一取值
feature_values = [int(row[best_feature_idx]) for row in X]
unique_values = list(set(feature_values))
# 递归处理每个子节点
for value in unique_values:
# 拆分数据集
X_subset, y_subset = split_dataset(X, y, best_feature_idx, value)
# 移除已使用的特征名称
new_feature_names = feature_names[:best_feature_idx] + feature_names[best_feature_idx+1:]
# 递归构建子树
tree['children'][value] = build_id3_tree(X_subset, y_subset, new_feature_names)
return tree
# ---------------------- 决策树预测函数 ----------------------
def predict_sample(tree, sample, feature_names):
"""用构建好的决策树预测单个样本"""
if tree['type'] == 'leaf':
return tree['class']
# 获取特征在当前样本中的索引
feature_name = tree['feature']
feature_idx = feature_names.index(feature_name)
sample_value = int(sample[feature_idx])
# 处理未见过的特征取值
if sample_value not in tree['children']:
return majority_vote(y_train) # y_train需为全局变量或通过参数传入
# 递归预测
return predict_sample(tree['children'][sample_value], sample, feature_names)
def predict_tree(tree, X, feature_names):
"""预测整个测试集"""
return [predict_sample(tree, sample, feature_names) for sample in X]
# ---------------------- 决策树文本化可视化 ----------------------
def print_tree(tree, indent=""):
"""递归打印决策树(文本格式)"""
if tree['type'] == 'leaf':
print(f"{indent}→ 类别:{'给贷款' if tree['class'] == 1 else '不给贷款'}")
return
# 打印决策节点
print(f"{indent}{tree['feature']}?")
# 递归打印子节点
for value, child in tree['children'].items():
feature_value_map = {0: '否/低', 1: '是/中', 2: '高'}
print(f"{indent} 取值{feature_value_map[value]}:")
print_tree(child, indent + " ")
# ---------------------- 数据加载工具函数 ----------------------
def load_data(file_path):
"""加载数据集(纯Python实现,替代np.loadtxt)"""
X = []
y = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
values = list(map(float, line.split(',')))
X.append(values[:-1])
y.append(values[-1])
return X, y
# ---------------------- 评估函数 ----------------------
def calculate_accuracy(y_true, y_pred):
"""计算准确率"""
correct = 0
for true, pred in zip(y_true, y_pred):
if int(true) == pred:
correct += 1
return (correct / len(y_true)) * 100 if y_true else 0.0
# ---------------------- 主程序(训练+测试+评估) ----------------------
if __name__ == "__main__":
# 1. 数据加载
feature_names = ['年龄段', '有工作', '有自己的房子', '信贷情况']
X_train, y_train = load_data('dataset.txt')
X_test, y_test = load_data('testset.txt')
# 2. 构建两个ID3决策树
# 决策树1:完整特征集
tree1 = build_id3_tree(X_train, y_train, feature_names.copy())
print("="*80)
print("决策树1(完整特征集)结构:")
print_tree(tree1)
# 决策树2:排除“有自己的房子”特征
feature_names_wo_house = ['年龄段', '有工作', '信贷情况']
# 手动移除“有自己的房子”特征列(索引为2)
X_train_wo_house = []
for row in X_train:
new_row = row[:2] + row[3:]
X_train_wo_house.append(new_row)
tree2 = build_id3_tree(X_train_wo_house, y_train, feature_names_wo_house.copy())
print("\n" + "="*80)
print("决策树2(排除'有自己的房子'特征)结构:")
print_tree(tree2)
# 3. 测试集预测与评估
# 决策树1预测
y_pred1 = predict_tree(tree1, X_test, feature_names)
acc1 = calculate_accuracy(y_test, y_pred1)
# 决策树2预测(测试集需同步移除对应特征列)
X_test_wo_house = []
for row in X_test:
new_row = row[:2] + row[3:]
X_test_wo_house.append(new_row)
y_pred2 = predict_tree(tree2, X_test_wo_house, feature_names_wo_house)
acc2 = calculate_accuracy(y_test, y_pred2)
# 4. 输出评估结果
print("\n" + "="*80)
print("决策树1(完整特征集)测试结果:")
print(f"真实标签:{[int(label) for label in y_test]}")
print(f"预测标签:{y_pred1}")
print(f"准确率:{acc1:.2f}%")
print("\n" + "="*80)
print("决策树2(排除'有自己的房子'特征)测试结果:")
print(f"真实标签:{[int(label) for label in y_test]}")
print(f"预测标签:{y_pred2}")
print(f"准确率:{acc2:.2f}%")
print("="*80)
三、实现过程中的问题记录
1. 浮点数无bit_length属性
错误:AttributeError: 'float' object has no attribute 'bit_length'
原因:尝试用(p**-1).bit_length()代替log2(p),但p是浮点数,而bit_length()仅适用于整数。
解决方法:导入math模块,使用math.log2(p)来正确计算以2为底的对数。
四、运行结果
执行代码后,控制台将输出决策树结构和测试结果:
=================================================================
决策树1(完整特征集)结构:
有自己的房子?
取值否/低:
有工作?
取值否/低:
→ 类别:不给贷款
取值是/中:
→ 类别:给贷款
取值是/中:
→ 类别:给贷款
==================================================================
决策树2(排除'有自己的房子'特征)结构:
有工作?
取值否/低:
信贷情况?
取值否/低:
→ 类别:不给贷款
取值是/中:
年龄段?
取值否/低:
→ 类别:不给贷款
取值是/中:
→ 类别:不给贷款
取值高:
→ 类别:给贷款
取值高:
年龄段?
取值是/中:
→ 类别:给贷款
取值高:
→ 类别:给贷款
取值是/中:
→ 类别:给贷款
==================================================================
决策树1(完整特征集)测试结果:
真实标签:[0, 1, 1, 0, 1, 0, 0]
预测标签:[0, 1, 1, 0, 1, 0, 0]
准确率:100.00%
==================================================================
决策树2(排除'有自己的房子'特征)测试结果:
真实标签:[0, 1, 1, 0, 1, 0, 0]
预测标签:[0, 1, 1, 0, 1, 0, 1]
准确率:85.71%
==================================================================
五、总结
本文使用纯Python完整实现了ID3决策树算法,涵盖了数据加载、熵值计算、决策树构建及预测评估的整个流程,并针对实现过程中遇到的问题提供了具体的解决方案。该实现完全基于原生Python,无需任何第三方库支持,有助于开发者深入理解ID3算法的核心原理与实现细节。


雷达卡


京公网安备 11010802022788号







