如何通过超简易模型进行推理流程示例?

摘要:前提: 数据集概念 模型的推理规则,由数据训练而来 流程: 以下是一个最基础、无需训练的 K近邻(KNN)分类器,并用它来对经典的鸢尾花数据集进行分类 数据集:经典的鸢尾花数据集,它包含150朵花的4个特征(花萼长
前提:   数据集概念 模型的推理规则,由数据训练而来 流程: 以下是一个最基础、无需训练的K近邻(KNN)分类器,并用它来对经典的鸢尾花数据集进行分类 数据集:经典的鸢尾花数据集,它包含150朵花的4个特征(花萼长宽、花瓣长宽)和3个类别标签 特征矩阵(X):二维数组(150,4),float类型 标签向量(Y):一维向量,映射为int类型 推理规则:KNN算法【非逻辑分支if/then/else】,基于空间几何和统计的两步推理过程 【距离度量——欧氏距离(也可以是其它)】->【投票决策 —— 基于邻域的统计共识】 示例代码:simple_knn.py """ K近邻(KNN)分类器 功能:使用纯Python实现KNN算法,并在鸢尾花数据集上测试 """ import sys import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') import numpy as np from collections import Counter from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report class SimpleKNN: """K近邻分类器""" def __init__(self, k=3, distance_metric='euclidean'): """ 初始化KNN模型 参数: - k: 邻居数量,默认3 - distance_metric: 距离度量方式,可选 'euclidean'(欧氏) 或 'manhattan'(曼哈顿) """ self.k = k self.distance_metric = distance_metric self.X_train = None self.y_train = None def fit(self, X, y): """ "训练"模型:实际只是记住训练数据 这是基于实例的学习,没有参数优化过程 参数: - X: 训练特征,形状 (n_samples, n_features) - y: 训练标签,形状 (n_samples,) """ self.X_train = np.array(X) self.y_train = np.array(y) print(f"[训练完成] 模型已记住 {len(X)} 个训练样本") return self def _compute_distance(self, x1, x2): """ 计算两个样本点之间的距离 参数: - x1, x2: 两个样本的特征向量 返回: - 距离值 """ if self.distance_metric == 'euclidean': # 欧氏距离:sqrt(Σ(x1_i - x2_i)²) return np.sqrt(np.sum((x1 - x2) ** 2)) elif self.distance_metric == 'manhattan': # 曼哈顿距离:Σ|x1_i - x2_i| return np.sum(np.abs(x1 - x2)) else: raise ValueError(f"不支持的距离度量方式: {self.distance_metric}") def _predict_one(self, x): """ 预测单个样本的类别 参数: - x: 单个样本的特征向量 返回: - 预测的类别标签 """ # 1. 计算与所有训练样本的距离 distances = [] for i, train_sample in enumerate(self.X_train): dist = self._compute_distance(x, train_sample) distances.append((dist, i)) # 2. 按距离排序,取最近的k个邻居 distances.sort(key=lambda x: x[0]) k_nearest_indices = [idx for _, idx in distances[:self.k]] # 3. 获取这些邻居的标签 k_nearest_labels = [self.y_train[idx] for idx in k_nearest_indices] # 4. 投票:返回最常见的标签 label_counts = Counter(k_nearest_labels) return label_counts.most_common(1)[0][0] def predict(self, X): """ 预测多个样本的类别 参数: - X: 待预测样本的特征矩阵,形状 (n_samples, n_features) 返回: - 预测的类别标签数组 """ return np.array([self._predict_one(x) for x in X]) def evaluate(self, X_test, y_test): """ 评估模型性能 参数: - X_test: 测试特征 - y_test: 测试标签 返回: - 准确率 """ predictions = self.predict(X_test) accuracy = np.sum(predictions == y_test) / len(y_test) return accuracy, predictions def main(): """主函数:执行完整的机器学习流程""" print("=" * 60) print("K近邻(KNN)分类器 - 完整实现") print("=" * 60) # ==================== 1. 数据准备 ==================== print("\n1. 加载数据集...") iris = datasets.load_iris() X, y = iris.data, iris.target feature_names = iris.feature_names target_names = iris.target_names print(f" 数据集: {iris.DESCR[:200]}...") print(f" 特征: {feature_names}") print(f" 类别: {target_names}") print(f" 数据形状: X={X.shape}, y={y.shape}") print(f" 样本数: {X.shape[0]}, 特征数: {X.shape[1]}") # ==================== 2. 数据分割 ==================== print("\n2. 分割训练集和测试集...") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f" 训练集: {X_train.shape[0]} 个样本") print(f" 测试集: {X_test.shape[0]} 个样本") # ==================== 3. 创建和训练模型 ==================== print("\n3. 创建并训练KNN模型...") knn_model = SimpleKNN(k=3, distance_metric='euclidean') knn_model.fit(X_train, y_train) # ==================== 4. 预测和评估 ==================== print("\n4. 在测试集上进行预测...") accuracy, predictions = knn_model.evaluate(X_test, y_test) print(f" 模型准确率: {accuracy:.2%}") # 对比真实标签和预测标签 print("\n 测试集详细结果:") print(" " + "-" * 50) print(" 序号 | 真实类别 | 预测类别 | 是否正确") print(" " + "-" * 50) correct_count = 0 for i in range(min(10, len(y_test))): # 只显示前10个结果 true_name = target_names[y_test[i]] pred_name = target_names[predictions[i]] is_correct = y_test[i] == predictions[i] if is_correct: correct_count += 1 print(f" {i+1:4d} | {true_name:10s} | {pred_name:10s} | {'✓' if is_correct else '✗'}") if len(y_test) > 10: print(f" ... (其余 {len(y_test)-10} 个样本)") # ==================== 5. 深入分析一次预测 ==================== print("\n5. 深入分析一次预测过程:") sample_idx = 0 sample = X_test[sample_idx] true_label = y_test[sample_idx] print(f"\n 测试样本 {sample_idx+1}:") for i, (name, value) in enumerate(zip(feature_names, sample)): print(f" {name}: {value:.2f}") print(f" 真实类别: {target_names[true_label]}") # 手动计算距离 print(f"\n 计算与训练集前5个样本的距离:") for i in range(min(5, len(knn_model.X_train))): dist = knn_model._compute_distance(sample, knn_model.X_train[i]) train_label = knn_model.y_train[i] print(f" 训练样本{i+1} ({target_names[train_label]}): 距离 = {dist:.4f}") # 进行预测 pred_label = knn_model._predict_one(sample) print(f"\n 最终预测: {target_names[pred_label]}") print(f" 预测是否正确: {'是' if true_label == pred_label else '否'}") # ==================== 6. 实验不同参数 ==================== print("\n6. 实验不同参数对性能的影响:") print(" " + "-" * 50) # 测试不同的k值 k_values = [1, 3, 5, 7, 9] print("\n 不同k值的准确率:") for k in k_values: test_model = SimpleKNN(k=k) test_model.fit(X_train, y_train) accuracy, _ = test_model.evaluate(X_test, y_test) print(f" k={k}: {accuracy:.2%}") # 测试不同的距离度量 print("\n 不同距离度量的准确率:") for metric in ['euclidean', 'manhattan']: test_model = SimpleKNN(k=3, distance_metric=metric) test_model.fit(X_train, y_train) accuracy, _ = test_model.evaluate(X_test, y_test) print(f" {metric}: {accuracy:.2%}") # ==================== 7. 与sklearn的KNN对比 ==================== print("\n7. 与sklearn官方KNN实现对比:") try: from sklearn.neighbors import KNeighborsClassifier sklearn_knn = KNeighborsClassifier(n_neighbors=3) sklearn_knn.fit(X_train, y_train) sklearn_accuracy = sklearn_knn.score(X_test, y_test) print(f" KNN准确率: {accuracy:.2%}") print(f" sklearn KNN准确率: {sklearn_accuracy:.2%}") if abs(accuracy - sklearn_accuracy) < 0.01: print(" ✓ 两者性能接近,实现正确!") else: print(" ⚠ 性能有差异,检查距离计算或投票逻辑") except ImportError: print(" 无法导入sklearn,跳过对比") print("\n" + "=" * 60) print("\n脚本执行完成!") print("=" * 60) if __name__ == "__main__": main() View Code PS:数据集的数据展示 import sys import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') import numpy as np from sklearn import datasets import pandas as pd def main(): iris = datasets.load_iris() print("\n--- 使用Pandas DataFrame查看数据集---") # 1. 先用特征数据创建DataFrame iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) # 2. 追加显示对应的标签列 iris_df['target'] = iris.target print("数据表预览(前10行):") print(iris_df.head(10)) # iris_df.to_csv('iris_data.csv', index=False, encoding='utf-8') print("\n数据表基本信息:") print(iris_df.info()) if __name__ == "__main__": main() View Code