KNN 算法优缺点:
优点:精度高,对异常值不敏感
缺点:计算复杂度高,空间复杂度高
使用数据范围:数值型和标称型
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。
sklearn 实现KNN 算法
def sklearn_test(): from sklearn import datasets from sklearn.neighbors import
KNeighborsClassifier from sklearn.model_selection import train_test_split
import numpy as np np.random.seed(0) iris = datasets.load_iris() iris_x, iris_y
= iris.data, iris.target # indices = np.random.permutation(len(iris_x)) # 产生随机数
# iris_x_train, iris_x_test = iris_x[indices[:-10]], iris_x[indices[-10:]
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_x,
iris_y, test_size=0.1,random_state=42) knn = KNeighborsClassifier()
knn.fit(iris_x_train, iris_y_train) iris_y_predict = knn.predict(iris_x_test)
probability = knn.predict_proba(iris_x_test) print("hrllo") score =
knn.score(iris_x_test, iris_y_test, sample_weight=None) print('then predict
result of iris is:', iris_y_predict, 'and the real result of iris is: %d',
iris_y_test) print('the accuracy is: %.2f' % score) # print("the neighbor point
of last test sample:", neighborpoint) print("the probability is:", probability)
KNN 算法的代码实现步骤:
import numpy as np import matplotlib.pyplot as plt # 创建训练集 def create_data():
x_train = np.array([[1,1.1], [1.3,0.8], [1.4,1.2], [1.1,0.9], [0.8,1.5],
[2.5,2], [3.4,2.5], [3.7,2.5], [2,3]]) y_train =
np.array(['a','a','a','a','a','b','b','b','b']) return x_train, y_train # 预测点
x_test = np.array([2,2]) # 计算距离 def calculate_dis(x_train, k =3): dis =
(x_train - x_test)**2 dis = dis.sum(axis = 1)**0.5 dis = dis.argsort() #
argsort()函数,是numpy库中的函数,返回的是数组值从小到大的索引值 small_k = dis[:k] return dis,small_k #
确定预测点所属类别 def pre_result(small_k, y_train): dic = {} for i in small_k: if
y_train[i] in dic.keys(): dic[y_train[i]] += 1 else: dic[y_train[i]] = 1 return
list(dic.keys())[0] # 将训练集按照所属类别分类 def to_array(cla): x_train, y_train =
create_data() x = [] for i in range(len(y_train)): if y_train[i] == cla:
x.append(list(x_train[i,:])) return np.array(x) # 画图 def plot_(x_train, pre,
small_k): x_train_a = to_array('a') x_train_b = to_array('b')
plt.scatter(x_train_a[:,0], x_train_a[:,1], c = 'b', marker='o',
label='train_class_a') plt.scatter(x_train_b[:,0], x_train_b[:,1], c= 'r',
marker='o', label = 'train_class_b') if pre == 'a': test_class = 'b' elif pre
== 'b': test_class = 'r' plt.scatter(x_test[0], x_test[1], c = test_class,
marker='*', label='test_class') for i in small_k: print([x_test[0],
x_train[i,:][0]], [x_test[1], x_train[i,:][1]]) plt.plot([x_test[0],
x_train[i,:][0]], [x_test[1], x_train[i,:][1]], c='c') plt.legend(loc = 'best')
plt.show() def main(): x_train, y_train = create_data() dis, small_k =
calculate_dis(x_train) pre = pre_result(small_k, y_train) plot_(x_train, pre,
small_k) if __name__ == '__main__': main()
画图结果展示:
以下是完整的代码实现过程
''' KNN 算法 优点:精度高,对异常值不敏感,武术家输入假定 缺点:计算复杂度高,空间复杂度高 使用数据范围:数值型和标称型
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。
''' import numpy as np import operator import os def create_dataSet(): group =
np.array([[1, 1.1], [1, 1], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B']
return group, labels def classify0(in_x, data_set, labels, k): data_set_size =
data_set.shape[0] diff_mat = np.tile(in_x, (data_set_size, 1)) - data_set dist
= (diff_mat**2).sum(axis=1)**0.5 sorted_dist_index = dist.argsort() class_count
= {} for i in range(k): votelabel = labels[sorted_dist_index[i]]
class_count[votelabel] = class_count.get(votelabel, 0) + 1 sorted_class_count =
sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) return
sorted_class_count[0][0] # dating数据 def file2matrix(filename): file_path =
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02'
fr = open(file_path + filename) ar_lines = fr.readlines() num = len(ar_lines)
return_mat = np.zeros((num, 3)) class_label = [] index = 0 for line in
ar_lines: line = line.strip().split('\t') return_mat[index, :] = line[0:3] #
class_label.append(int(line[-1])) if line[-1] == 'largeDoses':
class_label.append(3) elif line[-1] == 'didntLike': class_label.append(2) else:
class_label.append(1) index += 1 return return_mat, class_label # 归一化特征值 def
auto_norm(data_set): min_val = data_set.min(0) # 行对比,得到n列最小的值 max_val =
data_set.max(0) ranges = max_val - min_val norm_data_set =
np.zeros(np.shape(data_set)) m = data_set.shape[0] norm_data_set = data_set -
np.tile(min_val, (m, 1)) norm_data_set = norm_data_set/np.tile(ranges, (m, 1))
return norm_data_set, ranges, min_val # # 分类器针对约会网站的测试代码 def
dating_class_test(): h = 0.1 dating_data_mat, dating_labels =
file2matrix('/datingTestSet.txt') norm_mat, ranges, min_val =
auto_norm(dating_data_mat) m = norm_mat.shape[0] num_test_vect = int(m*h)
error_count = 0.0 for i in range(num_test_vect): classify_result =
classify0(norm_mat[i,:], dating_data_mat[num_test_vect:, :],
dating_labels[num_test_vect:], 3) print("the classifier come back with: %d, the
real answer is : %d" %(int(classify_result), int(dating_labels[i]))) if
classify_result != dating_labels[i]: error_count += 1 print("the total error
rate is: %.2f%%" % (error_count*100/float(num_test_vect))) def
classify_preson(): result_list = ['not at all', 'in small does', 'in large
does'] percent_tats = float(input("percentage of time spent playing video
games:")) ff_miles = float(input("frequent flier miles earned games:"))
ice_cream = float(input("liters of ice cream consumed per year:")) dating_mat,
dating_labels = file2matrix('/datingTestSet.txt') norm_mat, ranges, min_val =
auto_norm(dating_mat) in_x = np.array((percent_tats, ff_miles, ice_cream))
classifier_result = classify0(in_x, norm_mat, dating_labels, 3) print("you will
probably like this person:", result_list[classifier_result - 1]) # 导入手写数字文件 def
img2vector(filename): filepath =
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\trainingDigits'
fr = open(filepath + filename) return_vect = [] for i in fr.readlines(): for j
in i.strip(): return_vect.append(int(j)) # return_vect =
np.array(return_vect).reshape(32, 32) return_vect = np.array(return_vect)
return return_vect def handwriteing_classtest(): filepath =
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits'
hw_labels = [] training_filelist = os.listdir(filepath + '/trainingDigits') m =
len(training_filelist) training_mat = np.zeros((m, 1024)) for i in range(m):
filename_str = training_filelist[i] file_str = filename_str.split('.')[0]
class_num = int(file_str.split('_')[0]) hw_labels.append(class_num)
training_mat[i, :] = img2vector('/%s' % filename_str) test_filelist =
os.listdir(filepath + '/testDigits') error_count = 0.0 m_t = len(test_filelist)
for i in range(m_t): filename_str = test_filelist[i] class_num =
int(filename_str.split('.')[0].split('_')[0]) vector_undertest =
img2vector('/%s' % filename_str) classifierresult = classify0(vector_undertest,
training_mat, hw_labels, 3) print("the classifier came back with: %d, the real
answer is: %d"%(classifierresult, class_num)) if (classifierresult !=
class_num): error_count += 1.0 print("\nthe total number of errors is: %d"
%error_count) print("\nthe total error rate is: %f" %(error_count/float(m_t)))
################测试的代码: def test(): from imp import reload # 重新导入模块 from
matplotlib import pyplot as plt plt.scatter(group[:,0], group[:,1]) reload(KNN)
datingDataMat, datingLabels =
KNN.file2matrix(r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\datingTestSet2.txt')
plt.scatter(datingDataMat[:,1], datingDataMat[:,2]) plt.scatter(x =
datingDataMat[:,1], y = datingDataMat[:,2], c = np.array(datingLabels), s
=np.array(datingLabels))