Machine Learning in action –kNN
最近在自学机器学习,应导师要求,先把《Machine Learning with R》动手刷了一遍,感觉R真不能算是一门计算机语言,感觉也就是一个功能复杂的计算器。所以这次就决定使用经典教材《Machine Learning in action》。因为开学得换work station ,怕到时候代码又丢了,所以就索性开个博客,把代码上传上来。
talk is cheap show me the code
函数定义代码
from numpy
import *
import operator
import matplotlib
import matplotlib.pyplot
as plt
import os
def classify0(inX, dataset, lables, k):
dataSetSize = dataset.shape[
0]
matri_temp = tile(inX, (dataSetSize,
1))
diffMat = matri_temp - dataset
sqDiffMat = diffMat**
2
sqDistance = sqDiffMat.sum(axis =
1)
distance = sqDistance**
0.5
sortedDistIndicies = distance.argsort()
classCount = {}
for i
in range(k):
index = sortedDistIndicies[i]
votelable = lables[index]
classCount[votelable] = classCount.get(votelable,
0) +
1
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(
1), reverse =
True )
return sortedClassCount[
0][
0]
def file2matrix(filename):
fr = open(filename)
arrayOlines = fr.readlines()
numOfLines = len(arrayOlines)
returnMat = zeros((numOfLines,
3))
classLabelVector = []
index =
0
for line
in arrayOlines:
line = line.strip()
listFromLine = line.split(
'\t')
returnMat[index,:] = listFromLine[
0:
3]
classLabelVector.append(int(listFromLine[-
1]))
index +=
1
return returnMat, classLabelVector
def autoNorm(dataset):
minVals = dataset.min(
0)
maxVals = dataset.max(
0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataset))
m = dataset.shape[
0]
normDataSet = dataset - tile(minVals, (m,
1))
normDataSet = normDataSet / tile(ranges, (m,
1))
return normDataSet, ranges, minVals
def img2vector(filename):
returnVect = zeros((
1,
1024))
fr = open(filename)
for i
in range(
32):
lineStr = fr.readline()
for j
in range(
32):
returnVect[
0,
32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwlabels = []
trainingFileList = os.listdir(
'trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m,
1024))
for i
in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split(
'.')[
0]
classNumStr = int(fileStr.split(
'_')[
0])
hwlabels.append(classNumStr)
trainingMat[i,:] = img2vector(
'trainingDigits/%s' %fileNameStr)
testFileList = os.listdir(
'testDigits')
errorCount =
0.0
mTest = len(testFileList)
for i
in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split(
'.')[
0]
classNumStr = int(fileStr.split(
'_')[
0])
vectorUnderTest = img2vector(
'testDigits/%s'%fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwlabels,
3)
print(
"the classifier came back with :%d ,the real answer is :%d"\
%(classifierResult,classNumStr))
if(classifierResult != classNumStr):
errorCount +=
1
print(
"\n the total number of error is %d"%errorCount)
print(
"\n the total number rate is :%f"%(errorCount/float(mTest)))
在撸代码的时候,犯了一个subtle and deadly 的错误,在 file2matrix中,误将 readlines() 写成了readline() ,虽然两者只相差一个 s ,但是确实天壤之别的含义。前者是读入所有的数据,后者是只读入一行数据,就这一点差别,让我debug了好几个小时,以后还得细心啊。
上面代码块只是定义了主要的函数,离运行还差一点。由于书原文中,采用了使用 iPython 命令行的运行方式,但是博主比较懒,所以干脆舍弃掉原来的方式,直接在代码最后添加
代码块
if __name__ ==
"__main__":
废话不多少,直接上代码
实验1 -分类
if __name__==
"__main__":
dataset, lables = createDataSet()
inX = [
0.1,
0.01]
className = classify0(inX, dataset, lables,
3)
print(
"the class of test sample is %s" %className)
实验2 :file2matrix
if __name__ ==
"__main__":
datingDataMat, datingLabels = file2matrix(
'datingTestSet2.txt')
normData , ranges, minVals = autoNorm(datingDataMat)
print(normData)
print(
'/n')
print(ranges)
print(
'/n')
print(minVals)
实验3 :使用Matplotlib创建散点图
if __name__ ==
"__main__":
datingDataMat, datingLabels = file2matrix(
'datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(
111)
ax.scatter(datingDataMat[:,
1], datingDataMat[:,
2],
15.0 * array(datingLabels),
15.0 * array(datingLabels))
plt.show()
实验4 :归一化特征值
if __name__ ==
"__main__":
datingDataMat, datingLabels = file2matrix(
'datingTestSet2.txt')
normData , ranges, minVals = autoNorm(datingDataMat)
print(normData)
print(
'\n')
print(ranges)
print(
'\n')
print(minVals)
实验5 :手写识别系统
if __name__ ==
"__main__":
handwritingClassTest()
更多请戳github https://github.com/Edgis/Machine-learning-in-action/blob/master/kNN.py