最近在自学机器学习,应导师要求,先把《Machine Learning with R》动手刷了一遍,感觉R真不能算是一门计算机语言,感觉也就是一个功能复杂的计算器。所以这次就决定使用经典教材《Machine Learning in action》。因为开学得换work station ,怕到时候代码又丢了,所以就索性开个博客,把代码上传上来。
因为书上的原代码有很多错误,并且网上的许多博客的代码也是没有改正的,这次我把修正过的代码po上来
edition:python3.5
talk is cheap show me the code
函数定义代码
from numpy
import *
def loadDataSet():
postingList=[[
'my',
'dog',
'has',
'flea',
'problems',
'help',
'please'],
[
'maybe',
'not',
'take',
'him',
'to',
'dog',
'park',
'stupid'],
[
'my',
'dalmation',
'is',
'so',
'cute',
'I',
'love',
'him'],
[
'stop',
'posting',
'stupid',
'worthless',
'garbage'],
[
'mr',
'licks',
'ate',
'my',
'steak',
'how',
'to',
'stop',
'him'],
[
'quit',
'buying',
'worthless',
'dog',
'food',
'stupid']]
classVec = [
0,
1,
0,
1,
0,
1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document
in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [
0] * len(vocabList)
for word
in inputSet:
if word
in vocabList:
returnVec[vocabList.index(word)] =
1
else:
print(
"the word %s is not in Vocabulary"%word)
return returnVec
def trainNBO(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[
0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom =
2.0
p1Denom =
2.0
for i
in range(numTrainDocs):
if trainCategory[i] ==
1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(
1.0- pClass1)
if p1 > p0:
return 1
else:
return 0
def textParse(bigString):
import re
listOfTokens = re.split(
r'\W*', bigString)
return [tok.lower()
for tok
in listOfTokens
if len(tok) >
2]
def spamTest():
docList = []
classList = []
fullText = []
for i
in range(
1,
26):
wordList = textParse(open(
'email/spam/%d.txt'%i, encoding=
'gbk', errors=
'ignore').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(
1)
wordList = textParse(open(
'email/ham/%d.txt'%i, encoding=
'gbk', errors=
'ignore').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(
0)
vocabList = createVocabList(docList)
trainingSet = list(range(
50))
testSet = []
for i
in range(
10):
randIndex = int(random.randint(
0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex
in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNBO(array(trainMat), array(trainClasses))
errorCount =
0
for docIndex
in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount +=
1
print(
'the error rate is :', float(errorCount)/len(testSet))
在spamTest()中,主要有以下几个错误 1.’range’ object doesn’t support item deletion –>这是因为python3中中range不返回数组对象,而是返回range对象 改正方法:http://blog.csdn.net/dillon2015/article/details/52987792 1.UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0xae in position 199: illegal multibyte sequence —> 这个具体什么原因,我也是一头乱麻,后来找了下,因为原文件是 gbk 格式,所以改成以下格式
wordList = textParse(
open(
'email/spam/%d.txt'%i, encoding=
'gbk', errors=
'ignore').
read())
上面代码块只是定义了主要的函数,离运行还差一点。由于书原文中,采用了使用 iPython 命令行的运行方式,但是博主比较懒,所以干脆舍弃掉原来的方式。
废话不多少,直接上代码
实验1
if __name__==
"__main__":
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(sum(listClasses))
print(listClasses)
print(myVocabList)
vec1 = setOfWords2Vec(myVocabList, listOPosts[
0])
vec2 = setOfWords2Vec(myVocabList, listOPosts[
3])
print(vec1)
print(vec2)
实验2 :
if __name__ ==
"__main__":
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc
in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V ,pAb = trainNBO(trainMat, listClasses)
print(p0V)
print(p1V)
print(pAb)
实验3 :
if __name__ ==
"__main__":
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc
in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V ,pAb = trainNBO(trainMat, listClasses)
testEntry = [
'love',
'my',
'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry ,
'classified as :', classifyNB(thisDoc, p0V, p1V, pAb))
更多请戳github https://github.com/Edgis/Machine-learning-in-action/blob/master/bayes.py