python实现朴素贝叶斯算法
本代码实现了朴素贝叶斯分类器(假设了条件独立的版本),常用于垃圾邮件分类,进行了拉普拉斯平滑。
关于朴素贝叶斯算法原理可以参考博客中原理部分的博文。
#!/usr/bin/python
#-*-coding:utf-8-*-
frommathimportlog
fromnumpyimport*
importoperator
importmatplotlib
importmatplotlib.pyplotasplt
fromosimportlistdir
defloadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
returnpostingList,classVec
defcreateVocabList(dataSet):
vocabSet=set([])#createemptyset
fordocumentindataSet:
vocabSet=vocabSet|set(document)#unionofthetwosets
returnlist(vocabSet)
defsetOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
forwordininputSet:
ifwordinvocabList:
returnVec[vocabList.index(word)]=1
else:print"theword:%sisnotinmyVocabulary!"%word
returnreturnVec
deftrainNB0(trainMatrix,trainCategory):#训练模型
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num=ones(numWords);p1Num=ones(numWords)#拉普拉斯平滑
p0Denom=0.0+2.0;p1Denom=0.0+2.0#拉普拉斯平滑
foriinrange(numTrainDocs):
iftrainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect=log(p1Num/p1Denom)#用log()是为了避免概率乘积时浮点数下溢
p0Vect=log(p0Num/p0Denom)
returnp0Vect,p1Vect,pAbusive
defclassifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
ifp1>p0:
return1
else:
return0
defbagOfWords2VecMN(vocabList,inputSet):
returnVec=[0]*len(vocabList)
forwordininputSet:
ifwordinvocabList:
returnVec[vocabList.index(word)]+=1
returnreturnVec
deftestingNB():#测试训练结果
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
forpostinDocinlistOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))
testEntry=['love','my','dalmation']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry=['stupid','garbage']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb)
deftextParse(bigString):#长字符转转单词列表
importre
listOfTokens=re.split(r'\W*',bigString)
return[tok.lower()fortokinlistOfTokensiflen(tok)>2]
defspamTest():#测试垃圾文件需要数据
docList=[];
classList=[];
fullText=[]
foriinrange(1,26):
wordList=textParse(open('email/spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('email/ham/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList)
trainingSet=range(50);
testSet=[]
foriinrange(10):
randIndex=int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];
trainClasses=[]
fordocIndexintrainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
errorCount=0
fordocIndexintestSet:
wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])
ifclassifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print"classificationerror",docList[docIndex]
print'theerrorrateis:',float(errorCount)/len(testSet)
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
printmyVocabList,'\n'
#printsetOfWords2Vec(myVocabList,listOPosts[0]),'\n'
trainMat=[]
forpostinDocinlistOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
printtrainMat
p0V,p1V,pAb=trainNB0(trainMat,listClasses)
printpAb
printp0V,'\n',p1V
testingNB()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。