python提取内容关键词的方法
本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:
一个非常高效的提取内容关键词的python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上分词功能,效果和英文是一样的。
#coding=UTF-8 importnltk fromnltk.corpusimportbrown #Thisisafastandsimplenounphraseextractor(basedonNLTK) #Feelfreetouseit,justkeepalinkbacktothispost #http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ #CreatebyShlomiBabluki #May,2013 #ThisisourfastPartofSpeechtagger ############################################################################# brown_train=brown.tagged_sents(categories='news') regexp_tagger=nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$','CD'), (r'(-|:|;)$',':'), (r'\'*$','MD'), (r'(The|the|A|a|An|an)$','AT'), (r'.*able$','JJ'), (r'^[A-Z].*$','NNP'), (r'.*ness$','NN'), (r'.*ly$','RB'), (r'.*s$','NNS'), (r'.*ing$','VBG'), (r'.*ed$','VBD'), (r'.*','NN') ]) unigram_tagger=nltk.UnigramTagger(brown_train,backoff=regexp_tagger) bigram_tagger=nltk.BigramTagger(brown_train,backoff=unigram_tagger) ############################################################################# #Thisisoursemi-CFG;Extenditaccordingtoyourownneeds ############################################################################# cfg={} cfg["NNP+NNP"]="NNP" cfg["NN+NN"]="NNI" cfg["NNI+NN"]="NNI" cfg["JJ+JJ"]="JJ" cfg["JJ+NN"]="NNI" ############################################################################# classNPExtractor(object): def__init__(self,sentence): self.sentence=sentence #Splitthesentenceintosinglwwords/tokens deftokenize_sentence(self,sentence): tokens=nltk.word_tokenize(sentence) returntokens #Normalizebrowncorpus'tags("NN","NN-PL","NNS">"NN") defnormalize_tags(self,tagged): n_tagged=[] fortintagged: ift[1]=="NP-TL"ort[1]=="NP": n_tagged.append((t[0],"NNP")) continue ift[1].endswith("-TL"): n_tagged.append((t[0],t[1][:-3])) continue ift[1].endswith("S"): n_tagged.append((t[0],t[1][:-1])) continue n_tagged.append((t[0],t[1])) returnn_tagged #Extractthemaintopicsfromthesentence defextract(self): tokens=self.tokenize_sentence(self.sentence) tags=self.normalize_tags(bigram_tagger.tag(tokens)) merge=True whilemerge: merge=False forxinrange(0,len(tags)-1): t1=tags[x] t2=tags[x+1] key="%s+%s"%(t1[1],t2[1]) value=cfg.get(key,'') ifvalue: merge=True tags.pop(x) tags.pop(x) match="%s%s"%(t1[0],t2[0]) pos=value tags.insert(x,(match,pos)) break matches=[] fortintags: ift[1]=="NNP"ort[1]=="NNI": #ift[1]=="NNP"ort[1]=="NNI"ort[1]=="NN": matches.append(t[0]) returnmatches #Mainmethod,justrun"pythonnp_extractor.py" defmain(): sentence="Swayyisabeautifulnewdashboardfordiscoveringandcuratingonlinecontent." np_extractor=NPExtractor(sentence) result=np_extractor.extract() print"Thissentenceisabout:%s"%",".join(result) if__name__=='__main__': main()
希望本文所述对大家的Python程序设计有所帮助。