python中scikit-learn机器代码实例
我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
#-*-coding:utf-8-*- importnumpy fromsklearnimportmetrics fromsklearn.svmimportLinearSVC fromsklearn.naive_bayesimportMultinomialNB fromsklearnimportlinear_model fromsklearn.datasetsimportload_iris fromsklearn.cross_validationimporttrain_test_split fromsklearn.preprocessingimportOneHotEncoder,StandardScaler fromsklearnimportcross_validation fromsklearnimportpreprocessing #importiris_data defload_data(): iris=load_iris() x,y=iris.data,iris.target x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42) returnx_train,y_train,x_test,y_test deftrain_clf3(train_data,train_tags): clf=LinearSVC(C=1100.0)#defaultwith'rbf' clf.fit(train_data,train_tags) returnclf deftrain_clf(train_data,train_tags): clf=MultinomialNB(alpha=0.01) printnumpy.asarray(train_tags) clf.fit(train_data,numpy.asarray(train_tags)) returnclf defevaluate(actual,pred): m_precision=metrics.precision_score(actual,pred) m_recall=metrics.recall_score(actual,pred) print'precision:{0:.3f}'.format(m_precision) print'recall:{0:0.3f}'.format(m_recall) print'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); x_train,y_train,x_test,y_test=load_data() clf=train_clf(x_train,y_train) pred=clf.predict(x_test) evaluate(numpy.asarray(y_test),pred) printmetrics.classification_report(y_test,pred) 使用自定义数据 #coding:utf-8 importnumpy fromsklearnimportmetrics fromsklearn.feature_extraction.textimportHashingVectorizer fromsklearn.feature_extraction.textimportTfidfVectorizer fromsklearn.naive_bayesimportMultinomialNB fromsklearn.feature_extraction.textimportCountVectorizer,TfidfTransformer fromsklearn.neighborsimportKNeighborsClassifier fromsklearn.svmimportSVC fromsklearn.svmimportLinearSVC importcodecs fromsklearn.ensembleimportRandomForestClassifier fromsklearnimportcross_validation fromsklearnimportlinear_model train_corpus=[ '我们我们好孩子认证。就是', '我们好孩子认证。中国', '我们好孩子认证。孤独', '我们好孩子认证。', ] test_corpus=[ '我菲律宾韩国', '我们好孩子认证。中国', ] definput_data(train_file,test_file): train_words=[] train_tags=[] test_words=[] test_tags=[] f1=codecs.open(train_file,'r','utf-8','ignore') forlineinf1: tks=line.split(':',1) word_list=tks[1] word_array=word_list[1:(len(word_list)-3)].split(",") train_words.append("".join(word_array)) train_tags.append(tks[0]) f2=codecs.open(test_file,'r','utf-8','ignore') forlineinf2: tks=line.split(':',1) word_list=tks[1] word_array=word_list[1:(len(word_list)-3)].split(",") test_words.append("".join(word_array)) test_tags.append(tks[0]) returntrain_words,train_tags,test_words,test_tags defvectorize(train_words,test_words): #v=HashingVectorizer(n_features=25000,non_negative=True) v=HashingVectorizer(non_negative=True) #v=CountVectorizer(min_df=1) train_data=v.fit_transform(train_words) test_data=v.fit_transform(test_words) returntrain_data,test_data defvectorize1(train_words,test_words): tv=TfidfVectorizer(sublinear_tf=False,use_idf=True); train_data=tv.fit_transform(train_words); tv2=TfidfVectorizer(vocabulary=tv.vocabulary_); test_data=tv2.fit_transform(test_words); returntrain_data,test_data defvectorize2(train_words,test_words): count_v1=CountVectorizer(stop_words='english',max_df=0.5); counts_train=count_v1.fit_transform(train_words); count_v2=CountVectorizer(vocabulary=count_v1.vocabulary_); counts_test=count_v2.fit_transform(test_words); tfidftransformer=TfidfTransformer(); train_data=tfidftransformer.fit(counts_train).transform(counts_train); test_data=tfidftransformer.fit(counts_test).transform(counts_test); returntrain_data,test_data defevaluate(actual,pred): m_precision=metrics.precision_score(actual,pred) m_recall=metrics.recall_score(actual,pred) print'precision:{0:.3f}'.format(m_precision) print'recall:{0:0.3f}'.format(m_recall) print'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); deftrain_clf(train_data,train_tags): clf=MultinomialNB(alpha=0.01) clf.fit(train_data,numpy.asarray(train_tags)) returnclf deftrain_clf1(train_data,train_tags): #KNNClassifier clf=KNeighborsClassifier()#defaultwithk=5 clf.fit(train_data,numpy.asarray(train_tags)) returnclf deftrain_clf2(train_data,train_tags): clf=linear_model.LogisticRegression(C=1e5) clf.fit(train_data,train_tags) returnclf deftrain_clf3(train_data,train_tags): clf=LinearSVC(C=1100.0)#defaultwith'rbf' clf.fit(train_data,train_tags) returnclf deftrain_clf4(train_data,train_tags): """ 随机森林,不可使用稀疏矩阵 """ clf=RandomForestClassifier(n_estimators=10) clf.fit(train_data.todense(),train_tags) returnclf #使用codecs逐行读取 defcodecs_read_label_line(filename): label_list=[] f=codecs.open(filename,'r','utf-8','ignore') line=f.readline() whileline: #label_list.append(line[0:len(line)-2]) label_list.append(line[0:len(line)-1]) line=f.readline() f.close() returnlabel_list defsave_test_features(test_url,test_label): test_feature_list=codecs_read_label_line('test.dat') fw=open('test_labeded.dat',"w+") for(url,label)inzip(test_feature_list,test_label): fw.write(url+'\t'+label) fw.write('\n') fw.close() defmain(): train_file=u'..\\file\\py_train.txt' test_file=u'..\\file\\py_test.txt' train_words,train_tags,test_words,test_tags=input_data(train_file,test_file) #printlen(train_words),len(train_tags),len(test_words),len(test_words), train_data,test_data=vectorize1(train_words,test_words) printtype(train_data) printtrain_data.shape printtest_data.shape printtest_data[0].shape printnumpy.asarray(test_data[0]) clf=train_clf3(train_data,train_tags) scores=cross_validation.cross_val_score( clf,train_data,train_tags,cv=5,scoring="f1_weighted") printscores #predicted=cross_validation.cross_val_predict(clf,train_data,train_tags,cv=5) ''' ''' pred=clf.predict(test_data) error_list=[] for(true_tag,predict_tag)inzip(test_tags,pred): iftrue_tag!=predict_tag: printtrue_tag,predict_tag error_list.append(true_tag+''+predict_tag) printlen(error_list) evaluate(numpy.asarray(test_tags),pred) ''' #输出打标签结果 test_feature_list=codecs_read_label_line('test.dat') save_test_features(test_feature_list,pred) ''' if__name__=='__main__': main()