tag 标签: 多分类经管大学堂:名校名师名课

相关日志

分享 多分类本机训练模型___LR
xulimei1986 2017-10-12 10:57
# -*- coding: utf-8 -*- """ Created on Fri Sep 22 17:12:38 2017 @author: g6591 """ ##导入必要的库 import random import re from sklearn.linear_model import LogisticRegression import time from sklearn import metrics import numpy as np from sklearn.feature_extraction.text import CountVectorizer import jieba import jieba.analyse jieba.load_userdict('g37_ciku.txt') from collections import Counter import pickle import sys reload(sys) sys.setdefaultencoding('utf-8') #对原始数据做全字母处理 def data_pro1(path1,path2): with open(path1) as f1,open(path2,'w+') as f2: for i in f1: i=i.replace('"','') i=i.strip().split('\t') if len(i)1: continue content=i type1=i if len(content)10: regex1 = re.compile(u' ') aa = u''.join(regex1.findall(content.decode('utf8'))).encode('utf8') if len(aa)1: continue f2.write('%s\t%s\n'%(content,type1)) return 'Data begining processing is OK!' ##去停词函数 def qu_stopword(words,stopwords): return ##返回低频词汇以及 def deal_train_data(path2,dipin_path): ##导入停用词 stopkeys= ##获取低频词库 f2 = open(path2,"r") punctuation = word_list= .decode('utf8') if len(content)10: regex1 = re.compile(u' ') aa = u''.join(regex1.findall(content.decode('utf8'))).encode('utf8') if len(aa)1: continue else: regex = re.compile(u' | | | | | | | | | ') content = ''.join(regex.findall(content)) #fenci_list =jieba.lcut(content) if len(content)1: continue fenci_list =jieba.cut_for_search(content) fenci_list1 =qu_stopword(fenci_list,stopkeys) fenci_list2 =qu_stopword(fenci_list1,punctuation) for k in fenci_list2: word_list.append(k) word_set.add(k) elif len(content)=1: regex = re.compile(u' | | | | | | | | | ') content = ''.join(regex.findall(content)) #fenci_list =jieba.lcut(content) if len(content)1: continue fenci_list =jieba.cut_for_search(content) fenci_list1 =qu_stopword(fenci_list,stopkeys) fenci_list2 =qu_stopword(fenci_list1,punctuation) for k in fenci_list2: word_list.append(k) word_set.add(k) except: print line f2.close() f3 = open(dipin_path,"w") word_count = Counter(word_list) for k in word_set: if word_count ==1: word_dipin.add(k) f3.write(k+"\n") f3.close() print "OK!" return word_dipin,stopkeys def rebuild_trainset(path1,path2,word_dipin,stopkeys): f2 = open(path1,"r") f3 = open(path2,"w") punctuation = for line in f2.readlines(): if len(line)=1: try: lines = line.strip().split('\t') type1=lines content = lines .replace("\t","").decode('utf8') #print content regex = re.compile(u' | | | | | | | | | ') content = ''.join(regex.findall(content)) #print content,len(content) if len(content)1: continue fenci_list =jieba.cut_for_search(content) fenci_list1 = qu_stopword(fenci_list,word_dipin) ##去低频词库 fenci_list2 = qu_stopword(fenci_list1,punctuation) ##去标点符号 fenci_list3 = qu_stopword(fenci_list2,stopkeys) ##去停用词 #print fenci_list3 if len(fenci_list3)!=0: ##描述内容非空 fenci_list4 = " ".join(fenci_list3) ss = fenci_list4+"\t" + content+"\t"+type1 f3.write(ss+"\n") except: print 'False!' f2.close() f3.close() return 'Data processing is OK!' ##划分训练集及测试集:7:3的比例 def train_pro1(path2): m=0 texts_content1 = texts_type1= content2 = line.strip().split("\t") type1 = line.strip().split("\t") texts_content1.append(content1) texts_content2.append(content2) texts_type1.append(type1) except: m+=1 return texts_content1,texts_content2,texts_type1 print "OK!" def train_pro2(path2): texts_content1,texts_content2,texts_type1=train_pro1(path2) #7:3的测试集与训练集 sample_num = int(len(texts_type1)*0.3) test_index = random.sample(range(len(texts_type1)),sample_num) train_type1= train_content2= test_content1= for k in range(len(texts_type1)): if k in test_index: test_type1.append(texts_type1 ) test_content1.append(texts_content1 ) test_content2.append(texts_content2 ) else: train_type1.append(texts_type1 ) train_content1.append(texts_content1 ) train_content2.append(texts_content2 ) return train_type1,train_content1,train_content2,test_type1,test_content1,test_content2 def train_pro3(path3,path4,path5): train_type1,train_content1,train_content2,test_type1,test_content1,test_content2=train_pro2(path3) print "step 1 is OK!" with open(path4,'w+') as f1,open(path5,'w+') as f2: for i in range(len(train_type1)): ss=train_type1 +'\t'+train_content1 +'\t'+train_content2 f1.write('%s\n'%ss) for j in range(len(test_type1)): mm=test_type1 +'\t'+test_content1 +'\t'+test_content2 f2.write('%s\n'%mm) def train_pro4(path3,path4): with open(path3,'r') as f1,open(path4,'r') as f2: #模型的调用 train_Data= train_stype= test_content= for line in f1: line=line.strip().split('\t') stype=line content1=line content2=line if stype=="辱骂类": type1=1 elif stype=="广告": type1=2 elif stype=="政治": type1=3 elif stype=="非辱骂类": type1=4 else: continue train_Data.append(content1) train_content.append(content2) train_stype.append(type1) for line in f2: line=line.strip().split('\t') stype=line content1=line content2=line if stype=="辱骂类": type1=1 elif stype=="广告": type1=2 elif stype=="政治": type1=3 elif stype=="非辱骂类": type1=4 else: continue test_Data.append(content1) test_content.append(content2) test_stype.append(type1) return train_Data,train_content,train_stype,test_Data,test_content,test_stype #训练模型 # Logistic Regression Classifier def logistic_regression_classifier(train_x, train_y): model = LogisticRegression(penalty='l2') model.fit(train_x, train_y) return model def train_model(path1,path2): #模型小类的存储 print('reading training and testing data...') #准备小类的训练和测试数据集 trainData_s,train_content_s, trainLabel_s, testData_s,test_content_s, testLabel_s = train_pro4(path1,path2) vectorizer = CountVectorizer() train_sx = vectorizer.fit_transform(trainData_s) train_sy = np.array(trainLabel_s) test_sx = vectorizer.transform(testData_s) test_sy = np.array(testLabel_s) print "模型训练: ----LR--------------" start_time = time.time() model_s = logistic_regression_classifier(train_sx, train_sy) pickle.dump(model_s,open('classifier.pkl','w')) print('training took %fs!' % (time.time() - start_time)) predict_s = model_s.predict(test_sx) precision_s = metrics.precision_score(test_sy, predict_s) recall_s = metrics.recall_score(test_sy, predict_s) print('precision: %.2f%%, recall: %.2f%%' % (100 * precision_s, 100 * recall_s)) accuracy = metrics.accuracy_score(test_sy, predict_s) print('accuracy: %.2f%%' % (100 * accuracy)) pre_s =predict_s.tolist() file1=open('lr_result.txt','w+') for i in range(len(pre_s)): mm=testData_s +'\t'+test_content_s +'\t'+str(testLabel_s )+'\t'+str(pre_s )+'\n' file1.write('%s'%mm) file1.close() ###########################################主函数################################## if __name__=='__main__': path1='base_data.txt' #原始数据 path2='base_data1.txt' #做全字母过滤后的结果 path3='base_fenci.txt' #分词后的结果 path4='train_data.txt' #划分的训练集 path5='test_data.txt' #划分的测试集 dipin_path='dipin_word.txt' #低频结果 print 'step1:对数据做全字母过滤处理' aa=data_pro1(path1,path2) print 'step2:对数据分词,去停用词、低频词等' word_dipin,stopkeys =deal_train_data(path2,dipin_path) bb=rebuild_trainset(path2,path3,word_dipin,stopkeys) print bb print 'step3:划分训练集及测试集' train_pro3(path3,path4,path5) print 'step4:模型的训练及测试' train_model(path4,path5)
0 个评论

京ICP备16021002-2号 京B2-20170662号 京公网安备 11010802022788号 论坛法律顾问:王进律师 知识产权保护声明   免责及隐私声明

GMT+8, 2024-5-15 06:05