[源码分享] [红包]基于C++的RandomForest随机森林总结 [推广有奖]

14关注
289
粉丝

Ψ▄┳一大卫卍卐席尔瓦

大师

还不是VIP/贵宾

威望: 7 级
论坛币: -234475 个
通用积分: 124.1424
学术水平: 3783 点
热心指数: 3819 点
信用等级: 3454 点
经验: 150417 点
帖子: 7616
精华: 32
在线时间: 1327 小时
注册时间: 2013-2-3
最后登录: 2022-2-24

楼主

fantuanxiaot 发表于 2015-3-11 00:17:57 |只看作者 |坛友微信交流群|倒序 |AI写论文

相似文件

换一批

是否 +2 论坛币

k人参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群

赵安豆老师微信：zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

立即领取

感谢您参与论坛问题回答

经管之家送您两个论坛币！

+2 论坛币

来自：http://www.cnblogs.com/hrlnw/p/3850459.html

#include <cv.h> // opencv general include file
#include <ml.h> // opencv machine learning include file
#include <stdio.h>
using namespace cv; // OpenCV API is in the C++ "cv" namespace
/******************************************************************************/
// global definitions (for speed and ease of use)
//手写体数字识别
#define NUMBER_OF_TRAINING_SAMPLES 60000
#define ATTRIBUTES_PER_SAMPLE 784
#define NUMBER_OF_TESTING_SAMPLES 10000
#define NUMBER_OF_CLASSES 10
// N.B. classes are integer handwritten digits in range 0-9
/******************************************************************************/
// loads the sample database from file (which is a CSV text file)
inline void revertInt(int&x)
{
x=((x&0x000000ff)<<24)|((x&0x0000ff00)<<8)|((x&0x00ff0000)>>8)|((x&0xff000000)>>24);
};
int read_data_from_csv(const char* samplePath,const char* labelPath, Mat data, Mat classes,
int n_samples )
{
FILE* sampleFile=fopen(samplePath,"rb");
FILE* labelFile=fopen(labelPath,"rb");
int mbs=0,number=0,col=0,row=0;
fread(&mbs,4,1,sampleFile);
fread(&number,4,1,sampleFile);
fread(&row,4,1,sampleFile);
fread(&col,4,1,sampleFile);
revertInt(mbs);
revertInt(number);
revertInt(row);
revertInt(col);
fread(&mbs,4,1,labelFile);
fread(&number,4,1,labelFile);
revertInt(mbs);
revertInt(number);
unsigned char temp;
for(int line = 0; line < n_samples; line++)
{
// for each attribute on the line in the file
for(int attribute = 0; attribute < (ATTRIBUTES_PER_SAMPLE + 1); attribute++)
{
if (attribute < ATTRIBUTES_PER_SAMPLE)
{
// first 64 elements (0-63) in each line are the attributes
fread(&temp,1,1,sampleFile);
//fscanf(f, "%f,", &tmp);
data.at<float>(line, attribute) = static_cast<float>(temp);
// printf("%f,", data.at<float>(line, attribute));
}
else if (attribute == ATTRIBUTES_PER_SAMPLE)
{
// attribute 65 is the class label {0 ... 9}
fread(&temp,1,1,labelFile);
//fscanf(f, "%f,", &tmp);
classes.at<float>(line, 0) = static_cast<float>(temp);
// printf("%f\n", classes.at<float>(line, 0));
}
}
}
fclose(sampleFile);
fclose(labelFile);
return 1; // all OK
}
/******************************************************************************/
int main( int argc, char** argv )
{
for (int i=0; i< argc; i++)
std::cout<<argv[i]<<std::endl;
// lets just check the version first
printf ("OpenCV version %s (%d.%d.%d)\n",
CV_VERSION,
CV_MAJOR_VERSION, CV_MINOR_VERSION, CV_SUBMINOR_VERSION);
//定义训练数据与标签矩阵
Mat training_data = Mat(NUMBER_OF_TRAINING_SAMPLES, ATTRIBUTES_PER_SAMPLE, CV_32FC1);
Mat training_classifications = Mat(NUMBER_OF_TRAINING_SAMPLES, 1, CV_32FC1);
//定义测试数据矩阵与标签
Mat testing_data = Mat(NUMBER_OF_TESTING_SAMPLES, ATTRIBUTES_PER_SAMPLE, CV_32FC1);
Mat testing_classifications = Mat(NUMBER_OF_TESTING_SAMPLES, 1, CV_32FC1);
// define all the attributes as numerical
// alternatives are CV_VAR_CATEGORICAL or CV_VAR_ORDERED(=CV_VAR_NUMERICAL)
// that can be assigned on a per attribute basis
Mat var_type = Mat(ATTRIBUTES_PER_SAMPLE + 1, 1, CV_8U );
var_type.setTo(Scalar(CV_VAR_NUMERICAL) ); // all inputs are numerical
// this is a classification problem (i.e. predict a discrete number of class
// outputs) so reset the last (+1) output var_type element to CV_VAR_CATEGORICAL
var_type.at<uchar>(ATTRIBUTES_PER_SAMPLE, 0) = CV_VAR_CATEGORICAL;
double result; // value returned from a prediction
//加载训练数据集和测试数据集
if (read_data_from_csv(argv[1],argv[2], training_data, training_classifications, NUMBER_OF_TRAINING_SAMPLES) &&
read_data_from_csv(argv[3],argv[4], testing_data, testing_classifications, NUMBER_OF_TESTING_SAMPLES))
{
/********************************步骤1：定义初始化Random Trees的参数******************************/
float priors[] = {1,1,1,1,1,1,1,1,1,1}; // weights of each classification for classes
CvRTParams params = CvRTParams(20, // max depth
50, // min sample count
0, // regression accuracy: N/A here
false, // compute surrogate split, no missing data
15, // max number of categories (use sub-optimal algorithm for larger numbers)
priors, // the array of priors
false, // calculate variable importance
50, // number of variables randomly selected at node and used to find the best split(s).
100, // max number of trees in the forest
0.01f, // forest accuracy
CV_TERMCRIT_ITER | CV_TERMCRIT_EPS // termination cirteria
);
/****************************步骤2：训练 Random Decision Forest(RDF)分类器*********************/
printf( "\nUsing training database: %s\n\n", argv[1]);
CvRTrees* rtree = new CvRTrees;
bool train_result=rtree->train(training_data, CV_ROW_SAMPLE, training_classifications,
Mat(), Mat(), var_type, Mat(), params);
// float train_error=rtree->get_train_error();
// printf("train error:%f\n",train_error);
// perform classifier testing and report results
Mat test_sample;
int correct_class = 0;
int wrong_class = 0;
int false_positives [NUMBER_OF_CLASSES] = {0,0,0,0,0,0,0,0,0,0};
printf( "\nUsing testing database: %s\n\n", argv[2]);
for (int tsample = 0; tsample < NUMBER_OF_TESTING_SAMPLES; tsample++)
{
// extract a row from the testing matrix
test_sample = testing_data.row(tsample);
/********************************步骤3：预测*********************************************/
result = rtree->predict(test_sample, Mat());
printf("Testing Sample %i -> class result (digit %d)\n", tsample, (int) result);
// if the prediction and the (true) testing classification are the same
// (N.B. openCV uses a floating point decision tree implementation!)
if (fabs(result - testing_classifications.at<float>(tsample, 0))
>= FLT_EPSILON)
{
// if they differ more than floating point error => wrong class
wrong_class++;
false_positives[(int) result]++;
}
else
{
// otherwise correct
correct_class++;
}
}
printf( "\nResults on the testing database: %s\n"
"\tCorrect classification: %d (%g%%)\n"
"\tWrong classifications: %d (%g%%)\n",
argv[2],
correct_class, (double) correct_class*100/NUMBER_OF_TESTING_SAMPLES,
wrong_class, (double) wrong_class*100/NUMBER_OF_TESTING_SAMPLES);
for (int i = 0; i < NUMBER_OF_CLASSES; i++)
{
printf( "\tClass (digit %d) false postives %d (%g%%)\n", i,
false_positives[i],
(double) false_positives[i]*100/NUMBER_OF_TESTING_SAMPLES);
}
// all matrix memory free by destructors
// all OK : main returns 0
return 0;
}
// not OK : main returns -1
return -1;
}

复制代码

MNIST样本可以在这个网址http://yann.lecun.com/exdb/mnist/下载，改一下路径可以直接跑的。
3.如何自己设计随机森林程序
有时现有的库无法满足要求，就需要自己设计一个分类器算法，这部分讲一下如何设计自己的随机森林分类器，代码实现就不贴了，因为在工作中用到了，因此比较敏感。
首先，要有一个RandomForest类，里面保存整个树需要的一些参数，包括但不限于：训练样本数量、测试样本数量、特征维数、每个节点随机提取的特征维数、CART树的数量、树的最大深度、类别数量（如果是分类问题）、一些终止条件、指向所有树的指针，指向训练集和测试集的指针，指向训练集label的指针等。还要有一些函数，至少要有train和predict吧。train里面直接调用每棵树的train方法即可，predict同理，但要对每棵树的预测输出做处理，得到森林的预测输出。
其次，要有一个sample类，这个类可不是用来存储训练集和对应label的，这是因为，每棵树、每个节点都有自己的样本集和，如果你的存储每个样本集和的话，需要的内存实在是太过巨大了，假设样本数量为M，特征维数为N，则整个训练集大小为M×N，而每棵树的每层都有这么多样本，树的深度为D，共有S棵树的话，则需要存储M×N×D×S的存储空间。这实在是太大了。因此，每个节点训练时用到的训练样本和特征，我们都用序号数组来代替，sample类就是干这个的。sample的函数基本需要两个就行，第一个是从现有训练集有放回的随机抽取一个新的训练集，当然，只包含样本的序号。第二个函数是从现有的特征中无放回的随机抽取一定数量的特征，同理，也是特征序号即可。