|
这个决策树的例子里面,涉及到熵值法的计算,可以参考一下——
//遍历当前可遍历的数据,进行统计,为计算各属性熵做准备
for (int n = 0; n < PNode.DataIndexes.Count; n++)
{
int theI = PNode.DataIndexes[n];
for (int k = 0; k < TestProperties.Count; k++)
{
int theJ = TestProperties[k];
var thePropertyCalcNode = thePropertyCount[PropertyNames[theJ]];
//对当前属性计数
thePropertyCalcNode.CalcCount++;
//对第j个属性的当前因子计数
thePropertyCalcNode.AddChildren(Inputs[theJ][theI], "测试属性因子", theI, 1);
//对第j个属性的当前因子的主分类因子计数
thePropertyCalcNode.Children[Inputs[theJ][theI]].AddChildren(Inputs[0][theI], "主分类因子", theI, 1);
//统计归纳各属性因子,采用这种方式可以减少循环.
if (PropertyFactors[PropertyNames[theJ]].Contains(Inputs[theJ][theI]) == false)
{
PropertyFactors[PropertyNames[theJ]].Add(Inputs[theJ][theI]);
}
}
}
//计算信息增益量,获取具有最大信息增益属性
string theDefaultClassFactor = DefaultClassFactor;
//初始化最大测试属性熵值.
double theMaxEA = double.MinValue;
//记录具有最大熵值属性的索引位置
int theMaxPropertyIndex = TestProperties[1];
//总信息熵值,其实就是分类属性的熵值.
double theTotalEA = 0.0;
//记录总的样本数,用于估算概率.
double theTotalSimple = 0;
for(int theI=0;theI<TestProperties.Count;theI++)
{
int thePIndex_1 = TestProperties[theI];
if (thePIndex_1 == 0)
{
//主分类熵值计算,计算公式与测试属性有所不同.
CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
double theCount = theCalcNode.CalcCount;
theTotalSimple = theCount;
double theMaxSubCount = -1;
theTotalEA = 0.0;
//求和(-Pj*log2(Pj))
foreach (var theSubNode in theCalcNode.Children)
{
if (theSubNode.Value.CalcCount > 0)
{
double thePj = theSubNode.Value.CalcCount / theCount;
theTotalEA += 0 - thePj * Math.Log(thePj, 2);
}
if (theMaxSubCount < theSubNode.Value.CalcCount)
{
theMaxSubCount = theSubNode.Value.CalcCount;
theDefaultClassFactor = theSubNode.Key;
}
//测试输出,跟踪计算路径.
OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + ":: " + theSubNode.Value.Type + " :: " + theSubNode.Key + " :: " + theSubNode.Value.CalcCount;
}
}
else
{
//测试属性熵值计算。
CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
double theJEA = 0.0;
foreach (var theSubNode_1 in theCalcNode.Children)
{
if (theSubNode_1.Value.CalcCount > 0)
{
double theSjCount = theSubNode_1.Value.CalcCount;
double theSj_1 = theSjCount / theTotalSimple;
double theSj_2 = 0.0;
foreach (var theSubNode_2 in theSubNode_1.Value.Children)
{
if (theSubNode_2.Value.CalcCount > 0)
{
double thePj_1 = Convert.ToDouble(theSubNode_2.Value.CalcCount) / theSjCount;
theSj_2 += 0.0 - thePj_1 * Math.Log(thePj_1, 2);
}
OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + " :: " + theSubNode_1.Value.Type + " :: " + theSubNode_1.Key + " :: " + theSubNode_1.Value.CalcCount
+ theSubNode_2.Value.Type + " :: " + theSubNode_2.Key + " :: " + theSubNode_2.Value.CalcCount;
}
theJEA += theSj_1 * theSj_2;
}
}
theJEA = theTotalEA - theJEA;
//只记录最大熵值属性信息.
if (theMaxEA < theJEA)
{
theMaxEA = theJEA;
theMaxPropertyIndex = thePIndex_1;
}
}
}
整个例子可以参考:http://www.tuicool.com/articles/Vj6vYn
|