楼主: Scalachen
1113 0

DecisionTree using Julia [推广有奖]

  • 0关注
  • 0粉丝

已卖:147份资源

本科生

56%

还不是VIP/贵宾

-

TA的文库  其他...

Haskell NewOccidental

Splunk NewOccidental

Apache Storm NewOccidental

威望
0
论坛币
5149 个
通用积分
0
学术水平
9 点
热心指数
11 点
信用等级
9 点
经验
1156 点
帖子
24
精华
1
在线时间
0 小时
注册时间
2015-3-29
最后登录
2017-8-22

楼主
Scalachen 发表于 2015-3-31 21:11:59 |AI写论文

+2 论坛币
k人 参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群
赵安豆老师微信:zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

感谢您参与论坛问题回答

经管之家送您两个论坛币!

+2 论坛币
  1. module DecisionTree

  2. import Base: length, convert, promote_rule, show, start, next, done

  3. export Leaf, Node, Ensemble, print_tree, depth,
  4.        build_stump, build_tree, prune_tree, apply_tree, nfoldCV_tree,
  5.        build_forest, apply_forest, nfoldCV_forest,
  6.        build_adaboost_stumps, apply_adaboost_stumps, nfoldCV_stumps,
  7.        majority_vote, ConfusionMatrix, confusion_matrix,
  8.        mean_squared_error, R2

  9. include("measures.jl")

  10. immutable Leaf
  11.     majority::Any
  12.     values::Vector
  13. end

  14. immutable Node
  15.     featid::Integer
  16.     featval::Any
  17.     left::Union(Leaf,Node)
  18.     right::Union(Leaf,Node)
  19. end

  20. immutable Ensemble
  21.     trees::Vector{Node}
  22. end

  23. convert(::Type{Node}, x::Leaf) = Node(0, nothing, x, Leaf(nothing,[nothing]))
  24. promote_rule(::Type{Node}, ::Type{Leaf}) = Node
  25. promote_rule(::Type{Leaf}, ::Type{Node}) = Node

  26. immutable UniqueRanges
  27.     v::AbstractVector
  28. end

  29. start(u::UniqueRanges) = 1
  30. done(u::UniqueRanges, s) = done(u.v, s)
  31. next(u::UniqueRanges, s) = (val = u.v[s];
  32.                             t = searchsortedlast(u.v, val, s, length(u.v), Base.Order.Forward);
  33.                             ((val, s:t), t+1))

  34. length(leaf::Leaf) = 1
  35. length(tree::Node) = length(tree.left) + length(tree.right)
  36. length(ensemble::Ensemble) = length(ensemble.trees)

  37. depth(leaf::Leaf) = 0
  38. depth(tree::Node) = 1 + max(depth(tree.left), depth(tree.right))

  39. function print_tree(leaf::Leaf, depth=-1, indent=0)
  40.     matches = find(leaf.values .== leaf.majority)
  41.     ratio = string(length(matches)) * "/" * string(length(leaf.values))
  42.     println("$(leaf.majority) : $(ratio)")
  43. end

  44. function print_tree(tree::Node, depth=-1, indent=0)
  45.     if depth == indent
  46.         println()
  47.         return
  48.     end
  49.     println("Feature $(tree.featid), Threshold $(tree.featval)")
  50.     print("    " ^ indent * "L-> ")
  51.     print_tree(tree.left, depth, indent + 1)
  52.     print("    " ^ indent * "R-> ")
  53.     print_tree(tree.right, depth, indent + 1)
  54. end

  55. const NO_BEST=(0,0)


  56. ### Classification ###

  57. function _split(labels::Vector, features::Matrix, nsubfeatures::Int, weights::Vector)
  58.     if weights == [0]
  59.         _split_info_gain(labels, features, nsubfeatures)
  60.     else
  61.         _split_neg_z1_loss(labels, features, weights)
  62.     end
  63. end

  64. function _split_info_gain(labels::Vector, features::Matrix, nsubfeatures::Int)
  65.     nf = size(features, 2)
  66.     N = length(labels)

  67.     best = NO_BEST
  68.     best_val = -Inf

  69.     if nsubfeatures > 0
  70.         r = randperm(nf)
  71.         inds = r[1:nsubfeatures]
  72.     else
  73.         inds = 1:nf
  74.     end

  75.     for i in inds
  76.         ord = sortperm(features[:,i])
  77.         features_i = features[ord,i]
  78.         labels_i = labels[ord]

  79.         hist1 = _hist(labels_i, 1:0)
  80.         hist2 = _hist(labels_i)
  81.         N1 = 0
  82.         N2 = N

  83.         for (d, range) in UniqueRanges(features_i)
  84.             value = _info_gain(N1, hist1, N2, hist2)
  85.             if value > best_val
  86.                 best_val = value
  87.                 best = (i, d)
  88.             end

  89.             deltaN = length(range)

  90.             _hist_shift!(hist2, hist1, labels_i, range)
  91.             N1 += deltaN
  92.             N2 -= deltaN
  93.         end
  94.     end
  95.     return best
  96. end

  97. function _split_neg_z1_loss(labels::Vector, features::Matrix, weights::Vector)
  98.     best = NO_BEST
  99.     best_val = -Inf
  100.     for i in 1:size(features,2)
  101.         domain_i = sort(unique(features[:,i]))
  102.         for thresh in domain_i[2:end]
  103.             cur_split = features[:,i] .< thresh
  104.             value = _neg_z1_loss(labels[cur_split], weights[cur_split]) + _neg_z1_loss(labels[!cur_split], weights[!cur_split])
  105.             if value > best_val
  106.                 best_val = value
  107.                 best = (i, thresh)
  108.             end
  109.         end
  110.     end
  111.     return best
  112. end

  113. function build_stump(labels::Vector, features::Matrix, weights=[0])
  114.     S = _split(labels, features, 0, weights)
  115.     if S == NO_BEST
  116.         return Leaf(majority_vote(labels), labels)
  117.     end
  118.     id, thresh = S
  119.     split = features[:,id] .< thresh
  120.     return Node(id, thresh,
  121.                 Leaf(majority_vote(labels[split]), labels[split]),
  122.                 Leaf(majority_vote(labels[!split]), labels[!split]))
  123. end

  124. function build_tree(labels::Vector, features::Matrix, nsubfeatures=0)
  125.     S = _split(labels, features, nsubfeatures, [0])
  126.     if S == NO_BEST
  127.         return Leaf(majority_vote(labels), labels)
  128.     end
  129.     id, thresh = S
  130.     split = features[:,id] .< thresh
  131.     labels_left = labels[split]
  132.     labels_right = labels[!split]
  133.     pure_left = all(labels_left .== labels_left[1])
  134.     pure_right = all(labels_right .== labels_right[1])
  135.     if pure_right && pure_left
  136.         return Node(id, thresh,
  137.                     Leaf(labels_left[1], labels_left),
  138.                     Leaf(labels_right[1], labels_right))
  139.     elseif pure_left
  140.         return Node(id, thresh,
  141.                     Leaf(labels_left[1], labels_left),
  142.                     build_tree(labels_right,features[!split,:], nsubfeatures))
  143.     elseif pure_right
  144.         return Node(id, thresh,
  145.                     build_tree(labels_left,features[split,:], nsubfeatures),
  146.                     Leaf(labels_right[1], labels_right))
  147.     else
  148.         return Node(id, thresh,
  149.                     build_tree(labels_left,features[split,:], nsubfeatures),
  150.                     build_tree(labels_right,features[!split,:], nsubfeatures))
  151.     end
  152. end

  153. function prune_tree(tree::Union(Leaf,Node), purity_thresh=1.0)
  154.     function _prune_run(tree::Union(Leaf,Node), purity_thresh::Real)
  155.         N = length(tree)
  156.         if N == 1        ## a Leaf
  157.             return tree
  158.         elseif N == 2    ## a stump
  159.             all_labels = [tree.left.values; tree.right.values]
  160.             majority = majority_vote(all_labels)
  161.             matches = find(all_labels .== majority)
  162.             purity = length(matches) / length(all_labels)
  163.             if purity >= purity_thresh
  164.                 return Leaf(majority, all_labels)
  165.             else
  166.                 return tree
  167.             end
  168.         else
  169.             return Node(tree.featid, tree.featval,
  170.                         _prune_run(tree.left, purity_thresh),
  171.                         _prune_run(tree.right, purity_thresh))
  172.         end
  173.     end
  174.     pruned = _prune_run(tree, purity_thresh)
  175.     while length(pruned) < length(tree)
  176.         tree = pruned
  177.         pruned = _prune_run(tree, purity_thresh)
  178.     end
  179.     return pruned
  180. end

  181. apply_tree(leaf::Leaf, feature::Vector) = leaf.majority

  182. function apply_tree(tree::Node, features::Vector)
  183.     if tree.featval == nothing
  184.         return apply_tree(tree.left, features)
  185.     elseif features[tree.featid] < tree.featval
  186.         return apply_tree(tree.left, features)
  187.     else
  188.         return apply_tree(tree.right, features)
  189.     end
  190. end

  191. function apply_tree(tree::Union(Leaf,Node), features::Matrix)
  192.     N = size(features,1)
  193.     predictions = Array(Any,N)
  194.     for i in 1:N
  195.         predictions[i] = apply_tree(tree, squeeze(features[i,:],1))
  196.     end
  197.     if typeof(predictions[1]) <: FloatingPoint
  198.         return float(predictions)
  199.     else
  200.         return predictions
  201.     end
  202. end

  203. function build_forest(labels::Vector, features::Matrix, nsubfeatures::Integer, ntrees::Integer, partialsampling=0.7)
  204.     partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
  205.     Nlabels = length(labels)
  206.     Nsamples = int(partialsampling * Nlabels)
  207.     forest = @parallel (vcat) for i in 1:ntrees
  208.         inds = rand(1:Nlabels, Nsamples)
  209.         build_tree(labels[inds], features[inds,:], nsubfeatures)
  210.     end
  211.     return Ensemble([forest;])
  212. end

  213. function apply_forest(forest::Ensemble, features::Vector)
  214.     ntrees = length(forest)
  215.     votes = Array(Any,ntrees)
  216.     for i in 1:ntrees
  217.         votes[i] = apply_tree(forest.trees[i],features)
  218.     end
  219.     if typeof(votes[1]) <: FloatingPoint
  220.         return mean(votes)
  221.     else
  222.         return majority_vote(votes)
  223.     end
  224. end

  225. function apply_forest(forest::Ensemble, features::Matrix)
  226.     N = size(features,1)
  227.     predictions = Array(Any,N)
  228.     for i in 1:N
  229.         predictions[i] = apply_forest(forest, squeeze(features[i,:],1))
  230.     end
  231.     if typeof(predictions[1]) <: FloatingPoint
  232.         return float(predictions)
  233.     else
  234.         return predictions
  235.     end
  236. end

  237. function build_adaboost_stumps(labels::Vector, features::Matrix, niterations::Integer)
  238.     N = length(labels)
  239.     weights = ones(N) / N
  240.     stumps = Node[]
  241.     coeffs = FloatingPoint[]
  242.     for i in 1:niterations
  243.         new_stump = build_stump(labels, features, weights)
  244.         predictions = apply_tree(new_stump, features)
  245.         err = _weighted_error(labels, predictions, weights)
  246.         new_coeff = 0.5 * log((1.0 + err) / (1.0 - err))
  247.         matches = labels .== predictions
  248.         weights[!matches] *= exp(new_coeff)
  249.         weights[matches] *= exp(-new_coeff)
  250.         weights /= sum(weights)
  251.         push!(coeffs, new_coeff)
  252.         push!(stumps, new_stump)
  253.         if err < 1e-6
  254.             break
  255.         end
  256.     end
  257.     return (Ensemble(stumps), coeffs)
  258. end

  259. function apply_adaboost_stumps(stumps::Ensemble, coeffs::Vector{FloatingPoint}, features::Vector)
  260.     nstumps = length(stumps)
  261.     counts = Dict()
  262.     for i in 1:nstumps
  263.         prediction = apply_tree(stumps.trees[i], features)
  264.         counts[prediction] = get(counts, prediction, 0.0) + coeffs[i]
  265.     end
  266.     top_prediction = stumps.trees[1].left.majority
  267.     top_count = -Inf
  268.     for (k,v) in counts
  269.         if v > top_count
  270.             top_prediction = k
  271.             top_count = v
  272.         end
  273.     end
  274.     return top_prediction
  275. end

  276. function apply_adaboost_stumps(stumps::Ensemble, coeffs::Vector{FloatingPoint}, features::Matrix)
  277.     N = size(features,1)
  278.     predictions = Array(Any,N)
  279.     for i in 1:N
  280.         predictions[i] = apply_adaboost_stumps(stumps, coeffs, squeeze(features[i,:],1))
  281.     end
  282.     return predictions
  283. end

  284. function show(io::IO, leaf::Leaf)
  285.     println(io, "Decision Leaf")
  286.     println(io, "Majority: $(leaf.majority)")
  287.     print(io,   "Samples:  $(length(leaf.values))")
  288. end

  289. function show(io::IO, tree::Node)
  290.     println(io, "Decision Tree")
  291.     println(io, "Leaves: $(length(tree))")
  292.     print(io,   "Depth:  $(depth(tree))")
  293. end

  294. function show(io::IO, ensemble::Ensemble)
  295.     println(io, "Ensemble of Decision Trees")
  296.     println(io, "Trees:      $(length(ensemble))")
  297.     println(io, "Avg Leaves: $(mean([length(tree) for tree in ensemble.trees]))")
  298.     print(io,   "Avg Depth:  $(mean([depth(tree) for tree in ensemble.trees]))")
  299. end


  300. ### Regression ###

  301. function _split_mse{T<:FloatingPoint, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Int)
  302.     nr, nf = size(features)
  303.     best = NO_BEST
  304.     best_val = -Inf

  305.     if nsubfeatures > 0
  306.         r = randperm(nf)
  307.         inds = r[1:nsubfeatures]
  308.     else
  309.         inds = 1:nf
  310.     end

  311.     for i in inds
  312.         if nr > 100
  313.             features_i = features[:,i]
  314.             domain_i = quantile(features_i, linspace(0.01, 0.99, 99))
  315.             labels_i = labels
  316.         else
  317.             ord = sortperm(features[:,i])
  318.             features_i = features[ord,i]
  319.             domain_i = features_i
  320.             labels_i = labels[ord]
  321.         end
  322.         for thresh in domain_i[2:end]
  323.             value = _mse_loss(labels_i, features_i, thresh)
  324.             if value > best_val
  325.                 best_val = value
  326.                 best = (i, thresh)
  327.             end
  328.         end
  329.     end
  330.     return best
  331. end

  332. function build_stump{T<:FloatingPoint, U<:Real}(labels::Vector{T}, features::Matrix{U})
  333.     S = _split_mse(labels, features, 0)
  334.     if S == NO_BEST
  335.         return Leaf(mean(labels), labels)
  336.     end
  337.     id, thresh = S
  338.     split = features[:,id] .< thresh
  339.     return Node(id, thresh,
  340.                 Leaf(mean(labels[split]), labels[split]),
  341.                 Leaf(mean(labels[!split]), labels[!split]))
  342. end

  343. function build_tree{T<:FloatingPoint, U<:Real}(labels::Vector{T}, features::Matrix{U}, maxlabels=5, nsubfeatures=0)
  344.     if length(labels) <= maxlabels
  345.         return Leaf(mean(labels), labels)
  346.     end
  347.     S = _split_mse(labels, features, nsubfeatures)
  348.     if S == NO_BEST
  349.         return Leaf(mean(labels), labels)
  350.     end
  351.     id, thresh = S
  352.     split = features[:,id] .< thresh
  353.     return Node(id, thresh,
  354.                 build_tree(labels[split], features[split,:], maxlabels, nsubfeatures),
  355.                 build_tree(labels[!split], features[!split,:], maxlabels, nsubfeatures))
  356. end

  357. function build_forest{T<:FloatingPoint, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer, maxlabels=0.5, partialsampling=0.7)
  358.     partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
  359.     Nlabels = length(labels)
  360.     Nsamples = int(partialsampling * Nlabels)
  361.     forest = @parallel (vcat) for i in 1:ntrees
  362.         inds = rand(1:Nlabels, Nsamples)
  363.         build_tree(labels[inds], features[inds,:], maxlabels, nsubfeatures)
  364.     end
  365.     return Ensemble([forest;])
  366. end


  367. end # module
复制代码


二维码

扫码加我 拉你入群

请注明:姓名-公司-职位

以便审核进群资格,未注明则拒绝

关键词:Decision Julia Using CISI Tree convert values start

本帖被以下文库推荐

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注jltj
拉您入交流群
GMT+8, 2026-1-2 05:56