Class: DecisionTree::ID3Tree

Inherits:
Object
  • Object
show all
Defined in:
lib/decisiontree/id3_tree.rb

Instance Method Summary collapse

Constructor Details

#initialize(attributes, data, default, type) ⇒ ID3Tree

Returns a new instance of ID3Tree.



39
40
41
42
# File 'lib/decisiontree/id3_tree.rb', line 39

def initialize(attributes, data, default, type)
  @used, @tree, @type = {}, {}, type
  @data, @attributes, @default = data, attributes, default
end

Instance Method Details

#build_rules(tree = @tree) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/decisiontree/id3_tree.rb', line 144

def build_rules(tree=@tree)
  attr = tree.to_a.first
  cases = attr[1].to_a
  rules = []
  cases.each do |c,child|
    if child.is_a?(Hash) then
      build_rules(child).each do |r|
        r2 = r.clone
        r2.premises.unshift([attr.first, c])
        rules << r2
      end
    else
      rules << Rule.new(@attributes, [[attr.first, c]], child)
    end
  end
  rules
end

#fitness_for(attribute) ⇒ Object



57
58
59
60
61
62
# File 'lib/decisiontree/id3_tree.rb', line 57

def fitness_for(attribute)
  case type(attribute)
    when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)}
    when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)}
  end
end

#graph(filename) ⇒ Object



133
134
135
136
# File 'lib/decisiontree/id3_tree.rb', line 133

def graph(filename)
  dgp = DotGraphPrinter.new(build_tree)
  dgp.write_to_file("#{filename}.png", "png")
end

#id3_continuous(data, attributes, attribute) ⇒ Object

ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/decisiontree/id3_tree.rb', line 101

def id3_continuous(data, attributes, attribute)
  values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, []
  return [-1, -1] if values.size == 1
  values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) }
  thresholds.pop
  #thresholds -= used[attribute] if used.has_key? attribute

  gain = thresholds.collect { |threshold|
    sp = data.partition { |d| d[attributes.index(attribute)] >= threshold }
    pos = (sp[0].size).to_f / data.size
    neg = (sp[1].size).to_f / data.size

    [data.classification.entropy - pos*sp[0].classification.entropy - neg*sp[1].classification.entropy, threshold]
  }.max { |a,b| a[0] <=> b[0] }

  return [-1, -1] if gain.size == 0
  gain
end

#id3_discrete(data, attributes, attribute) ⇒ Object

ID3 for discrete label cases



121
122
123
124
125
126
127
# File 'lib/decisiontree/id3_tree.rb', line 121

def id3_discrete(data, attributes, attribute)
  values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
  partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
  remainder = partitions.collect {|p| (p.size.to_f / data.size) * p.classification.entropy}.inject(0) {|i,s| s+=i }

  [data.classification.entropy - remainder, attributes.index(attribute)]
end

#id3_train(data, attributes, default, used = {}) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/decisiontree/id3_tree.rb', line 64

def id3_train(data, attributes, default, used={})
  return default if data.empty?

  # return classification if all examples have the same classification
  return data.first.last if data.classification.uniq.size == 1

  # Choose best attribute:
  # 1. enumerate all attributes
  # 2. Pick best attribute
  # 3. If attributes all score the same, then pick a random one to avoid infinite recursion.
  performance = attributes.collect { |attribute| fitness_for(attribute).call(data, attributes, attribute) }
  max = performance.max { |a,b| a[0] <=> b[0] }
  min = performance.min { |a,b| a[0] <=> b[0] }
  max = performance.shuffle.first if max[0] == min[0]
  best = Node.new(attributes[performance.index(max)], max[1], max[0])
  best.threshold = nil if @type == :discrete
  @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
  tree, l = {best => {}}, ['>=', '<']

  fitness = fitness_for(best.attribute)
  case type(best.attribute)
    when :continuous
      data.partition { |d| d[attributes.index(best.attribute)] >= best.threshold }.each_with_index  { |examples, i|
        tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
      }
    when :discrete
      values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
      partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } }
      partitions.each_with_index  { |examples, i|
        tree[best][values[i]] = id3_train(examples, attributes-[values[i]], (data.classification.mode rescue 0), &fitness)
      }
    end

  tree
end

#predict(test) ⇒ Object



129
130
131
# File 'lib/decisiontree/id3_tree.rb', line 129

def predict(test)
  descend(@tree, test)
end

#rulesetObject



138
139
140
141
142
# File 'lib/decisiontree/id3_tree.rb', line 138

def ruleset
  rs = Ruleset.new(@attributes, @data, @default, @type)
  rs.rules = build_rules
  rs
end

#train(data = @data, attributes = @attributes, default = @default) ⇒ Object



44
45
46
47
48
49
50
51
# File 'lib/decisiontree/id3_tree.rb', line 44

def train(data=@data, attributes=@attributes, default=@default)
  initialize(attributes, data, default, @type)

  # Remove samples with same attributes leaving most common classification
  data2 = data.inject({}) {|hash, d| hash[d.slice(0..-2)] ||= Hash.new(0); hash[d.slice(0..-2)][d.last] += 1; hash }.map{|key,val| key + [val.sort_by{ |k, v| v }.last.first]}

  @tree = id3_train(data2, attributes, default)
end

#type(attribute) ⇒ Object



53
54
55
# File 'lib/decisiontree/id3_tree.rb', line 53

def type(attribute)
  @type.is_a?(Hash) ? @type[attribute.to_sym] : @type
end