Class: Bae::Classifier
- Inherits:
-
Object
- Object
- Bae::Classifier
- Defined in:
- lib/bae/classifier.rb
Instance Attribute Summary collapse
-
#frequency_table ⇒ Object
Returns the value of attribute frequency_table.
-
#label_index ⇒ Object
Returns the value of attribute label_index.
-
#label_index_sequence ⇒ Object
Returns the value of attribute label_index_sequence.
-
#label_instance_count ⇒ Object
Returns the value of attribute label_instance_count.
-
#total_terms ⇒ Object
Returns the value of attribute total_terms.
Instance Method Summary collapse
- #classify(data) ⇒ Object
- #classify_from_hash(frequency_hash) ⇒ Object
- #classify_from_string(document) ⇒ Object
- #finish_training! ⇒ Object
-
#initialize ⇒ Classifier
constructor
A new instance of Classifier.
- #load_state(path) ⇒ Object
- #save_state(path) ⇒ Object
- #train(label, training_data) ⇒ Object
- #train_from_hash(label, frequency_hash) ⇒ Object
- #train_from_string(label, document) ⇒ Object
Constructor Details
#initialize ⇒ Classifier
Returns a new instance of Classifier.
7 8 9 10 11 12 13 |
# File 'lib/bae/classifier.rb', line 7 def initialize @frequency_table = ::Hash.new { |hash, feature| hash[feature] = [] } @label_instance_count = ::Hash.new { |hash, label| hash[label] = 0 } @label_index = ::Hash.new { |hash, label| hash[label] = 0 } @label_index_sequence = -1 # start at -1 so 0 is first value @total_terms = 0.0 end |
Instance Attribute Details
#frequency_table ⇒ Object
Returns the value of attribute frequency_table.
4 5 6 |
# File 'lib/bae/classifier.rb', line 4 def frequency_table @frequency_table end |
#label_index ⇒ Object
Returns the value of attribute label_index.
4 5 6 |
# File 'lib/bae/classifier.rb', line 4 def label_index @label_index end |
#label_index_sequence ⇒ Object
Returns the value of attribute label_index_sequence.
4 5 6 |
# File 'lib/bae/classifier.rb', line 4 def label_index_sequence @label_index_sequence end |
#label_instance_count ⇒ Object
Returns the value of attribute label_instance_count.
4 5 6 |
# File 'lib/bae/classifier.rb', line 4 def label_instance_count @label_instance_count end |
#total_terms ⇒ Object
Returns the value of attribute total_terms.
4 5 6 |
# File 'lib/bae/classifier.rb', line 4 def total_terms @total_terms end |
Instance Method Details
#classify(data) ⇒ Object
50 51 52 53 54 55 56 57 58 |
# File 'lib/bae/classifier.rb', line 50 def classify(data) if data.is_a?(::String) classify_from_string(data) elsif data.is_a?(::Hash) classify_from_hash(data) else fail 'Training data must either be a string or hash' end end |
#classify_from_hash(frequency_hash) ⇒ Object
60 61 62 63 64 |
# File 'lib/bae/classifier.rb', line 60 def classify_from_hash(frequency_hash) document = frequency_hash.map{ |word, frequency| (word + ' ') * frequency }.join classify_from_string(document) end |
#classify_from_string(document) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/bae/classifier.rb', line 66 def classify_from_string(document) words = document.split.uniq likelihoods = @likelihoods.dup posterior = {} vocab_size = frequency_table.keys.size label_index.each do |label, index| words.map do |word| row = frequency_table[word] unless row.empty? laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f likelihoods[label] *= laplace_word_likelihood / (1.0 - laplace_word_likelihood) end end posterior[label] = @priors[label] * likelihoods[label] end normalize(posterior) end |
#finish_training! ⇒ Object
15 16 17 18 |
# File 'lib/bae/classifier.rb', line 15 def finish_training! calculate_likelihoods! calculate_priors! end |
#load_state(path) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/bae/classifier.rb', line 102 def load_state(path) state = ::JSON.parse(::File.read(::File.(path))) fail 'Missing frequency_table' unless state['frequency_table'] fail 'Missing label_instance_count' unless state['label_instance_count'] fail 'Missing label_index' unless state['label_index'] fail 'Missing label_index_sequence' unless state['label_index_sequence'] fail 'Missing total_terms' unless state['total_terms'] @frequency_table = state['frequency_table'] @label_instance_count = state['label_instance_count'] @label_index = state['label_index'] @label_index_sequence = state['label_index_sequence'] @total_terms = state['total_terms'] finish_training! end |
#save_state(path) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/bae/classifier.rb', line 89 def save_state(path) state = {} state['frequency_table'] = frequency_table state['label_instance_count'] = label_instance_count state['label_index'] = label_index state['label_index_sequence'] = label_index_sequence state['total_terms'] = total_terms ::File.open(::File.(path), 'w') do |handle| handle.write(state.to_json) end end |
#train(label, training_data) ⇒ Object
20 21 22 23 24 25 26 27 28 |
# File 'lib/bae/classifier.rb', line 20 def train(label, training_data) if training_data.is_a?(::String) train_from_string(label, training_data) elsif training_data.is_a?(::Hash) train_from_hash(label, training_data) else fail 'Training data must either be a string or hash' end end |
#train_from_hash(label, frequency_hash) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/bae/classifier.rb', line 41 def train_from_hash(label, frequency_hash) frequency_hash.each do |word, frequency| update_label_index(label) update_frequency_table(label, word, frequency) end @label_instance_count[label] += 1 @total_terms += 1 end |
#train_from_string(label, document) ⇒ Object
30 31 32 33 34 35 36 37 38 39 |
# File 'lib/bae/classifier.rb', line 30 def train_from_string(label, document) words = document.split words.each do |word| update_label_index(label) update_frequency_table(label, word, 1) end @label_instance_count[label] += 1 @total_terms += 1 end |