Class: Zipfy::Zipf

Inherits:
Object
  • Object
show all
Defined in:
lib/zipfy.rb

Constant Summary collapse

TAB =
"\t"

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#distributionObject (readonly)

Returns the value of attribute distribution.



16
17
18
# File 'lib/zipfy.rb', line 16

def distribution
  @distribution
end

#totalObject (readonly)

Returns the value of attribute total.



17
18
19
# File 'lib/zipfy.rb', line 17

def total
  @total
end

#zipf_constantObject (readonly)

Returns the value of attribute zipf_constant.



18
19
20
# File 'lib/zipfy.rb', line 18

def zipf_constant
  @zipf_constant
end

Instance Method Details

#calculate_std_dev_from_regObject

must calculate zipfness first



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/zipfy.rb', line 99

def calculate_std_dev_from_reg
    #the zipfness should be equal to 1/rank. lets see the average deviatoin form that number
    length = @distribution.length
    deviations = []

    @distribution.each_with_index do |wd,i|
        theoretical = 1/(length - i).to_f
        actual = wd.zip_number
        deviation = (theoretical - actual).abs
        deviations << deviation
    end
    sum = 0.0
    deviations.each {|d| sum += d}
    (sum/length.to_f)
end

#calculate_zipf_constantObject



85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/zipfy.rb', line 85

def calculate_zipf_constant
    #this word has rank =1 , so its zipf factor should be 1/n = 1/1 = 1
    # so find its percentage and then multiply it up to one.

    #make the following true
    #freq of most common word / total words = 1 
    #by adding a constant to the right side
    # where constant = total words/ freq of most common word

    @zipf_constant = @total.to_f / @distribution.last.frequency.to_f
    @zipf_constant
end

#calculate_zipfnessObject



56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/zipfy.rb', line 56

def calculate_zipfness
    calculate_zipf_constant unless @zipf_constant

    #zipf number is equal to freq * zipf_const/total
    # so call zipf_const/total 'z'
    z = @zipf_constant/@total.to_f
    length = @distribution.length

    @distribution.map do |d|
       d.zip_number = z * d.frequency
    end
end

#create_distribution(words) ⇒ Object

sets instance var distribution to the distirubtion of the words set passed



37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/zipfy.rb', line 37

def create_distribution words
    temp_hash = Hash.new(0)
    @distribution = []
    @total = 0
    @zipf_constant = nil

    words.each do |word|
        temp_hash[word] += 1
        @total += 1
    end
    temp_hash.keys.each do |k|
        @distribution << WordData.new(k, temp_hash[k])
    end
end

#load_file(file_path) ⇒ Object

Parses a file into an array of words



21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/zipfy.rb', line 21

def load_file file_path
    words = []
    File.open(file_path, 'r') do |f|
        words = f.read.to_s.split(/[\s,-]/)
    end
    words.each do |word|
        word.gsub!(/\W+/, '') #remove non characters
        word.downcase!
    end
    words.delete("") #remove blank entries
    words
end

#puts_distributionObject



69
70
71
72
73
74
75
76
# File 'lib/zipfy.rb', line 69

def puts_distribution
    underline " "*6 + "Word Distirbution" + " "*6
    length = @distribution.length
    @distribution.each_with_index do |wd, i|
        tabs = 4 - (wd.word.length / 8).floor
        puts wd.word + (TAB * tabs).to_s + wd.frequency.to_s + TAB + (length - i).to_s + TAB + wd.zip_number.to_s
    end
end

#save_dist_to_file(file_path, overwrite_file = false) ⇒ Object



78
79
80
81
82
83
# File 'lib/zipfy.rb', line 78

def save_dist_to_file file_path, overwrite_file = false
    if File.exists?(file_path) && overwrite_file
        puts "Requires --force to overwrite existing file '#{file_path}'"
        exit
    end
end

#sort_distributionObject



52
53
54
# File 'lib/zipfy.rb', line 52

def sort_distribution
    @distribution = @distribution.sort_by {|wd| wd.frequency.to_i}
end