Top Level Namespace

Defined Under Namespace

Modules: Semtools Classes: Ontology

Instance Method Summary collapse

Instance Method Details

#binom(n, k) ⇒ Object



90
91
92
93
94
95
96
# File 'lib/semtools/math_methods.rb', line 90

def binom(n,k)
  if k > 0 && k < n
    res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
  else
    res = 1
  end
end

#complex_text_similitude(textA, textB, splitChar = ";", charsToRemove = "") ⇒ Object

Applies the WhiteSimilarity from ‘text’ package over two given complex texts. Complex texts will be splitted and compared one by one from A to B and B to A Param:

textA

text to be compared with textB

textB

text to be compared with textA

splitChar

char to split text* complex names

charsToRemove

char (or chars set) to be removed from text to be compared

Returns the similarity percentage between [0,1] obtained by bidirectional all Vs all similarity



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/semtools/sim_handler.rb', line 61

def complex_text_similitude(textA, textB, splitChar = ";", charsToRemove = "")
  # Check special cases
  return -1.0 if (textA.nil?) | (textB.nil?)
  return -1.0 if (!textA.is_a? String) | (!textB.is_a? String)
  return -1.0 if (textA.length <= 0) | (textB.length <= 0)
  # Split&Clean both sets
  textA_splitted = textA.split(splitChar)
  textB_splitted = textB.split(splitChar)
  if !charsToRemove.empty?
    textA_splitted.map! {|str| str.gsub(/[#{charsToRemove}]/,'')}
    textA_splitted.select! {|str| str.length > 0}
    textB_splitted.map! {|str| str.gsub(/[#{charsToRemove}]/,'')}
    textB_splitted.select! {|str| str.length > 0}
  end
  # Per each X elemnt, compare against all Y elements
  similitudesA = ctext_AtoB(textA_splitted, textB_splitted)
  similitudesB = ctext_AtoB(textB_splitted, textA_splitted)
  # Obtain bidirectional similitude
  similitudesA = similitudesA.inject{ |sum, el| sum + el }.to_f / similitudesA.size
  similitudesB = similitudesB.inject{ |sum, el| sum + el }.to_f / similitudesB.size
  # Obtain bidirectional similitude
  bidirectional_sim = (similitudesA + similitudesB) / 2
  # Return info
  return bidirectional_sim
end

#compute_hyper_prob(a, b, c, d, n) ⇒ Object



82
83
84
85
86
87
88
# File 'lib/semtools/math_methods.rb', line 82

def compute_hyper_prob(a, b, c, d, n)
  # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
  binomA = binom(a + b, a)
  binomC = binom(c + d, c)
  divisor = binom(n, a + c)
  return (binomA * binomC).fdiv(divisor)
end

#ctext_AtoB(textsA, textsB) ⇒ Object

Applies the WhiteSimilarity from ‘text’ package over two given text sets and returns the similitudes of the each element of the first set over the second set Param:

textsA

text set to be compared with textsB

textsB

text set to be compared with textsA

Returns the maximum similarity percentage between [0,1] for each element of textsA against all elements of textsB



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/semtools/sim_handler.rb', line 28

def ctext_AtoB(textsA, textsB)
  # Check special cases
  return [-1.0] if (textsA.nil?) | (textsB.nil?)
  return [-1.0] if (!textsA.is_a? Array) | (!textsB.is_a? Array)
  return [-1.0] if (textsA.length <= 0) | (textsB.length <= 0)
  # Calculate similitude
  similitudesA = []
  textsA.each do |fragA|
    frag_A_similitudes = []
    textsB.each do |fragB|
      frag_A_similitudes << text_similitude(fragA, fragB)
    end
    begin 
      similitudesA << frag_A_similitudes.max
    rescue => e
      STDERR.puts frag_A_similitudes.inspect
      STDERR.puts textsA.inspect , textsB.inspect
      STDERR.puts e.message
      STDERR.puts e.backtrace
      Process.exit
    end 
  end
  return similitudesA
end

#cummin(array) ⇒ Object



121
122
123
124
125
126
127
128
129
# File 'lib/semtools/math_methods.rb', line 121

def cummin(array)
  cumulative_min = array.first
  arr_cummin = []
  array.each do |p|
    cumulative_min = [p, cumulative_min].min
    arr_cummin << cumulative_min
  end
  return arr_cummin
end

#get_benjaminiHochberg_pvalues(arr_pvalues) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/semtools/math_methods.rb', line 100

def get_benjaminiHochberg_pvalues(arr_pvalues)
  n = arr_pvalues.length
  arr_o = order(arr_pvalues, true)
  arr_cummin_input = []
  (0..(n - 1)).each do |i|
    arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
  end
  arr_ro = order(arr_o)
  arr_cummin = cummin(arr_cummin_input)
  arr_pmin = pmin(arr_cummin)
  return arr_pmin.values_at(*arr_ro)
end

#get_fisher_exact_test(listA, listB, all_elements_count, tail = 'two_sided', weigths = nil) ⇒ Object

TODO: Make a pull request to rubygems.org/gems/ruby-statistics, with all the statistic code implemented here. to cmpute fisher exact test Fisher => www.biostathandbook.com/fishers.html



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/semtools/math_methods.rb', line 4

def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
  listA_listB = listA & listB
  listA_nolistB = listA - listB
  nolistA_listB = listB - listA
  if weigths.nil?
    listA_listB_count = listA_listB.length
    listA_nolistB_count = listA_nolistB.length
    nolistA_listB_count = nolistA_listB.length
    nolistA_nolistB_count = all_elements_count - (listA | listB).length
  else
    # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
    # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
    listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
    listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
    nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
    nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
    all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
  end
  if tail == 'two_sided'
    accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
  elsif tail == 'less' 
    accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
  end
  return accumulated_prob
end

#get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/semtools/math_methods.rb', line 68

def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
  accumulated_prob = 0
  [listA_listB_count, nolistA_nolistB_count].min.times do |n|
    accumulated_prob += compute_hyper_prob(
      listA_listB_count - n, 
      listA_nolistB_count + n, 
      nolistA_listB_count + n, 
      nolistA_nolistB_count - n, 
      all_elements_count
    )
  end
  return accumulated_prob
end

#get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/semtools/math_methods.rb', line 30

def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
  #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
  accumulated_prob = 0
  ref_prob = compute_hyper_prob(
    listA_listB_count, 
    listA_nolistB_count, 
    nolistA_listB_count, 
    nolistA_nolistB_count, 
    all_elements_count
  )
  accumulated_prob += ref_prob
  [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
    n += 1
    prob = compute_hyper_prob(
      listA_listB_count - n, 
      listA_nolistB_count + n, 
      nolistA_listB_count + n, 
      nolistA_nolistB_count - n, 
      all_elements_count
    )
    prob <= ref_prob ? accumulated_prob += prob : break
  end

  [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
    n += 1
    prob = compute_hyper_prob(
      listA_listB_count + n, 
      listA_nolistB_count - n, 
      nolistA_listB_count - n, 
      nolistA_nolistB_count + n, 
      all_elements_count
    )
    accumulated_prob += prob if prob <= ref_prob
  end

  return accumulated_prob
end

#order(array, decreasing = false) ⇒ Object



113
114
115
116
117
118
119
# File 'lib/semtools/math_methods.rb', line 113

def order(array, decreasing = false)
  if decreasing == false
    array.sort.map { |n| array.index(n) }
  else
    array.sort.map { |n| array.index(n) }.reverse
  end
end

#pmin(array) ⇒ Object



131
132
133
134
135
136
137
138
139
# File 'lib/semtools/math_methods.rb', line 131

def pmin(array)
  x = 1
  pmin_array = []
  array.each_index do |i|
    pmin_array[i] = [array[i], x].min
    abort if pmin_array[i] > 1
  end
  return pmin_array
end

#similitude_network(items_array, splitChar = ";", charsToRemove = "", unique = false) ⇒ Object

Applies the WhiteSimilarity from ‘text’ package over all complex text stored into an array. Complex texts will be splitted and compared one by one from A to B and B to A Param:

items_array

text elements to be compared all against others

splitChar

char to split text* complex names

charsToRemove

char (or chars set) to be removed from texts to be compared

unique

boolean flag which indicates if repeated elements must be removed

Returns the similarity percentage for all elements into array



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/semtools/sim_handler.rb', line 95

def similitude_network(items_array, splitChar = ";", charsToRemove = "", unique = false)
  # Special cases
  return nil if items_array.nil?
  return nil if !items_array.is_a? Array
  return nil if items_array.length <= 0
  # Remove repeated elements
  items_array.uniq! if unique
  # Define hash to be filled
  sims = {}
  # Per each item into array => Calculate similitude
  while(items_array.length > 1)
    current = items_array.shift
    sims[current] = {}
    items_array.each do |item|
      sims[current][item] = complex_text_similitude(current,item,splitChar,charsToRemove)
    end
  end 
  return sims
end

#text_similitude(textA, textB) ⇒ Object

Applies the WhiteSimilarity from ‘text’ package over two given texts Param:

textA

text to be compared with textB

textB

text to be compared with textA

Returns the similarity percentage between [0,1]



11
12
13
14
15
16
17
18
19
20
# File 'lib/semtools/sim_handler.rb', line 11

def text_similitude(textA, textB)
  # Check special cases
  return -1.0 if (textA.nil?) | (textB.nil?)
  return -1.0 if (!textA.is_a? String) | (!textB.is_a? String)
  return -1.0 if (textA.length <= 0) | (textB.length <= 0)
  # Calculate similitude
  require 'text'
  white = Text::WhiteSimilarity.new
  return white.similarity(textA.lstrip, textB.lstrip)
end