Module: Cassiopee

Defined in:
lib/cassiopee.rb

Defined Under Namespace

Classes: Crawler, CrawlerCache

Instance Method Summary collapse

Instance Method Details

#computeAmbiguousDistance(pattern, hamming, edit, ambiguous) ⇒ Object

Extend a String Return -1 if max is reached



24
25
26
27
28
29
30
# File 'lib/cassiopee.rb', line 24

def computeAmbiguousDistance(pattern,hamming,edit,ambiguous)
  if(edit==0)
  	return computeHammingAmbiguous(pattern,hamming,ambiguous)
  else
   return computeLevenshteinAmbiguous(pattern,edit,ambiguous)
  end
end

#computeDistance(pattern, hamming, edit) ⇒ Object

Extend a String Return -1 if max is reached



12
13
14
15
16
17
18
# File 'lib/cassiopee.rb', line 12

def computeDistance(pattern,hamming,edit)
  if(edit==0)
  	return computeHamming(pattern,hamming)
  else
   return computeLevenshtein(pattern,edit)
  end
end

#computeHamming(pattern, hamming) ⇒ Object

Extend a String Return -1 if max is reached



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/cassiopee.rb', line 51

def computeHamming(pattern,hamming)
	nberr = 0
	(0..(self.length-1)).each do |c|
		if(pattern[c] != self[c])
			nberr = nberr+1
			if(nberr>hamming.to_i)
				return -1		
			end
		end
	end
	return nberr
end

#computeHammingAmbiguous(pattern, hamming, ambiguous) ⇒ Object

Compute Hamming distance but using a mapping matrix of alphabet ambiguity



34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/cassiopee.rb', line 34

def computeHammingAmbiguous(pattern,hamming,ambiguous)
	nberr = 0
	(0..(self.length-1)).each do |c|
		if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
			nberr = nberr+1
			if(nberr>hamming.to_i)
				return -1		
			end
		end
	end
	return nberr
end

#computeLevenshtein(pattern, edit) ⇒ Object

Calculate the edit distance between string and pattern Extend a String Return -1 if max is reached



70
71
72
73
74
75
76
77
78
79
# File 'lib/cassiopee.rb', line 70

def computeLevenshtein(pattern,edit)
	
	distance = Text::Levenshtein.distance(self, pattern)
	
	if(distance>edit)
		return -1
	end
	return distance
	
end

#computeLevenshteinAmbiguous(pattern, edit, ambiguous) ⇒ Object

Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/cassiopee.rb', line 85

def computeLevenshteinAmbiguous(pattern, edit, ambiguous)

      prepare =
        if "ruby".respond_to?(:encoding)
          lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") }
        else
          rule = $KCODE.match(/^U/i) ? "U*" : "C*"
          lambda { |str| str.unpack(rule) }
        end

      s, t = [self, pattern].map(&prepare)

	
n = s.length
m = t.length
return m if (0 == n)
return n if (0 == m)

d = (0..m).to_a
x = nil

(0...n).each do |i|
	e = i+1
	(0...m).each do |j|
		cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
		x = [
			d[j+1] + 1, # insertion
			e + 1,      # deletion
			d[j] + cost # substitution
		].min
		d[j] = e
		e = x
	end
	d[m] = x
end
if(x>edit)
	return -1
end
return x
end

#isAmbiguousEqual(a, b, ambiguous) ⇒ Object

checks if 2 chars are equal with ambiguity rules

  • ambigous is a Hash of char/Array of char mapping



130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/cassiopee.rb', line 130

def isAmbiguousEqual(a,b,ambiguous)
	if(ambiguous==nil || (ambiguous[a.chr]==nil && ambiguous[b.chr]==nil ))
 if(a==b)
   return true
 else
   return false
 end
	end
	if(a==b || (ambiguous[a.chr]!=nil && ambiguous[a.chr].index(b.chr)!=nil) || (ambiguous[b.chr]!=nil && ambiguous[b.chr].index(a.chr)!=nil))
  return true
  else
  return false
	end
end