Module: PertinentParser

Defined in:
lib/pertinent_parser.rb

Class Method Summary collapse

Class Method Details

.html(html) ⇒ Object

Better write our own traversal function so that we can screw with the HTML representation the way we like.


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/pertinent_parser.rb', line 17

def html(html)
  doc = Hpricot(html)
  d = 0
  t = text(doc.inner_text)
  doc.traverse_all_element do |elem|
    if elem.text?
      #puts elem.inner_text
      d += elem.inner_text.size
    else
      #puts elem.stag
      t + wrap_(d...d+elem.inner_text.size, elem.stag)
      #puts "#{d}..#{d+elem.inner_text.size}"
    end
  end
  t
end

.new_replace(context, target, number, replacement) ⇒ Object


72
73
74
75
76
# File 'lib/pertinent_parser.rb', line 72

def new_replace(context, target, number, replacement)
  range = range_from_specification(context, target, number)
  transform = Transform.new(:replacement, replacement)
  r = Rule.new(range, transform)
end

.new_wrap(context, target, number, tag) ⇒ Object


57
58
59
60
# File 'lib/pertinent_parser.rb', line 57

def new_wrap(context, target, number, tag)
  range = range_from_specification(context, target, number)
  wrap_(range, tag)
end

.offset_to_r(o) ⇒ Object


41
42
43
# File 'lib/pertinent_parser.rb', line 41

def offset_to_r(o)
  (o[0]..o[1]-1)
end

.range_from_specification(context, target, number) ⇒ Object


45
46
47
48
49
50
51
52
53
54
55
# File 'lib/pertinent_parser.rb', line 45

def range_from_specification context, target, number
  count, position = 0, 0
  stored = []
  re = Regexp.new(Regexp.escape(target))
  while (match = context.match(re , position)) do
    temp = match.offset 0
    position += 1; count += 1 if temp != stored
    return offset_to_r(temp) if count == number
    stored = temp
  end
end

.rule(range, transform) ⇒ Object


62
63
64
# File 'lib/pertinent_parser.rb', line 62

def rule(range, transform)
  Rule.new(range, transform)
end

.text(s) ⇒ Object


34
35
36
37
38
39
# File 'lib/pertinent_parser.rb', line 34

def text(s)
  r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"]))
  t = Text.new(s)
  t.rule = r
  t
end

.wrap_(range, tag) ⇒ Object


67
68
69
70
# File 'lib/pertinent_parser.rb', line 67

def wrap_(range, tag)
  transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ])
  r = Rule.new(range, transform)
end