Class: Coradoc::Input::HTML::Postprocessor

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/input/html/postprocessor.rb

Overview

Postprocessor’s aim is to convert a Coradoc tree from a mess that has been created from HTML into a tree that is compatible with what we would get out of Coradoc, if it parsed it directly.

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(coradoc) ⇒ Postprocessor

Returns a new instance of Postprocessor.



11
12
13
# File 'lib/coradoc/input/html/postprocessor.rb', line 11

def initialize(coradoc)
  @tree = coradoc
end

Class Method Details

.process(coradoc) ⇒ Object



7
8
9
# File 'lib/coradoc/input/html/postprocessor.rb', line 7

def self.process(coradoc)
  new(coradoc).process
end

Instance Method Details

#collapse_meaningless_sectionsObject

Collapse DIVs that only have a title, or nest another DIV.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/coradoc/input/html/postprocessor.rb', line 16

def collapse_meaningless_sections
  @tree = Coradoc::Element::Base.visit(@tree) do |elem, _dir|
    if elem.is_a?(Coradoc::Element::Section) && elem.safe_to_collapse?
      children_classes = Array(elem.contents).map(&:class)
      count = children_classes.length
      safe_classes = [Coradoc::Element::Section, Coradoc::Element::Title]

      # Count > 0 because some documents use <div> as a <br>.
      if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
        next elem.contents
      end
    end
    elem
  end
end

#generate_meaningful_sectionsObject

tree should now be more cleaned up, so we can progress with creating meaningful sections



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/coradoc/input/html/postprocessor.rb', line 34

def generate_meaningful_sections
  @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
    # We are searching for an array, that has a title. This
    # will be a candidate for our section array.
    if dir == :post &&
        elem.is_a?(Array) &&
        !elem.grep(Coradoc::Element::Title).empty?

      new_array = []
      content_array = new_array
      section_arrays_by_level = [new_array] * 8

      # For each title element, we create a new section. Then we push
      # all descendant sections into those sections. Otherwise, we push
      # an element as content of current section.
      elem.each do |e|
        if e.is_a? Coradoc::Element::Title
          title = e
          content_array = []
          section_array = []
          level = title.level_int
          section = Coradoc::Element::Section.new(
            title, contents: content_array, sections: section_array
          )
          # Some documents may not be consistent and eg. follow H4 after
          # H2. Let's ensure that proceeding sections will land in a
          # correct place.
          (8 - level).times do |j|
            section_arrays_by_level[level + j] = section_array
          end
          section_arrays_by_level[level - 1] << section
        else
          content_array << e
        end
      end
      next new_array
    end
    elem
  end
end

#processObject



139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/coradoc/input/html/postprocessor.rb', line 139

def process
  collapse_meaningless_sections
  generate_meaningful_sections
  # Do it again to simplify the document further.
  # Since the structure is changed, we may have new meaningful
  # sections as only children of some meaningless sections.
  collapse_meaningless_sections

  split_sections

  @tree
end

#split_sectionsObject



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/coradoc/input/html/postprocessor.rb', line 75

def split_sections
  max_level = Coradoc::Input::HTML.config.split_sections

  return unless max_level

  sections = {}
  parent_sections = []
  previous_sections = {}

  determine_section_id = ->(elem) do
    if elem.title.style == "appendix"
      level = "A"
    else
      level = 1
    end

    section = previous_sections[elem]
    while section
      level = level.succ if elem.title.style == section.title.style
      section = previous_sections[section]
    end
    level.is_a?(Integer) ? "%02d" % level : level
  end

  determine_style = ->(elem) do
    style = elem.title.style || "section"
    style += "-"
    style
  end

  @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
    title = elem.title if elem.is_a?(Coradoc::Element::Section)

    if title && title.level_int <= max_level
      if dir == :pre
        # In the PRE pass, we build a tree of sections, so that
        # we can compute numbers
        previous_sections[elem] = parent_sections[title.level_int]
        parent_sections[title.level_int] = elem
        parent_sections[(title.level_int + 1)..nil] = nil

        elem
      else
        # In the POST pass, we replace the sections with their
        # include tag.
        section_file = "sections/"
        section_file += parent_sections[1..title.level_int].map do |parent|
          determine_style.(parent) + determine_section_id.(parent)
        end.join("/")
        section_file += ".adoc"

        sections[section_file] = elem
        up = "../" * (title.level_int - 1)
        "\ninclude::#{up}#{section_file}[]\n"
      end
    else
      elem
    end
  end

  sections[nil] = @tree
  @tree = sections
end