Class: Factbook::Builder

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/factbook/builder.rb

Overview

todo: change to PageBuilder ???

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html_ascii) ⇒ Builder

Returns a new instance of Builder.



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/factbook/builder.rb', line 37

def initialize( html_ascii )
  @html_ascii = html_ascii
    
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )

  @html_debug = map_sects( @html )
  @html_debug = map_subsects( @html_debug )

  html_sects = split_sects( @html_debug )
  pp html_sects


  @sects = []
  html_sects.each do |html_sect|
    html_sect_head = html_sect[0]
    html_subsects  = html_sect[1]
    puts html_sect_head
    puts html_subsects.size
    
    ## get section title
    ##  @SECTION{Economy}  => Economy
    if html_sect_head =~ /@SECTION{(.+?)}/
      title = $1.strip
      puts title
      sect = Sect.new
      sect.title = title
      ## get subsections
      subsects = []
      html_subsects.each do |html_subsect|
        html_subsect_head = html_subsect[0]
        html_subsect_body = html_subsect[1]
        if html_subsect_head =~ /@SUBSECTION{(.+?)}/
          title = $1.strip
          title = title.sub( /:\z/, '' )    # remove trailing : if present
          title = title.strip

          puts title
          subsect = Subsect.new
          subsect.title = title     ## todo/fix: cut off trailing colon (:)
 
          b = Factbook::ItemBuilder.new( html_subsect_body, title )
          h = b.read
          subsect.data = h

          subsects << subsect
        else
          ## warn/fix: no subsection title found
        end
      end
      sect.subsects = subsects
      @sects << sect
    else
      ## warn/fix:  no section title found
    end   
  end
  
  self  ## return self -- needed?? default (standard) anyway?? check and remove
end

Instance Attribute Details

#errorsObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def errors
  @errors
end

#htmlObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def html
  @html
end

#html_asciiObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def html_ascii
  @html_ascii
end

#html_debugObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def html_debug
  @html_debug
end

#infoObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def info
  @info
end

#sectsObject (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31
# File 'lib/factbook/builder.rb', line 29

def sects
  @sects
end

Class Method Details

.from_file(path) ⇒ Object



19
20
21
22
# File 'lib/factbook/builder.rb', line 19

def self.from_file( path )
  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!
  self.from_string( html_ascii )
end

.from_string(html_ascii) ⇒ Object

note: expects ASCII-7BIT/BINARY encoding



24
25
26
# File 'lib/factbook/builder.rb', line 24

def self.from_string( html_ascii )   ## note: expects ASCII-7BIT/BINARY encoding
  self.new( html_ascii )
end

Instance Method Details

#map_sects(html) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/factbook/builder.rb', line 99

def map_sects( html )
   ## convert section titles
   ##   from  <h2>..</h2>
   ##   to "unified" marker

  ## e.g.
  ##  <h2 sectiontitle='Introduction' ccode='au'>Introduction ::  <span class='region'>AUSTRIA </span></h2>
  ##  <h2>Introduction</h2>

  title_regex= /<h2
                 (?:\s[^>]+)?  ## allow optional attributes in h2
                 >      
                 \s*
                   ([^<>]+?)  ## note: use non-greedy; do NOT allow tags inside for now
                 \s*
                 (?:\s::\s
                   .+?       ## note: use non-greedy; allows tags inside
                 )?          ## strip optional name (e.g.  :: AUSTRIA)
                <\/h2>
              /xim

  html = html.gsub( title_regex ) do |m|
     puts "** found section >#{$1}<:"
     puts "   >|#{m}|<"

     "\n\n@SECTION{#{$1}}\n\n"     
  end
  html
end

#map_subsects(html) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/factbook/builder.rb', line 130

def map_subsects( html )
   ## convert subsection titles
   ##   from  <div id='field'>..</div>
   ##   to "unified" marker

  ## e.g.
  ##  <div id='field' class='category'>Disputes - international:</div>

  title_regex= /<div \s id='field'
                     \s class='category'>
                   \s*
                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
                   \s*
                 <\/div>
               /xim

  html = html.gsub( title_regex ) do |m|
     puts "** found subsection >#{$1}<:"
     puts "   >|#{m}|<"

     "\n@SUBSECTION{#{$1}}\n"     
  end
  html
end

#split_sects(html) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/factbook/builder.rb', line 157

def split_sects( html )
  ####
  #  split html in sections (divided by section headings)
  #  e.g. remove optional prolog ??,
  ##   [[heading,sect],
  ##    [heading,sect],
  ##    [heading,sect],...]

  ## note: "wrap" regex in a capture group (just one)
  ##   String#split will include all catpure groups in the result array

  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
     
  chunks = html.split( section_regex )
  
  ## check if first item is a section or (html) prolog
  #   if prolog (remove)
  chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION

  pairs = chunks.each_slice(2).to_a

  ## now split subsections
  newpairs = []
  pairs.each do |item|
    ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
    newpairs << [item[0], split_subsects( item[1]) ]
  end
  newpairs
end

#split_subsects(html) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/factbook/builder.rb', line 188

def split_subsects( html )
  ####
  #  split html in subsections (divided by subsection headings)
  #  e.g. remove optional prolog ??,
  ##   [[heading,sect],
  ##    [heading,sect],
  ##    [heading,sect],...]

  ## note: "wrap" regex in a capture group (just one)
  ##   String#split will include all catpure groups in the result array
  
  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
     
  chunks = html.split( subsection_regex )

  ## check if first item is a section or (html) prolog
  #   if prolog (remove)
  chunks.slice!(0)  unless chunks[0] =~ /@SUBSECTION/  ## starts w/ @SUBSECTION

  pairs = chunks.each_slice(2).to_a
  pairs
end