Class: Factbook::Builder
- Inherits:
-
Object
- Object
- Factbook::Builder
- Includes:
- LogUtils::Logging
- Defined in:
- lib/factbook/builder.rb
Overview
todo: change to PageBuilder ???
Instance Attribute Summary collapse
-
#errors ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
-
#html ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
-
#html_ascii ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
-
#html_debug ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
-
#info ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
-
#sects ⇒ Object
readonly
full “original” 1:1 page in “original/ascii8/binary” encoding.
Class Method Summary collapse
- .from_file(path) ⇒ Object
-
.from_string(html_ascii) ⇒ Object
note: expects ASCII-7BIT/BINARY encoding.
Instance Method Summary collapse
-
#initialize(html_ascii) ⇒ Builder
constructor
A new instance of Builder.
- #map_sects(html) ⇒ Object
- #map_subsects(html) ⇒ Object
- #split_sects(html) ⇒ Object
- #split_subsects(html) ⇒ Object
Constructor Details
#initialize(html_ascii) ⇒ Builder
Returns a new instance of Builder.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/factbook/builder.rb', line 37 def initialize( html_ascii ) @html_ascii = html_ascii ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit) @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii ) @html_debug = map_sects( @html ) @html_debug = map_subsects( @html_debug ) html_sects = split_sects( @html_debug ) pp html_sects @sects = [] html_sects.each do |html_sect| html_sect_head = html_sect[0] html_subsects = html_sect[1] puts html_sect_head puts html_subsects.size ## get section title ## @SECTION{Economy} => Economy if html_sect_head =~ /@SECTION{(.+?)}/ title = $1.strip puts title sect = Sect.new sect.title = title ## get subsections subsects = [] html_subsects.each do |html_subsect| html_subsect_head = html_subsect[0] html_subsect_body = html_subsect[1] if html_subsect_head =~ /@SUBSECTION{(.+?)}/ title = $1.strip title = title.sub( /:\z/, '' ) # remove trailing : if present title = title.strip puts title subsect = Subsect.new subsect.title = title ## todo/fix: cut off trailing colon (:) b = Factbook::ItemBuilder.new( html_subsect_body, title ) h = b.read subsect.data = h subsects << subsect else ## warn/fix: no subsection title found end end sect.subsects = subsects @sects << sect else ## warn/fix: no section title found end end self ## return self -- needed?? default (standard) anyway?? check and remove end |
Instance Attribute Details
#errors ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def errors @errors end |
#html ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def html @html end |
#html_ascii ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def html_ascii @html_ascii end |
#html_debug ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def html_debug @html_debug end |
#info ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def info @info end |
#sects ⇒ Object (readonly)
full “original” 1:1 page in “original/ascii8/binary” encoding
29 30 31 |
# File 'lib/factbook/builder.rb', line 29 def sects @sects end |
Class Method Details
.from_file(path) ⇒ Object
19 20 21 22 |
# File 'lib/factbook/builder.rb', line 19 def self.from_file( path ) html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!! self.from_string( html_ascii ) end |
.from_string(html_ascii) ⇒ Object
note: expects ASCII-7BIT/BINARY encoding
24 25 26 |
# File 'lib/factbook/builder.rb', line 24 def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding self.new( html_ascii ) end |
Instance Method Details
#map_sects(html) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/factbook/builder.rb', line 99 def map_sects( html ) ## convert section titles ## from <h2>..</h2> ## to "unified" marker ## e.g. ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2> ## <h2>Introduction</h2> title_regex= /<h2 (?:\s[^>]+)? ## allow optional attributes in h2 > \s* ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now \s* (?:\s::\s .+? ## note: use non-greedy; allows tags inside )? ## strip optional name (e.g. :: AUSTRIA) <\/h2> /xim html = html.gsub( title_regex ) do |m| puts "** found section >#{$1}<:" puts " >|#{m}|<" "\n\n@SECTION{#{$1}}\n\n" end html end |
#map_subsects(html) ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/factbook/builder.rb', line 130 def map_subsects( html ) ## convert subsection titles ## from <div id='field'>..</div> ## to "unified" marker ## e.g. ## <div id='field' class='category'>Disputes - international:</div> title_regex= /<div \s id='field' \s class='category'> \s* (.+?) ## note: use non-greedy; allows tags inside - why? why not \s* <\/div> /xim html = html.gsub( title_regex ) do |m| puts "** found subsection >#{$1}<:" puts " >|#{m}|<" "\n@SUBSECTION{#{$1}}\n" end html end |
#split_sects(html) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/factbook/builder.rb', line 157 def split_sects( html ) #### # split html in sections (divided by section headings) # e.g. remove optional prolog ??, ## [[heading,sect], ## [heading,sect], ## [heading,sect],...] ## note: "wrap" regex in a capture group (just one) ## String#split will include all catpure groups in the result array section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ?? chunks = html.split( section_regex ) ## check if first item is a section or (html) prolog # if prolog (remove) chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION pairs = chunks.each_slice(2).to_a ## now split subsections newpairs = [] pairs.each do |item| ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not?? newpairs << [item[0], split_subsects( item[1]) ] end newpairs end |
#split_subsects(html) ⇒ Object
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/factbook/builder.rb', line 188 def split_subsects( html ) #### # split html in subsections (divided by subsection headings) # e.g. remove optional prolog ??, ## [[heading,sect], ## [heading,sect], ## [heading,sect],...] ## note: "wrap" regex in a capture group (just one) ## String#split will include all catpure groups in the result array subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ?? chunks = html.split( subsection_regex ) ## check if first item is a section or (html) prolog # if prolog (remove) chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION pairs = chunks.each_slice(2).to_a pairs end |