Class: Factbook::Builder

Inherits:

Object

Object
Factbook::Builder

show all

Includes:: LogUtils::Logging

Defined in:: lib/factbook/builder.rb

Overview

todo: change to PageBuilder ???

Instance Attribute Summary collapse

#errors ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.
#html ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.
#html_ascii ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.
#html_debug ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.
#info ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.
#sects ⇒ Object readonly

full “original” 1:1 page in “original/ascii8/binary” encoding.

Class Method Summary collapse

.from_file(path) ⇒ Object
.from_string(html_ascii) ⇒ Object

note: expects ASCII-7BIT/BINARY encoding.

Instance Method Summary collapse

#initialize(html_ascii) ⇒ Builder constructor

A new instance of Builder.
#map_sects(html) ⇒ Object
#map_subsects(html) ⇒ Object
#split_sects(html) ⇒ Object
#split_subsects(html) ⇒ Object

Constructor Details

#initialize(html_ascii) ⇒ `Builder`

Returns a new instance of Builder.

# File 'lib/factbook/builder.rb', line 37

def initialize( html_ascii )
  @html_ascii = html_ascii
    
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)

  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )

  @html_debug = map_sects( @html )
  @html_debug = map_subsects( @html_debug )

  html_sects = split_sects( @html_debug )
  pp html_sects


  @sects = []
  html_sects.each do |html_sect|
    html_sect_head = html_sect[0]
    html_subsects  = html_sect[1]
    puts html_sect_head
    puts html_subsects.size
    
    ## get section title

    ##  @SECTION{Economy}  => Economy

    if html_sect_head =~ /@SECTION{(.+?)}/
      title = $1.strip
      puts title
      sect = Sect.new
      sect.title = title
      ## get subsections

      subsects = []
      html_subsects.each do |html_subsect|
        html_subsect_head = html_subsect[0]
        html_subsect_body = html_subsect[1]
        if html_subsect_head =~ /@SUBSECTION{(.+?)}/
          title = $1.strip
          title = title.sub( /:\z/, '' )    # remove trailing : if present

          title = title.strip

          puts title
          subsect = Subsect.new
          subsect.title = title     ## todo/fix: cut off trailing colon (:)

 
          b = Factbook::ItemBuilder.new( html_subsect_body, title )
          h = b.read
          subsect.data = h

          subsects << subsect
        else
          ## warn/fix: no subsection title found

        end
      end
      sect.subsects = subsects
      @sects << sect
    else
      ## warn/fix:  no section title found

    end   
  end
  
  self  ## return self -- needed?? default (standard) anyway?? check and remove

end

Instance Attribute Details

#errors ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def errors
  @errors
end

#html ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def html
  @html
end

#html_ascii ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def html_ascii
  @html_ascii
end

#html_debug ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def html_debug
  @html_debug
end

#info ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def info
  @info
end

#sects ⇒ `Object` (readonly)

full “original” 1:1 page in “original/ascii8/binary” encoding



29
30
31

# File 'lib/factbook/builder.rb', line 29

def sects
  @sects
end

Class Method Details

.from_file(path) ⇒ `Object`

# File 'lib/factbook/builder.rb', line 19

def self.from_file( path )
  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!

  self.from_string( html_ascii )
end

.from_string(html_ascii) ⇒ `Object`

note: expects ASCII-7BIT/BINARY encoding



24
25
26

# File 'lib/factbook/builder.rb', line 24

def self.from_string( html_ascii )   ## note: expects ASCII-7BIT/BINARY encoding

  self.new( html_ascii )
end

Instance Method Details

#map_sects(html) ⇒ `Object`

# File 'lib/factbook/builder.rb', line 99

def map_sects( html )
   ## convert section titles

   ##   from  <h2>..</h2>

   ##   to "unified" marker


  ## e.g.

  ##  <h2 sectiontitle='Introduction' ccode='au'>Introduction ::  <span class='region'>AUSTRIA </span></h2>

  ##  <h2>Introduction</h2>


  title_regex= /<h2
                 (?:\s[^>]+)?  ## allow optional attributes in h2
                 >      
                 \s*
                   ([^<>]+?)  ## note: use non-greedy; do NOT allow tags inside for now
                 \s*
                 (?:\s::\s
                   .+?       ## note: use non-greedy; allows tags inside
                 )?          ## strip optional name (e.g.  :: AUSTRIA)
                <\/h2>
              /xim

  html = html.gsub( title_regex ) do |m|
     puts "** found section >#{$1}<:"
     puts "   >|#{m}|<"

     "\n\n@SECTION{#{$1}}\n\n"     
  end
  html
end

#map_subsects(html) ⇒ `Object`

# File 'lib/factbook/builder.rb', line 130

def map_subsects( html )
   ## convert subsection titles

   ##   from  <div id='field'>..</div>

   ##   to "unified" marker


  ## e.g.

  ##  <div id='field' class='category'>Disputes - international:</div>


  title_regex= /<div \s id='field'
                     \s class='category'>
                   \s*
                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
                   \s*
                 <\/div>
               /xim

  html = html.gsub( title_regex ) do |m|
     puts "** found subsection >#{$1}<:"
     puts "   >|#{m}|<"

     "\n@SUBSECTION{#{$1}}\n"     
  end
  html
end

#split_sects(html) ⇒ `Object`

# File 'lib/factbook/builder.rb', line 157

def split_sects( html )
  ####

  #  split html in sections (divided by section headings)

  #  e.g. remove optional prolog ??,

  ##   [[heading,sect],

  ##    [heading,sect],

  ##    [heading,sect],...]


  ## note: "wrap" regex in a capture group (just one)

  ##   String#split will include all catpure groups in the result array


  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??

     
  chunks = html.split( section_regex )
  
  ## check if first item is a section or (html) prolog

  #   if prolog (remove)

  chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION


  pairs = chunks.each_slice(2).to_a

  ## now split subsections

  newpairs = []
  pairs.each do |item|
    ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??

    newpairs << [item[0], split_subsects( item[1]) ]
  end
  newpairs
end

#split_subsects(html) ⇒ `Object`

# File 'lib/factbook/builder.rb', line 188

def split_subsects( html )
  ####

  #  split html in subsections (divided by subsection headings)

  #  e.g. remove optional prolog ??,

  ##   [[heading,sect],

  ##    [heading,sect],

  ##    [heading,sect],...]


  ## note: "wrap" regex in a capture group (just one)

  ##   String#split will include all catpure groups in the result array

  
  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??

     
  chunks = html.split( subsection_regex )

  ## check if first item is a section or (html) prolog

  #   if prolog (remove)

  chunks.slice!(0)  unless chunks[0] =~ /@SUBSECTION/  ## starts w/ @SUBSECTION


  pairs = chunks.each_slice(2).to_a
  pairs
end

Class: Factbook::Builder

Overview

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html_ascii) ⇒ Builder

Instance Attribute Details

#errors ⇒ Object (readonly)

#html ⇒ Object (readonly)

#html_ascii ⇒ Object (readonly)

#html_debug ⇒ Object (readonly)

#info ⇒ Object (readonly)

#sects ⇒ Object (readonly)

Class Method Details

.from_file(path) ⇒ Object

.from_string(html_ascii) ⇒ Object

Instance Method Details

#map_sects(html) ⇒ Object

#map_subsects(html) ⇒ Object

#split_sects(html) ⇒ Object

#split_subsects(html) ⇒ Object

#initialize(html_ascii) ⇒ `Builder`

#errors ⇒ `Object` (readonly)

#html ⇒ `Object` (readonly)

#html_ascii ⇒ `Object` (readonly)

#html_debug ⇒ `Object` (readonly)

#info ⇒ `Object` (readonly)

#sects ⇒ `Object` (readonly)

.from_file(path) ⇒ `Object`

.from_string(html_ascii) ⇒ `Object`

#map_sects(html) ⇒ `Object`

#map_subsects(html) ⇒ `Object`

#split_sects(html) ⇒ `Object`

#split_subsects(html) ⇒ `Object`