Class: Factbook::Sanitizer

Inherits:
Object
  • Object
show all
Includes:
Utils, LogUtils::Logging
Defined in:
lib/factbook/sanitizer.rb

Constant Summary collapse

BEGIN_FACTS_REGEX =

example match:

<ul class="expandcollapse">
/<ul \s+
   class="expandcollapse">
/xim
END_FACTS_REGEX =
/<\/li> \s*
 <\/ul> \s*
 <!-- \s end \s generated \s content \s -->
/xim
STYLE_ATTR_REGEX =
/\s*
  style=('|").+?\1     ## note: use non-greedy match e.g. .+?
/xim
CLASS_ATTR_REGEX =

do NOT allow multi-line - why? why not?

/\s*
  class=('|")(.+?)\1     ## note: use non-greedy match e.g. .+?
/xim
COUNTRY_COMPARISON_REGEX =

todo: add enclosing div too!!!

/
<div>
 <span \s class='category'[^>]*>
   country \s comparison \s to \s the \s world: \s*
 <\/span>
  \s*
 <span \s class='category_data'[^>]*>
  \s*
    <a \s [^>]+>
     .+?
    <\/a>
  \s*
 <\/span>
 <\/div>
/xim
AUDIO_PLAYER_REGEX =
/
        <div \s class='wrap'>
        <div \s class='audio-player'>
 <audio \s [^>]+>
 <\/audio>
        <\/div>
        <\/div>
/xim
AREA_COMP_CATEGORY_REGEX =

remove category => Area comparison map:

<div class='disTable areaComp'
 ...
until hitting: <div id='field'    -- e.g. next category/field (use lookahead e.g. (?=))
/
  <div \s class='disTable \s areaComp'
    .+?
  (?=<div \s id='field')
/xim
POP_PYRAMID_CATEGORY_REGEX =

remove category => population pyramid:

<div class=‘disTable popPyramid’>

...

until hitting: <div id=‘field’ – e.g. next category/field (use lookahead e.g. (?=))

/
  <div \s class='disTable \s popPyramid'
    .+?
  (?=<div \s id='field')
/xim
REL_AFFILIATION_CATEGORY_REGEX =

remove category => religious affiliation:

<div class=‘disTable relAffiliation’>

...

until hitting: <div id=‘field’ – e.g. next category/field (use lookahead e.g. (?=))

/
  <div \s class='disTable \s relAffiliation'
    .+?
  (?=<div \s id='field')
/xim

Constants included from Utils

Utils::COUNTRY_CODE_REGEX, Utils::MONTH_EN_TO_S, Utils::PAGE_INFO_REGEX, Utils::PAGE_LAST_UPDATED_REGEX

Instance Method Summary collapse

Methods included from Utils

#data_to_csv, #encode_utf8, #find_country_code, #find_page_info, #find_page_last_updated, #values_to_csv

Instance Method Details

#find_country_profile(html) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/factbook/sanitizer.rb', line 88

def find_country_profile( html )
  ####
  ## remove header (everything before)
  ##   <ul class="expandcollapse">

  pos = html.index( BEGIN_FACTS_REGEX )
  fail "*** no begin facts marker found for page"  if pos.nil?

  puts "  bingo - found BEGIN_FACTS on pos #{pos}"
  html = html[pos..-1]

  pp html[0..100]

  ###
  ## remove footer
  ##  assume everthings after (last list item in unorder list inside a table body)
  ##    </li>
  ##    </ul>
  ##    </tbody></table>

  pos = html.index( END_FACTS_REGEX )
  fail "*** no end facts marker found for page"  if pos.nil?

  puts "  bingo - found END_FACTS on pos #{pos}"
  html = html[0...pos] + "</li></ul>\n"        ## note: use ... (not .. to cut-off pos)

  pp html[-200..-1]
  html
end

#sanitize(html_ascii) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/factbook/sanitizer.rb', line 10

def sanitize( html_ascii )
  ## todo: add option for (html source) encoding - why?? why not??

  ## note:
  ##   returns 1) html profile withouth headers, footers, scripts,etc.
  ##           2) page (meta) info e.g. country_name, country_code, last_updated, etc.
  ##           3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)

  page_info = PageInfo.new

  ## todo:
  ##   make page info optional? why? why not?
  ##   not always available (if page structure changes) - check
  ##   what page info is required??
  h = find_page_info( html_ascii )
  if h
    page_info.country_code        = h[:country_code]
    page_info.country_name        = h[:country_name]
    page_info.country_affiliation = h[:country_affiliation]
    page_info.region_code         = h[:region_code]
    page_info.region_name         = h[:region_name]
  else
    page_info.country_code = find_country_code( html_ascii )
    ## print/warn: no page info found
  end


  page_info.last_updated        = find_page_last_updated( html_ascii )


  html_profile_ascii = find_country_profile( html_ascii )    ## cut-off headers, footers, scripts, etc.

  ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
  html, errors = encode_utf8( html_profile_ascii )  ## change encoding to utf-8  (from binary/ascii8bit)

  html = sanitize_profile( html )

  [html, page_info, errors]
end

#sanitize_profile(html) ⇒ Object



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/factbook/sanitizer.rb', line 215

def sanitize_profile( html )

  ## remove categories w/ visualizations/graphics only e.g.
  ##  - area comparions map
  ##  - population pyramid
  ##  - religious affiliation

  html = html.gsub( AREA_COMP_CATEGORY_REGEX ) do |m|
          puts "remove category => area comparison map:"
          puts "#{m}"
          ''
        end

  html = html.gsub( POP_PYRAMID_CATEGORY_REGEX ) do |m|
          puts "remove category => population pyramid:"
          puts "#{m}"
          ''
        end

  html = html.gsub( REL_AFFILIATION_CATEGORY_REGEX ) do |m|
          puts "remove category => religious affiliation:"
          puts "#{m}"
          ''
        end

  ################################################
  ## more - let's get started

  html = html.gsub( STYLE_ATTR_REGEX ) do |m|
          puts "remove style attr:"
          puts "#{m}"
          ''
        end

  html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
          puts "remove audio player:"
          puts "#{m}"
          ''
        end

  html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
          puts "remove country comparison:"
          puts "#{m}"
          ''
        end

  ## remove/cleanup anchors (a href)
  html = html.gsub( /<a\s+[^>]+>(.+?)<\/a>/im ) do |_|   ## note: use .+? non-greedy match
    puts " replace anchor (a) >#{$1}<"

    inner_text = $1.dup ## keep a copy
    if inner_text =~ /<img/    ## if includes image remove
      puts "  remove image in anchor"
      ''
    else    ## keep inner text
      inner_text
    end
  end


  ## remove all list e.g. ul/li
  html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
    puts " remove list >#{m}<"
    ''
  end

  ## clean-up class attrib e.g. remove unknown classes
  html = html.gsub( CLASS_ATTR_REGEX ) do |m|
          puts "cleanup class attr:"
          puts "#{m}"

          klasses = $2.split(' ')
          klasses = klasses.select do |klass|
            if ['region', 'category', 'category_data'].include?( klass )
              true
            else
              puts "  remove class #{klass}"
              false
            end
          end

          if klasses.size > 0
            " class='#{klasses.join(' ')}'"   ## note: add leading space!!
          else
            ''   ## remove class attrib completely
          end
        end

   html
end