Module: ParsePage

Included in:
GeneralScraper
Defined in:
lib/parse_page.rb

Instance Method Summary collapse

Instance Method Details

#fixEncode(str) ⇒ Object



65
66
67
68
69
70
71
# File 'lib/parse_page.rb', line 65

def fixEncode(str)
  if str.is_a?(String)
    return str.unpack('C*').pack('U*')
  else
    return str
  end
end

#getContent(url, pagehash, html) ⇒ Object

Get the page content by type of page



14
15
16
17
18
19
20
21
22
23
24
# File 'lib/parse_page.rb', line 14

def getContent(url, pagehash, html)
  if url.include? ".pdf"
    begin
      return getPDF(url, pagehash)
    rescue
      return nil
    end
  else
    return getHTMLText(url, pagehash, html)
  end
end

#getHTMLText(url, pagehash, html) ⇒ Object

Download the page text



27
28
29
30
# File 'lib/parse_page.rb', line 27

def getHTMLText(url, pagehash, html)
  pagehash[:text] = fixEncode(html.css("body").text)
  return pagehash
end

#getMetadata(url, html) ⇒ Object

Get the page metadata



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/parse_page.rb', line 45

def (url, html)
  pagehash = Hash.new

  # Save URL and date retreived
  url.gsub!("%3F", "?")
  url.gsub!("%3D", "=")
  pagehash[:url] = url
  pagehash[:date_retrieved] = Time.now

  # Get title and meta tag info
  pagehash[:page_title] = fixEncode(html.css("title").text)
  html.css("meta").each do |m|
    if m
      pagehash[m['name']] = fixEncode(m['content'])
    end
  end

  return pagehash
end

#getPageData(url) ⇒ Object

Get both page metadata and text



5
6
7
8
9
10
11
# File 'lib/parse_page.rb', line 5

def getPageData(url)
  page = @requests.get_page(url)
  html = Nokogiri::HTML(page)
  pagehash = (url, html)
  pagehash = getContent(url, pagehash, html)
  return pagehash
end

#getPDF(url, pagehash) ⇒ Object

Download and extract text from PDF



33
34
35
36
37
38
39
40
41
42
# File 'lib/parse_page.rb', line 33

def getPDF(url, pagehash)
  `wget --tries=2 -P public/uploads #{url}`
  path = url.split("/")

  # OCR PDF and save fields
  u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
  pdfparse = JSON.parse(u.handleDoc)
  pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
  return pagehash
end