Module: ParsePage
- Included in:
- GeneralScraper
- Defined in:
- lib/parse_page.rb
Instance Method Summary collapse
- #fixEncode(str) ⇒ Object
-
#getContent(url, pagehash, html) ⇒ Object
Get the page content by type of page.
-
#getHTMLText(url, pagehash, html) ⇒ Object
Download the page text.
-
#getMetadata(url, html) ⇒ Object
Get the page metadata.
-
#getPageData(url) ⇒ Object
Get both page metadata and text.
-
#getPDF(url, pagehash) ⇒ Object
Download and extract text from PDF.
Instance Method Details
#fixEncode(str) ⇒ Object
65 66 67 68 69 70 71 |
# File 'lib/parse_page.rb', line 65 def fixEncode(str) if str.is_a?(String) return str.unpack('C*').pack('U*') else return str end end |
#getContent(url, pagehash, html) ⇒ Object
Get the page content by type of page
14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/parse_page.rb', line 14 def getContent(url, pagehash, html) if url.include? ".pdf" begin return getPDF(url, pagehash) rescue return nil end else return getHTMLText(url, pagehash, html) end end |
#getHTMLText(url, pagehash, html) ⇒ Object
Download the page text
27 28 29 30 |
# File 'lib/parse_page.rb', line 27 def getHTMLText(url, pagehash, html) pagehash[:text] = fixEncode(html.css("body").text) return pagehash end |
#getMetadata(url, html) ⇒ Object
Get the page metadata
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/parse_page.rb', line 45 def getMetadata(url, html) pagehash = Hash.new # Save URL and date retreived url.gsub!("%3F", "?") url.gsub!("%3D", "=") pagehash[:url] = url pagehash[:date_retrieved] = Time.now # Get title and meta tag info pagehash[:page_title] = fixEncode(html.css("title").text) html.css("meta").each do |m| if m pagehash[m['name']] = fixEncode(m['content']) end end return pagehash end |
#getPageData(url) ⇒ Object
Get both page metadata and text
5 6 7 8 9 10 11 |
# File 'lib/parse_page.rb', line 5 def getPageData(url) page = @requests.get_page(url) html = Nokogiri::HTML(page) pagehash = getMetadata(url, html) pagehash = getContent(url, pagehash, html) return pagehash end |
#getPDF(url, pagehash) ⇒ Object
Download and extract text from PDF
33 34 35 36 37 38 39 40 41 42 |
# File 'lib/parse_page.rb', line 33 def getPDF(url, pagehash) `wget --tries=2 -P public/uploads #{url}` path = url.split("/") # OCR PDF and save fields u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip) pdfparse = JSON.parse(u.handleDoc) pdfparse.each{|k, v| pagehash[k] = fixEncode(v)} return pagehash end |