Class: Wikipedia::Page
- Inherits:
-
Object
- Object
- Wikipedia::Page
- Defined in:
- lib/wikipedia/page.rb
Instance Attribute Summary collapse
-
#json ⇒ Object
readonly
Returns the value of attribute json.
Class Method Summary collapse
-
.sanitize(s) ⇒ Object
rubocop:disable Metrics/MethodLength rubocop:disable Metrics/AbcSize.
Instance Method Summary collapse
- #categories ⇒ Object
- #content ⇒ Object
- #coordinates ⇒ Object
- #editurl ⇒ Object
- #extlinks ⇒ Object
- #fullurl ⇒ Object
- #image_descriptionurl ⇒ Object
- #image_descriptionurls ⇒ Object
- #image_metadata(options = {}) ⇒ Object
- #image_thumburl ⇒ Object
- #image_thumburls(width = nil) ⇒ Object
- #image_url ⇒ Object
- #image_urls ⇒ Object
- #images ⇒ Object
-
#initialize(json) ⇒ Page
constructor
A new instance of Page.
- #langlinks ⇒ Object
- #links ⇒ Object
- #main_image_thumburl ⇒ Object
- #main_image_url ⇒ Object
- #page ⇒ Object
- #raw_data ⇒ Object
- #redirect? ⇒ Boolean
- #redirect_title ⇒ Object
- #sanitized_content ⇒ Object
- #summary ⇒ Object
- #templates ⇒ Object
- #text ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(json) ⇒ Page
Returns a new instance of Page.
5 6 7 8 9 |
# File 'lib/wikipedia/page.rb', line 5 def initialize(json) require 'json' @json = json @data = JSON.parse(json) end |
Instance Attribute Details
#json ⇒ Object (readonly)
Returns the value of attribute json.
3 4 5 |
# File 'lib/wikipedia/page.rb', line 3 def json @json end |
Class Method Details
.sanitize(s) ⇒ Object
rubocop:disable Metrics/MethodLength rubocop:disable Metrics/AbcSize
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/wikipedia/page.rb', line 127 def self.sanitize(s) return unless s # Transform punctuation templates # Em dash (https://en.wikipedia.org/wiki/Template:Em_dash) s.gsub!(/\{\{(em dash|emdash)\}\}/i, '—') # En dash (https://en.wikipedia.org/wiki/Template:En_dash) s.gsub!(/\{\{(en dash|ndash|nsndns)\}\}/i, '–') # Spaced en dashes (https://en.wikipedia.org/wiki/Template:Spaced_en_dash_space) s.gsub!(/\{\{(spaced e?n\s?dash( space)?|snds?|spndsp|sndashs|spndashsp)\}\}/i, ' – ') # Bold middot s.gsub!(/\{\{(·|dot|middot|\,)\}\}/i, ' <b>·</b>') # Bullets s.gsub!(/\{\{(•|bull(et)?)\}\}/i, ' •') # Forward Slashes (https://en.wikipedia.org/wiki/Template:%5C) s.gsub!(/\{\{\\\}\}/i, ' /') # Transform language specific blocks s.gsub!(/\{\{lang[\-\|]([a-z]+)\|([^\|\{\}]+)(\|[^\{\}]+)?\}\}/i, '<span lang="\1">\2</span>') # Parse Old Style Date template blocks # Old Style Dates (https://en.wikipedia.org/wiki/Template:OldStyleDate) s.gsub!(/\{\{OldStyleDate\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \3] \2') # Old Style Dates with different years (https://en.wikipedia.org/wiki/Template:OldStyleDateDY) s.gsub!(/\{\{OldStyleDateDY\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 \2 [<abbr title="Old Style">O.S.</abbr> \3]') # Old Style Dates with no year (https://en.wikipedia.org/wiki/Template:OldStyleDateNY) s.gsub!(/\{\{OldStyleDateNY\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \2]') # strip anything else inside curly braces! s.gsub!(/\{\{[^\{\}]+?\}\}[\;\,]?/, '') while s =~ /\{\{[^\{\}]+?\}\}[\;\,]?/ # strip info box s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '') # strip internal links s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2') s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1') # strip images and file links s.gsub!(/\[\[Image:(.*?(?=\]\]))??\]\]/, '') s.gsub!(/\[\[File:(.*?(?=\]\]))??\]\]/, '') # convert bold/italic to html s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>') s.gsub!(/'''(.+?)'''/, '<b>\1</b>') s.gsub!(/''(.+?)''/, '<i>\1</i>') # misc s.gsub!(/(\d)<ref[^<>]*>[\s\S]*?<\/ref>(\d)/, '\1 – \2') s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '') s.gsub!(/<ref(.*?(?=\/>))??\/>/, '') s.gsub!(/<!--[^>]+?-->/, '') s.gsub!(/\(\s+/, '(') s.gsub!(' ', ' ') s.strip! # create paragraphs sections = s.split("\n\n") s = if sections.size > 1 sections.map { |paragraph| "<p>#{paragraph.strip}</p>" }.join("\n") else "<p>#{s}</p>" end s end |
Instance Method Details
#categories ⇒ Object
51 52 53 |
# File 'lib/wikipedia/page.rb', line 51 def categories page['categories'].map { |c| c['title'] } if page['categories'] end |
#content ⇒ Object
15 16 17 |
# File 'lib/wikipedia/page.rb', line 15 def content page['revisions'].first['*'] if page['revisions'] end |
#coordinates ⇒ Object
104 105 106 |
# File 'lib/wikipedia/page.rb', line 104 def coordinates page['coordinates'].first.values if page['coordinates'] end |
#editurl ⇒ Object
39 40 41 |
# File 'lib/wikipedia/page.rb', line 39 def editurl page['editurl'] end |
#extlinks ⇒ Object
59 60 61 |
# File 'lib/wikipedia/page.rb', line 59 def extlinks page['extlinks'].map { |c| c['*'] } if page['extlinks'] end |
#fullurl ⇒ Object
35 36 37 |
# File 'lib/wikipedia/page.rb', line 35 def fullurl page['fullurl'] end |
#image_descriptionurl ⇒ Object
79 80 81 |
# File 'lib/wikipedia/page.rb', line 79 def image_descriptionurl page['imageinfo'].first['descriptionurl'] if page['imageinfo'] end |
#image_descriptionurls ⇒ Object
92 93 94 |
# File 'lib/wikipedia/page.rb', line 92 def image_descriptionurls .map(&:image_descriptionurl) unless .nil? end |
#image_metadata(options = {}) ⇒ Object
112 113 114 115 116 117 118 119 |
# File 'lib/wikipedia/page.rb', line 112 def ( = {} ) unless return if images.nil? filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') } = filtered.map { |title| Wikipedia.find_image(title, ) } end || [] end |
#image_thumburl ⇒ Object
75 76 77 |
# File 'lib/wikipedia/page.rb', line 75 def image_thumburl page['imageinfo'].first['thumburl'] if page['imageinfo'] end |
#image_thumburls(width = nil) ⇒ Object
87 88 89 90 |
# File 'lib/wikipedia/page.rb', line 87 def image_thumburls( width = nil ) = width.nil? ? {} : { iiurlwidth: width } ( ).map(&:image_thumburl) unless ( ).nil? end |
#image_url ⇒ Object
71 72 73 |
# File 'lib/wikipedia/page.rb', line 71 def image_url page['imageinfo'].first['url'] if page['imageinfo'] end |
#image_urls ⇒ Object
83 84 85 |
# File 'lib/wikipedia/page.rb', line 83 def image_urls .map(&:image_url) unless .nil? end |
#images ⇒ Object
67 68 69 |
# File 'lib/wikipedia/page.rb', line 67 def images page['images'].map { |c| c['title'] } if page['images'] end |
#langlinks ⇒ Object
63 64 65 |
# File 'lib/wikipedia/page.rb', line 63 def langlinks Hash[page['langlinks'].collect { |c| [c['lang'], c['*']] }] if page['langlinks'] end |
#links ⇒ Object
55 56 57 |
# File 'lib/wikipedia/page.rb', line 55 def links page['links'].map { |c| c['title'] } if page['links'] end |
#main_image_thumburl ⇒ Object
100 101 102 |
# File 'lib/wikipedia/page.rb', line 100 def main_image_thumburl page['thumbnail']['source'] if page['thumbnail'] end |
#main_image_url ⇒ Object
96 97 98 |
# File 'lib/wikipedia/page.rb', line 96 def main_image_url page['thumbnail']['source'].sub(/\/thumb/, '').sub(/\/[^\/]*$/, '') if page['thumbnail'] end |
#page ⇒ Object
11 12 13 |
# File 'lib/wikipedia/page.rb', line 11 def page @data['query']['pages'].values.first if @data['query']['pages'] end |
#raw_data ⇒ Object
108 109 110 |
# File 'lib/wikipedia/page.rb', line 108 def raw_data @data end |
#redirect? ⇒ Boolean
23 24 25 |
# File 'lib/wikipedia/page.rb', line 23 def redirect? content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i) end |
#redirect_title ⇒ Object
27 28 29 |
# File 'lib/wikipedia/page.rb', line 27 def redirect_title redirect?[1] rescue nil end |
#sanitized_content ⇒ Object
19 20 21 |
# File 'lib/wikipedia/page.rb', line 19 def sanitized_content self.class.sanitize(content) end |
#summary ⇒ Object
47 48 49 |
# File 'lib/wikipedia/page.rb', line 47 def summary page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != '' end |
#templates ⇒ Object
121 122 123 |
# File 'lib/wikipedia/page.rb', line 121 def templates page['templates'].map { |c| c['title'] } if page['templates'] end |
#text ⇒ Object
43 44 45 |
# File 'lib/wikipedia/page.rb', line 43 def text page['extract'] end |
#title ⇒ Object
31 32 33 |
# File 'lib/wikipedia/page.rb', line 31 def title page['title'] end |