Class: Probot
- Inherits:
-
Object
- Object
- Probot
- Defined in:
- lib/probot.rb,
lib/probot/version.rb
Overview
Two main parts of this class:
Parse a robots.txt file
Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
Defined Under Namespace
Classes: ParsedLine
Constant Summary collapse
- VERSION =
"0.5.0"
Instance Attribute Summary collapse
-
#agent ⇒ Object
Returns the value of attribute agent.
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#rules ⇒ Object
readonly
Returns the value of attribute rules.
-
#site ⇒ Object
Returns the value of attribute site.
-
#sitemaps ⇒ Object
Returns the value of attribute sitemaps.
Class Method Summary collapse
Instance Method Summary collapse
- #allowed ⇒ Object
-
#allowed?(url) ⇒ Boolean
If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it’s allowed.
- #allowed_best(url) ⇒ Object
- #allowed_matches(url) ⇒ Object
- #crawl_delay ⇒ Object
- #disallowed ⇒ Object
- #disallowed?(url) ⇒ Boolean
- #disallowed_best(url) ⇒ Object
- #disallowed_matches(url) ⇒ Object
- #fetch_robots_txt(url) ⇒ Object
- #found_agents ⇒ Object
-
#initialize(data, agent: "*") ⇒ Probot
constructor
A new instance of Probot.
- #matches(url) ⇒ Object
- #matching_rule(url) ⇒ Object
- #parse(doc) ⇒ Object
- #pattern_length(regexp) ⇒ Object
- #request_headers ⇒ Object
Constructor Details
#initialize(data, agent: "*") ⇒ Probot
Returns a new instance of Probot.
26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/probot.rb', line 26 def initialize(data, agent: "*") raise ArgumentError, "The first argument must be a string" unless data.is_a?(String) @agent = agent @rules = {} @current_agents = ["*"] @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } @sitemaps = [] @site = URI(data) if data.start_with?("http") @doc = @site.nil? ? data : fetch_robots_txt(@site) parse(@doc) end |
Instance Attribute Details
#agent ⇒ Object
Returns the value of attribute agent.
24 25 26 |
# File 'lib/probot.rb', line 24 def agent @agent end |
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
23 24 25 |
# File 'lib/probot.rb', line 23 def doc @doc end |
#rules ⇒ Object (readonly)
Returns the value of attribute rules.
23 24 25 |
# File 'lib/probot.rb', line 23 def rules @rules end |
#site ⇒ Object
Returns the value of attribute site.
24 25 26 |
# File 'lib/probot.rb', line 24 def site @site end |
#sitemaps ⇒ Object
Returns the value of attribute sitemaps.
24 25 26 |
# File 'lib/probot.rb', line 24 def sitemaps @sitemaps end |
Class Method Details
Instance Method Details
#allowed ⇒ Object
54 |
# File 'lib/probot.rb', line 54 def allowed = rules.dig(@agent, "allow") || rules.dig("*", "allow") |
#allowed?(url) ⇒ Boolean
If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it’s allowed.
69 |
# File 'lib/probot.rb', line 69 def allowed?(url) = !disallowed?(url) |
#allowed_best(url) ⇒ Object
64 |
# File 'lib/probot.rb', line 64 def allowed_best(url) = allowed_matches(url).max_by { |k, v| v } |
#allowed_matches(url) ⇒ Object
58 |
# File 'lib/probot.rb', line 58 def allowed_matches(url) = allowed.select { |allowed_url| url.match?(allowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } |
#crawl_delay ⇒ Object
48 |
# File 'lib/probot.rb', line 48 def crawl_delay = rules.dig(@agent, "crawl_delay") |
#disallowed ⇒ Object
52 |
# File 'lib/probot.rb', line 52 def disallowed = rules.dig(@agent, "disallow") || rules.dig("*", "disallow") |
#disallowed?(url) ⇒ Boolean
71 |
# File 'lib/probot.rb', line 71 def disallowed?(url) = matching_rule(url)&.keys&.first == :disallow |
#disallowed_best(url) ⇒ Object
62 |
# File 'lib/probot.rb', line 62 def disallowed_best(url) = disallowed_matches(url).max_by { |k, v| v } |
#disallowed_matches(url) ⇒ Object
56 |
# File 'lib/probot.rb', line 56 def disallowed_matches(url) = disallowed.select { |disallowed_url| url.match?(disallowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } |
#fetch_robots_txt(url) ⇒ Object
42 43 44 45 46 |
# File 'lib/probot.rb', line 42 def fetch_robots_txt(url) Net::HTTP.get(URI(url).tap { |u| u.path = "/robots.txt" }, request_headers) rescue "" end |
#found_agents ⇒ Object
50 |
# File 'lib/probot.rb', line 50 def found_agents = rules.keys |
#matches(url) ⇒ Object
60 |
# File 'lib/probot.rb', line 60 def matches(url) = {disallowed: disallowed_matches(url), allowed: allowed_matches(url)} |
#matching_rule(url) ⇒ Object
66 |
# File 'lib/probot.rb', line 66 def matching_rule(url) = (disallowed_best(url)&.last.to_i > allowed_best(url)&.last.to_i) ? {disallow: disallowed_best(url)&.first} : {allow: allowed_best(url)&.first} |
#parse(doc) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/probot.rb', line 73 def parse(doc) # We need to handle consective user-agent lines, which are considered to be part of the same record. subsequent_agent = false doc.lines.each do |line| next if line.start_with?("#") || !line.include?(":") || line.split(":").length < 2 data = ParsedLine.new(line) if data.agent? if subsequent_agent @current_agents << data.value else @current_agents = [data.value] subsequent_agent = true end @current_agents.each { |agent| rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } next end # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt if data.allow? || data.disallow? @current_agents.each do |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil? end # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. subsequent_agent = false next end if data.crawl_delay? @current_agents.each { |agent| rules[agent][data.key] = data.value } next end # Ensure we have an absolute URL if data.sitemap? sitemap_uri = URI(data.value) sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri @sitemaps << sitemap_uri.to_s @sitemaps.uniq! next end @current_agents.each { |agent| rules[agent][data.key] = data.value } end end |
#pattern_length(regexp) ⇒ Object
123 |
# File 'lib/probot.rb', line 123 def pattern_length(regexp) = regexp.source.gsub(/(\\[\*\$\.])/, "*").length |
#request_headers ⇒ Object
40 |
# File 'lib/probot.rb', line 40 def request_headers = (agent == "*") ? {} : {"User-Agent" => @agent} |