Module: ScraperUtils::SpecSupport

Defined in:
lib/scraper_utils/spec_support.rb

Overview

Methods to support specs

Constant Summary collapse

AUSTRALIAN_STATES =
%w[ACT NSW NT QLD SA TAS VIC WA].freeze
STREET_TYPE_PATTERNS =
[
  /\bArcade\b/i,
  /\bAv(e(nue)?)?\b/i,
  /\bB(oulevard|lvd|vd)\b/i,
  /\b(Circuit|Cct)\b/i,
  /\bCir(cle)?\b/i,
  /\bCl(ose)?\b/i,
  /\bC(our|r)?t\b/i,
  /\bChase\b/i,
  /\bCorso\b/i,
  /\bCr(es(cent)?)?\b/i,
  /\bCross\b/i,
  /\bDr((ive)?|v)\b/i,
  /\bEnt(rance)?\b/i,
  /\bEsp(lanade)?\b/i,
  /\bGr(ove)?\b/i,
  /\bH(ighwa|w)y\b/i,
  /\bL(ane?|a)\b/i,
  /\bLoop\b/i,
  /\bM(ews|w)\b/i,
  /\bP(arade|de)\b/i,
  /\bParkway\b/i,
  /\bPl(ace)?\b/i,
  /\bPriv(ate)?\b/i,
  /\bProm(enade)?\b/i,
  /\bQuay\b/i,
  /\bR(oa)?d\b/i,
  /\bR(idge|dg)\b/i,
  /\bRise\b/i,
  /\bSq(uare)?\b/i,
  /\bSt(reet)?\b/i,
  /\bT(erra)?ce\b/i,
  /\bWa?y\b/i
].freeze
AUSTRALIAN_POSTCODES =
/\b\d{4}\b/.freeze
PLANNING_KEYWORDS =
[
  # Building types
  'dwelling', 'house', 'unit', 'building', 'structure', 'facility',
  # Modifications
  'addition', 'extension', 'renovation', 'alteration', 'modification',
  'replacement', 'upgrade', 'improvement',
  # Specific structures
  'carport', 'garage', 'shed', 'pool', 'deck', 'patio', 'pergola',
  'verandah', 'balcony', 'fence', 'wall', 'driveway',
  # Development types
  'subdivision', 'demolition', 'construction', 'development',
  # Services/utilities
  'signage', 'telecommunications', 'stormwater', 'water', 'sewer',
  # Approvals/certificates
  'certificate', 'approval', 'consent', 'permit'
].freeze
PLACEHOLDERS =
[
  /no description/i,
  /not available/i,
  /to be confirmed/i,
  /\btbc\b/i,
  %r{\bn/a\b}i
].freeze

Class Method Summary collapse

Class Method Details

.authority_label(results, prefix: '', suffix: '') ⇒ Object



71
72
73
74
75
76
77
78
79
# File 'lib/scraper_utils/spec_support.rb', line 71

def self.authority_label(results, prefix: '', suffix: '')
  return nil if results.nil?

  authority_labels = results.map { |record| record['authority_label'] }.compact.uniq
  return nil if authority_labels.empty?

  raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
  "#{prefix}#{authority_labels.first}#{suffix}"
end

.bot_protection_detected?(page) ⇒ Boolean

Check if the page response indicates bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to check

Returns:

  • (Boolean)

    True if bot protection is detected



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/scraper_utils/spec_support.rb', line 226

def self.bot_protection_detected?(page)
  return true if %w[403 429].include?(page.code)

  return false unless page.body

  body_lower = page.body&.downcase

  # Check for common bot protection indicators
  bot_indicators = [
    'recaptcha',
    'cloudflare',
    'are you human',
    'bot detection',
    'security check',
    'verify you are human',
    'access denied',
    'blocked',
    'captcha'
  ]

  bot_indicators.any? { |indicator| body_lower.include?(indicator) }
end

.fetch_url_with_redirects(url) ⇒ Object



65
66
67
68
69
# File 'lib/scraper_utils/spec_support.rb', line 65

def self.fetch_url_with_redirects(url)
  agent = Mechanize.new
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.get(url)
end

.geocodable?(address, ignore_case: false) ⇒ Boolean

Check if an address is likely to be geocodable by analyzing its format. This is a bit stricter than needed - typically assert >= 75% match

Parameters:

  • address (String)

    The address to check

Returns:

  • (Boolean)

    True if the address appears to be geocodable.



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/scraper_utils/spec_support.rb', line 104

def self.geocodable?(address, ignore_case: false)
  return false if address.nil? || address.empty?
  check_address = ignore_case ? address.upcase : address

  # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
  has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)

  # Using the pre-compiled patterns
  has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }

  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }

  if ENV["DEBUG"]
    missing = []
    missing << "street type" unless has_street_type
    missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
    missing << "state" unless has_state
    puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
  end

  has_street_type && (has_postcode || has_uppercase_suburb) && has_state
end

.placeholder?(text) ⇒ Boolean

Returns:

  • (Boolean)


137
138
139
# File 'lib/scraper_utils/spec_support.rb', line 137

def self.placeholder?(text)
  PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
end

.reasonable_description?(text) ⇒ Boolean

Check if this looks like a “reasonable” description This is a bit stricter than needed - typically assert >= 75% match

Returns:

  • (Boolean)


166
167
168
169
170
171
172
173
174
175
# File 'lib/scraper_utils/spec_support.rb', line 166

def self.reasonable_description?(text)
  return false if placeholder?(text)

  # Long descriptions (3+ words) are assumed reasonable
  return true if text.to_s.split.size >= 3

  # Short descriptions must contain at least one planning keyword
  text_lower = text.to_s.downcase
  PLANNING_KEYWORDS.any? { |keyword| text_lower.include?(keyword) }
end

.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough addresses are geocodable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of addresses expected to be geocodable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient addresses are geocodable



86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/scraper_utils/spec_support.rb', line 86

def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  geocodable = results
                 .map { |record| record["address"] }
                 .uniq
                 .count { |text| ScraperUtils::SpecSupport.geocodable? text }
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
         "(#{(100.0 * geocodable / results.count).round(1)}%)"
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
  raise "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}" unless geocodable >= expected
  geocodable
end

.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough descriptions are reasonable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of descriptions expected to be reasonable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient descriptions are reasonable



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/scraper_utils/spec_support.rb', line 146

def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  descriptions = results
                   .map { |record| record["description"] }
                   .uniq
                   .count do |text|
    selected = ScraperUtils::SpecSupport.reasonable_description? text
    puts "  description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
    selected
  end
  puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
         "(#{(100.0 * descriptions / results.count).round(1)}%)"
  expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
  raise "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
  descriptions
end

.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that info_urls have expected details (unique URLs with content validation)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if insufficient detail checks pass



213
214
215
216
217
218
219
220
221
# File 'lib/scraper_utils/spec_support.rb', line 213

def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false, &block)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
      check_info_url_details(results, percentage, variation, bot_check_expected, &block)
    end
  else
    check_info_url_details(results, percentage, variation, bot_check_expected, &block)
  end
end

.validate_page_response(page, bot_check_expected) ⇒ Object

Validate page response, accounting for bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to validate

  • bot_check_expected (Boolean)

    Whether bot protection is acceptable

Raises:

  • RuntimeError if page response is invalid and bot protection not expected



253
254
255
256
257
258
259
260
# File 'lib/scraper_utils/spec_support.rb', line 253

def self.validate_page_response(page, bot_check_expected)
  if bot_check_expected && bot_protection_detected?(page)
    puts "  Bot protection detected - accepting as valid response"
    return
  end

  raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
end

.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that all records use the expected global info_url and it returns 200

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • expected_url (String)

    The expected global info_url for this authority

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if records don’t use the expected URL or it doesn’t return 200



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/scraper_utils/spec_support.rb', line 183

def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false, &block)
  info_urls = results.map { |record| record["info_url"] }.uniq

  unless info_urls.size == 1
    raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
  end
  unless info_urls.first == expected_url
    raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
  end

  puts "Checking the one expected info_url returns 200: #{expected_url}"

  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url") do
      page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
      validate_page_response(page, bot_check_expected)
    end
  else
    page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
    validate_page_response(page, bot_check_expected)
  end
end