Module: ScraperUtils::SpecSupport

Defined in:
lib/scraper_utils/spec_support.rb

Overview

Methods to support specs

Constant Summary collapse

AUSTRALIAN_STATES =
%w[ACT NSW NT QLD SA TAS VIC WA].freeze
STREET_TYPE_PATTERNS =
[
  /\bArcade\b/i,
  /\bAv(e(nue)?)?\b/i,
  /\bB(oulevard|lvd|vd)\b/i,
  /\b(Circuit|Cct)\b/i,
  /\bCir(cle)?\b/i,
  /\bCl(ose)?\b/i,
  /\bC(our|r)?t\b/i,
  /\bChase\b/i,
  /\bCorso\b/i,
  /\bCr(es(cent)?)?\b/i,
  /\bCross\b/i,
  /\bDr((ive)?|v)\b/i,
  /\bEnt(rance)?\b/i,
  /\bEsp(lanade)?\b/i,
  /\bGr(ove)?\b/i,
  /\bH(ighwa|w)y\b/i,
  /\bL(ane?|a)\b/i,
  /\bLoop\b/i,
  /\bM(ews|w)\b/i,
  /\bP(arade|de)\b/i,
  /\bParkway\b/i,
  /\bPl(ace)?\b/i,
  /\bPriv(ate)?\b/i,
  /\bProm(enade)?\b/i,
  /\bQuay\b/i,
  /\bR(oa)?d\b/i,
  /\bR(idge|dg)\b/i,
  /\bRise\b/i,
  /\bSq(uare)?\b/i,
  /\bSt(reet)?\b/i,
  /\bT(erra)?ce\b/i,
  /\bWa?y\b/i
].freeze
AUSTRALIAN_POSTCODES =
/\b\d{4}\b/.freeze
PLANNING_KEYWORDS =
[
  # Building types
  "dwelling", "house", "unit", "building", "structure", "facility",
  # Modifications
  "addition", "extension", "renovation", "alteration", "modification",
  "replacement", "upgrade", "improvement",
  # Specific structures
  "carport", "garage", "shed", "pool", "deck", "patio", "pergola",
  "verandah", "balcony", "fence", "wall", "driveway",
  # Development types
  "subdivision", "demolition", "construction", "development",
  # Services/utilities
  "signage", "telecommunications", "stormwater", "water", "sewer",
  # Approvals/certificates
  "certificate", "approval", "consent", "permit"
].freeze
PLACEHOLDERS =
[
  /no description/i,
  /not available/i,
  /to be confirmed/i,
  /\btbc\b/i,
  %r{\bn/a\b}i
].freeze

Class Method Summary collapse

Class Method Details

.authority_label(results, prefix: "", suffix: "") ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/scraper_utils/spec_support.rb', line 77

def self.authority_label(results, prefix: "", suffix: "")
  return nil if results.nil?

  authority_labels = results.map { |record| record["authority_label"] }.compact.uniq
  return nil if authority_labels.empty?

  if authority_labels.size > 1
    raise "Expected one authority_label, not #{authority_labels.inspect}"
  end

  "#{prefix}#{authority_labels.first}#{suffix}"
end

.bot_protection_detected?(page) ⇒ Boolean

Check if the page response indicates bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to check

Returns:

  • (Boolean)

    True if bot protection is detected



298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/scraper_utils/spec_support.rb', line 298

def self.bot_protection_detected?(page)
  return true if %w[403 429].include?(page.code)

  return false unless page.body

  body_lower = page.body&.downcase

  # Check for common bot protection indicators
  bot_indicators = [
    "recaptcha",
    "cloudflare",
    "are you human",
    "bot detection",
    "security check",
    "verify you are human",
    "access denied",
    "blocked",
    "captcha"
  ]

  bot_indicators.any? { |indicator| body_lower.include?(indicator) }
end

.check_info_url_details(results, percentage, variation, bot_check_expected, &block) ⇒ Object



377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
# File 'lib/scraper_utils/spec_support.rb', line 377

def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
  count = 0
  failed = 0
  fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq

  fib_indices.each do |index|
    record = results[index]
    info_url = record["info_url"]
    puts "Checking info_url[#{index}]: #{info_url} has the expected reference, address and description..."

    page = block_given? ? block.call(info_url) : fetch_url_with_redirects(info_url)

    if bot_check_expected && bot_protection_detected?(page)
      puts "  Bot protection detected - skipping detailed validation"
      next
    end

    unless page.code == "200"
      raise UnprocessableRecord,
            "Expected 200 response, got #{page.code}"
    end

    page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")

    %w[council_reference address description].each do |attribute|
      count += 1
      expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
      expected2 = case attribute
                  when "council_reference"
                    expected.sub(/\ADA\s*-\s*/, "")
                  when "address"
                    expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, "") # Handle Lismore post-code/state swap
                  else
                    expected
                  end
      expected3 = case attribute
                  when "address"
                    expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, "")
                  else
                    expected
                  end.gsub(/\s*,\s*/, " ").gsub(/\s*-\s*/, "-")
      next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, " ").gsub(
        /\s*-\s*/, "-"
      ).include?(expected3)

      failed += 1
      desc2 = expected2 == expected ? "" : " or #{expected2.inspect}"
      desc3 = expected3 == expected ? "" : " or #{expected3.inspect}"
      puts "  Missing: #{expected.inspect}#{desc2}#{desc3}"
      puts "    IN: #{page_body}" if ENV["DEBUG"]

      min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
      passed = count - failed
      if passed < min_required
        raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
      end
    end
  end

  return unless count > 0

  puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!"
end

.check_info_url_is_present(results, percentage, variation, &block) ⇒ Object



336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/scraper_utils/spec_support.rb', line 336

def self.check_info_url_is_present(results, percentage, variation, &block)
  count = 0
  failed = 0
  fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq

  fib_indices.each do |index|
    record = results[index]
    info_url = record["info_url"]
    puts "Checking info_url[#{index}]: #{info_url} is present..."

    begin
      page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
      status = page.code.to_i
    rescue Mechanize::ResponseCodeError => e
      status = e.response_code.to_i
    end

    if [403, 429].include?(status)
      puts "  Bot protection detected - skipping"
      next
    end

    count += 1
    if status.between?(200, 299)
      puts "  OK: #{status}" if ENV["DEBUG"]
    else
      failed += 1
      puts "  Failed: #{status}"
      min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
      passed = count - failed
      if passed < min_required
        raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
      end
    end
  end

  return unless count > 0

  puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!"
end

.fetch_url_head(url) ⇒ Object



65
66
67
68
69
# File 'lib/scraper_utils/spec_support.rb', line 65

def self.fetch_url_head(url)
  agent = Mechanize.new
  # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.head(url)
end

.fetch_url_with_redirects(url) ⇒ Object



71
72
73
74
75
# File 'lib/scraper_utils/spec_support.rb', line 71

def self.fetch_url_with_redirects(url)
  agent = Mechanize.new
  # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.get(url)
end

.geocodable?(address, ignore_case: false, known_suburbs: []) ⇒ Boolean

Check if an address is likely to be geocodable by analyzing its format. This is a bit stricter than needed - typically assert >= 75% match

Parameters:

  • address (String)

    The address to check

  • ignore_case (Boolean) (defaults to: false)

    Ignores case which relaxes suburb check

  • known_suburbs (Array<String>) (defaults to: [])

    Known suburbs to detect in address when there is no postcode and no uppercase suburb

Returns:

  • (Boolean)

    True if the address appears to be geocodable.



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/scraper_utils/spec_support.rb', line 146

def self.geocodable?(address, ignore_case: false, known_suburbs: [])
  return false if address.nil? || address.empty?

  check_address = ignore_case ? address.upcase : address

  # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
  has_state = AUSTRALIAN_STATES.any? do |state|
    check_address.end_with?(" #{state}") || check_address.include?(" #{state} ")
  end
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)

  # Using the pre-compiled patterns
  has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }

  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
  has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }

  if ENV["DEBUG"]
    missing = []
    missing << "street type" unless has_street_type
    unless has_postcode || has_uppercase_suburb || has_known_suburb
      missing << "postcode/Uppercase suburb/Known suburb"
    end
    missing << "state" unless has_state
    if missing.any?
      puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}"
    end
  end

  has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
end

.placeholder?(text) ⇒ Boolean

Returns:

  • (Boolean)


187
188
189
# File 'lib/scraper_utils/spec_support.rb', line 187

def self.placeholder?(text)
  PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
end

.reasonable_description?(text) ⇒ Boolean

Check if this looks like a “reasonable” description This is a bit stricter than needed - typically assert >= 75% match

Returns:

  • (Boolean)


220
221
222
223
224
225
226
227
228
229
# File 'lib/scraper_utils/spec_support.rb', line 220

def self.reasonable_description?(text)
  return false if placeholder?(text)

  # Long descriptions (3+ words) are assumed reasonable
  return true if text.to_s.split.size >= 3

  # Short descriptions must contain at least one planning keyword
  text_lower = text.to_s.downcase
  PLANNING_KEYWORDS.any? { |keyword| text_lower.include?(keyword) }
end

.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: []) ⇒ Object

Validates enough addresses are geocodable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of addresses expected to be geocodable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

  • ignore_case (Boolean) (defaults to: false)

    Ignores case which relaxes suburb check

  • known_suburbs (Array<String>) (defaults to: [])

    Known suburbs to detect in address when there is no postcode and no uppercase suburb

Raises:

  • RuntimeError if insufficient addresses are geocodable



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/scraper_utils/spec_support.rb', line 111

def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
                                            ignore_case: false, known_suburbs: [])
  return nil if results.empty?

  geocodable = results
               .map { |record| record["address"] }
               .uniq
               .count do |text|
    ok = ScraperUtils::SpecSupport.geocodable? text,
                                               known_suburbs: known_suburbs, ignore_case: ignore_case
    if !ok && DebugUtils.verbose?
      ScraperUtils::LogUtils.log(
        "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
      )
    end

    ok
  end
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
       "(#{(100.0 * geocodable / results.count).round(1)}%)"
  expected = [(((percentage.to_f / 100.0) * results.count) - variation), 1].max
  unless geocodable >= expected
    raise UnprocessableSite,
          "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
  end

  geocodable
end

.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough descriptions are reasonable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of descriptions expected to be reasonable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient descriptions are reasonable



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/scraper_utils/spec_support.rb', line 196

def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  descriptions = results
                 .map { |record| record["description"] }
                 .uniq
                 .count do |text|
    selected = ScraperUtils::SpecSupport.reasonable_description? text
    puts "  description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
    selected
  end
  puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
       "(#{(100.0 * descriptions / results.count).round(1)}%)"
  expected = [((percentage.to_f / 100.0) * results.count) - variation, 1].max
  unless descriptions >= expected
    raise UnprocessableSite,
          "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}"
  end

  descriptions
end

.validate_info_urls_are_present!(results, percentage: 75, variation: 3) {|String| ... } ⇒ Object

Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if insufficient detail checks pass



267
268
269
270
271
272
273
274
275
# File 'lib/scraper_utils/spec_support.rb', line 267

def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
      check_info_url_is_present(results, percentage, variation, &block)
    end
  else
    check_info_url_is_present(results, percentage, variation, &block)
  end
end

.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that info_urls have expected details (unique URLs with content validation)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if insufficient detail checks pass



284
285
286
287
288
289
290
291
292
293
# File 'lib/scraper_utils/spec_support.rb', line 284

def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
                                                   bot_check_expected: false, &block)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
      check_info_url_details(results, percentage, variation, bot_check_expected, &block)
    end
  else
    check_info_url_details(results, percentage, variation, bot_check_expected, &block)
  end
end

.validate_page_response(page, bot_check_expected) ⇒ Object

Validate page response, accounting for bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to validate

  • bot_check_expected (Boolean)

    Whether bot protection is acceptable

Raises:

  • RuntimeError if page response is invalid and bot protection not expected



325
326
327
328
329
330
331
332
333
334
# File 'lib/scraper_utils/spec_support.rb', line 325

def self.validate_page_response(page, bot_check_expected)
  if bot_check_expected && bot_protection_detected?(page)
    puts "  Bot protection detected - accepting as valid response"
    return
  end

  return if page.code == "200"

  raise "Expected 200 response from the one expected info_url, got #{page.code}"
end

.validate_unique_references!(records) ⇒ Object

Finds records with duplicate [authority_label, council_reference] keys.

Parameters:

  • records (Array<Hash>)

    All records to check

Raises:



93
94
95
96
97
98
99
100
101
102
# File 'lib/scraper_utils/spec_support.rb', line 93

def self.validate_unique_references!(records)
  groups = records.group_by do |r|
    [r["authority_label"], r["council_reference"]&.downcase]
  end
  duplicates = groups.select { |_k, g| g.size > 1 }
  return if duplicates.empty?

  raise UnprocessableSite,
        "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
end

.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that all records use the expected global info_url and it returns 200

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • expected_url (String)

    The expected global info_url for this authority

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if records don’t use the expected URL or it doesn’t return 200



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/scraper_utils/spec_support.rb', line 237

def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
                                           &block)
  info_urls = results.map { |record| record["info_url"] }.uniq

  unless info_urls.size == 1
    raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
  end
  unless info_urls.first == expected_url
    raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
  end

  puts "Checking the one expected info_url returns 200: #{expected_url}"

  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url") do
      page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
      validate_page_response(page, bot_check_expected)
    end
  else
    page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
    validate_page_response(page, bot_check_expected)
  end
end