Class: ScraperCentral

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper_central.rb,
lib/scraper_central/version.rb

Constant Summary collapse

VERSION =
'2.0.1'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeScraperCentral

Returns a new instance of ScraperCentral.



14
15
16
17
# File 'lib/scraper_central.rb', line 14

def initialize
  @lock = Mutex.new
  @logger = Logger.new($stdout)
end

Instance Attribute Details

#auth_configObject

Returns the value of attribute auth_config.



11
12
13
# File 'lib/scraper_central.rb', line 11

def auth_config
  @auth_config
end

#cache_durationObject

Returns the value of attribute cache_duration.



11
12
13
# File 'lib/scraper_central.rb', line 11

def cache_duration
  @cache_duration
end

#cookiesObject

Returns the value of attribute cookies.



11
12
13
# File 'lib/scraper_central.rb', line 11

def cookies
  @cookies
end

#enable_image_cacheObject

Returns the value of attribute enable_image_cache.



11
12
13
# File 'lib/scraper_central.rb', line 11

def enable_image_cache
  @enable_image_cache
end

#enable_jsObject

Returns the value of attribute enable_js.



11
12
13
# File 'lib/scraper_central.rb', line 11

def enable_js
  @enable_js
end

#headersObject

Returns the value of attribute headers.



11
12
13
# File 'lib/scraper_central.rb', line 11

def headers
  @headers
end

#proxy_nameObject

Returns the value of attribute proxy_name.



11
12
13
# File 'lib/scraper_central.rb', line 11

def proxy_name
  @proxy_name
end

#query_paramsObject

Returns the value of attribute query_params.



11
12
13
# File 'lib/scraper_central.rb', line 11

def query_params
  @query_params
end

#retry_attrObject

Returns the value of attribute retry_attr.



11
12
13
# File 'lib/scraper_central.rb', line 11

def retry_attr
  @retry_attr
end

#s3_keyObject

Returns the value of attribute s3_key.



11
12
13
# File 'lib/scraper_central.rb', line 11

def s3_key
  @s3_key
end

#timeoutObject

Returns the value of attribute timeout.



11
12
13
# File 'lib/scraper_central.rb', line 11

def timeout
  @timeout
end

#tls_verifyObject

Returns the value of attribute tls_verify.



11
12
13
# File 'lib/scraper_central.rb', line 11

def tls_verify
  @tls_verify
end

Instance Method Details

#cache_serverObject



94
95
96
97
98
99
100
101
102
103
104
# File 'lib/scraper_central.rb', line 94

def cache_server
  args = {
    proxy_name: proxy_name,
    enable_js: enable_js,
    cache_duration: cache_duration,
    s3_key: s3_key,
    enable_image_cache: enable_image_cache,
    auth_config: auth_config
  }
  CacheServer.new(args)
end

#fetch(url) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/scraper_central.rb', line 19

def fetch(url)
  @lock.synchronize do
    @url = url
    page_from_server, headers_from_server, proxy_from_server = cache_server.get_cache(@url)
    if proxy_from_server.nil?
      print_proxy_values
      return Response.new(code: 200, body: page_from_server, headers: headers_from_server)
    else
      proxy_response = nil

      params = {
        country: s3_key[:country],
        headers: headers,
        query_params: query_params,
        cookies: cookies,
        timeout: timeout,
        tls_verify: tls_verify,
        retry_attr: retry_attr,
        enable_js: enable_js,
        enable_image_cache: enable_image_cache
      }

      case proxy_from_server['proxyName']
      when 'BrightData'
        proxy_response = Proxy::BrightData.new(params).fetch(@url, proxy_from_server)
      when 'CrawlBase'
        proxy_response = Proxy::CrawlBase.new(params).fetch(@url, proxy_from_server)
      when 'ScraperApi'
        proxy_response = Proxy::ScraperApi.new(params).fetch(@url, proxy_from_server)
      end

      if proxy_response.nil? || proxy_response&.code != 200
        status_code = proxy_response&.code || 500
        @logger.error("Error fetching content from proxy: #{proxy_from_server['proxyName']}, error code: #{status_code}, params: #{s3_key}")
        return Response.new(code: status_code)
      end

      Thread.new do
        cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers,
                               proxy_response.cookies, enable_image_cache)
        @logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}")
      rescue StandardError => e
        @logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.message}")
      end

      print_proxy_values

      proxy_response
    end
  end
end


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/scraper_central.rb', line 71

def print_proxy_values
  @logger.info("url: #{@url}")
  unless s3_key.empty?
    @logger.info("marketplace: #{s3_key[:marketplace]}")
    @logger.info("country: #{s3_key[:country]}")
    @logger.info("identifier: #{s3_key[:identifier]}")
    @logger.info("page_type: #{s3_key[:page_type]}")
    @logger.info("page_number: #{s3_key[:page_number]}")
  end

  @logger.info("cache_duration: #{cache_duration}")
  @logger.info("proxy_name: #{proxy_name}")
  @logger.info("enable_js: #{enable_js}")
  @logger.info("tls_verify: #{tls_verify}") if tls_verify

  @logger.info("headers: #{headers}") if headers
  @logger.info("query_params: #{query_params}") if query_params
  @logger.info("cookies: #{cookies}") if cookies
  @logger.info("timeout: #{timeout}") if timeout
  @logger.info("retry_attr: #{retry_attr}") if retry_attr
  @logger.info("enable_image_cache: #{enable_image_cache}") if enable_image_cache
end