Class: ScraperCentral
- Inherits:
-
Object
- Object
- ScraperCentral
- Defined in:
- lib/scraper_central.rb,
lib/scraper_central/version.rb
Constant Summary collapse
- VERSION =
'2.0.1'
Instance Attribute Summary collapse
-
#auth_config ⇒ Object
Returns the value of attribute auth_config.
-
#cache_duration ⇒ Object
Returns the value of attribute cache_duration.
-
#cookies ⇒ Object
Returns the value of attribute cookies.
-
#enable_image_cache ⇒ Object
Returns the value of attribute enable_image_cache.
-
#enable_js ⇒ Object
Returns the value of attribute enable_js.
-
#headers ⇒ Object
Returns the value of attribute headers.
-
#proxy_name ⇒ Object
Returns the value of attribute proxy_name.
-
#query_params ⇒ Object
Returns the value of attribute query_params.
-
#retry_attr ⇒ Object
Returns the value of attribute retry_attr.
-
#s3_key ⇒ Object
Returns the value of attribute s3_key.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#tls_verify ⇒ Object
Returns the value of attribute tls_verify.
Instance Method Summary collapse
- #cache_server ⇒ Object
- #fetch(url) ⇒ Object
-
#initialize ⇒ ScraperCentral
constructor
A new instance of ScraperCentral.
- #print_proxy_values ⇒ Object
Constructor Details
#initialize ⇒ ScraperCentral
Returns a new instance of ScraperCentral.
14 15 16 17 |
# File 'lib/scraper_central.rb', line 14 def initialize @lock = Mutex.new @logger = Logger.new($stdout) end |
Instance Attribute Details
#auth_config ⇒ Object
Returns the value of attribute auth_config.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def auth_config @auth_config end |
#cache_duration ⇒ Object
Returns the value of attribute cache_duration.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def cache_duration @cache_duration end |
#cookies ⇒ Object
Returns the value of attribute cookies.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def @cookies end |
#enable_image_cache ⇒ Object
Returns the value of attribute enable_image_cache.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def enable_image_cache @enable_image_cache end |
#enable_js ⇒ Object
Returns the value of attribute enable_js.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def enable_js @enable_js end |
#headers ⇒ Object
Returns the value of attribute headers.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def headers @headers end |
#proxy_name ⇒ Object
Returns the value of attribute proxy_name.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def proxy_name @proxy_name end |
#query_params ⇒ Object
Returns the value of attribute query_params.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def query_params @query_params end |
#retry_attr ⇒ Object
Returns the value of attribute retry_attr.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def retry_attr @retry_attr end |
#s3_key ⇒ Object
Returns the value of attribute s3_key.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def s3_key @s3_key end |
#timeout ⇒ Object
Returns the value of attribute timeout.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def timeout @timeout end |
#tls_verify ⇒ Object
Returns the value of attribute tls_verify.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def tls_verify @tls_verify end |
Instance Method Details
#cache_server ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/scraper_central.rb', line 94 def cache_server args = { proxy_name: proxy_name, enable_js: enable_js, cache_duration: cache_duration, s3_key: s3_key, enable_image_cache: enable_image_cache, auth_config: auth_config } CacheServer.new(args) end |
#fetch(url) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/scraper_central.rb', line 19 def fetch(url) @lock.synchronize do @url = url page_from_server, headers_from_server, proxy_from_server = cache_server.get_cache(@url) if proxy_from_server.nil? print_proxy_values return Response.new(code: 200, body: page_from_server, headers: headers_from_server) else proxy_response = nil params = { country: s3_key[:country], headers: headers, query_params: query_params, cookies: , timeout: timeout, tls_verify: tls_verify, retry_attr: retry_attr, enable_js: enable_js, enable_image_cache: enable_image_cache } case proxy_from_server['proxyName'] when 'BrightData' proxy_response = Proxy::BrightData.new(params).fetch(@url, proxy_from_server) when 'CrawlBase' proxy_response = Proxy::CrawlBase.new(params).fetch(@url, proxy_from_server) when 'ScraperApi' proxy_response = Proxy::ScraperApi.new(params).fetch(@url, proxy_from_server) end if proxy_response.nil? || proxy_response&.code != 200 status_code = proxy_response&.code || 500 @logger.error("Error fetching content from proxy: #{proxy_from_server['proxyName']}, error code: #{status_code}, params: #{s3_key}") return Response.new(code: status_code) end Thread.new do cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers, proxy_response., enable_image_cache) @logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}") rescue StandardError => e @logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.}") end print_proxy_values proxy_response end end end |
#print_proxy_values ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/scraper_central.rb', line 71 def print_proxy_values @logger.info("url: #{@url}") unless s3_key.empty? @logger.info("marketplace: #{s3_key[:marketplace]}") @logger.info("country: #{s3_key[:country]}") @logger.info("identifier: #{s3_key[:identifier]}") @logger.info("page_type: #{s3_key[:page_type]}") @logger.info("page_number: #{s3_key[:page_number]}") end @logger.info("cache_duration: #{cache_duration}") @logger.info("proxy_name: #{proxy_name}") @logger.info("enable_js: #{enable_js}") @logger.info("tls_verify: #{tls_verify}") if tls_verify @logger.info("headers: #{headers}") if headers @logger.info("query_params: #{query_params}") if query_params @logger.info("cookies: #{}") if @logger.info("timeout: #{timeout}") if timeout @logger.info("retry_attr: #{retry_attr}") if retry_attr @logger.info("enable_image_cache: #{enable_image_cache}") if enable_image_cache end |