Class: SearchSolrTools::Harvesters::Base

Inherits:
Object
  • Object
show all
Includes:
SSTLogger
Defined in:
lib/search_solr_tools/harvesters/base.rb

Overview

base class for solr harvesters

Direct Known Subclasses

AutoSuggest, NsidcJson

Constant Summary collapse

DELETE_DOCUMENTS_RATIO =
0.1
XML_CONTENT_TYPE =
'text/xml; charset=utf-8'
JSON_CONTENT_TYPE =
'application/json; charset=utf-8'

Constants included from SSTLogger

SSTLogger::LOG_LEVELS

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from SSTLogger

#logger, logger

Constructor Details

#initialize(env = 'development', die_on_failure: false) ⇒ Base

Returns a new instance of Base.



26
27
28
29
# File 'lib/search_solr_tools/harvesters/base.rb', line 26

def initialize(env = 'development', die_on_failure: false)
  @environment = env
  @die_on_failure = die_on_failure
end

Instance Attribute Details

#environmentObject

Returns the value of attribute environment.



20
21
22
# File 'lib/search_solr_tools/harvesters/base.rb', line 20

def environment
  @environment
end

Instance Method Details

#create_new_solr_add_docObject

returns Nokogiri XML document with content ‘<?xml version=“1.0”?><add/>’



203
204
205
206
207
# File 'lib/search_solr_tools/harvesters/base.rb', line 203

def create_new_solr_add_doc
  doc = Nokogiri::XML::Document.new
  doc.root = Nokogiri::XML::Node.new('add', doc)
  doc
end

#create_new_solr_add_doc_with_child(child) ⇒ Object

returns a Nokogiri XML document with content ‘<?xml version=“1.0”?><add> <child /> </add>’



211
212
213
214
215
# File 'lib/search_solr_tools/harvesters/base.rb', line 211

def create_new_solr_add_doc_with_child(child)
  doc = create_new_solr_add_doc
  doc.root.add_child(child)
  doc
end

#delete_old_documents(timestamp, constraints, solr_core, force: false) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/search_solr_tools/harvesters/base.rb', line 80

def delete_old_documents(timestamp, constraints, solr_core, force: false)
  constraints = sanitize_data_centers_constraints(constraints)
  delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
  solr = RSolr.connect url: solr_url + "/#{solr_core}"
  unchanged_count = (solr.get 'select', params: { wt: :ruby, q: delete_query, rows: 0 })['response']['numFound'].to_i
  if unchanged_count.zero?
    logger.info "All documents were updated after #{timestamp}, nothing to delete"
  else
    logger.info "Begin removing documents older than #{timestamp}"
    remove_documents(solr, delete_query, constraints, force, unchanged_count)
  end
end

#doc_valid?(doc) ⇒ Boolean

Make sure that Solr is able to accept this doc in a POST

Returns:

  • (Boolean)


218
219
220
221
222
223
224
225
226
227
228
# File 'lib/search_solr_tools/harvesters/base.rb', line 218

def doc_valid?(doc)
  spatial_coverages = doc.xpath(".//field[@name='spatial_coverages']").first
  return true if spatial_coverages.nil?

  spatial_coverages = spatial_coverages.text.split

  # We've only seen the failure with 4 spatial coverage values
  return true if spatial_coverages.size < 4

  valid_solr_spatial_coverage?(spatial_coverages)
end

#encode_data_provider_url(url) ⇒ Object

Some data providers require encoding (such as URI.encode), while others barf on encoding. The default is to just return url, override this in the subclass if special encoding is needed.



40
41
42
# File 'lib/search_solr_tools/harvesters/base.rb', line 40

def encode_data_provider_url(url)
  url
end

#get_results(request_url, metadata_path, content_type = 'application/xml') ⇒ Object

Get results from an end point specified in the request_url



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/search_solr_tools/harvesters/base.rb', line 175

def get_results(request_url, , content_type = 'application/xml')
  timeout = 300
  retries_left = 3

  request_url = encode_data_provider_url(request_url)

  begin
    logger.debug "Request: #{request_url}"
    response = URI.parse(request_url).open(read_timeout: timeout, 'Content-Type' => content_type)
  rescue OpenURI::HTTPError, Timeout::Error, Errno::ETIMEDOUT => e
    retries_left -= 1
    logger.error "## REQUEST FAILED ## #{e.class} ## Retrying #{retries_left} more times..."

    retry if retries_left.positive?

    # TODO: Do we really need this "die_on_failure" anymore?  The empty return
    #  will cause the "No Documents" error to be thrown in the harvester class
    #  now, so it will pretty much always "die on failure"
    raise e if @die_on_failure

    return
  end
  doc = Nokogiri.XML(response)
  doc.xpath(, Helpers::IsoNamespaces.namespaces(doc))
end

#get_serialized_doc(doc, content_type) ⇒ Object



164
165
166
167
168
169
170
171
172
# File 'lib/search_solr_tools/harvesters/base.rb', line 164

def get_serialized_doc(doc, content_type)
  if content_type.eql?(XML_CONTENT_TYPE)
    doc.respond_to?(:to_xml) ? doc.to_xml : doc
  elsif content_type.eql?(JSON_CONTENT_TYPE)
    MultiJson.dump(doc)
  else
    doc
  end
end

#harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name]) ⇒ Object



71
72
73
74
75
76
77
78
# File 'lib/search_solr_tools/harvesters/base.rb', line 71

def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
  start_time = Time.now.utc.iso8601

  harvest_status = harvest_method.call
  delete_old_documents start_time, delete_constraints, solr_core

  harvest_status
end

#insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name]) ⇒ Object

TODO: Need to return a specific type of failure:

- Bad record content identified and no ingest attempted
- Solr tries to ingest document and fails (bad content not detected prior to ingest)
- Solr cannot insert document for reasons other than the document structure and content.


136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/search_solr_tools/harvesters/base.rb', line 136

def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
  url = solr_url + "/#{core}/update?commit=true"
  status = Helpers::HarvestStatus::INGEST_OK

  # Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
  # doesn't seem to recover.
  return Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC if content_type == XML_CONTENT_TYPE && !doc_valid?(doc)

  doc_serialized = get_serialized_doc(doc, content_type)

  # Some docs will cause solr to time out during the POST
  begin
    RestClient.post(url, doc_serialized, content_type:) do |response, _request, _result|
      success = response.code == 200
      unless success
        logger.error "Error for #{doc_serialized}\n\n response: #{response.body}"
        status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
      end
    end
  rescue StandardError => e
    # TODO: Need to provide more detail re: this failure so we know whether to
    #  exit the job with a status != 0
    logger.error "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
    status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
  end
  status
end

#insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name]) ⇒ Object

Update Solr with an array of Nokogiri xml documents, report number of successfully added documents



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/search_solr_tools/harvesters/base.rb', line 115

def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
  success = 0
  failure = 0

  status = Helpers::HarvestStatus.new

  docs.each do |doc|
    doc_status = insert_solr_doc(doc, content_type, core)
    status.record_status doc_status
    doc_status == Helpers::HarvestStatus::INGEST_OK ? success += 1 : failure += 1
  end
  logger.info "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
  logger.info "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."

  status
end

#ping_solr(core = SolrEnvironments[@environment][:collection_name]) ⇒ Object

Ping the Solr instance to ensure that it’s running. The ping query is specified to manually check the title, as it’s possible there is no “default” query in the solr instance.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/search_solr_tools/harvesters/base.rb', line 47

def ping_solr(core = SolrEnvironments[@environment][:collection_name])
  url = solr_url + "/#{core}/admin/ping?df=title"
  success = false

  # Some docs will cause solr to time out during the POST
  begin
    RestClient.get(url) do |response, _request, _result|
      success = response.code == 200
      logger.error "Error in ping request: #{response.body}" unless success
    end
  rescue StandardError => e
    logger.error "Rest exception while pinging Solr: #{e}"
  end
  success
end

#ping_sourceObject

This should be overridden by child classes to implement the ability to “ping” the data center. Returns true if the ping is successful (or, as in this default, no ping method was defined)



66
67
68
69
# File 'lib/search_solr_tools/harvesters/base.rb', line 66

def ping_source
  logger.info 'Harvester does not have ping method defined, assuming true'
  true
end

#remove_documents(solr, delete_query, constraints, force, numfound) ⇒ Object



101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/search_solr_tools/harvesters/base.rb', line 101

def remove_documents(solr, delete_query, constraints, force, numfound)
  all_response_count = (solr.get 'select', params: { wt: :ruby, q: constraints, rows: 0 })['response']['numFound']
  if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
    logger.info "Deleting #{numfound} documents for #{constraints}"
    solr.delete_by_query delete_query
    solr.commit
  else
    logger.info "Failed to delete records older than current harvest start because they exceeded #{DELETE_DOCUMENTS_RATIO} of the total records for this data center."
    logger.info "\tTotal records: #{all_response_count}"
    logger.info "\tNon-updated records: #{numfound}"
  end
end

#sanitize_data_centers_constraints(query_string) ⇒ Object



93
94
95
96
97
98
99
# File 'lib/search_solr_tools/harvesters/base.rb', line 93

def sanitize_data_centers_constraints(query_string)
  # Remove lucene special characters, preserve the query parameter and compress whitespace
  query_string = query_string.gsub(/[:&|!~\-\(\)\{\}\[\]\^\*\?\+]+/, ' ')
  query_string = query_string.gsub('data_centers ', 'data_centers:')
  query_string = query_string.gsub('source ', 'source:')
  query_string.squeeze(' ').strip
end

#solr_urlObject



31
32
33
34
# File 'lib/search_solr_tools/harvesters/base.rb', line 31

def solr_url
  env = SolrEnvironments[@environment]
  "http://#{env[:host]}:#{env[:port]}/#{env[:collection_path]}"
end

#valid_solr_spatial_coverage?(spatial_coverages) ⇒ Boolean

spatial_coverages is an array with length 4:

North, East, South, West

Returns:

  • (Boolean)


232
233
234
235
236
237
238
# File 'lib/search_solr_tools/harvesters/base.rb', line 232

def valid_solr_spatial_coverage?(spatial_coverages)
  north, east, south, west = spatial_coverages

  polar_point = (north == south) && (north.to_f.abs == 90)

  (east == west) || !polar_point
end