Class: GeoCombine::Harvester

Inherits:
Object
  • Object
show all
Defined in:
lib/geo_combine/harvester.rb

Overview

Harvests Geoblacklight documents from OpenGeoMetadata for indexing

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'), schema_version: ENV.fetch('SCHEMA_VERSION', 'Aardvark'), logger: GeoCombine::Logger.logger) ⇒ Harvester

Returns a new instance of Harvester.



32
33
34
35
36
37
38
39
40
# File 'lib/geo_combine/harvester.rb', line 32

def initialize(
  ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'),
  schema_version: ENV.fetch('SCHEMA_VERSION', 'Aardvark'),
  logger: GeoCombine::Logger.logger
)
  @ogm_path = ogm_path
  @schema_version = schema_version
  @logger = logger
end

Instance Attribute Details

#ogm_pathObject (readonly)

Returns the value of attribute ogm_path.



12
13
14
# File 'lib/geo_combine/harvester.rb', line 12

def ogm_path
  @ogm_path
end

#schema_versionObject (readonly)

Returns the value of attribute schema_version.



12
13
14
# File 'lib/geo_combine/harvester.rb', line 12

def schema_version
  @schema_version
end

Class Method Details

.denylistObject

Non-metadata repositories that shouldn’t be harvested



15
16
17
18
19
20
21
22
23
24
25
# File 'lib/geo_combine/harvester.rb', line 15

def self.denylist
  [
    'GeoCombine',
    'aardvark',
    'metadata-issues',
    'ogm_utils-python',
    'opengeometadata.github.io',
    'opengeometadata-rails',
    'gbl-1_to_aardvark'
  ]
end

.ogm_api_uriObject

GitHub API endpoint for OpenGeoMetadata repositories



28
29
30
# File 'lib/geo_combine/harvester.rb', line 28

def self.ogm_api_uri
  URI('https://api.github.com/orgs/opengeometadata/repos?per_page=1000')
end

Instance Method Details

#clone(repo) ⇒ Object

Clone a repository via git If the repository already exists, skip it.



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/geo_combine/harvester.rb', line 91

def clone(repo)
  repo_path = File.join(@ogm_path, repo)
  repo_info = repository_info(repo)
  repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"

  # Skip if exists; warn if archived or empty
  if File.directory? repo_path
    @logger.warn "skipping clone to #{repo_path}; directory exists"
    return nil
  end
  @logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
  @logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?

  Git.clone(repo_url, nil, path: ogm_path, depth: 1)
  @logger.info "cloned #{repo_url} to #{repo_path}"
  repo
end

#clone_allObject

Clone all repositories via git Return the names of repositories cloned.



111
112
113
114
115
# File 'lib/geo_combine/harvester.rb', line 111

def clone_all
  cloned = repositories.map(&method(:clone)).compact
  @logger.info "cloned #{cloned.size} repositories"
  cloned
end

#docs_to_indexObject

Enumerable of docs to index, for passing to an indexer



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/geo_combine/harvester.rb', line 43

def docs_to_index
  return to_enum(:docs_to_index) unless block_given?

  @logger.info "loading documents from #{ogm_path}"
  Find.find(@ogm_path) do |path|
    # skip non-json and layers.json files
    if File.basename(path) == 'layers.json' || !File.basename(path).end_with?('.json')
      @logger.debug "skipping #{path}; not a geoblacklight JSON document"
      next
    end

    doc = JSON.parse(File.read(path))
    [doc].flatten.each do |record|
      # skip indexing if this record has a different schema version than what we want
      record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version']
      record_id = record['layer_slug_s'] || record['dc_identifier_s']
      if record_schema != @schema_version
        @logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}"
        next
      end

      @logger.debug "found record #{record_id} at #{path}"
      yield record, path
    end
  end
end

#pull(repo) ⇒ Object

Update a repository via git If the repository doesn’t exist, clone it.



72
73
74
75
76
77
78
79
# File 'lib/geo_combine/harvester.rb', line 72

def pull(repo)
  repo_path = File.join(@ogm_path, repo)
  clone(repo) unless File.directory? repo_path

  Git.open(repo_path).pull
  @logger.info "updated #{repo}"
  repo
end

#pull_allObject

Update all repositories Return the names of repositories updated



83
84
85
86
87
# File 'lib/geo_combine/harvester.rb', line 83

def pull_all
  updated = repositories.map(&method(:pull)).compact
  @logger.info "updated #{updated.size} repositories"
  updated
end