Class: RStore::FileCrawler

Inherits:
Object show all
Defined in:
lib/rstore/file_crawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_or_folder, file_type, options = {}) ⇒ FileCrawler

Returns a new instance of FileCrawler.



21
22
23
24
25
26
27
28
29
30
# File 'lib/rstore/file_crawler.rb', line 21

def initialize file_or_folder, file_type, options={}
  @path                  = file_or_folder
  @file_type             = file_type
  @config                = Configuration.new(file_or_folder, options)
  @file_options          = @config.file_options
  @parse_options         = @config.parse_options
  self.file_paths        = @path
  self.file_options_hash = @file_paths
  self.data_hash         = @file_options_hash
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



17
18
19
# File 'lib/rstore/file_crawler.rb', line 17

def config
  @config
end

#data_hashObject

attr_reader :file_options_hash



12
13
14
# File 'lib/rstore/file_crawler.rb', line 12

def data_hash
  @data_hash
end

#file_optionsObject (readonly)

Returns the value of attribute file_options.



14
15
16
# File 'lib/rstore/file_crawler.rb', line 14

def file_options
  @file_options
end

#file_pathsObject

Returns the value of attribute file_paths.



16
17
18
# File 'lib/rstore/file_crawler.rb', line 16

def file_paths
  @file_paths
end

#file_typeObject (readonly)

Returns the value of attribute file_type.



16
17
18
# File 'lib/rstore/file_crawler.rb', line 16

def file_type
  @file_type
end

#parse_optionsObject (readonly)

Returns the value of attribute parse_options.



14
15
16
# File 'lib/rstore/file_crawler.rb', line 14

def parse_options
  @parse_options
end

#pathObject (readonly)

Returns the value of attribute path.



15
16
17
# File 'lib/rstore/file_crawler.rb', line 15

def path
  @path
end

Instance Method Details

#can_read?(path) ⇒ Boolean

Returns:

  • (Boolean)


108
109
110
# File 'lib/rstore/file_crawler.rb', line 108

def can_read? path
  !!(/.*\.#{@file_type.to_s}$/ =~ path)
end

#file_options_hash=(file_paths) ⇒ Object



81
82
83
84
85
86
87
88
# File 'lib/rstore/file_crawler.rb', line 81

def file_options_hash= file_paths
  hash = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = nil}}
  file_paths.each do |path|
    hash[path][:file_options]  = @file_options
    hash[path][:parse_options] = @parse_options
  end
  @file_options_hash = hash
end

#parse_directory(option) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/rstore/file_crawler.rb', line 91

def parse_directory option
  files = []
  if option
    files = Dir.glob("**/*.{#{@file_type}}") # Recursively read files into array, skip files that are not of @file_type
  else
    files = Dir.glob("*.{#{@file_type}}")    # Read files of the current directory
  end
  files.each do |file|
    next if File.directory? file
    file
  end
end

#verify_and_format_url(url) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/rstore/file_crawler.rb', line 113

def verify_and_format_url url
  address = url
  begin # add additional 'begin' block so that we can return the original, unchanged url in the error message.
    open(address)
    address
  rescue
    case address
    when /^www/  # open-uri does not recognize URLs starting with 'www'
      address = 'http://' + address
      retry
    when /^http:/ # open-uri does not redirect from http to https on a valid https URL
      address = address.gsub(/http/,'https')
      retry
    else
      raise ArgumentError, "Could not connect to #{url}. Please check if this URL is correct."
    end
  end
end