Class: ExtractDates

Inherits:
Object
  • Object
show all
Defined in:
lib/extractdates.rb

Instance Method Summary collapse

Constructor Details

#initialize(text) ⇒ ExtractDates

Returns a new instance of ExtractDates.



8
9
10
11
# File 'lib/extractdates.rb', line 8

def initialize(text)
  @text = text
  @output = Array.new
end

Instance Method Details

#addItem(date, append, title, description, blob, regex) ⇒ Object

Adds and item to the hash



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/extractdates.rb', line 68

def addItem(date, append, title, description, blob, regex)
  shash = Hash.new
  shash[:parsed_date] = date
  shash[:raw_date] = regex
  shash[:short_chunk] = title
  
  # Append fields specified
  unless append == {nil=>nil}
    append.each do |k, v|
      shash[k] = v
    end
  end

  flag = 0
  @output.each do |o|
    if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
      flag = 1
      break
    end
  end

  if flag == 0
    @output.push(shash)
  end

  blob.slice! regex
  dateExtract(blob, append, title, description)
end

#chunk(append) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/extractdates.rb', line 13

def chunk(append)
  if !@text.empty?
    i = @text
    s = paragraph(i).segment
    s.each do |j|
      dateExtract(j, append, j, i)
    end
  end
  
  return @output
end

#dateExtract(blob, append, title, description) ⇒ Object

Finds matches for date formats in the blob from chunk(append)



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/extractdates.rb', line 26

def dateExtract(blob, append, title, description)
  blobstring = blob.to_s
  
  begin
    # See below, but with yyyy-mm-dd (and months can only start with 0-1
    if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
      save = Regexp.last_match.to_s
      saveparse = save.gsub("-", "/") # Needed for american_date gem
      addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)

      # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
      # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
    elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
      save = Regexp.last_match.to_s
      saveparse = save.gsub("-", "/")
      addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)

      # Same as below but with dd before instead of in middle and supports two digit year
      # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy   
    elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
      save = Regexp.last_match.to_s
      addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
  
      # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
      # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
      # Case insensitive, optional/variable spaces
    elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
      save = Regexp.last_match.to_s
      addItem(Date.parse(save).to_s, append, title, description, blobstring, save)

      # Matches: yyyy
      # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
    elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
      save = Regexp.last_match.to_s
      addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
    end

  rescue
  end
end