Class: ExtractDates
- Inherits:
-
Object
- Object
- ExtractDates
- Defined in:
- lib/extractdates.rb
Instance Method Summary collapse
-
#addItem(date, append, title, description, blob, regex) ⇒ Object
Adds and item to the hash.
- #chunk(append) ⇒ Object
-
#dateExtract(blob, append, title, description) ⇒ Object
Finds matches for date formats in the blob from chunk(append).
-
#initialize(text) ⇒ ExtractDates
constructor
A new instance of ExtractDates.
Constructor Details
#initialize(text) ⇒ ExtractDates
Returns a new instance of ExtractDates.
8 9 10 11 |
# File 'lib/extractdates.rb', line 8 def initialize(text) @text = text @output = Array.new end |
Instance Method Details
#addItem(date, append, title, description, blob, regex) ⇒ Object
Adds and item to the hash
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/extractdates.rb', line 68 def addItem(date, append, title, description, blob, regex) shash = Hash.new shash[:parsed_date] = date shash[:raw_date] = regex shash[:short_chunk] = title # Append fields specified unless append == {nil=>nil} append.each do |k, v| shash[k] = v end end flag = 0 @output.each do |o| if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s) flag = 1 break end end if flag == 0 @output.push(shash) end blob.slice! regex dateExtract(blob, append, title, description) end |
#chunk(append) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/extractdates.rb', line 13 def chunk(append) if !@text.empty? i = @text s = paragraph(i).segment s.each do |j| dateExtract(j, append, j, i) end end return @output end |
#dateExtract(blob, append, title, description) ⇒ Object
Finds matches for date formats in the blob from chunk(append)
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/extractdates.rb', line 26 def dateExtract(blob, append, title, description) blobstring = blob.to_s begin # See below, but with yyyy-mm-dd (and months can only start with 0-1 if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/) save = Regexp.last_match.to_s saveparse = save.gsub("-", "/") # Needed for american_date gem addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save) # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0 # Year can only start with 19 or 20 if it is four chars, or it could be 2 char elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/) save = Regexp.last_match.to_s saveparse = save.gsub("-", "/") addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save) # Same as below but with dd before instead of in middle and supports two digit year # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i) save = Regexp.last_match.to_s addItem(Date.parse(save).to_s, append, title, description, blobstring, save) # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy, # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20 # Case insensitive, optional/variable spaces elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i) save = Regexp.last_match.to_s addItem(Date.parse(save).to_s, append, title, description, blobstring, save) # Matches: yyyy # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers elsif blobstring.match(/\b(?:19|20)\d{2}\b/) save = Regexp.last_match.to_s addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save) end rescue end end |