Class: Mindee::Input::Source::LocalInputSource

Inherits:
Object
  • Object
show all
Defined in:
lib/mindee/input/sources.rb

Overview

Base class for loading documents.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io_stream, filename, fix_pdf: false) ⇒ LocalInputSource

Returns a new instance of LocalInputSource.

Parameters:

  • io_stream (StringIO)
  • filename (String)
  • fix_pdf (Boolean) (defaults to: false)

Raises:



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/mindee/input/sources.rb', line 57

def initialize(io_stream, filename, fix_pdf: false)
  @io_stream = io_stream
  @filename = filename
  @file_mimetype = if fix_pdf
                     Marcel::MimeType.for @io_stream
                   else
                     Marcel::MimeType.for @io_stream, name: @filename
                   end
  return if ALLOWED_MIME_TYPES.include? @file_mimetype

  if filename.end_with?('.pdf') && fix_pdf
    rescue_broken_pdf(@io_stream)
    @file_mimetype = Marcel::MimeType.for @io_stream

    return if ALLOWED_MIME_TYPES.include? @file_mimetype
  end

  raise InvalidMimeTypeError, @file_mimetype.to_s
end

Instance Attribute Details

#file_mimetypeString (readonly)

Returns:

  • (String)


50
51
52
# File 'lib/mindee/input/sources.rb', line 50

def file_mimetype
  @file_mimetype
end

#filenameString (readonly)

Returns:

  • (String)


48
49
50
# File 'lib/mindee/input/sources.rb', line 48

def filename
  @filename
end

#io_streamStringIO (readonly)

Returns:

  • (StringIO)


52
53
54
# File 'lib/mindee/input/sources.rb', line 52

def io_stream
  @io_stream
end

Instance Method Details

#count_pdf_pagesObject



122
123
124
125
126
127
128
# File 'lib/mindee/input/sources.rb', line 122

def count_pdf_pages
  return 1 unless pdf?

  @io_stream.seek(0)
  pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
  pdf_processor.pages.size
end

#pdf?Boolean

Shorthand for pdf mimetype validation.

Returns:

  • (Boolean)


94
95
96
# File 'lib/mindee/input/sources.rb', line 94

def pdf?
  @file_mimetype.to_s == 'application/pdf'
end

#process_pdf(options) ⇒ Object

Parses a PDF file according to provided options.

Parameters:

  • options (Hash, nil)

    Page cutting/merge options:

    • :page_indexes Zero-based list of page indexes.
    • :operation Operation to apply on the document, given the `page_indexes specified:
      • :KEEP_ONLY - keep only the specified pages, and remove all others.
      • :REMOVE - remove the specified pages, and keep all others.
    • :on_min_pages Apply the operation only if document has at least this many pages.


106
107
108
109
# File 'lib/mindee/input/sources.rb', line 106

def process_pdf(options)
  @io_stream.seek(0)
  @io_stream = PdfProcessor.parse(@io_stream, options)
end

#read_document(close: true) ⇒ Array<String, [String, aBinaryString ], [Hash, nil] >

Reads a document.

Parameters:

  • close (Boolean) (defaults to: true)

Returns:

  • (Array<String, [String, aBinaryString ], [Hash, nil] >)


114
115
116
117
118
119
120
# File 'lib/mindee/input/sources.rb', line 114

def read_document(close: true)
  @io_stream.seek(0)
  # Avoids needlessly re-packing some files
  data = @io_stream.read
  @io_stream.close if close
  ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
end

#rescue_broken_pdf(stream) ⇒ Object

Attempts to fix pdf files if mimetype is rejected. "Broken PDFs" are often a result of third-party injecting invalid headers. This attempts to remove them and send the file

Parameters:

  • stream (StringIO)

Raises:



81
82
83
84
85
86
87
88
89
90
91
# File 'lib/mindee/input/sources.rb', line 81

def rescue_broken_pdf(stream)
  stream.gets('%PDF-')
  raise UnfixablePDFError if stream.eof? || stream.pos > 500

  stream.pos = stream.pos - 5
  data = stream.read
  @io_stream.close

  @io_stream = StringIO.new
  @io_stream << data
end