Class: P3

Inherits:
Object
  • Object
show all
Defined in:
lib/arxiv/references/P3.rb

Constant Summary collapse

BASE_URL =
'https://arxiv.org'
REFERENCE_START_REGEXP =
Regexp.new('\n*[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]?( +|\n+)?$')
REFERENCE_REGEXP =
Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')

Class Method Summary collapse

Class Method Details

.convertSingleColPdf(job_id, work_dir, file_name, use_dir) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/arxiv/references/P3.rb', line 54

def self.convertSingleColPdf(job_id, work_dir, file_name, use_dir)
  cmd = "k2pdfopt -dev kpw #{file_name}"
  PTY.spawn(cmd) do |i, o|
    o.sync = true
    i.expect(/\S.*Enter option above \(h=help, q=quit\):/, 10) {
      o.puts "\n"
      o.flush
    }
    while (i.eof? == false)
      res = i.gets
      print res
      break unless res.index('written').nil?
    end
  end
  return getK2Pdf(job_id, work_dir, use_dir)
end

.fetchFromPdfUrl(pdfUrl, work_dir = true, use_dir = true) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/arxiv/references/P3.rb', line 104

def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
  job_id = makeId
  makeDir(job_id, work_dir) if use_dir
  file_name = makeFile(job_id, work_dir, use_dir)

  fetchPdfFile(pdfUrl, file_name)
  executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
  references = fetchReference(executed_pdf)
  if use_dir
    removeDir(job_id, work_dir)
  else
    removeFile(job_id, work_dir)
  end
  return references
end

.fetchPdfFile(pdfUrl, file_name) ⇒ Object



46
47
48
49
50
51
52
# File 'lib/arxiv/references/P3.rb', line 46

def self.fetchPdfFile(pdfUrl, file_name)
  open(file_name, 'wb') do |o|
    open(pdfUrl) do |data|
      o.write(data.read)
    end
  end
end

.fetchReference(file_name) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/arxiv/references/P3.rb', line 71

def self.fetchReference(file_name)
  reader = PDF::Reader.new(file_name)
  page_no = reader.
      pages.
      reject { |i|
    i.text.index(REFERENCE_START_REGEXP).nil?
  }.
      map(&:number).
      sort.
      shift

  ref_page = reader.
      pages.
      select { |i|
    i.number >= page_no
  }.
      map { |i|
    i.text.gsub(/\n\n+/, "\n").gsub(/ +/, ' ').gsub(/-\n +/, '')
  }

  ref_page.shift

  ref_page = ref_page.
      join(' ').
      gsub(REFERENCE_REGEXP, "\n\\1")

  ref_page = ref_page.
      split(/\n *\n/).
      map { |i| i.gsub("\n", '') }.
      select { |i| i.length > 15 }
  return ref_page
end

.getK2Pdf(id, work_dir, use_dir) ⇒ Object



33
34
35
36
37
38
39
# File 'lib/arxiv/references/P3.rb', line 33

def self.getK2Pdf(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output_k2opt.pdf"
  else
    return "#{work_dir}/#{id}-output_k2opt.pdf"
  end
end

.makeDir(id, work_dir) ⇒ Object



17
18
19
# File 'lib/arxiv/references/P3.rb', line 17

def self.makeDir(id, work_dir)
  Dir.mkdir("#{work_dir}/#{id}")
end

.makeFile(id, work_dir, use_dir) ⇒ Object



25
26
27
28
29
30
31
# File 'lib/arxiv/references/P3.rb', line 25

def self.makeFile(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output.pdf"
  else
    return "#{work_dir}/#{id}-output.pdf"
  end
end

.makeIdObject



13
14
15
# File 'lib/arxiv/references/P3.rb', line 13

def self.makeId
  return Digest::SHA256.hexdigest Time.now.strftime('%F %H:%M:%S')
end

.removeDir(id, work_dir) ⇒ Object



21
22
23
# File 'lib/arxiv/references/P3.rb', line 21

def self.removeDir(id, work_dir)
  FileUtils.rm_rf("#{work_dir}/#{id}")
end

.removeFile(id, work_dir) ⇒ Object



41
42
43
44
# File 'lib/arxiv/references/P3.rb', line 41

def self.removeFile(id, work_dir)
  File.delete("#{work_dir}/#{id}-output.pdf")
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
end