Class: P3

Inherits:
Object
  • Object
show all
Defined in:
lib/arxiv/references/P3.rb

Constant Summary collapse

BASE_URL =
"https://arxiv.org"
REFERENCE_START_REGEXP =
Regexp.new('\n*[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]?( +|\n+)?$')
REFERENCE_REGEXP =
Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')

Class Method Summary collapse

Class Method Details

.convertSingleColPdf(job_id, work_dir, file_name, use_dir) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/arxiv/references/P3.rb', line 52

def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
  cmd = "k2pdfopt -dev kpw #{file_name}"
  PTY.spawn(cmd) do |i,o|
    o.sync = true
    i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
      o.puts "\n"
      o.flush
    }
    while( i.eof? == false )
      res = i.gets
      print res
      break unless res.index('written').nil?
    end
  end
  return getK2Pdf(job_id, work_dir, use_dir)
end

.fetchFromPdfUrl(pdfUrl, work_dir = true, use_dir = true) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/arxiv/references/P3.rb', line 98

def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
  job_id = makeId
  makeDir(job_id, work_dir) if use_dir
  file_name = makeFile(job_id, work_dir, use_dir)

  fetchPdfFile(pdfUrl, file_name)
  executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
  references = fetchReference(executed_pdf)
  if use_dir
    removeDir(job_id, work_dir) 
  else
    removeFile(job_id, work_dir)
  end
  return references
end

.fetchPdfFile(pdfUrl, file_name) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/arxiv/references/P3.rb', line 44

def self.fetchPdfFile(pdfUrl,file_name) 
  open(file_name, 'wb') do |o|
    open(pdfUrl) do |data|
      o.write(data.read)
    end
  end
end

.fetchReference(file_name) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/arxiv/references/P3.rb', line 69

def self.fetchReference(file_name)
  reader = PDF::Reader.new(file_name)
  page_no = reader.
    pages.
    reject{|i|
      i.text.index(REFERENCE_START_REGEXP).nil?
    }.
    map(&:number).
    sort.
    shift

    ref_page = reader.
      pages.
      select{|i|
        i.number >= page_no
      }.
      map{|i|
        i.text.gsub(/\n\n+/,"\n").gsub(/ +/,' ').gsub(/-\n +/,'')
      }
      ref_page.shift
      ref_page.
      join(' ').
      gsub(REFERENCE_REGEXP,"\n\\1").
      split(/\n *\n/).
      map{|i| i.gsub("\n",'')}.
      select{|i| i.length > 15}
    return ref_page
end

.getK2Pdf(id, work_dir, use_dir) ⇒ Object



32
33
34
35
36
37
38
# File 'lib/arxiv/references/P3.rb', line 32

def self. getK2Pdf(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output_k2opt.pdf"
  else
    return "#{work_dir}/#{id}-output_k2opt.pdf"
  end
end

.makeDir(id, work_dir) ⇒ Object



16
17
18
# File 'lib/arxiv/references/P3.rb', line 16

def self.makeDir(id, work_dir)
  Dir.mkdir("#{work_dir}/#{id}") 
end

.makeFile(id, work_dir, use_dir) ⇒ Object



24
25
26
27
28
29
30
# File 'lib/arxiv/references/P3.rb', line 24

def self.makeFile(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output.pdf"
  else
    return "#{work_dir}/#{id}-output.pdf"
  end
end

.makeIdObject



12
13
14
# File 'lib/arxiv/references/P3.rb', line 12

def self.makeId
  return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S")
end

.removeDir(id, work_dir) ⇒ Object



20
21
22
# File 'lib/arxiv/references/P3.rb', line 20

def self.removeDir(id, work_dir)
  FileUtils.rm_rf("#{work_dir}/#{id}")
end

.removeFile(id, work_dir) ⇒ Object



40
41
42
43
# File 'lib/arxiv/references/P3.rb', line 40

def self.removeFile(id, work_dir)
  File.delete("#{work_dir}/#{id}-output.pdf")
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
end