Module: Metanorma::Utils

Defined in:
lib/utils/image.rb,
lib/utils/cjk.rb,
lib/utils/log.rb,
lib/utils/xml.rb,
lib/utils/main.rb,
lib/utils/anchor.rb,
lib/utils/version.rb,
lib/utils/namespace.rb,
lib/utils/linestatus.rb,
lib/utils/hash_transform_keys.rb,
lib/utils/hash_transform_keys.rb

Overview

Image methods were moved to the Vectory gem

Defined Under Namespace

Modules: Array, Hash Classes: LineStatus, Log, Namespace

Constant Summary collapse

HAN =

Basic CJK scripts

"\\p{Han}".freeze
BOPOMOFO =
"\\p{Bopomofo}".freeze
HANGUL =
"\\p{Hangul}".freeze
HIRAGANA =
"\\p{Hiragana}".freeze
KATAKANA =
"\\p{Katakana}".freeze
CJK_SYMBOLS =

CJK Symbols and Punctuation (U+3000–U+303F) Used across all CJK scripts

"[\\u3000-\\u303F]".freeze
CJK_PUNCTUATION =

CJK Punctuation (subset of CJK Symbols commonly used)

"[\\u3001-\\u3003\\u3008-\\u3011\\u3014-\\u301F]".freeze
CJK_HALFWIDTH_FULLWIDTH =

Halfwidth and Fullwidth Forms (U+FF00–U+FFEF) Used in all CJK contexts

"[\\uFF00-\\uFFEF]".freeze
CJK_COMPAT =

CJK Compatibility Forms (U+FE30–U+FE4F) Primarily used with Han but relevant for all CJK

"[\\uFE30-\\uFE4F]".freeze
CJK_VERTICAL =

Vertical Forms (U+FE10–U+FE1F) Used in vertical text layout for all CJK

"[\\uFE10-\\uFE1F]".freeze
CJK_SMALL_FORMS =

Small Form Variants (U+FE50–U+FE6F) Used in all CJK contexts

"[\\uFE50-\\uFE6F]".freeze
HAN_IDC =

Ideographic Description Characters (U+2FF0–U+2FFF) Used with Han script

"[\\u2FF0-\\u2FFF]".freeze
KANBUN =

Kanbun (U+3190–U+319F) Used with Han script for Japanese

"[\\u3190-\\u319F]".freeze
CJK_COMPAT_IDEOGRAPHS =

CJK Compatibility (U+3300–U+33FF) Used with Han script

"[\\u3300-\\u33FF]".freeze
HAN_COMPAT_IDEOGRAPHS =

CJK Compatibility Ideographs (U+F900–U+FAFF)

"[\\uF900-\\uFAFF]".freeze
HAN_EXTENSIONS =

Script extensions by primary script

[
  HAN,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_COMPAT,
  CJK_VERTICAL,
  CJK_SMALL_FORMS,
  HAN_IDC,
  KANBUN,
  CJK_COMPAT_IDEOGRAPHS,
  HAN_COMPAT_IDEOGRAPHS
].join("|").freeze
HANGUL_EXTENSIONS =
[
  HANGUL,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze
HIRAGANA_EXTENSIONS =
[
  HIRAGANA,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze
KATAKANA_EXTENSIONS =
[
  KATAKANA,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze
BOPOMOFO_EXTENSIONS =
[
  BOPOMOFO,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH
].join("|").freeze
CJK =

Combined CJK pattern including all script extensions

[
  HAN_EXTENSIONS,
  HANGUL_EXTENSIONS,
  HIRAGANA_EXTENSIONS,
  KATAKANA_EXTENSIONS,
  BOPOMOFO_EXTENSIONS
].join("|").freeze
NOKOHEAD =
<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE
LONGSTR_THRESHOLD =
10
LONGSTR_NOPUNCT =
2
STR_BREAKUP_RE =
%r{
 (?<=[=_—–\u2009→?+;]) | # break after any of these
 (?<=[,.:])(?!\d) | # break on punct only if not preceding digit
 (?<=[>])(?![>]) | # > not >->
 (?<=[\]])(?![\]]) | # ] not ]-]
 (?<=//) | # //
 (?<=[/])(?![/]) | # / not /-/
 (?<![<])(?=[<]) | # < not <-<
 (?<=\p{L})(?=[(\{\[]\p{L}) # letter and bracket, followed by letter
}x.freeze
CAMEL_CASE_RE =
%r{
  (?<=\p{Ll}\p{Ll})(?=\p{Lu}\p{Ll}\p{Ll}) # 2 lowerc / upperc, 2 lowerc
}x.freeze
BASECHAR =

Following XML requirements for NCName: www.w3.org/TR/xml-names/#NT-NCName

"A-Za-z\u{C0}-\u{D6}\u{D8}-\u{F6}\u{F8}-\u{FF}\u{100}-\u{131}\u{134}-\u{13E}" \
"\u{141}-\u{148}\u{14A}-\u{17E}\u{180}-\u{1C3}\u{1CD}-\u{1F0}\u{1F4}-\u{1F5}" \
"\u{1FA}-\u{217}\u{250}-\u{2A8}\u{2BB}-\u{2C1}\u{386}\u{388}-\u{38A}\u{38C}" \
"\u{38E}-\u{3A1}\u{3A3}-\u{3CE}\u{3D0}-\u{3D6}\u{3DA}\u{3DC}\u{3DE}\u{3E0}" \
"\u{3E2}-\u{3F3}\u{401}-\u{40C}\u{40E}-\u{44F}\u{451}-\u{45C}\u{45E}-\u{481}" \
"\u{490}-\u{4C4}\u{4C7}-\u{4C8}\u{4CB}-\u{4CC}\u{4D0}-\u{4EB}\u{4EE}-\u{4F5}" \
"\u{4F8}-\u{4F9}\u{531}-\u{556}\u{559}\u{561}-\u{586}\u{5D0}-\u{5EA}" \
"\u{5F0}-\u{5F2}\u{621}-\u{63A}\u{641}-\u{64A}\u{671}-\u{6B7}\u{6BA}-\u{6BE}" \
"\u{6C0}-\u{6CE}\u{6D0}-\u{6D3}\u{6D5}\u{6E5}-\u{6E6}\u{905}-\u{939}\u{93D}" \
"\u{958}-\u{961}\u{985}-\u{98C}\u{98F}-\u{990}\u{993}-\u{9A8}\u{9AA}-\u{9B0}" \
"\u{9B2}\u{9B6}-\u{9B9}\u{9DC}-\u{9DD}\u{9DF}-\u{9E1}\u{9F0}-\u{9F1}" \
"\u{A05}-\u{A0A}\u{A0F}-\u{A10}\u{A13}-\u{A28}\u{A2A}-\u{A30}\u{A32}-\u{A33}" \
"\u{A35}-\u{A36}\u{A38}-\u{A39}\u{A59}-\u{A5C}\u{A5E}\u{A72}-\u{A74}" \
"\u{A85}-\u{A8B}\u{A8D}\u{A8F}-\u{A91}\u{A93}-\u{AA8}\u{AAA}-\u{AB0}" \
"\u{AB2}-\u{AB3}\u{AB5}-\u{AB9}\u{ABD}\u{AE0}\u{B05}-\u{B0C}\u{B0F}-\u{B10}" \
"\u{B13}-\u{B28}\u{B2A}-\u{B30}\u{B32}-\u{B33}\u{B36}-\u{B39}\u{B3D}" \
"\u{B5C}-\u{B5D}\u{B5F}-\u{B61}\u{B85}-\u{B8A}\u{B8E}-\u{B90}\u{B92}-\u{B95}" \
"\u{B99}-\u{B9A}\u{B9C}\u{B9E}-\u{B9F}\u{BA3}-\u{BA4}\u{BA8}-\u{BAA}" \
"\u{BAE}-\u{BB5}\u{BB7}-\u{BB9}\u{C05}-\u{C0C}\u{C0E}-\u{C10}\u{C12}-\u{C28}" \
"\u{C2A}-\u{C33}\u{C35}-\u{C39}\u{C60}-\u{C61}\u{C85}-\u{C8C}\u{C8E}-\u{C90}" \
"\u{C92}-\u{CA8}\u{CAA}-\u{CB3}\u{CB5}-\u{CB9}\u{CDE}\u{CE0}-\u{CE1}" \
"\u{D05}-\u{D0C}\u{D0E}-\u{D10}\u{D12}-\u{D28}\u{D2A}-\u{D39}\u{D60}-\u{D61}" \
"\u{E01}-\u{E2E}\u{E30}\u{E32}-\u{E33}\u{E40}-\u{E45}\u{E81}-\u{E82}\u{E84}" \
"\u{E87}-\u{E88}\u{E8A}\u{E8D}\u{E94}-\u{E97}\u{E99}-\u{E9F}\u{EA1}-\u{EA3}" \
"\u{EA5}\u{EA7}\u{EAA}-\u{EAB}\u{EAD}-\u{EAE}\u{EB0}\u{EB2}-\u{EB3}\u{EBD}" \
"\u{EC0}-\u{EC4}\u{F40}-\u{F47}\u{F49}-\u{F69}\u{10A0}-\u{10C5}\u{10D0}-\u{10F6}" \
"\u{1100}\u{1102}-\u{1103}\u{1105}-\u{1107}\u{1109}\u{110B}-\u{110C}" \
"\u{110E}-\u{1112}\u{113C}\u{113E}\u{1140}\u{114C}\u{114E}\u{1150}" \
"\u{1154}-\u{1155}\u{1159}\u{115F}-\u{1161}\u{1163}\u{1165}\u{1167}\u{1169}" \
"\u{116D}-\u{116E}\u{1172}-\u{1173}\u{1175}\u{119E}\u{11A8}\u{11AB}" \
"\u{11AE}-\u{11AF}\u{11B7}-\u{11B8}\u{11BA}\u{11BC}-\u{11C2}\u{11EB}\u{11F0}" \
"\u{11F9}\u{1E00}-\u{1E9B}\u{1EA0}-\u{1EF9}\u{1F00}-\u{1F15}\u{1F18}-\u{1F1D}" \
"\u{1F20}-\u{1F45}\u{1F48}-\u{1F4D}\u{1F50}-\u{1F57}\u{1F59}\u{1F5B}\u{1F5D}" \
"\u{1F5F}-\u{1F7D}\u{1F80}-\u{1FB4}\u{1FB6}-\u{1FBC}\u{1FBE}\u{1FC2}-\u{1FC4}" \
"\u{1FC6}-\u{1FCC}\u{1FD0}-\u{1FD3}\u{1FD6}-\u{1FDB}\u{1FE0}-\u{1FEC}" \
"\u{1FF2}-\u{1FF4}\u{1FF6}-\u{1FFC}\u{2126}\u{212A}-\u{212B}\u{212E}" \
"\u{2180}-\u{2182}\u{3041}-\u{3094}\u{30A1}-\u{30FA}\u{3105}-\u{312C}" \
"\u{AC00}-\u{D7A3}".freeze
IDEOGRAPHIC =
"\u{4E00}-\u{9FA5}\u{3007}\u{3021}-\u{3029}".freeze
LETTER =
"#{BASECHAR}#{IDEOGRAPHIC}".freeze
DIGIT =
"0-9\u{0660}-\u{0669}\u{06F0}-\u{06F9}\u{0966}-\u{096F}\u{09E6}-\u{09EF}" \
"\u{0A66}-\u{0A6F}\u{0AE6}-\u{0AEF}\u{0B66}-\u{0B6F}\u{0BE7}-\u{0BEF}" \
"\u{0C66}-\u{0C6F}\u{0CE6}-\u{0CEF}\u{0D66}-\u{0D6F}\u{0E50}-\u{0E59}" \
"\u{0ED0}-\u{0ED9}\u{0F20}-\u{0F29}".freeze
COMBINING_CHAR =
"\u{0300}-\u{0345}\u{0360}-\u{0361}\u{0483}-\u{0486}\u{0591}-\u{05A1}" \
"\u{05A3}-\u{05B9}\u{05BB}-\u{05BD}\u{05BF}\u{05C1}-\u{05C2}\u{05C4}" \
"\u{064B}-\u{0652}\u{0670}\u{06D6}-\u{06DC}\u{06DD}-\u{06DF}" \
"\u{06E0}-\u{06E4}\u{06E7}-\u{06E8}\u{06EA}-\u{06ED}\u{0901}-\u{0903}" \
"\u{093C}\u{093E}-\u{094C}\u{094D}\u{0951}-\u{0954}\u{0962}-\u{0963}" \
"\u{0981}-\u{0983}\u{09BC}\u{09BE}\u{09BF}\u{09C0}-\u{09C4}" \
"\u{09C7}-\u{09C8}\u{09CB}-\u{09CD}\u{09D7}\u{09E2}-\u{09E3}\u{0A02}" \
"\u{0A3C}\u{0A3E}\u{0A3F}\u{0A40}-\u{0A42}\u{0A47}-\u{0A48}" \
"\u{0A4B}-\u{0A4D}\u{0A70}-\u{0A71}\u{0A81}-\u{0A83}\u{0ABC}" \
"\u{0ABE}-\u{0AC5}\u{0AC7}-\u{0AC9}\u{0ACB}-\u{0ACD}\u{0B01}-\u{0B03}" \
"\u{0B3C}\u{0B3E}-\u{0B43}\u{0B47}-\u{0B48}\u{0B4B}-\u{0B4D}" \
"\u{0B56}-\u{0B57}\u{0B82}-\u{0B83}\u{0BBE}-\u{0BC2}\u{0BC6}-\u{0BC8}" \
"\u{0BCA}-\u{0BCD}\u{0BD7}\u{0C01}-\u{0C03}\u{0C3E}-\u{0C44}" \
"\u{0C46}-\u{0C48}\u{0C4A}-\u{0C4D}\u{0C55}-\u{0C56}\u{0C82}-\u{0C83}" \
"\u{0CBE}-\u{0CC4}\u{0CC6}-\u{0CC8}\u{0CCA}-\u{0CCD}\u{0CD5}-\u{0CD6}" \
"\u{0D02}-\u{0D03}\u{0D3E}-\u{0D43}\u{0D46}-\u{0D48}\u{0D4A}-\u{0D4D}" \
"\u{0D57}\u{0E31}\u{0E34}-\u{0E3A}\u{0E47}-\u{0E4E}\u{0EB1}" \
"\u{0EB4}-\u{0EB9}\u{0EBB}-\u{0EBC}\u{0EC8}-\u{0ECD}\u{0F18}-\u{0F19}" \
"\u{0F35}\u{0F37}\u{0F39}\u{0F3E}\u{0F3F}\u{0F71}-\u{0F84}" \
"\u{0F86}-\u{0F8B}\u{0F90}-\u{0F95}\u{0F97}\u{0F99}-\u{0FAD}" \
"\u{0FB1}-\u{0FB7}\u{0FB9}\u{20D0}-\u{20DC}\u{20E1}\u{302A}-\u{302F}" \
"\u{3099}\u{309A}".freeze
EXTENDER =
"\u{00B7}\u{02D0}\u{02D1}\u{0387}\u{0640}\u{0E46}\u{0EC6}\u{3005}" \
"\u{3031}-\u{3035}\u{309D}-\u{309E}\u{30FC}-\u{30FE}".freeze
NCNAME_START_CHAR =

NCName specific constants - NCName is “an XML Name, minus the :” NCName = (Letter | ‘_’) (NCNameChar)*

"#{LETTER}_".freeze
NCNAME_CHAR =
"#{LETTER}#{DIGIT}._\\-#{COMBINING_CHAR}#{EXTENDER}".freeze
INVALID_NCNAME_START_REGEXP =
/[^#{NCNAME_START_CHAR}]/.freeze
INVALID_NCNAME_CHAR_REGEXP =
/[^#{NCNAME_CHAR}]/.freeze
SAFE_NCNAME_REGEXP =
/\A[#{NCNAME_START_CHAR}][#{NCNAME_CHAR}]*\z/.freeze
NCNAME_INVALID =
"_".freeze
VERSION =
"1.11.7".freeze

Class Method Summary collapse

Class Method Details

.anchor_attributes(presxml: false) ⇒ Object

all element/attribute pairs that are ID anchors in Metanorma



119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/utils/anchor.rb', line 119

def anchor_attributes(presxml: false)
  ret = [%w(annotation from), %w(annotation to), %w(callout target),
         %w(xref to), %w(eref bibitemid), %w(citation bibitemid),
         %w(xref target), %w(label for), %w(location target),
         %w(index to), %w(termsource bibitemid), %w(admonition target)]
  ret1 = [%w(fn target), %w(semx source), %w(fmt-title source),
          %w(fmt-xref to), %w(fmt-xref target), %w(fmt-eref bibitemid),
          %w(fmt-xref-label container), %w(fmt-fn-body target),
          %w(fmt-annotation-body from), %w(fmt-annotation-body to),
          %w(fmt-annotation-start source), %w(fmt-annotation-start end),
          %w(fmt-annotation-start target), %w(fmt-annotation-end source),
          %w(fmt-annotation-end start), %w(fmt-annotation-end target)]
  presxml ? ret + ret1 : ret
end

.anchor_or_uuid(node = nil) ⇒ Object



113
114
115
116
# File 'lib/utils/anchor.rb', line 113

def anchor_or_uuid(node = nil)
  uuid = UUIDTools::UUID.random_create
  node.nil? || node.id.nil? || node.id.empty? ? "_#{uuid}" : node.id
end

.asciidoc_sub(text, flavour = :standoc) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
# File 'lib/utils/main.rb', line 22

def asciidoc_sub(text, flavour = :standoc)
  return nil if text.nil?
  return "" if text.empty?

  d = Asciidoctor::Document.new(
    text.lines.entries,
    { header_footer: false, backend: flavour },
  )
  b = d.parse.blocks.first
  b.apply_subs(b.source)
end

.attr_code(attributes) ⇒ Object



33
34
35
36
37
# File 'lib/utils/xml.rb', line 33

def attr_code(attributes)
  attributes.compact.transform_values do |v|
    v.is_a?(String) ? HTMLEntities.new.decode(v) : v
  end
end

.break_up_long_str(text, threshold = LONGSTR_THRESHOLD, nopunct = LONGSTR_NOPUNCT) ⇒ Object

break on punct every LONGSTRING_THRESHOLD chars, with zero width space if punct fails, try break on camel case, with soft hyphen break regardless every LONGSTRING_THRESHOLD * LONGSTR_NOPUNCT, with soft hyphen



140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/utils/main.rb', line 140

def break_up_long_str(text, threshold = LONGSTR_THRESHOLD,
nopunct = LONGSTR_NOPUNCT)
  /^\s*$/.match?(text) and return text
  text.split(/(?=(?:\s|-))/).map do |w|
    if /^\s*$/.match(w) || (w.size < threshold) then w
    else
      w.scan(/.{,#{threshold}}/o).map.with_index do |w1, i|
        w1.size < threshold ? w1 : break_up_long_str1(w1, i + 1, nopunct)
      end.join
    end
  end.join
end

.break_up_long_str1(text, iteration, nopunct) ⇒ Object



168
169
170
171
172
173
174
175
176
177
178
# File 'lib/utils/main.rb', line 168

def break_up_long_str1(text, iteration, nopunct)
  s, separator = break_up_long_str2(text)
  if s.size == 1 # could not break up
    (iteration % nopunct).zero? and
      text += "\u00ad" # force soft hyphen
    text
  else
    s[-1] = "#{separator}#{s[-1]}"
    s.join
  end
end

.break_up_long_str2(text) ⇒ Object



180
181
182
183
184
185
186
187
188
# File 'lib/utils/main.rb', line 180

def break_up_long_str2(text)
  s = text.split(STR_BREAKUP_RE, -1)
  separator = "\u200b"
  if s.size == 1
    s = text.split(CAMEL_CASE_RE)
    separator = "\u00ad"
  end
  [s, separator]
end

.case_transform_xml(xml, kase) ⇒ Object



143
144
145
146
147
148
149
150
# File 'lib/utils/xml.rb', line 143

def case_transform_xml(xml, kase)
  x = Nokogiri::XML("<root>#{xml}</root>")
  x.traverse do |e|
    e.text? or next
    e.replace(e.text.send(kase))
  end
  x.root.children.to_xml
end

.contenthash(elem) ⇒ Object



139
140
141
142
# File 'lib/utils/anchor.rb', line 139

def contenthash(elem)
  Digest::MD5.hexdigest("#{elem.path}////#{elem.text}")
    .sub(/^(.{8})(.{4})(.{4})(.{4})(.{12})$/, "_\\1-\\2-\\3-\\4-\\5")
end

.create_namespace(xmldoc) ⇒ Object



21
22
23
# File 'lib/utils/namespace.rb', line 21

def create_namespace(xmldoc)
  Namespace.new(xmldoc)
end

.csv_split(text, delim = ";") ⇒ Object

, “ => ,” : CSV definition does not deal with space followed by quote at start of field



15
16
17
18
19
20
# File 'lib/utils/main.rb', line 15

def csv_split(text, delim = ";")
  text.nil? || text.empty? and return []
  CSV.parse_line(text.gsub(/#{delim} "(?!")/, "#{delim}\""),
                 liberal_parsing: true,
                 col_sep: delim)&.compact&.map(&:strip)
end

.default_script(lang) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/utils/main.rb', line 113

def default_script(lang)
  case lang
  when "ar", "fa" then "Arab"
  when "ur" then "Aran"
  when "ru", "bg" then "Cyrl"
  when "hi" then "Deva"
  when "el" then "Grek"
  when "zh" then "Hans"
  when "ko" then "Kore"
  when "he" then "Hebr"
  when "ja" then "Jpan"
  else
    "Latn"
  end
end

.dl_to_attrs(elem, dlist, name) ⇒ Object

convert definition list term/value pair into Nokogiri XML attribute



119
120
121
122
123
# File 'lib/utils/xml.rb', line 119

def dl_to_attrs(elem, dlist, name)
  e = dlist.at("./dt[text()='#{name}']") or return
  val = e.at("./following::dd/p") || e.at("./following::dd") or return
  elem[name] = val.text
end

.dl_to_elems(ins, elem, dlist, name) ⇒ Object

convert definition list term/value pairs into Nokogiri XML elements



126
127
128
129
130
131
132
133
# File 'lib/utils/xml.rb', line 126

def dl_to_elems(ins, elem, dlist, name)
  a = elem.at("./#{name}[last()]")
  ins = a if a
  dlist.xpath("./dt[text()='#{name}']").each do |e|
    ins = dl_to_elems1(e, name, ins)
  end
  ins
end

.dl_to_elems1(term, name, ins) ⇒ Object



135
136
137
138
139
140
141
# File 'lib/utils/xml.rb', line 135

def dl_to_elems1(term, name, ins)
  v = term.at("./following::dd")
  e = v.elements and e.size == 1 && e.first.name == "p" and v = e.first
  v.name = name
  ins.next = v
  ins.next
end

.endash_date(elem) ⇒ Object



53
54
55
56
57
58
# File 'lib/utils/main.rb', line 53

def endash_date(elem)
  elem.traverse do |n|
    n.text? or next
    n.replace(n.text.gsub(/\s+--?\s+/, "&#8211;").gsub("--", "&#8211;"))
  end
end

.external_path(path) ⇒ Object



102
103
104
105
106
107
108
109
110
111
# File 'lib/utils/main.rb', line 102

def external_path(path)
  win = !!((RUBY_PLATFORM =~ /(win|w)(32|64)$/) ||
           (RUBY_PLATFORM =~ /mswin|mingw/))
  if win
    path.gsub!(%{/}, "\\")
    path[/\s/] ? "\"#{path}\"" : path
  else
    path
  end
end

.firstchar_xml(line) ⇒ Object

need to deal with both <em> and its reverse string, >me<



71
72
73
74
# File 'lib/utils/xml.rb', line 71

def firstchar_xml(line)
  m = /^([<>][^<>]+[<>])*(.)/.match(line) or return ""
  m[2]
end

.guid_anchor?(id) ⇒ Boolean

Returns:

  • (Boolean)


134
135
136
137
# File 'lib/utils/anchor.rb', line 134

def guid_anchor?(id)
  /^_[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}$/i
    .match?(id)
end

.line_sanitise(ret) ⇒ Object

By default, carriage return in source translates to whitespace; but in CJK, it does not. (Non-CJK text n CJK)



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/utils/xml.rb', line 54

def line_sanitise(ret)
  ret.size == 1 and return ret
  (0...(ret.size - 1)).each do |i|
    last = firstchar_xml(ret[i].reverse)
    nextfirst = firstchar_xml(ret[i + 1])
    cjk1 = /#{CJK}/o.match?(last)
    cjk2 = /#{CJK}/o.match?(nextfirst)
    text1 = /[^\p{Z}\p{C}]/.match?(last)
    text2 = /[^\p{Z}\p{C}]/.match?(nextfirst)
    cjk1 && (cjk2 || !text2) and next
    !text1 && cjk2 and next
    ret[i] += " "
  end
  ret
end

.localdir(node) ⇒ Object



34
35
36
37
# File 'lib/utils/main.rb', line 34

def localdir(node)
  docfile = node.attr("docfile")
  docfile.nil? ? "./" : "#{Pathname.new(docfile).parent}/"
end

.noko(_script = "Latn", &block) ⇒ Object

block for processing XML document fragments as XHTML, to allow for HTMLentities Unescape special chars used in Asciidoctor substitution processing



42
43
44
45
46
47
48
49
50
# File 'lib/utils/xml.rb', line 42

def noko(_script = "Latn", &block)
  fragment = ::Nokogiri::XML.parse(NOKOHEAD).fragment("")
  ::Nokogiri::XML::Builder.with fragment, &block
  fragment
    .to_xml(encoding: "UTF-8", indent: 0,
            save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
    .gsub("&#150;", "\u0096").gsub("&#151;", "\u0097")
    .gsub("&#x96;", "\u0096").gsub("&#x97;", "\u0097")
end

.noko_html(&block) ⇒ Object



76
77
78
79
80
81
82
83
84
85
# File 'lib/utils/xml.rb', line 76

def noko_html(&block)
  doc = ::Nokogiri::XML.parse(NOKOHEAD)
  fragment = doc.fragment("")
  ::Nokogiri::XML::Builder.with fragment, &block
  fragment.to_xml(encoding: "UTF-8", indent: 0,
                  save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
    .lines.map do |l|
    l.gsub(/\s*\n/, "")
  end
end

.ns(xpath) ⇒ Object



92
93
94
95
96
97
# File 'lib/utils/xml.rb', line 92

def ns(xpath)
  xpath.gsub(%r{/([a-zA-Z])}, "/xmlns:\\1")
    .gsub(%r{::([a-zA-Z])}, "::xmlns:\\1")
    .gsub(%r{\[([a-zA-Z][a-z0-9A-Z@/-]* ?=)}, "[xmlns:\\1")
    .gsub(%r{\[([a-zA-Z][a-z0-9A-Z@/-]*[/\[\]])}, "[xmlns:\\1")
end

.numeric_escapes(xml) ⇒ Object



99
100
101
102
103
104
105
106
107
# File 'lib/utils/xml.rb', line 99

def numeric_escapes(xml)
  c = HTMLEntities.new
  xml.split(/(&[^ \r\n\t#&;]+;)/).map do |t|
    if /^(&[^ \t\r\n#;]+;)/.match?(t)
      c.encode(c.decode(t), :hexadecimal)
    else t
    end
  end.join
end

.rtl_script?(script) ⇒ Boolean

Returns:

  • (Boolean)


129
130
131
# File 'lib/utils/main.rb', line 129

def rtl_script?(script)
  %w(Arab Aran Hebr).include? script
end

.set_nested_value(hash, keys, new_val) ⇒ Object

Set hash value using keys path mod from stackoverflow.com/a/42425884



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/utils/main.rb', line 62

def set_nested_value(hash, keys, new_val)
  key = keys[0]
  if keys.length == 1
    hash[key] = if hash[key].is_a?(::Array) then (hash[key] << new_val)
                else hash[key].nil? ? new_val : [hash[key], new_val]
                end
  elsif hash[key].is_a?(::Array)
    hash[key][-1] = {} if !hash[key].empty? && hash[key][-1].nil?
    hash[key] << {} if hash[key].empty? || !hash[key][-1].is_a?(::Hash)
    set_nested_value(hash[key][-1], keys[1..-1], new_val)
  elsif hash[key].nil? || hash[key].empty?
    hash[key] = {}
    set_nested_value(hash[key], keys[1..-1], new_val)
  elsif hash[key].is_a?(::Hash) && !hash[key][keys[1]]
    set_nested_value(hash[key], keys[1..-1], new_val)
  elsif !hash[key][keys[1]]
    hash[key] = [hash[key], {}]
    set_nested_value(hash[key][-1], keys[1..-1], new_val)
  else
    set_nested_value(hash[key], keys[1..-1], new_val)
  end
  hash
end

.smartformat(text) ⇒ Object

TODO needs internationalisation of quote



40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/utils/main.rb', line 40

def smartformat(text)
  ret = HTMLEntities.new.decode(
    text.gsub(/ --? /, "&#8201;&#8212;&#8201;")
    .gsub("--", "&#8212;"),
  )
  ret = ret.gsub(%r{(#{CJK})(["'])}o, "\\1\u200a\\2")
    .gsub(%r{(["'])(#{CJK})}o, "\\1\u200a\\2")
  ret = ret.smart_format
  ret = ret.gsub(%r{(#{CJK})\u200a}o, "\\1")
    .gsub(%r{\u200a(#{CJK})}o, "\\1")
  HTMLEntities.new.encode(ret, :basic)
end

.strict_capitalize_first(str) ⇒ Object



94
95
96
97
98
99
100
# File 'lib/utils/main.rb', line 94

def strict_capitalize_first(str)
  str.split(/ /).each_with_index.map do |w, i|
    letters = w.chars
    letters.first.upcase! if i.zero?
    letters.join
  end.join(" ")
end

.strict_capitalize_phrase(str) ⇒ Object



86
87
88
89
90
91
92
# File 'lib/utils/main.rb', line 86

def strict_capitalize_phrase(str)
  str.split(/ /).map do |w|
    letters = w.chars
    letters.first.upcase!
    letters.join
  end.join(" ")
end

.to_ncname(name, asciionly: false) ⇒ Object

A utility method for escaping XML NCNames (XML Names without colons).

to_ncname('1 < 2 & 3')
# => "1___2___3"

It follows the requirements of the specification for NCName: www.w3.org/TR/xml-names/#NT-NCName NCName is “an XML Name, minus the :”



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/utils/anchor.rb', line 93

def to_ncname(name, asciionly: false)
  name, valid = to_ncname_prep(name, asciionly)
  valid and return name
  starting_char = name[0]
  starting_char.gsub!(INVALID_NCNAME_START_REGEXP, NCNAME_INVALID)
  name.size == 1 and return starting_char
  following_chars = name[1..-1]
  following_chars.gsub!(INVALID_NCNAME_CHAR_REGEXP, NCNAME_INVALID)
  following_chars.gsub!(":", NCNAME_INVALID)
  starting_char << following_chars
end

.to_ncname_prep(name, asciionly) ⇒ Object



105
106
107
108
109
110
111
# File 'lib/utils/anchor.rb', line 105

def to_ncname_prep(name, asciionly)
  name = name&.to_s
  name.nil? and name = ""
  asciionly and name = HTMLEntities.new.encode(name,
                                               :basic, :hexadecimal)
  [name, name.nil? || name.empty? || name.match?(SAFE_NCNAME_REGEXP)]
end

.to_xhtml_fragment(xml) ⇒ Object



87
88
89
90
# File 'lib/utils/xml.rb', line 87

def to_xhtml_fragment(xml)
  doc = ::Nokogiri::XML.parse(NOKOHEAD)
  doc.fragment(xml)
end

.wrap_in_para(node, out) ⇒ Object

if the contents of node are blocks, output them to out; else, wrap them in <p>



111
112
113
114
115
116
# File 'lib/utils/xml.rb', line 111

def wrap_in_para(node, out)
  if node.blocks? then out << node.content
  else
    out.p { |p| p << node.content }
  end
end