Module: Metanorma::Utils

Defined in:: lib/utils/image.rb,
lib/utils/cjk.rb,
lib/utils/log.rb,
lib/utils/xml.rb,
lib/utils/main.rb,
lib/utils/version.rb,
lib/utils/namespace.rb,
lib/utils/linestatus.rb,
lib/utils/hash_transform_keys.rb,
lib/utils/hash_transform_keys.rb

Overview

Image methods were moved to the Vectory gem

Defined Under Namespace

Modules: Array, Hash Classes: LineStatus, Log, Namespace

Constant Summary collapse

HAN = Basic CJK scripts

"\\p{Han}".freeze

BOPOMOFO =

"\\p{Bopomofo}".freeze

HANGUL =

"\\p{Hangul}".freeze

HIRAGANA =

"\\p{Hiragana}".freeze

KATAKANA =

"\\p{Katakana}".freeze

CJK_SYMBOLS = CJK Symbols and Punctuation (U+3000–U+303F) Used across all CJK scripts

"[\\u3000-\\u303F]".freeze

CJK_PUNCTUATION = CJK Punctuation (subset of CJK Symbols commonly used)

"[\\u3001-\\u3003\\u3008-\\u3011\\u3014-\\u301F]".freeze

CJK_HALFWIDTH_FULLWIDTH = Halfwidth and Fullwidth Forms (U+FF00–U+FFEF) Used in all CJK contexts

"[\\uFF00-\\uFFEF]".freeze

CJK_COMPAT = CJK Compatibility Forms (U+FE30–U+FE4F) Primarily used with Han but relevant for all CJK

"[\\uFE30-\\uFE4F]".freeze

CJK_VERTICAL = Vertical Forms (U+FE10–U+FE1F) Used in vertical text layout for all CJK

"[\\uFE10-\\uFE1F]".freeze

CJK_SMALL_FORMS = Small Form Variants (U+FE50–U+FE6F) Used in all CJK contexts

"[\\uFE50-\\uFE6F]".freeze

HAN_IDC = Ideographic Description Characters (U+2FF0–U+2FFF) Used with Han script

"[\\u2FF0-\\u2FFF]".freeze

KANBUN = Kanbun (U+3190–U+319F) Used with Han script for Japanese

"[\\u3190-\\u319F]".freeze

CJK_COMPAT_IDEOGRAPHS = CJK Compatibility (U+3300–U+33FF) Used with Han script

"[\\u3300-\\u33FF]".freeze

HAN_COMPAT_IDEOGRAPHS = CJK Compatibility Ideographs (U+F900–U+FAFF)

"[\\uF900-\\uFAFF]".freeze

HAN_EXTENSIONS = Script extensions by primary script

[
  HAN,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_COMPAT,
  CJK_VERTICAL,
  CJK_SMALL_FORMS,
  HAN_IDC,
  KANBUN,
  CJK_COMPAT_IDEOGRAPHS,
  HAN_COMPAT_IDEOGRAPHS
].join("|").freeze

HANGUL_EXTENSIONS =

[
  HANGUL,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze

HIRAGANA_EXTENSIONS =

[
  HIRAGANA,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze

KATAKANA_EXTENSIONS =

[
  KATAKANA,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH,
  CJK_VERTICAL,
  CJK_SMALL_FORMS
].join("|").freeze

BOPOMOFO_EXTENSIONS =

[
  BOPOMOFO,
  CJK_SYMBOLS,
  CJK_PUNCTUATION,
  CJK_HALFWIDTH_FULLWIDTH
].join("|").freeze

CJK = Combined CJK pattern including all script extensions

[
  HAN_EXTENSIONS,
  HANGUL_EXTENSIONS,
  HIRAGANA_EXTENSIONS,
  KATAKANA_EXTENSIONS,
  BOPOMOFO_EXTENSIONS
].join("|").freeze

NOKOHEAD =

<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE

BASECHAR = Following XML requirements for NCName: www.w3.org/TR/xml-names/#NT-NCName

"A-Za-z\u{C0}-\u{D6}\u{D8}-\u{F6}\u{F8}-\u{FF}\u{100}-\u{131}\u{134}-\u{13E}" \
"\u{141}-\u{148}\u{14A}-\u{17E}\u{180}-\u{1C3}\u{1CD}-\u{1F0}\u{1F4}-\u{1F5}" \
"\u{1FA}-\u{217}\u{250}-\u{2A8}\u{2BB}-\u{2C1}\u{386}\u{388}-\u{38A}\u{38C}" \
"\u{38E}-\u{3A1}\u{3A3}-\u{3CE}\u{3D0}-\u{3D6}\u{3DA}\u{3DC}\u{3DE}\u{3E0}" \
"\u{3E2}-\u{3F3}\u{401}-\u{40C}\u{40E}-\u{44F}\u{451}-\u{45C}\u{45E}-\u{481}" \
"\u{490}-\u{4C4}\u{4C7}-\u{4C8}\u{4CB}-\u{4CC}\u{4D0}-\u{4EB}\u{4EE}-\u{4F5}" \
"\u{4F8}-\u{4F9}\u{531}-\u{556}\u{559}\u{561}-\u{586}\u{5D0}-\u{5EA}" \
"\u{5F0}-\u{5F2}\u{621}-\u{63A}\u{641}-\u{64A}\u{671}-\u{6B7}\u{6BA}-\u{6BE}" \
"\u{6C0}-\u{6CE}\u{6D0}-\u{6D3}\u{6D5}\u{6E5}-\u{6E6}\u{905}-\u{939}\u{93D}" \
"\u{958}-\u{961}\u{985}-\u{98C}\u{98F}-\u{990}\u{993}-\u{9A8}\u{9AA}-\u{9B0}" \
"\u{9B2}\u{9B6}-\u{9B9}\u{9DC}-\u{9DD}\u{9DF}-\u{9E1}\u{9F0}-\u{9F1}" \
"\u{A05}-\u{A0A}\u{A0F}-\u{A10}\u{A13}-\u{A28}\u{A2A}-\u{A30}\u{A32}-\u{A33}" \
"\u{A35}-\u{A36}\u{A38}-\u{A39}\u{A59}-\u{A5C}\u{A5E}\u{A72}-\u{A74}" \
"\u{A85}-\u{A8B}\u{A8D}\u{A8F}-\u{A91}\u{A93}-\u{AA8}\u{AAA}-\u{AB0}" \
"\u{AB2}-\u{AB3}\u{AB5}-\u{AB9}\u{ABD}\u{AE0}\u{B05}-\u{B0C}\u{B0F}-\u{B10}" \
"\u{B13}-\u{B28}\u{B2A}-\u{B30}\u{B32}-\u{B33}\u{B36}-\u{B39}\u{B3D}" \
"\u{B5C}-\u{B5D}\u{B5F}-\u{B61}\u{B85}-\u{B8A}\u{B8E}-\u{B90}\u{B92}-\u{B95}" \
"\u{B99}-\u{B9A}\u{B9C}\u{B9E}-\u{B9F}\u{BA3}-\u{BA4}\u{BA8}-\u{BAA}" \
"\u{BAE}-\u{BB5}\u{BB7}-\u{BB9}\u{C05}-\u{C0C}\u{C0E}-\u{C10}\u{C12}-\u{C28}" \
"\u{C2A}-\u{C33}\u{C35}-\u{C39}\u{C60}-\u{C61}\u{C85}-\u{C8C}\u{C8E}-\u{C90}" \
"\u{C92}-\u{CA8}\u{CAA}-\u{CB3}\u{CB5}-\u{CB9}\u{CDE}\u{CE0}-\u{CE1}" \
"\u{D05}-\u{D0C}\u{D0E}-\u{D10}\u{D12}-\u{D28}\u{D2A}-\u{D39}\u{D60}-\u{D61}" \
"\u{E01}-\u{E2E}\u{E30}\u{E32}-\u{E33}\u{E40}-\u{E45}\u{E81}-\u{E82}\u{E84}" \
"\u{E87}-\u{E88}\u{E8A}\u{E8D}\u{E94}-\u{E97}\u{E99}-\u{E9F}\u{EA1}-\u{EA3}" \
"\u{EA5}\u{EA7}\u{EAA}-\u{EAB}\u{EAD}-\u{EAE}\u{EB0}\u{EB2}-\u{EB3}\u{EBD}" \
"\u{EC0}-\u{EC4}\u{F40}-\u{F47}\u{F49}-\u{F69}\u{10A0}-\u{10C5}\u{10D0}-\u{10F6}" \
"\u{1100}\u{1102}-\u{1103}\u{1105}-\u{1107}\u{1109}\u{110B}-\u{110C}" \
"\u{110E}-\u{1112}\u{113C}\u{113E}\u{1140}\u{114C}\u{114E}\u{1150}" \
"\u{1154}-\u{1155}\u{1159}\u{115F}-\u{1161}\u{1163}\u{1165}\u{1167}\u{1169}" \
"\u{116D}-\u{116E}\u{1172}-\u{1173}\u{1175}\u{119E}\u{11A8}\u{11AB}" \
"\u{11AE}-\u{11AF}\u{11B7}-\u{11B8}\u{11BA}\u{11BC}-\u{11C2}\u{11EB}\u{11F0}" \
"\u{11F9}\u{1E00}-\u{1E9B}\u{1EA0}-\u{1EF9}\u{1F00}-\u{1F15}\u{1F18}-\u{1F1D}" \
"\u{1F20}-\u{1F45}\u{1F48}-\u{1F4D}\u{1F50}-\u{1F57}\u{1F59}\u{1F5B}\u{1F5D}" \
"\u{1F5F}-\u{1F7D}\u{1F80}-\u{1FB4}\u{1FB6}-\u{1FBC}\u{1FBE}\u{1FC2}-\u{1FC4}" \
"\u{1FC6}-\u{1FCC}\u{1FD0}-\u{1FD3}\u{1FD6}-\u{1FDB}\u{1FE0}-\u{1FEC}" \
"\u{1FF2}-\u{1FF4}\u{1FF6}-\u{1FFC}\u{2126}\u{212A}-\u{212B}\u{212E}" \
"\u{2180}-\u{2182}\u{3041}-\u{3094}\u{30A1}-\u{30FA}\u{3105}-\u{312C}" \
"\u{AC00}-\u{D7A3}".freeze

IDEOGRAPHIC =

"\u{4E00}-\u{9FA5}\u{3007}\u{3021}-\u{3029}".freeze

LETTER =

"#{BASECHAR}#{IDEOGRAPHIC}".freeze

DIGIT =

"0-9\u{0660}-\u{0669}\u{06F0}-\u{06F9}\u{0966}-\u{096F}\u{09E6}-\u{09EF}" \
"\u{0A66}-\u{0A6F}\u{0AE6}-\u{0AEF}\u{0B66}-\u{0B6F}\u{0BE7}-\u{0BEF}" \
"\u{0C66}-\u{0C6F}\u{0CE6}-\u{0CEF}\u{0D66}-\u{0D6F}\u{0E50}-\u{0E59}" \
"\u{0ED0}-\u{0ED9}\u{0F20}-\u{0F29}".freeze

COMBINING_CHAR =

"\u{0300}-\u{0345}\u{0360}-\u{0361}\u{0483}-\u{0486}\u{0591}-\u{05A1}" \
"\u{05A3}-\u{05B9}\u{05BB}-\u{05BD}\u{05BF}\u{05C1}-\u{05C2}\u{05C4}" \
"\u{064B}-\u{0652}\u{0670}\u{06D6}-\u{06DC}\u{06DD}-\u{06DF}" \
"\u{06E0}-\u{06E4}\u{06E7}-\u{06E8}\u{06EA}-\u{06ED}\u{0901}-\u{0903}" \
"\u{093C}\u{093E}-\u{094C}\u{094D}\u{0951}-\u{0954}\u{0962}-\u{0963}" \
"\u{0981}-\u{0983}\u{09BC}\u{09BE}\u{09BF}\u{09C0}-\u{09C4}" \
"\u{09C7}-\u{09C8}\u{09CB}-\u{09CD}\u{09D7}\u{09E2}-\u{09E3}\u{0A02}" \
"\u{0A3C}\u{0A3E}\u{0A3F}\u{0A40}-\u{0A42}\u{0A47}-\u{0A48}" \
"\u{0A4B}-\u{0A4D}\u{0A70}-\u{0A71}\u{0A81}-\u{0A83}\u{0ABC}" \
"\u{0ABE}-\u{0AC5}\u{0AC7}-\u{0AC9}\u{0ACB}-\u{0ACD}\u{0B01}-\u{0B03}" \
"\u{0B3C}\u{0B3E}-\u{0B43}\u{0B47}-\u{0B48}\u{0B4B}-\u{0B4D}" \
"\u{0B56}-\u{0B57}\u{0B82}-\u{0B83}\u{0BBE}-\u{0BC2}\u{0BC6}-\u{0BC8}" \
"\u{0BCA}-\u{0BCD}\u{0BD7}\u{0C01}-\u{0C03}\u{0C3E}-\u{0C44}" \
"\u{0C46}-\u{0C48}\u{0C4A}-\u{0C4D}\u{0C55}-\u{0C56}\u{0C82}-\u{0C83}" \
"\u{0CBE}-\u{0CC4}\u{0CC6}-\u{0CC8}\u{0CCA}-\u{0CCD}\u{0CD5}-\u{0CD6}" \
"\u{0D02}-\u{0D03}\u{0D3E}-\u{0D43}\u{0D46}-\u{0D48}\u{0D4A}-\u{0D4D}" \
"\u{0D57}\u{0E31}\u{0E34}-\u{0E3A}\u{0E47}-\u{0E4E}\u{0EB1}" \
"\u{0EB4}-\u{0EB9}\u{0EBB}-\u{0EBC}\u{0EC8}-\u{0ECD}\u{0F18}-\u{0F19}" \
"\u{0F35}\u{0F37}\u{0F39}\u{0F3E}\u{0F3F}\u{0F71}-\u{0F84}" \
"\u{0F86}-\u{0F8B}\u{0F90}-\u{0F95}\u{0F97}\u{0F99}-\u{0FAD}" \
"\u{0FB1}-\u{0FB7}\u{0FB9}\u{20D0}-\u{20DC}\u{20E1}\u{302A}-\u{302F}" \
"\u{3099}\u{309A}".freeze

EXTENDER =

"\u{00B7}\u{02D0}\u{02D1}\u{0387}\u{0640}\u{0E46}\u{0EC6}\u{3005}" \
"\u{3031}-\u{3035}\u{309D}-\u{309E}\u{30FC}-\u{30FE}".freeze

NCNAME_START_CHAR = NCName specific constants - NCName is “an XML Name, minus the :” NCName = (Letter | ‘_’) (NCNameChar)*

"#{LETTER}_".freeze

NCNAME_CHAR =

"#{LETTER}#{DIGIT}._\\-#{COMBINING_CHAR}#{EXTENDER}".freeze

INVALID_NCNAME_START_REGEXP =

/[^#{NCNAME_START_CHAR}]/.freeze

INVALID_NCNAME_CHAR_REGEXP =

/[^#{NCNAME_CHAR}]/.freeze

SAFE_NCNAME_REGEXP =

/\A[#{NCNAME_START_CHAR}][#{NCNAME_CHAR}]*\z/.freeze

NCNAME_INVALID =

"_".freeze

LONGSTR_THRESHOLD =

LONGSTR_NOPUNCT =

STR_BREAKUP_RE =

%r{
 (?<=[=_—–\u2009→?+;]) | # break after any of these
 (?<=[,.:])(?!\d) | # break on punct only if not preceding digit
 (?<=[>])(?![>]) | # > not >->
 (?<=[\]])(?![\]]) | # ] not ]-]
 (?<=//) | # //
 (?<=[/])(?![/]) | # / not /-/
 (?<![<])(?=[<]) | # < not <-<
 (?<=\p{L})(?=[(\{\[]\p{L}) # letter and bracket, followed by letter
}x.freeze

CAMEL_CASE_RE =

%r{
  (?<=\p{Ll}\p{Ll})(?=\p{Lu}\p{Ll}\p{Ll}) # 2 lowerc / upperc, 2 lowerc
}x.freeze

VERSION =

"1.11.4".freeze

Class Method Summary collapse

.anchor_attributes(presxml: false) ⇒ Object

all element/attribute pairs that are ID anchors in Metanorma.
.anchor_or_uuid(node = nil) ⇒ Object
.asciidoc_sub(text, flavour = :standoc) ⇒ Object
.attr_code(attributes) ⇒ Object
.break_up_long_str(text, threshold = LONGSTR_THRESHOLD, nopunct = LONGSTR_NOPUNCT) ⇒ Object

break on punct every LONGSTRING_THRESHOLD chars, with zero width space if punct fails, try break on camel case, with soft hyphen break regardless every LONGSTRING_THRESHOLD * LONGSTR_NOPUNCT, with soft hyphen.
.break_up_long_str1(text, iteration, nopunct) ⇒ Object
.break_up_long_str2(text) ⇒ Object
.case_transform_xml(xml, kase) ⇒ Object
.create_namespace(xmldoc) ⇒ Object
.csv_split(text, delim = ";") ⇒ Object

, “ => ,” : CSV definition does not deal with space followed by quote at start of field.
.default_script(lang) ⇒ Object
.dl_to_attrs(elem, dlist, name) ⇒ Object

convert definition list term/value pair into Nokogiri XML attribute.
.dl_to_elems(ins, elem, dlist, name) ⇒ Object

convert definition list term/value pairs into Nokogiri XML elements.
.dl_to_elems1(term, name, ins) ⇒ Object
.endash_date(elem) ⇒ Object
.external_path(path) ⇒ Object
.firstchar_xml(line) ⇒ Object

need to deal with both <em> and its reverse string, >me<.
.guid_anchor?(id) ⇒ Boolean
.line_sanitise(ret) ⇒ Object

By default, carriage return in source translates to whitespace; but in CJK, it does not.
.localdir(node) ⇒ Object
.noko(_script = "Latn", &block) ⇒ Object

block for processing XML document fragments as XHTML, to allow for HTMLentities Unescape special chars used in Asciidoctor substitution processing.
.noko_html(&block) ⇒ Object
.ns(xpath) ⇒ Object
.numeric_escapes(xml) ⇒ Object
.rtl_script?(script) ⇒ Boolean
.set_nested_value(hash, keys, new_val) ⇒ Object

Set hash value using keys path mod from stackoverflow.com/a/42425884.
.smartformat(text) ⇒ Object

TODO needs internationalisation of quote.
.strict_capitalize_first(str) ⇒ Object
.strict_capitalize_phrase(str) ⇒ Object
.to_ncname(name, asciionly: false) ⇒ Object

A utility method for escaping XML NCNames (XML Names without colons).
.to_ncname_prep(name, asciionly) ⇒ Object
.to_xhtml_fragment(xml) ⇒ Object
.wrap_in_para(node, out) ⇒ Object

if the contents of node are blocks, output them to out; else, wrap them in <p>.

Class Method Details

.anchor_attributes(presxml: false) ⇒ `Object`

all element/attribute pairs that are ID anchors in Metanorma

# File 'lib/utils/xml.rb', line 215

def anchor_attributes(presxml: false)
  ret = [%w(review from), %w(review to), %w(callout target), %w(xref to),
         %w(eref bibitemid), %w(citation bibitemid), %w(xref target),
         %w(label for), %w(location target), %w(index to),
         %w(termsource bibitemid), %w(admonition target)]
  ret1 = [%w(fn target), %w(semx source), %w(fmt-title source),
          %w(fmt-xref to), %w(fmt-xref target), %w(fmt-eref bibitemid),
          %w(fmt-xref-label container), %w(fmt-fn-body target),
          %w(fmt-review-start source), %w(fmt-review-start end),
          %w(fmt-review-start target), %w(fmt-review-end source),
          %w(fmt-review-end start), %w(fmt-review-end target)]
  presxml ? ret + ret1 : ret
end

.anchor_or_uuid(node = nil) ⇒ `Object`

# File 'lib/utils/xml.rb', line 130

def anchor_or_uuid(node = nil)
  uuid = UUIDTools::UUID.random_create
  node.nil? || node.id.nil? || node.id.empty? ? "_#{uuid}" : node.id
end

.asciidoc_sub(text, flavour = :standoc) ⇒ `Object`

# File 'lib/utils/main.rb', line 22

def asciidoc_sub(text, flavour = :standoc)
  return nil if text.nil?
  return "" if text.empty?

  d = Asciidoctor::Document.new(
    text.lines.entries,
    { header_footer: false, backend: flavour },
  )
  b = d.parse.blocks.first
  b.apply_subs(b.source)
end

.attr_code(attributes) ⇒ `Object`

# File 'lib/utils/xml.rb', line 18

def attr_code(attributes)
  attributes.compact.transform_values do |v|
    v.is_a?(String) ? HTMLEntities.new.decode(v) : v
  end
end

.break_up_long_str(text, threshold = LONGSTR_THRESHOLD, nopunct = LONGSTR_NOPUNCT) ⇒ `Object`

break on punct every LONGSTRING_THRESHOLD chars, with zero width space if punct fails, try break on camel case, with soft hyphen break regardless every LONGSTRING_THRESHOLD * LONGSTR_NOPUNCT, with soft hyphen

# File 'lib/utils/main.rb', line 140

def break_up_long_str(text, threshold = LONGSTR_THRESHOLD,
nopunct = LONGSTR_NOPUNCT)
  /^\s*$/.match?(text) and return text
  text.split(/(?=(?:\s|-))/).map do |w|
    if /^\s*$/.match(w) || (w.size < threshold) then w
    else
      w.scan(/.{,#{threshold}}/o).map.with_index do |w1, i|
        w1.size < threshold ? w1 : break_up_long_str1(w1, i + 1, nopunct)
      end.join
    end
  end.join
end

.break_up_long_str1(text, iteration, nopunct) ⇒ `Object`

# File 'lib/utils/main.rb', line 168

def break_up_long_str1(text, iteration, nopunct)
  s, separator = break_up_long_str2(text)
  if s.size == 1 # could not break up
    (iteration % nopunct).zero? and
      text += "\u00ad" # force soft hyphen
    text
  else
    s[-1] = "#{separator}#{s[-1]}"
    s.join
  end
end

.break_up_long_str2(text) ⇒ `Object`

# File 'lib/utils/main.rb', line 180

def break_up_long_str2(text)
  s = text.split(STR_BREAKUP_RE, -1)
  separator = "\u200b"
  if s.size == 1
    s = text.split(CAMEL_CASE_RE)
    separator = "\u00ad"
  end
  [s, separator]
end

.case_transform_xml(xml, kase) ⇒ `Object`

# File 'lib/utils/xml.rb', line 254

def case_transform_xml(xml, kase)
  x = Nokogiri::XML("<root>#{xml}</root>")
  x.traverse do |e|
    e.text? or next
    e.replace(e.text.send(kase))
  end
  x.root.children.to_xml
end

.create_namespace(xmldoc) ⇒ `Object`



21
22
23

# File 'lib/utils/namespace.rb', line 21

def create_namespace(xmldoc)
  Namespace.new(xmldoc)
end

.csv_split(text, delim = ";") ⇒ `Object`

, “ => ,” : CSV definition does not deal with space followed by quote at start of field

# File 'lib/utils/main.rb', line 15

def csv_split(text, delim = ";")
  text.nil? || text.empty? and return []
  CSV.parse_line(text.gsub(/#{delim} "(?!")/, "#{delim}\""),
                 liberal_parsing: true,
                 col_sep: delim)&.compact&.map(&:strip)
end

.default_script(lang) ⇒ `Object`

# File 'lib/utils/main.rb', line 113

def default_script(lang)
  case lang
  when "ar", "fa" then "Arab"
  when "ur" then "Aran"
  when "ru", "bg" then "Cyrl"
  when "hi" then "Deva"
  when "el" then "Grek"
  when "zh" then "Hans"
  when "ko" then "Kore"
  when "he" then "Hebr"
  when "ja" then "Jpan"
  else
    "Latn"
  end
end

.dl_to_attrs(elem, dlist, name) ⇒ `Object`

convert definition list term/value pair into Nokogiri XML attribute

# File 'lib/utils/xml.rb', line 230

def dl_to_attrs(elem, dlist, name)
  e = dlist.at("./dt[text()='#{name}']") or return
  val = e.at("./following::dd/p") || e.at("./following::dd") or return
  elem[name] = val.text
end

.dl_to_elems(ins, elem, dlist, name) ⇒ `Object`

convert definition list term/value pairs into Nokogiri XML elements

# File 'lib/utils/xml.rb', line 237

def dl_to_elems(ins, elem, dlist, name)
  a = elem.at("./#{name}[last()]")
  ins = a if a
  dlist.xpath("./dt[text()='#{name}']").each do |e|
    ins = dl_to_elems1(e, name, ins)
  end
  ins
end

.dl_to_elems1(term, name, ins) ⇒ `Object`

# File 'lib/utils/xml.rb', line 246

def dl_to_elems1(term, name, ins)
  v = term.at("./following::dd")
  e = v.elements and e.size == 1 && e.first.name == "p" and v = e.first
  v.name = name
  ins.next = v
  ins.next
end

.endash_date(elem) ⇒ `Object`

# File 'lib/utils/main.rb', line 53

def endash_date(elem)
  elem.traverse do |n|
    n.text? or next
    n.replace(n.text.gsub(/\s+--?\s+/, "&#8211;").gsub("--", "&#8211;"))
  end
end

.external_path(path) ⇒ `Object`

# File 'lib/utils/main.rb', line 102

def external_path(path)
  win = !!((RUBY_PLATFORM =~ /(win|w)(32|64)$/) ||
           (RUBY_PLATFORM =~ /mswin|mingw/))
  if win
    path.gsub!(%{/}, "\\")
    path[/\s/] ? "\"#{path}\"" : path
  else
    path
  end
end

.firstchar_xml(line) ⇒ `Object`

need to deal with both <em> and its reverse string, >me<

# File 'lib/utils/xml.rb', line 167

def firstchar_xml(line)
  m = /^([<>][^<>]+[<>])*(.)/.match(line) or return ""
  m[2]
end

.guid_anchor?(id) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/utils/xml.rb', line 263

def guid_anchor?(id)
  /^_[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}$/i
    .match?(id)
end

.line_sanitise(ret) ⇒ `Object`

By default, carriage return in source translates to whitespace; but in CJK, it does not. (Non-CJK text n CJK)

# File 'lib/utils/xml.rb', line 150

def line_sanitise(ret)
  ret.size == 1 and return ret
  (0...(ret.size - 1)).each do |i|
    last = firstchar_xml(ret[i].reverse)
    nextfirst = firstchar_xml(ret[i + 1])
    cjk1 = /#{CJK}/o.match?(last)
    cjk2 = /#{CJK}/o.match?(nextfirst)
    text1 = /[^\p{Z}\p{C}]/.match?(last)
    text2 = /[^\p{Z}\p{C}]/.match?(nextfirst)
    cjk1 && (cjk2 || !text2) and next
    !text1 && cjk2 and next
    ret[i] += " "
  end
  ret
end

.localdir(node) ⇒ `Object`

# File 'lib/utils/main.rb', line 34

def localdir(node)
  docfile = node.attr("docfile")
  docfile.nil? ? "./" : "#{Pathname.new(docfile).parent}/"
end

.noko(_script = "Latn", &block) ⇒ `Object`

block for processing XML document fragments as XHTML, to allow for HTMLentities Unescape special chars used in Asciidoctor substitution processing

# File 'lib/utils/xml.rb', line 138

def noko(_script = "Latn", &block)
  fragment = ::Nokogiri::XML.parse(NOKOHEAD).fragment("")
  ::Nokogiri::XML::Builder.with fragment, &block
  fragment
    .to_xml(encoding: "UTF-8", indent: 0,
            save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
    .gsub("&#150;", "\u0096").gsub("&#151;", "\u0097")
    .gsub("&#x96;", "\u0096").gsub("&#x97;", "\u0097")
end

.noko_html(&block) ⇒ `Object`

# File 'lib/utils/xml.rb', line 172

def noko_html(&block)
  doc = ::Nokogiri::XML.parse(NOKOHEAD)
  fragment = doc.fragment("")
  ::Nokogiri::XML::Builder.with fragment, &block
  fragment.to_xml(encoding: "UTF-8", indent: 0,
                  save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
    .lines.map do |l|
    l.gsub(/\s*\n/, "")
  end
end

.ns(xpath) ⇒ `Object`

# File 'lib/utils/xml.rb', line 188

def ns(xpath)
  xpath.gsub(%r{/([a-zA-Z])}, "/xmlns:\\1")
    .gsub(%r{::([a-zA-Z])}, "::xmlns:\\1")
    .gsub(%r{\[([a-zA-Z][a-z0-9A-Z@/-]* ?=)}, "[xmlns:\\1")
    .gsub(%r{\[([a-zA-Z][a-z0-9A-Z@/-]*[/\[\]])}, "[xmlns:\\1")
end

.numeric_escapes(xml) ⇒ `Object`

# File 'lib/utils/xml.rb', line 195

def numeric_escapes(xml)
  c = HTMLEntities.new
  xml.split(/(&[^ \r\n\t#&;]+;)/).map do |t|
    if /^(&[^ \t\r\n#;]+;)/.match?(t)
      c.encode(c.decode(t), :hexadecimal)
    else t
    end
  end.join
end

.rtl_script?(script) ⇒ `Boolean`

Returns:

(Boolean)



129
130
131

# File 'lib/utils/main.rb', line 129

def rtl_script?(script)
  %w(Arab Aran Hebr).include? script
end

.set_nested_value(hash, keys, new_val) ⇒ `Object`

Set hash value using keys path mod from stackoverflow.com/a/42425884

# File 'lib/utils/main.rb', line 62

def set_nested_value(hash, keys, new_val)
  key = keys[0]
  if keys.length == 1
    hash[key] = if hash[key].is_a?(::Array) then (hash[key] << new_val)
                else hash[key].nil? ? new_val : [hash[key], new_val]
                end
  elsif hash[key].is_a?(::Array)
    hash[key][-1] = {} if !hash[key].empty? && hash[key][-1].nil?
    hash[key] << {} if hash[key].empty? || !hash[key][-1].is_a?(::Hash)
    set_nested_value(hash[key][-1], keys[1..-1], new_val)
  elsif hash[key].nil? || hash[key].empty?
    hash[key] = {}
    set_nested_value(hash[key], keys[1..-1], new_val)
  elsif hash[key].is_a?(::Hash) && !hash[key][keys[1]]
    set_nested_value(hash[key], keys[1..-1], new_val)
  elsif !hash[key][keys[1]]
    hash[key] = [hash[key], {}]
    set_nested_value(hash[key][-1], keys[1..-1], new_val)
  else
    set_nested_value(hash[key], keys[1..-1], new_val)
  end
  hash
end

.smartformat(text) ⇒ `Object`

TODO needs internationalisation of quote

# File 'lib/utils/main.rb', line 40

def smartformat(text)
  ret = HTMLEntities.new.decode(
    text.gsub(/ --? /, "&#8201;&#8212;&#8201;")
    .gsub("--", "&#8212;"),
  )
  ret = ret.gsub(%r{(#{CJK})(["'])}o, "\\1\u200a\\2")
    .gsub(%r{(["'])(#{CJK})}o, "\\1\u200a\\2")
  ret = ret.smart_format
  ret = ret.gsub(%r{(#{CJK})\u200a}o, "\\1")
    .gsub(%r{\u200a(#{CJK})}o, "\\1")
  HTMLEntities.new.encode(ret, :basic)
end

.strict_capitalize_first(str) ⇒ `Object`

# File 'lib/utils/main.rb', line 94

def strict_capitalize_first(str)
  str.split(/ /).each_with_index.map do |w, i|
    letters = w.chars
    letters.first.upcase! if i.zero?
    letters.join
  end.join(" ")
end

.strict_capitalize_phrase(str) ⇒ `Object`

# File 'lib/utils/main.rb', line 86

def strict_capitalize_phrase(str)
  str.split(/ /).map do |w|
    letters = w.chars
    letters.first.upcase!
    letters.join
  end.join(" ")
end

.to_ncname(name, asciionly: false) ⇒ `Object`

A utility method for escaping XML NCNames (XML Names without colons).

to_ncname('1 < 2 & 3')
# => "1___2___3"

It follows the requirements of the specification for NCName: www.w3.org/TR/xml-names/#NT-NCName NCName is “an XML Name, minus the :”

# File 'lib/utils/xml.rb', line 110

def to_ncname(name, asciionly: false)
  name, valid = to_ncname_prep(name, asciionly)
  valid and return name
  starting_char = name[0]
  starting_char.gsub!(INVALID_NCNAME_START_REGEXP, NCNAME_INVALID)
  name.size == 1 and return starting_char
  following_chars = name[1..-1]
  following_chars.gsub!(INVALID_NCNAME_CHAR_REGEXP, NCNAME_INVALID)
  following_chars.gsub!(":", NCNAME_INVALID)
  starting_char << following_chars
end

.to_ncname_prep(name, asciionly) ⇒ `Object`

# File 'lib/utils/xml.rb', line 122

def to_ncname_prep(name, asciionly)
  name = name&.to_s
  name.nil? and name = ""
  asciionly and name = HTMLEntities.new.encode(name,
                                               :basic, :hexadecimal)
  [name, name.nil? || name.empty? || name.match?(SAFE_NCNAME_REGEXP)]
end

.to_xhtml_fragment(xml) ⇒ `Object`

# File 'lib/utils/xml.rb', line 183

def to_xhtml_fragment(xml)
  doc = ::Nokogiri::XML.parse(NOKOHEAD)
  doc.fragment(xml)
end

.wrap_in_para(node, out) ⇒ `Object`

if the contents of node are blocks, output them to out; else, wrap them in <p>

# File 'lib/utils/xml.rb', line 207

def wrap_in_para(node, out)
  if node.blocks? then out << node.content
  else
    out.p { |p| p << node.content }
  end
end

Module: Metanorma::Utils

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.anchor_attributes(presxml: false) ⇒ Object

.anchor_or_uuid(node = nil) ⇒ Object

.asciidoc_sub(text, flavour = :standoc) ⇒ Object

.attr_code(attributes) ⇒ Object

.break_up_long_str(text, threshold = LONGSTR_THRESHOLD, nopunct = LONGSTR_NOPUNCT) ⇒ Object

.break_up_long_str1(text, iteration, nopunct) ⇒ Object

.break_up_long_str2(text) ⇒ Object

.case_transform_xml(xml, kase) ⇒ Object

.create_namespace(xmldoc) ⇒ Object

.csv_split(text, delim = ";") ⇒ Object

.default_script(lang) ⇒ Object

.dl_to_attrs(elem, dlist, name) ⇒ Object

.dl_to_elems(ins, elem, dlist, name) ⇒ Object

.dl_to_elems1(term, name, ins) ⇒ Object

.endash_date(elem) ⇒ Object

.external_path(path) ⇒ Object

.firstchar_xml(line) ⇒ Object

.guid_anchor?(id) ⇒ Boolean

.line_sanitise(ret) ⇒ Object

.localdir(node) ⇒ Object

.noko(_script = "Latn", &block) ⇒ Object

.noko_html(&block) ⇒ Object

.ns(xpath) ⇒ Object

.numeric_escapes(xml) ⇒ Object

.rtl_script?(script) ⇒ Boolean

.set_nested_value(hash, keys, new_val) ⇒ Object

.smartformat(text) ⇒ Object

.strict_capitalize_first(str) ⇒ Object

.strict_capitalize_phrase(str) ⇒ Object

.to_ncname(name, asciionly: false) ⇒ Object

.to_ncname_prep(name, asciionly) ⇒ Object

.to_xhtml_fragment(xml) ⇒ Object

.wrap_in_para(node, out) ⇒ Object