Class: REXML::Parsers::BaseParser

Inherits:
Object
  • Object
show all
Defined in:
lib/vendor/rexml/parsers.rb

Instance Method Summary collapse

Instance Method Details

#pullObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/vendor/rexml/parsers.rb', line 8

def pull
  if @closed
    x, @closed = @closed, nil
    return [ :end_element, x ]
  end
  return [ :end_document ] if empty?
  return @stack.shift if @stack.size > 0
  #STDERR.puts @source.encoding
  @source.read if @source.buffer.size<2
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
  if @document_status == nil
    #@source.consume( /^\s*/um )
    word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
    word = word[1] unless word.nil?
    #STDERR.puts "WORD = #{word.inspect}"
    case word
    when COMMENT_START
      return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
    when XMLDECL_START
      #STDERR.puts "XMLDECL"
      results = @source.match( XMLDECL_PATTERN, true )[1]
      version = VERSION.match( results )
      version = version[1] unless version.nil?
      encoding = ENCODING.match(results)
      encoding = encoding[1] unless encoding.nil?
      @source.encoding = encoding
      standalone = STANDALONE.match(results)
      standalone = standalone[1] unless standalone.nil?
      return [ :xmldecl, version, encoding, standalone ]
    when INSTRUCTION_START
      return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
    when DOCTYPE_START
      md = @source.match( DOCTYPE_PATTERN, true )
      @nsstack.unshift(curr_ns=Set.new)
      identity = md[1]
      close = md[2]
      identity =~ IDENTITY
      name = $1
      raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
      pub_sys = $2.nil? ? nil : $2.strip
      long_name = $4.nil? ? nil : $4.strip
      uri = $6.nil? ? nil : $6.strip
      args = [ :start_doctype, name, pub_sys, long_name, uri ]
      if close == ">"
        @document_status = :after_doctype
        @source.read if @source.buffer.size<2
        md = @source.match(/^\s*/um, true)
        @stack << [ :end_doctype ]
      else
        @document_status = :in_doctype
      end
      return args
    when /^\s+/
    else
      @document_status = :after_doctype
      @source.read if @source.buffer.size<2
      md = @source.match(/\s*/um, true)
    end
  end
  if @document_status == :in_doctype
    md = @source.match(/\s*(.*?>)/um)
    case md[1]
    when SYSTEMENTITY 
      match = @source.match( SYSTEMENTITY, true )[1]
      return [ :externalentity, match ]

    when ELEMENTDECL_START
      return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

    when ENTITY_START
      match = @source.match( ENTITYDECL, true ).to_a.compact
      match[0] = :entitydecl
      ref = false
      if match[1] == '%'
        ref = true
        match.delete_at 1
      end
      # Now we have to sort out what kind of entity reference this is
      if match[2] == 'SYSTEM'
        # External reference
        match[3] = match[3][1..-2] # PUBID
        match.delete_at(4) if match.size > 4 # Chop out NDATA decl
        # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
      elsif match[2] == 'PUBLIC'
        # External reference
        match[3] = match[3][1..-2] # PUBID
        match[4] = match[4][1..-2] # HREF
        # match is [ :entity, name, PUBLIC, pubid, href ]
      else
        match[2] = match[2][1..-2]
        match.pop if match.size == 4
        # match is [ :entity, name, value ]
      end
      match << '%' if ref
      return match
    when ATTLISTDECL_START
      md = @source.match( ATTLISTDECL_PATTERN, true )
      raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
      element = md[1]
      contents = md[0]

      pairs = {}
      values = md[0].scan( ATTDEF_RE )
      values.each do |attdef|
        unless attdef[3] == "#IMPLIED"
          attdef.compact!
          val = attdef[3]
          val = attdef[4] if val == "#FIXED "
          pairs[attdef[0]] = val
          if attdef[0] =~ /^xmlns:(.*)/
            @nsstack[0] << $1
          end
        end
      end
      return [ :attlistdecl, element, pairs, contents ]
    when NOTATIONDECL_START
      md = nil
      if @source.match( PUBLIC )
        md = @source.match( PUBLIC, true )
        vals = [md[1],md[2],md[4],md[6]]
      elsif @source.match( SYSTEM )
        md = @source.match( SYSTEM, true )
        vals = [md[1],md[2],nil,md[4]]
      else
        raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
      end
      return [ :notationdecl, *vals ]
    when CDATA_END
      @document_status = :after_doctype
      @source.match( CDATA_END, true )
      return [ :end_doctype ]
    end
  end
  begin
    if @source.buffer[0] == ?<
      if @source.buffer[1] == ?/
        @nsstack.shift
        last_tag = @tags.pop
        #md = @source.match_to_consume( '>', CLOSE_MATCH)
        md = @source.match( CLOSE_MATCH, true )
        raise REXML::ParseException.new( "Missing end tag for "+
          "'#{last_tag}' (got \"#{md[1]}\")", 
          @source) unless last_tag == md[1]
        return [ :end_element, last_tag ]
      elsif @source.buffer[1] == ?!
        md = @source.match(/\A(\s*[^>]*>)/um)
        #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
        raise REXML::ParseException.new("Malformed node", @source) unless md
        if md[0][2] == ?-
          md = @source.match( COMMENT_PATTERN, true )
          return [ :comment, md[1] ] if md
        else
          md = @source.match( CDATA_PATTERN, true )
          return [ :cdata, md[1] ] if md
        end
        raise REXML::ParseException.new( "Declarations can only occur "+
          "in the doctype declaration.", @source)
      elsif @source.buffer[1] == ??
        md = @source.match( INSTRUCTION_PATTERN, true )
        return [ :processing_instruction, md[1], md[2] ] if md
        raise REXML::ParseException.new( "Bad instruction declaration",
          @source)
      else
        # Get the next tag
        md = @source.match(TAG_MATCH, true)
        unless md
          # Check for missing attribute quotes
          raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
          raise REXML::ParseException.new("malformed XML: missing tag start", @source) 
        end
        attributes = {}
        prefixes = Set.new
        prefixes << md[2] if md[2]
        @nsstack.unshift(curr_ns=Set.new)
        if md[4].size > 0
          attrs = md[4].scan( ATTRIBUTE_PATTERN )
          raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
          attrs.each { |a,b,c,d,e| 
            if b == "xmlns"
              if c == "xml"
                if d != "http://www.w3.org/XML/1998/namespace"
                  msg = "The 'xml' prefix must not be bound to any other namespace "+
                  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
                  raise REXML::ParseException.new( msg, @source, self )
                end
              elsif c == "xmlns"
                msg = "The 'xmlns' prefix must not be declared "+
                "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
                raise REXML::ParseException.new( msg, @source, self)
              end
              curr_ns << c
            elsif b
              prefixes << b unless b == "xml"
            end
            attributes[a] = e 
          }
        end

        # Verify that all of the prefixes have been defined
        #for prefix in prefixes
        #  unless @nsstack.find{|k| k.member?(prefix)}
        #    raise UndefinedNamespaceException.new(prefix,@source,self)
        #  end
        #end

        if md[6]
          @closed = md[1]
          @nsstack.shift
        else
          @tags.push( md[1] )
        end
        return [ :start_element, md[1], attributes ]
      end
    else
      md = @source.match( TEXT_PATTERN, true )
      if md[0].length == 0
        @source.match( /(\s+)/, true )
      end
      #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
      #return [ :text, "" ] if md[0].length == 0
      # unnormalized = Text::unnormalize( md[1], self )
      # return PullEvent.new( :text, md[1], unnormalized )
      return [ :text, md[1] ]
    end
  rescue REXML::UndefinedNamespaceException
    raise
  rescue REXML::ParseException
    raise
  rescue Exception, NameError => error
    raise REXML::ParseException.new( "Exception parsing",
      @source, self, (error ? error : $!) )
  end
  return [ :dummy ]
end