---
-- This is the NSE implementation of SLAXML.
-- SLAXML is a pure-Lua SAX-like streaming XML parser. It is more robust
-- than many (simpler) pattern-based parsers that exist, properly supporting
-- code like
, CDATA nodes, comments,
-- namespaces, and processing instructions.
-- It is currently not a truly valid XML parser, however, as it allows certain XML that is
-- syntactically-invalid (not well-formed) to be parsed without reporting an error.
-- The streaming parser does a simple pass through the input and reports what it sees along the way.
-- You can optionally ignore white-space only text nodes using the stripWhitespace
option.
-- The library contains the parser class and the parseDOM function.
--
-- Basic Usage of the library:
--
-- local parser = parser:new()
-- parser:parseSAX(xmlbody, {stripWhitespace=true})
--
-- To specify custom call backs use :
--
-- local call_backs = {
-- startElement = function(name,nsURI,nsPrefix) end, -- When "" or or "/>" is seen
-- text = function(text) end, -- text and CDATA nodes
-- comment = function(content) end, -- comments
-- pi = function(target,content) end, -- processing instructions e.g. ""
-- }
-- local parser = parser:new(call_backs)
-- parser:parseSAX(xmlbody)
--
-- The code also contains the parseDOM
function.
-- To get the dom table use the parseDOM
method as follows.
--
-- parseDOM(xmlbody, options)
--
--
-- DOM Table Features
--
-- Document - the root table returned from the parseDOM() method.
--
-- * doc.type
: the string "document"
-- * doc.name
: the string "#doc"
-- * doc.kids
: an array table of child processing instructions, the root element, and comment nodes.
-- * doc.root
: the root element for the document
--
-- Element
--
-- * someEl.type
: the string "element"
-- * someEl.name
: the string name of the element (without any namespace prefix)
-- * someEl.nsURI
: the namespace URI for this element; nil if no namespace is applied
-- * someEl.attr
: a table of attributes, indexed by name and index
--
-- local value = someEl.attr['attribute-name']
: any namespace prefix of the attribute is not part of the name
--
-- local someAttr = someEl.attr[1]
: an single attribute table (see below); useful for iterating all
-- attributes of an element, or for disambiguating attributes with the same name in different namespaces
--
-- * someEl.kids
: an array table of child elements, text nodes, comment nodes, and processing instructions
-- * someEl.el
: an array table of child elements only
-- * someEl.parent
: reference to the parent element or document table
--
-- Attribute
--
-- * someAttr.type
: the string "attribute"
-- * someAttr.name
: the name of the attribute (without any namespace prefix)
-- * someAttr.value
: the string value of the attribute (with XML and numeric entities unescaped)
-- * someAttr.nsURI
: the namespace URI for the attribute; nil if no namespace is applied
-- * someAttr.parent
: reference to the owning element table
--
-- Text - for both CDATA and normal text nodes
--
-- * someText.type
: the string "text"
-- * someText.name
: the string "#text"
-- * someText.value
: the string content of the text node (with XML and numeric entities unescaped for non-CDATA elements)
-- * someText.parent
: reference to the parent element table
--
-- Comment
--
-- * someComment.type
: the string "comment"
-- * someComment.name
: the string "#comment"
-- * someComment.value
: the string content of the attribute
-- * someComment.parent
: reference to the parent element or document table
--
-- Processing Instruction
--
-- * someComment.type
: the string "pi"
-- * someComment.name
: the string name of the PI, e.g. has a name of "foo"
-- * someComment.value
: the string content of the PI, i.e. everything but the name
-- * someComment.parent
: reference to the parent element or document table
--
-- @author Gavin Kistner
-- @author Gyanendra Mishra
--[=====================================================================[
v0.7 Copyright © 2013-2014 Gavin Kistner ; MIT Licensed
See http://github.com/Phrogz/SLAXML for details.
--]=====================================================================]
local string = require "string"
local stdnse = require "stdnse"
local table = require "table"
local unicode = require "unicode"
_ENV = stdnse.module("slaxml", stdnse.seeall)
-- A table containing the default call backs to be used
-- This really floods the script output, you will mostly be
-- using custom call backs.
-- Set the debugging level required for the default call backs. Defaults to 3.
local debugging_level = tonumber(stdnse.get_script_args('slaxml.debug')) or 3
local DEFAULT_CALLBACKS = {
--- A call back for processing instructions.
-- To use define pi = function(, ) end in parser._call table.
-- Executes whenever a processing instruction is found.
-- @param target the PI target
-- @param content any value not containing the sequence '?>'
pi = function(target,content)
stdnse.debug(debugging_level, string.format("%s %s?>",target,content))
end,
--- A call back for comments.
-- To use define comment = function() end in parser._call table.
-- Executes whenever a comment is encountered.
-- @param content The comment body itself.
comment = function(content)
stdnse.debug(debugging_level, debugging_level, string.format("",content))
end,
--- A call back for the start of elements.
-- To use define startElement = function(, , ) end in parser._call table.
-- Executes whenever an element starts.
-- @param name The name of the element.
-- @param nsURI The name space URI.
-- @param nsPrefix The name space prefix.
startElement = function(name,nsURI,nsPrefix)
local output = "<"
if nsPrefix then output = output .. nsPrefix .. ":" end
output = output .. name
if nsURI then output = output .. " (ns='" .. nsURI .. "')" end
output = output .. ">"
stdnse.debug(debugging_level, output)
end,
--- A call back for attributes.
-- To use define attribute = function(, , , ) end in parser._call table.
-- Executes whenever an attribute is found.
-- @param name The name of the attribute.
-- @param value The value of the attribute.
-- @param nsURI The name space URI.
-- @param nsPrefix The name space prefix.
attribute = function(name,value,nsURI,nsPrefix)
local output = ' '
if nsPrefix then output = output .. nsPrefix .. ":" end
output = output .. name .. '=' .. string.format('%q',value)
if nsURI then output = output .. (" (ns='" .. nsURI .. "')") end
stdnse.debug(debugging_level, output)
end,
--- A call back for text content.
-- To use define text = function() end in parser._call table.
-- Executes whenever pure text is found.
-- @param text The actual text.
text = function(text)
stdnse.debug(debugging_level, string.format(" text: %q",text))
end,
--- A call back for the end of elements.
-- To use define closeElement = function(, , ) end in parser._call table.
-- Executes whenever an element closes.
-- @param name The name of the element.
-- @param nsURI The name space URI.
-- @param nsPrefix The name space prefix.
closeElement = function(name,nsURI,nsPrefix)
stdnse.debug(debugging_level, string.format("%s>",name))
end,
}
local entityMap = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and unicode.utf8_enc(tonumber('0'..s)) or orig end
parser = {
new = function(self, callbacks)
local o = {
_call = callbacks or DEFAULT_CALLBACKS
}
setmetatable(o, self)
self.__index = self
return o
end,
unescape = function(str) return string.gsub( str, '(&(#?)([%d%a]+);)', entitySwap ) end,
--- Parses the xml in sax like manner.
-- @self The parser object.
-- @param xml The xml body to be parsed.
-- @param options Options if any specified.
parseSAX = function(self, xml, options)
if not options then options = { stripWhitespace=false } end
-- Cache references for maximum speed
local find, sub, gsub, char, push, pop, concat = string.find, string.sub, string.gsub, string.char, table.insert, table.remove, table.concat
local first, last, match1, match2, match3, pos2, nsURI
local unpack = unpack or table.unpack
local pos = 1
local state = "text"
local textStart = 1
local currentElement={}
local currentAttributes={}
local currentAttributeCt -- manually track length since the table is re-used
local nsStack = {}
local anyElement = false
local function finishText()
if first>textStart and self._call.text then
local text = sub(xml,textStart,first-1)
if options.stripWhitespace then
text = gsub(text,'^%s+','')
text = gsub(text,'%s+$','')
if #text==0 then text=nil end
end
if text then self._call.text(parser.unescape(text)) end
end
end
local function findPI()
first, last, match1, match2 = find( xml, '^<%?([:%a_][:%w_.-]*) ?(.-)%?>', pos )
if first then
finishText()
if self._call.pi then self._call.pi(match1,match2) end
pos = last+1
textStart = pos
return true
end
end
local function findComment()
first, last, match1 = find( xml, '^', pos )
if first then
finishText()
if self._call.comment then self._call.comment(match1) end
pos = last+1
textStart = pos
return true
end
end
local function nsForPrefix(prefix)
if prefix=='xml' then return 'http://www.w3.org/XML/1998/namespace' end -- http://www.w3.org/TR/xml-names/#ns-decl
for i=#nsStack,1,-1 do if nsStack[i][prefix] then return nsStack[i][prefix] end end
stdnse.debug1(("Cannot find namespace for prefix %s"):format(prefix))
return
end
local function startElement()
anyElement = true
first, last, match1 = find( xml, '^<([%a_][%w_.-]*)', pos )
if first then
currentElement[2] = nil -- reset the nsURI, since this table is re-used
currentElement[3] = nil -- reset the nsPrefix, since this table is re-used
finishText()
pos = last+1
first,last,match2 = find(xml, '^:([%a_][%w_.-]*)', pos )
if first then
currentElement[1] = match2
currentElement[3] = match1 -- Save the prefix for later resolution
match1 = match2
pos = last+1
else
currentElement[1] = match1
for i=#nsStack,1,-1 do if nsStack[i]['!'] then currentElement[2] = nsStack[i]['!']; break end end
end
currentAttributeCt = 0
push(nsStack,{})
return true
end
end
local function findAttribute()
first, last, match1 = find( xml, '^%s+([:%a_][:%w_.-]*)%s*=%s*', pos )
if first then
pos2 = last+1
first, last, match2 = find( xml, '^"([^<"]*)"', pos2 ) -- FIXME: disallow non-entity ampersands
if first then
pos = last+1
match2 = parser.unescape(match2)
else
first, last, match2 = find( xml, "^'([^<']*)'", pos2 ) -- FIXME: disallow non-entity ampersands
if first then
pos = last+1
match2 = parser.unescape(match2)
end
end
end
if match1 and match2 then
local currentAttribute = {match1,match2}
local prefix,name = string.match(match1,'^([^:]+):([^:]+)$')
if prefix then
if prefix=='xmlns' then
nsStack[#nsStack][name] = match2
else
currentAttribute[1] = name
currentAttribute[4] = prefix
end
else
if match1=='xmlns' then
nsStack[#nsStack]['!'] = match2
currentElement[2] = match2
end
end
currentAttributeCt = currentAttributeCt + 1
currentAttributes[currentAttributeCt] = currentAttribute
return true
end
end
local function findCDATA()
first, last, match1 = find( xml, '^', pos )
if first then
finishText()
if self._call.text then self._call.text(match1) end
pos = last+1
textStart = pos
return true
end
end
local function closeElement()
first, last, match1 = find( xml, '^%s*(/?)>', pos )
if first then
state = "text"
pos = last+1
textStart = pos
-- Resolve namespace prefixes AFTER all new/redefined prefixes have been parsed
if currentElement[3] then currentElement[2] = nsForPrefix(currentElement[3]) end
if self._call.startElement then self._call.startElement(unpack(currentElement)) end
if self._call.attribute then
for i=1,currentAttributeCt do
if currentAttributes[i][4] then currentAttributes[i][3] = nsForPrefix(currentAttributes[i][4]) end
self._call.attribute(unpack(currentAttributes[i]))
end
end
if match1=="/" then
pop(nsStack)
if self._call.closeElement then self._call.closeElement(unpack(currentElement)) end
end
return true
end
end
local function findElementClose()
first, last, match1, match2 = find( xml, '^([%a_][%w_.-]*)%s*>', pos )
if first then
nsURI = nil
for i=#nsStack,1,-1 do if nsStack[i]['!'] then nsURI = nsStack[i]['!']; break end end
else
first, last, match2, match1 = find( xml, '^([%a_][%w_.-]*):([%a_][%w_.-]*)%s*>', pos )
if first then nsURI = nsForPrefix(match2) end
end
if first then
finishText()
if self._call.closeElement then self._call.closeElement(match1,nsURI) end
pos = last+1
textStart = pos
pop(nsStack)
return true
end
end
while pos<#xml do
if state=="text" then
if not (findPI() or findComment() or findCDATA() or findElementClose()) then
if startElement() then
state = "attributes"
else
first, last = find( xml, '^[^<]+', pos )
pos = (first and last or pos) + 1
end
end
elseif state=="attributes" then
if not findAttribute() then
if not closeElement() then
stdnse.debug1("Was in an element and couldn't find attributes or the close.")
return
end
end
end
end
if not anyElement then stdnse.debug1("Parsing did not discover any elements") end
if #nsStack > 0 then stdnse.debug1("Parsing ended with unclosed elements") end
end,
}
--- Parses xml and outputs a dom table.
-- @param xml the xml body to be parsed.
-- @param options if any to use. Supports stripWhitespaces
currently.
function parseDOM (xml, options)
if not options then options={} end
local rich = not options.simple
local push, pop = table.insert, table.remove
local stack = {}
local doc = { type="document", name="#doc", kids={} }
local current = doc
local builder = parser:new{
startElement = function(name,nsURI)
local el = { type="element", name=name, kids={}, el=rich and {} or nil, attr={}, nsURI=nsURI, parent=rich and current or nil }
if current==doc then
if doc.root then stdnse.debug2(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name)) return end
doc.root = el
end
push(current.kids,el)
if current.el then push(current.el,el) end
current = el
push(stack,el)
end,
attribute = function(name,value,nsURI)
if not current or current.type~="element" then stdnse.debug2(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value)) return end
local attr = {type='attribute',name=name,nsURI=nsURI,value=value,parent=rich and current or nil}
if rich then current.attr[name] = value end
push(current.attr,attr)
end,
closeElement = function(name)
if current.name~=name or current.type~="element" then stdnse.debug2(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type)) return end
pop(stack)
current = stack[#stack]
end,
text = function(value)
if current.type~='document' then
if current.type~="element" then stdnse.debug2(("Received a text notification '%s' but was inside a %s"):format(value,current.type)) return end
push(current.kids,{type='text',name='#text',value=value,parent=rich and current or nil})
end
end,
comment = function(value)
push(current.kids,{type='comment',name='#comment',value=value,parent=rich and current or nil})
end,
pi = function(name,value)
push(current.kids,{type='pi',name=name,value=value,parent=rich and current or nil})
end
}
builder:parseSAX (xml,options)
return doc
end
return _ENV;