module(...,package.seeall) -- this counts how many closing tags come before -- the first opening tag -- is 2 for example local count_close_before_open = function(s) local count = 0 for kind in string.gmatch(s, "< -(/?)%w") do if(kind == "/") then count = count + 1 else break end end return count end -- doc is a string containing an html document -- queries is a hash table where the keys are -- paths for which you want to know the content -- and the values are true -- known bugs: -- * tags which are using different capitalization -- for opening and closing () are not -- handled correctly -- * the query and the document have to use -- the same capitalization ('/foo' won't match ) local parser = function(doc, queries) -- remove comments local doc = string.gsub(doc, "", "") local data local path = "" local depth = 0 local last_depth = -1 helper = function(string) -- find the next open tag local token_begin, token_end = string.find(string, "< -%w+.->") if(token_begin ~= nil) then -- we decrease the depth in the document tree -- by the number of close tags () occuring -- before the next open tag -- if we are at the depth is decreased -- by 2 and increased by one -- note that at the bottom of the document the depth -- is not properly decreased to 0 but it doesn't really matter depth = depth - count_close_before_open(string) depth = depth + 1 local tag = string.sub(string, token_begin, token_end) local tag_name = string.match(tag, "< -(%w+)") -- if the current node is a sibling of the last node -- we append it with a , to the path -- if the current node is a child of the last node -- we append it with a / to the path if(depth == last_depth) then path = path .. "," .. tag_name else path = path .. "/" .. tag_name end -- now that we have cleared up where we are in the -- document tree we can remove all the cruft -- coming before the current node string = string.sub(string, token_end+1) -- the data of the current node is the plain text enclosed -- by its opening and closing tag local data_end = string.find(string, "< -/ -" .. tag_name) if(data_end ~= nil) then data = string.sub(string, 0, data_end - 1) else data = nil end -- the caller wants to know what's in this node so we tell them if(queries[path]) then coroutine.yield(data) end last_depth = depth -- the next iteration is called with the part after the | -- | stuff stuff stuff stuff helper(string) end end helper(doc) end create_parser = function() return coroutine.wrap(parser) end