--- -- Implements an HTTP spider for web crawling. -- -- The main function of this library is to provide a mechanism for web crawling. -- -- The crawl function crawls a website and returns a list containing all the local links found -- and page information such as: -- file extension - File extension if available. -- file size - Size in bytes of page. -- content type - Response's content type. -- checksum - Checksum of content. Useful when comparing changes. -- cache filename - Temporary local filename where page content is stored. -- -- @args httpspider.useragent User-Agent field sent with all the HTTP requests. Default value: "Mozilla 6.0/Nmap HTTPspider" -- @args httpspider.allowremote Turn on to allow spider to crawl outside the parent website. Default value: false -- @args httpspider.cachecontent Turn on to write cache files containing all the crawled page's content. Default value: false -- --@author Paulino Calderon 05/2011 --@copyright Same as Nmap--See http://nmap.org/book/man-legal.html -- -- TODO --- Code all helper functions missing --- HTTP Support --- Add proxy support -- local DEFAULT_USER_AGENT = "Mozilla 6.0/Nmap httpspider" local nmap = require "nmap"; local stdnse = require "stdnse"; local http = require "http"; local string = require "string"; module(... or "httpspider",package.seeall) --Set options for library local OPT_USER_AGENT = stdnse.get_script_args('httpspider.useragent') or DEFAULT_USER_AGENT local OPT_ALLOW_REMOTE = stdnse.get_script_args('httpspider.allowremote') or false local OPT_CACHE_CONTENT = stdnse.get_script_args() or false --locals holding unvisited and visited links local link_list, visited_links --Crawls given URL until it find all local links --@return Table of crawled pages and its information local function crawl(uri, options) end --Finds redirects in the response --@return True if a redirect is found local function find_redirect(header) end --Checks if URL is an absolute address --@return True if "http://"/"https://" is found local function is_url_absolute(url) if string.find(url, "http://") or string.find(url, "https://") then return true else return false end end --Returns base URL --@return Base URL of address local function get_base_url(url) end --Parses the href attribute of the tags inside the given string --@return list of href links local function get_href_links(body) local href_links end --Downloads a page and processes its information --@return Table containing all the page information local function download_page(host, port, url) page_resp = http.get(host, port, url) -- Process & store page link_list[#link_list + 1] = {["uri"]=url, ["status"]=page_resp.status, ["checksum"]="", ["ext"]="", ["type"]=page_resp.header["content-type"], ["content"]=page_resp.body} end --Returns a list with all the crawled pages and its information --@return Table of crawled pages local function get_page_list() return link_list end --Returns string between delimeters --@return String found between delimeters local function return_between(string, start_del, end_del) end --Returns array of strings found between the given delimeters --@return Array of strings found between delimeters local function return_array_between(str, start_del, end_del) end --Returns a list of all images found in the website --@return List of images found in document local function get_image_files() end --Returns a list of all javascript files found in the website --@return List of js files in document local function get_javascript_files() end