---
-- Implements an HTTP spider for web crawling.
--
-- The main function of this library is to provide a mechanism for web crawling.
--
-- The crawl
function crawls a website and returns a list containing all the local links found
-- and page information such as:
-- file extension
- File extension if available.
-- file size
- Size in bytes of page.
-- content type
- Response's content type.
-- checksum
- Checksum of content. Useful when comparing changes.
-- cache filename
- Temporary local filename where page content is stored.
--
-- @args httpspider.useragent User-Agent field sent with all the HTTP requests. Default value: "Mozilla 6.0/Nmap HTTPspider"
-- @args httpspider.allowremote Turn on to allow spider to crawl outside the parent website. Default value: false
-- @args httpspider.cachecontent Turn on to write cache files containing all the crawled page's content. Default value: false
--
--@author Paulino Calderon 05/2011
--@copyright Same as Nmap--See http://nmap.org/book/man-legal.html
--
-- TODO
--- Code all helper functions missing
--- HTTP Support
--- Add proxy support
--
local DEFAULT_USER_AGENT = "Mozilla 6.0/Nmap httpspider"
local nmap = require "nmap";
local stdnse = require "stdnse";
local http = require "http";
local string = require "string";
module(... or "httpspider",package.seeall)
--Set options for library
local OPT_USER_AGENT = stdnse.get_script_args('httpspider.useragent') or DEFAULT_USER_AGENT
local OPT_ALLOW_REMOTE = stdnse.get_script_args('httpspider.allowremote') or false
local OPT_CACHE_CONTENT = stdnse.get_script_args() or false
--locals holding unvisited and visited links
local link_list, visited_links
--Crawls given URL until it find all local links
--@return Table of crawled pages and its information
local function crawl(uri, options)
end
--Finds redirects in the response
--@return True if a redirect is found
local function find_redirect(header)
end
--Checks if URL is an absolute address
--@return True if "http://"/"https://" is found
local function is_url_absolute(url)
if string.find(url, "http://") or string.find(url, "https://") then
return true
else
return false
end
end
--Returns base URL
--@return Base URL of address
local function get_base_url(url)
end
--Parses the href attribute of the tags inside the given string
--@return list of href links
local function get_href_links(body)
local href_links
end
--Downloads a page and processes its information
--@return Table containing all the page information
local function download_page(host, port, url)
page_resp = http.get(host, port, url)
-- Process & store page
link_list[#link_list + 1] = {["uri"]=url, ["status"]=page_resp.status, ["checksum"]="",
["ext"]="", ["type"]=page_resp.header["content-type"], ["content"]=page_resp.body}
end
--Returns a list with all the crawled pages and its information
--@return Table of crawled pages
local function get_page_list()
return link_list
end
--Returns string between delimeters
--@return String found between delimeters
local function return_between(string, start_del, end_del)
end
--Returns array of strings found between the given delimeters
--@return Array of strings found between delimeters
local function return_array_between(str, start_del, end_del)
end
--Returns a list of all images found in the website
--@return List of images found in document
local function get_image_files()
end
--Returns a list of all javascript files found in the website
--@return List of js files in document
local function get_javascript_files()
end