---
-- Implements an HTTP spider for web crawling.
--
-- The main function of this library is to provide a mechanism for web crawling. 
--
-- The <code>crawl</code> function crawls a website and returns a list containing all the local links found
-- and page information such as:
-- <code>file extension</code> - File extension if available.
-- <code>file size</code> - Size in bytes of page.
-- <code>content type</code> - Response's content type.
-- <code>checksum</code> - Checksum of content. Useful when comparing changes.
-- <code>cache filename</code> - Temporary local filename where page content is stored.
--
-- @args httpspider.useragent User-Agent field sent with all the HTTP requests. Default value: "Mozilla 6.0/Nmap HTTPspider"
-- @args httpspider.allowremote Turn on to allow spider to crawl outside the parent website. Default value: false
-- @args httpspider.cachecontent Turn on to write cache files containing all the crawled page's content. Default value: false
--
--@author Paulino Calderon 05/2011
--@copyright Same as Nmap--See http://nmap.org/book/man-legal.html
--
-- TODO
--- Code all helper functions missing
--- HTTP Support
--- Add proxy support
--

local DEFAULT_USER_AGENT = "Mozilla 6.0/Nmap httpspider"

local nmap = require "nmap";
local stdnse = require "stdnse";
local http = require "http";
local string = require "string";
module(... or "httpspider",package.seeall)

--Set options for library
local OPT_USER_AGENT = stdnse.get_script_args('httpspider.useragent') or DEFAULT_USER_AGENT
local OPT_ALLOW_REMOTE = stdnse.get_script_args('httpspider.allowremote') or false
local OPT_CACHE_CONTENT = stdnse.get_script_args() or false

--locals holding unvisited and visited links
local link_list, visited_links

--Crawls given URL until it find all local links
--@return Table of crawled pages and its information
local function crawl(uri, options)

end

--Finds redirects in the response
--@return True if a redirect is found
local function find_redirect(header)

end

--Checks if URL is an absolute address
--@return True if "http://"/"https://" is found
local function is_url_absolute(url)
        if string.find(url, "http://") or string.find(url, "https://") then
                return true
        else
                return false
        end
end

--Returns base URL
--@return Base URL of address
local function get_base_url(url)
end

--Parses the href attribute of the <a> tags inside the given string
--@return list of href links
local function get_href_links(body)
        local href_links
end

--Downloads a page and processes its information
--@return Table containing all the page information
local function download_page(host, port, url)
	page_resp = http.get(host, port, url)
	
	-- Process & store page
	link_list[#link_list + 1] = {["uri"]=url, ["status"]=page_resp.status, ["checksum"]="",
			 ["ext"]="", ["type"]=page_resp.header["content-type"], ["content"]=page_resp.body}
 
end

--Returns a list with all the crawled pages and its information
--@return Table of crawled pages
local function get_page_list()
	return link_list
end

--Returns string between delimeters
--@return String found between delimeters
local function return_between(string, start_del, end_del)

end

--Returns array of strings found between the given delimeters
--@return Array of strings found between delimeters
local function return_array_between(str, start_del, end_del)

end

--Returns a list of all images found in the website
--@return List of images found in document
local function get_image_files()

end

--Returns a list of all javascript files found in the website
--@return List of js files in document
local function get_javascript_files()

end