description = [[ Crawls through archive.org and extracts URLs from previous versions of the target website. It is useful for discovering hidden pages that were used in the past but still exist on the target. It also gives an overview of the website through time. The script crawls through one previous version of the website for each archived year. When it encounters a URL, it checks if it still exists on the target website and adds it to the list. It will return the archived version (along with its links) only if it contains newly discovered URLs. Note that this script is pretty intrusive for both archive.org and the target website. Use maxyears and singleyears options to limit the crawling operations. ]] --- -- @usage nmap -p80 --script http-archive.nse -- -- @args http-archive.maxyears The maximum number of archived years to search -- through. The years start from the past. For example, if a webpage is being -- archived since 2000 and we set the value of maxyears to 10, the script will -- search through the archived versions (one per year) until 2010. Default: 15 -- -- @output -- PORT STATE SERVICE REASON -- 80/tcp open http syn-ack -- | http-archive -- | web.archive.org/web/20070108230550/http://www.example.com/ -- | -- | Dead links: -- | example.com/index.cfm -- | example.com/contacts -- | -- | web.archive.org/web/20080102194330/http://www.example.com/ -- | -- | Alive links: -- | example.com/info.html -- | -- | Dead links: -- |_ example.com/im --- author = {'George Chatzisofroniou'} license = "Same as Nmap--See http://nmap.org/book/man-legal.html" categories = {"intrusive", "external", "discovery"} local nmap = require "nmap" local http = require "http" local httpspider = require "httpspider" local shortport = require "shortport" local stdnse = require "stdnse" local table = require "table" local string = require "string" portrule = shortport.port_or_service( {80, 443}, {"http", "https"}, "tcp", "open") local ARCHIVE_SITE = "web.archive.org" local ARCHIVE_YEAR = "%s(%d%d%d%d)" local removewww = function(url) return string.gsub(url, "^www%.", "") end action = function(host, port) local maxyears = stdnse.get_script_args("http-archive.maxyears") or 15 local singleyears = stdnse.get_script_args("http-archive.singleyears") or nil local urls = {} local target = "/web/*/" .. host.targetname -- Only one instantiation of the script should ping archive.org at once. local mutex = nmap.mutex("http-archive") mutex "lock" -- Get the first archived year. local response = http.get(ARCHIVE_SITE, 80, target) local year = string.match(response.body, ARCHIVE_YEAR) -- If you can't find it, the target website is not archived. if not year then return "archive.org doesn't contain any archived version of this website." end local yearscount = 0 local curyear = tonumber(os.date("%Y", nmap.clock())) local oldurl = "" local checkedurls = {} local targetname = removewww(host.targetname) local index, k while true do if singleyears then k, url = next(singleyears, index) if (k == nil) then break end -- If the table contains years. if type(tonumber(url)) == "number" then target = "/web/" .. url .. "1000000000*/" .. host.targetname local response = http.get(ARCHIVE_SITE, 80, target) -- Get the first archived version for this year. url = string.match(response.body, " curyear and yearscount > maxyears then break end stdnse.print_debug(1, "Checking archived version in " .. year) target = "/web/" .. year .. "1000000000*/" .. host.targetname response = http.get(ARCHIVE_SITE, 80, target) if not response.body then break end -- Get the first archived version for this year. url = string.match(response.body, ".*