---
-- Formula functions for various calculations.
--
-- The library lets scripts to use common mathematical functions to compute percentages,
-- averages, entropy, randomness and other calculations. Scripts that generate statistics 
-- and metrics can also make use of this library.
--
-- Functions included:
--
-- <code>calcPwdEntropy</code> - Calculate the entropy of a password.  A random
-- password's information entropy, H, is given by the formula: H = L * (logN) / (log2), 
-- where N is the number of possible symbols and L is the number of symbols in the 
-- password. Based on https://en.wikipedia.org/wiki/Password_strength 
--
-- <code>looksRandom</code> - Returns true if the value looks random.
--
-- <code>EditDistance</code> - Finds the edit distance between two strings or tables. 
-- Edit distance is the minimum number of edits needed to transform one string or table 
-- into the other. The implementation was taken from Brett Smith: https://gist.github.com/Nayruden/427389
--
--  Parameters:
--        s - A *string* or *table*.
--        t - Another *string* or *table* to compare against s.
--        lim - An *optional number* to limit the function to a maximum edit distance. If specified
--            and the function detects that the edit distance is going to be larger than limit, limit
--            is returned immediately.
--            
--    Returns:
--    
--        A *number* specifying the minimum edits it takes to transform s into t or vice versa. Will
--            not return a higher number than lim, if specified.
--            
--    Example:
-- 
--        :editDistance( "Tuesday", "Teusday" ) -- One transposition.
--        :editDistance( "kitten", "sitting" ) -- Two substitutions and a deletion.
-- 
--        returns...
-- 
--        :1
--        :3
--            
--    Notes:
--    
--        * Complexity is O( (#t+1) * (#s+1) ) when lim isn't specified.
--        * This function can be used to compare array-like tables as easily as strings.
--        * The algorithm used is Damerau–Levenshtein distance, which calculates edit distance based
--            off number of subsitutions, additions, deletions, and transpositions.
--        * Source code for this function is based off the Wikipedia article for the algorithm
--            <http://en.wikipedia.org/w/index.php?title=Damerau%E2%80%93Levenshtein_distance&oldid=351641537>.
--        * This function is case sensitive when comparing strings.
--        * If this function is being used several times a second, you should be taking advantage of
--            the lim parameter.
--        * Using this function to compare against a dictionary of 250,000 words took about 0.6
--            seconds on my machine for the word "Teusday", around 10 seconds for very poorly 
--            spelled words. Both tests used lim. 
--
-- @copyright Same as Nmap--See http://nmap.org/book/man-legal.html
---

local stdnse = require "stdnse"
local table = require "table"

_ENV = stdnse.module("formulas", stdnse.seeall)

calcPwdEntropy = function(value)

    local total, hasdigit, haslower, hasupper, hasspaces = 0, 0, 0, 0, false

    if string.find(value, "%d") then
        hasdigit = 1
    end
    if string.find(value, "%l") then
        haslower = 1
    end
    if string.find(value, "%u") then
        hasupper = 1
    end
    if string.find(value, ' ') then
        hasspaces = true
    end

    -- The values 10, 26, 26 have been taken from Wikipedia's entropy table.
    local total = hasdigit * 10 + hasupper * 26 + haslower * 26
    local entropy = math.floor(math.log(total) * #value / math.log(2))

    return entropy
end

-- A chi-square test for the null hypothesis that the members of data are drawn
-- from a uniform distribution over num_cats categories.
local function chi2(data, num_cats)
    local bins = {}
    local x2, delta, expected

    for _, x in ipairs(data) do
        bins[x] = bins[x] or 0
        bins[x] = bins[x] + 1
    end

    expected = #data / num_cats
    x2 = 0.0
    for _, n in pairs(bins) do
        delta = n - expected
        x2 = x2 + delta * delta
    end
    x2 = x2 / expected

    return x2
end

-- Split a string into a sequence of bit strings of the given length.
-- splitbits("abc", 5) --> {"01100", "00101", "10001", "00110"}
-- Any short final group is omitted.
local function splitbits(s, n)
    local seq

    local _, bits = bin.unpack("B" .. #s, s)
    seq = {}
    for i = 1, #bits - n, n do
        seq[#seq + 1] = bits:sub(i, i + n - 1)
    end

    return seq
end

-- chi-square cdf table at 0.95 confidence for different degrees of freedom.
-- >>> import scipy.stats, scipy.optimize
-- >>> scipy.optimize.newton(lambda x: scipy.stats.chi2(dof).cdf(x) - 0.95, dof)
local CHI2_CDF = {
    [3] = 7.8147279032511738,
    [15] = 24.99579013972863,
    [255] = 293.2478350807001,
}

function looksRandom(data)
    local x2

    -- Because our sample is so small (only 16 bytes), do a chi-square
    -- goodness of fit test across groups of 2, 4, and 8 bits. If using only
    -- 8 bits, for example, any sample whose bytes are all different would
    -- pass the test. Using 2 bits will tend to catch things like pure
    -- ASCII, where one out of every four samples never has its high bit
    -- set.

    x2 = chi2(splitbits(data, 2), 4)
    if x2 > CHI2_CDF[3] then
        return false
    end

    x2 = chi2(splitbits(data, 4), 16)
    if x2 > CHI2_CDF[15] then
        return false
    end

    x2 = chi2({string.byte(data, 1, -1)}, 256)
    if x2 > CHI2_CDF[255] then
        return false
    end

    return true
end

--[[
    Function: editDistance
 
    Finds the edit distance between two strings or tables. Edit distance is the minimum number of
    edits needed to transform one string or table into the other.
    
           
    Revisions:
 
        v1.00 - Initial.
]]
function editDistance( s, t, lim )
    local s_len, t_len = #s, #t -- Calculate the sizes of the strings or arrays
    if lim and math.abs( s_len - t_len ) >= lim then -- If sizes differ by lim, we can stop here
        return lim
    end
    
    -- Convert string arguments to arrays of ints (ASCII values)
    if type( s ) == "string" then
        s = { string.byte( s, 1, s_len ) }
    end
    
    if type( t ) == "string" then
        t = { string.byte( t, 1, t_len ) }
    end
    
    local min = math.min -- Localize for performance
    local num_columns = t_len + 1 -- We use this a lot
    
    local d = {} -- (s_len+1) * (t_len+1) is going to be the size of this array
    -- This is technically a 2D array, but we're treating it as 1D. Remember that 2D access in the
    -- form my_2d_array[ i, j ] can be converted to my_1d_array[ i * num_columns + j ], where
    -- num_columns is the number of columns you had in the 2D array assuming row-major order and
    -- that row and column indices start at 0 (we're starting at 0).
    
    for i=0, s_len do
        d[ i * num_columns ] = i -- Initialize cost of deletion
    end
    for j=0, t_len do
        d[ j ] = j -- Initialize cost of insertion
    end
    
    for i=1, s_len do
        local i_pos = i * num_columns
        local best = lim -- Check to make sure something in this row will be below the limit
        for j=1, t_len do
            local add_cost = (s[ i ] ~= t[ j ] and 1 or 0)
            local val = min(
                d[ i_pos - num_columns + j ] + 1,                               -- Cost of deletion
                d[ i_pos + j - 1 ] + 1,                                         -- Cost of insertion
                d[ i_pos - num_columns + j - 1 ] + add_cost                     -- Cost of substitution, it might not cost anything if it's the same
            )
            d[ i_pos + j ] = val
            
            -- Is this eligible for tranposition?
            if i > 1 and j > 1 and s[ i ] == t[ j - 1 ] and s[ i - 1 ] == t[ j ] then
                d[ i_pos + j ] = min(
                    val,                                                        -- Current cost
                    d[ i_pos - num_columns - num_columns + j - 2 ] + add_cost   -- Cost of transposition
                )
            end
            
            if lim and val < best then
                best = val
            end
        end
        
        if lim and best >= lim then
            return lim
        end
    end
    
    return d[ #d ]
end

return _ENV