package scrub
|
|
|
|
import (
|
|
"math"
|
|
"strings"
|
|
)
|
|
|
|
// Defaults for the Entropy scrubber.
|
|
const (
|
|
// DefaultWhitespace is used to split the given input string into tokens that get scrubbed individually.
|
|
DefaultWhitespace = " \t\r\n="
|
|
|
|
// DefaultEntropyThreshold is chosen to not match most UNIX shell commands, but it does match
|
|
// passwords with sufficient complexity; use with care!
|
|
DefaultEntropyThreshold = 3.75
|
|
|
|
// IdealEntropyCorrection is used to calculate how much of ideal entropy should be in the string to be considered for scrubbing.
|
|
IdealEntropyCorrection = 0.75
|
|
)
|
|
|
|
// EntropyScrubber splits the input by Whitespace and matches each part's entropy with the configured threshold.
|
|
type EntropyScrubber struct {
|
|
// Whitespace runes.
|
|
Whitespace []rune
|
|
|
|
// Threshold for scrubbing. If not set the ideal entropy is calculated based on the length of the input string.
|
|
Threshold float64
|
|
|
|
// Correction threshold matching based on the ideal entropy if the Threshold is empty. If not set, the
|
|
// correction is set to the IdealEntropyCorrection.
|
|
Correction float64
|
|
}
|
|
|
|
// Scrub string s by first splitting the input based on Whitespace and then analyzing the entropy of each field.
|
|
func (es EntropyScrubber) Scrub(s string) string {
|
|
var (
|
|
whitespace = es.Whitespace
|
|
threshold = es.Threshold
|
|
correction = es.Correction
|
|
)
|
|
if len(whitespace) == 0 {
|
|
whitespace = []rune(DefaultWhitespace)
|
|
}
|
|
if threshold == 0 {
|
|
if correction == 0 {
|
|
correction = IdealEntropyCorrection
|
|
}
|
|
threshold = idealEntropy(len(s)) * correction
|
|
}
|
|
|
|
f := splitAfter(s, []rune(whitespace))
|
|
for i, p := range f {
|
|
if e := entropy(p); e >= threshold {
|
|
f[i] = replaceNoWhitespace(f[i], Replacement, whitespace)
|
|
}
|
|
}
|
|
return strings.Join(f, "")
|
|
}
|
|
|
|
// Entropy scrubs all high-entropy strings from s based on the ideal entropy for a string of len(s).
|
|
func Entropy(s string) string {
|
|
return EntropyWithThreshold(s, idealEntropy(len(s))*IdealEntropyCorrection)
|
|
}
|
|
|
|
// idealEntropy calculates the ideal Shannon entropy of a string of length n.
|
|
func idealEntropy(n int) float64 {
|
|
if n == 0 {
|
|
return 0
|
|
}
|
|
probability := 1.0 / float64(n)
|
|
return -1.0 * float64(n) * probability * math.Log(probability) / math.Log(2.0)
|
|
}
|
|
|
|
// EntropyWithThreshold is like Entropy with a custom threshold.
|
|
func EntropyWithThreshold(s string, threshold float64) string {
|
|
f := strings.Fields(s)
|
|
for i, p := range f {
|
|
if entropy(strings.TrimSpace(p)) >= threshold {
|
|
f[i] = replaceNoWhitespace(f[i], Replacement, []rune(DefaultWhitespace))
|
|
}
|
|
}
|
|
return strings.Join(f, " ")
|
|
}
|
|
|
|
// entropy calculates the Shannon entropy of string s.
|
|
func entropy(s string) float64 {
|
|
size := len([]byte(s))
|
|
if size == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Calculate the probabilities for a given byte in string s.
|
|
var (
|
|
l = float64(size)
|
|
m = make(map[byte]float64)
|
|
f float64
|
|
)
|
|
for i := 0; i < size; i++ {
|
|
m[s[i]]++
|
|
}
|
|
for _, c := range m {
|
|
f += c * math.Log2(c)
|
|
}
|
|
return math.Log2(l) - f/l
|
|
}
|
|
|
|
func splitAfter(s string, separators []rune) (out []string) {
|
|
var (
|
|
j int
|
|
p = []rune(s)
|
|
l = len(p)
|
|
)
|
|
if l == 0 {
|
|
return
|
|
}
|
|
for i := 0; i < l; i++ {
|
|
if strings.ContainsRune(string(separators), p[i]) {
|
|
k := i
|
|
for k < l && strings.ContainsRune(string(separators), p[k]) {
|
|
k++
|
|
}
|
|
out = append(out, s[j:k])
|
|
i, j = k, k
|
|
}
|
|
}
|
|
if j > 0 && j < l-1 {
|
|
out = append(out, s[j:])
|
|
}
|
|
if len(out) == 0 {
|
|
out = append(out, s)
|
|
}
|
|
return
|
|
}
|
|
|
|
func replaceNoWhitespace(s, r string, whitespace []rune) string {
|
|
var (
|
|
out = make([]rune, 0, len(s))
|
|
j int
|
|
t = []rune(r)
|
|
l = len(t)
|
|
)
|
|
for _, c := range s {
|
|
if strings.ContainsRune(string(whitespace), c) {
|
|
out = append(out, c)
|
|
} else if j < l {
|
|
out = append(out, t[j])
|
|
j++
|
|
}
|
|
}
|
|
return string(out)
|
|
}
|