Data scrubbing options for protecting sensitive data https://godoc.org/maze.io/x/scrub
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

151 lines
3.7 KiB

package scrub
import (
"math"
"strings"
)
// Defaults for the Entropy scrubber.
const (
// DefaultWhitespace is used to split the given input string into tokens that get scrubbed individually.
DefaultWhitespace = " \t\r\n="
// DefaultEntropyThreshold is chosen to not match most UNIX shell commands, but it does match
// passwords with sufficient complexity; use with care!
DefaultEntropyThreshold = 3.75
// IdealEntropyCorrection is used to calculate how much of ideal entropy should be in the string to be considered for scrubbing.
IdealEntropyCorrection = 0.75
)
// EntropyScrubber splits the input by Whitespace and matches each part's entropy with the configured threshold.
type EntropyScrubber struct {
// Whitespace runes.
Whitespace []rune
// Threshold for scrubbing. If not set the ideal entropy is calculated based on the length of the input string.
Threshold float64
// Correction threshold matching based on the ideal entropy if the Threshold is empty. If not set, the
// correction is set to the IdealEntropyCorrection.
Correction float64
}
// Scrub string s by first splitting the input based on Whitespace and then analyzing the entropy of each field.
func (es EntropyScrubber) Scrub(s string) string {
var (
whitespace = es.Whitespace
threshold = es.Threshold
correction = es.Correction
)
if len(whitespace) == 0 {
whitespace = []rune(DefaultWhitespace)
}
if threshold == 0 {
if correction == 0 {
correction = IdealEntropyCorrection
}
threshold = idealEntropy(len(s)) * correction
}
f := splitAfter(s, []rune(whitespace))
for i, p := range f {
if e := entropy(p); e >= threshold {
f[i] = replaceNoWhitespace(f[i], Replacement, whitespace)
}
}
return strings.Join(f, "")
}
// Entropy scrubs all high-entropy strings from s based on the ideal entropy for a string of len(s).
func Entropy(s string) string {
return EntropyWithThreshold(s, idealEntropy(len(s))*IdealEntropyCorrection)
}
// idealEntropy calculates the ideal Shannon entropy of a string of length n.
func idealEntropy(n int) float64 {
if n == 0 {
return 0
}
probability := 1.0 / float64(n)
return -1.0 * float64(n) * probability * math.Log(probability) / math.Log(2.0)
}
// EntropyWithThreshold is like Entropy with a custom threshold.
func EntropyWithThreshold(s string, threshold float64) string {
f := strings.Fields(s)
for i, p := range f {
if entropy(strings.TrimSpace(p)) >= threshold {
f[i] = replaceNoWhitespace(f[i], Replacement, []rune(DefaultWhitespace))
}
}
return strings.Join(f, " ")
}
// entropy calculates the Shannon entropy of string s.
func entropy(s string) float64 {
size := len([]byte(s))
if size == 0 {
return 0
}
// Calculate the probabilities for a given byte in string s.
var (
l = float64(size)
m = make(map[byte]float64)
f float64
)
for i := 0; i < size; i++ {
m[s[i]]++
}
for _, c := range m {
f += c * math.Log2(c)
}
return math.Log2(l) - f/l
}
func splitAfter(s string, separators []rune) (out []string) {
var (
j int
p = []rune(s)
l = len(p)
)
if l == 0 {
return
}
for i := 0; i < l; i++ {
if strings.ContainsRune(string(separators), p[i]) {
k := i
for k < l && strings.ContainsRune(string(separators), p[k]) {
k++
}
out = append(out, s[j:k])
i, j = k, k
}
}
if j > 0 && j < l-1 {
out = append(out, s[j:])
}
if len(out) == 0 {
out = append(out, s)
}
return
}
func replaceNoWhitespace(s, r string, whitespace []rune) string {
var (
out = make([]rune, 0, len(s))
j int
t = []rune(r)
l = len(t)
)
for _, c := range s {
if strings.ContainsRune(string(whitespace), c) {
out = append(out, c)
} else if j < l {
out = append(out, t[j])
j++
}
}
return string(out)
}