Checkpoint

This commit is contained in:
2025-10-01 15:37:55 +02:00
parent 4a60059ff2
commit 03352e3312
31 changed files with 2611 additions and 384 deletions

99
dataset/domain.go Normal file
View File

@@ -0,0 +1,99 @@
package dataset
import (
"strings"
"github.com/miekg/dns"
)
type DomainTree struct {
root *domainTreeNode
}
type domainTreeNode struct {
leaf map[string]*domainTreeNode
isEnd bool
}
func NewDomainList(domains ...string) *DomainTree {
tree := &DomainTree{
root: &domainTreeNode{leaf: make(map[string]*domainTreeNode)},
}
for _, domain := range domains {
tree.Add(domain)
}
return tree
}
func (tree *DomainTree) Add(domain string) {
domain = normalizeDomain(domain)
if domain == "" {
return
}
labels := dns.SplitDomainName(domain)
if len(labels) == 0 {
return
}
node := tree.root
for i := len(labels) - 1; i >= 0; i-- {
label := labels[i]
if label == "" {
continue
}
if node.leaf == nil {
node.leaf = make(map[string]*domainTreeNode)
}
if node.leaf[label] == nil {
node.leaf[label] = &domainTreeNode{}
}
node = node.leaf[label]
}
node.isEnd = true
}
func (tree *DomainTree) Contains(domain string) bool {
domain = normalizeDomain(domain)
if domain == "" {
return false
}
labels := dns.SplitDomainName(domain)
if len(labels) == 0 {
return false
}
node := tree.root
for i := len(labels) - 1; i >= 0; i-- {
if node.isEnd {
return true
}
if node.leaf == nil {
return false
}
label := labels[i]
if node = node.leaf[label]; node == nil {
return false
}
}
return node.isEnd
}
func normalizeDomain(domain string) string {
domain = strings.ToLower(strings.TrimSpace(domain))
if domain == "" {
return ""
}
// Remove trailing dot if present, dns.Fqdn will add it back properly
domain = strings.TrimSuffix(domain, ".")
if domain == "" {
return ""
}
return dns.Fqdn(domain)
}

5
dataset/domain_data.go Normal file
View File

@@ -0,0 +1,5 @@
package dataset
var Domains = map[string]*DomainTree{
"example": NewDomainList("example.org", "example.net", "example.com"),
}

276
dataset/domain_test.go Normal file
View File

@@ -0,0 +1,276 @@
package dataset
import (
"testing"
)
func TestDomainList(t *testing.T) {
tests := []struct {
name string
domains []string
hostname string
expected bool
}{
// Basic exact matches
{
name: "exact match",
domains: []string{"example.com"},
hostname: "example.com",
expected: true,
},
{
name: "exact match with subdomain in list",
domains: []string{"api.example.com"},
hostname: "api.example.com",
expected: true,
},
// Suffix matching - if domain is in list, all subdomains should match
{
name: "subdomain matches parent domain",
domains: []string{"example.com"},
hostname: "sub.example.com",
expected: true,
},
{
name: "multiple subdomain levels match",
domains: []string{"example.com"},
hostname: "deep.nested.sub.example.com",
expected: true,
},
{
name: "subdomain matches intermediate domain",
domains: []string{"api.example.com", "example.com"},
hostname: "sub.api.example.com",
expected: true,
},
// Multi-level TLDs
{
name: "co.uk domain exact match",
domains: []string{"domain.co.uk"},
hostname: "domain.co.uk",
expected: true,
},
{
name: "subdomain of co.uk domain",
domains: []string{"domain.co.uk"},
hostname: "sub.domain.co.uk",
expected: true,
},
// Case sensitivity
{
name: "case insensitive match",
domains: []string{"Example.COM"},
hostname: "example.com",
expected: true,
},
{
name: "case insensitive hostname",
domains: []string{"example.com"},
hostname: "EXAMPLE.COM",
expected: true,
},
// Trailing dots
{
name: "domain with trailing dot",
domains: []string{"example.com."},
hostname: "example.com",
expected: true,
},
{
name: "hostname with trailing dot",
domains: []string{"example.com"},
hostname: "example.com.",
expected: true,
},
// Non-matches
{
name: "different TLD",
domains: []string{"example.com"},
hostname: "example.org",
expected: false,
},
{
name: "different domain",
domains: []string{"example.com"},
hostname: "test.com",
expected: false,
},
{
name: "partial match but not suffix",
domains: []string{"example.com"},
hostname: "com",
expected: false,
},
{
name: "empty hostname",
domains: []string{"example.com"},
hostname: "",
expected: false,
},
// Multiple domains in list
{
name: "matches first domain in list",
domains: []string{"test.org", "example.com"},
hostname: "example.com",
expected: true,
},
{
name: "matches second domain in list",
domains: []string{"test.org", "example.com"},
hostname: "test.org",
expected: true,
},
{
name: "subdomain matches any domain in list",
domains: []string{"test.org", "example.com"},
hostname: "sub.example.com",
expected: true,
},
// Edge cases
{
name: "empty domain list",
domains: []string{},
hostname: "example.com",
expected: false,
},
{
name: "invalid domain in list",
domains: []string{""},
hostname: "example.com",
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
list := NewDomainList(tt.domains...)
result := list.Contains(tt.hostname)
if result != tt.expected {
t.Errorf("Contains(%q) = %v, expected %v (domains: %v)",
tt.hostname, result, tt.expected, tt.domains)
}
})
}
}
func TestDomainList_Performance(t *testing.T) {
// Test with a large number of domains to ensure performance
domains := make([]string, 1000)
for i := 0; i < 1000; i++ {
domains[i] = string(rune('a'+(i%26))) + ".com"
}
domains = append(domains, "example.com") // Add our test domain
list := NewDomainList(domains...)
// These should be fast even with many domains
if !list.Contains("example.com") {
t.Error("Should match exact domain")
}
if !list.Contains("sub.example.com") {
t.Error("Should match subdomain")
}
if list.Contains("notfound.com") {
t.Error("Should not match unrelated domain")
}
}
func TestDomainList_ComplexDomains(t *testing.T) {
domains := []string{
"very.long.domain.name.with.many.labels.com",
"example.co.uk",
"sub.domain.example.com",
"a.b.c.d.e.f.com",
}
list := NewDomainList(domains...)
tests := []struct {
hostname string
expected bool
}{
{"very.long.domain.name.with.many.labels.com", true},
{"sub.very.long.domain.name.with.many.labels.com", true},
{"example.co.uk", true},
{"www.example.co.uk", true},
{"sub.domain.example.com", true},
{"another.sub.domain.example.com", true},
{"a.b.c.d.e.f.com", true},
{"x.a.b.c.d.e.f.com", true},
{"not.matching.com", false},
{"com", false},
{"uk", false},
}
for _, tt := range tests {
t.Run(tt.hostname, func(t *testing.T) {
result := list.Contains(tt.hostname)
if result != tt.expected {
t.Errorf("Contains(%q) = %v, expected %v", tt.hostname, result, tt.expected)
}
})
}
}
func TestDomainList_SpecialCases(t *testing.T) {
t.Run("domain with asterisk treated literally", func(t *testing.T) {
list := NewDomainList("*.example.com")
// The asterisk should be treated as a literal label, not a wildcard
if !list.Contains("*.example.com") {
t.Error("Asterisk should be treated literally, not as wildcard")
}
if list.Contains("test.example.com") {
t.Error("Should not match subdomain with literal asterisk domain")
}
})
t.Run("domains with hyphens and numbers", func(t *testing.T) {
list := NewDomainList("test-123.example.com", "123abc.org")
if !list.Contains("test-123.example.com") {
t.Error("Should match domain with hyphens and numbers")
}
if !list.Contains("sub.test-123.example.com") {
t.Error("Should match subdomain of hyphenated domain")
}
if !list.Contains("123abc.org") {
t.Error("Should match domain starting with numbers")
}
if !list.Contains("www.123abc.org") {
t.Error("Should match subdomain of numeric domain")
}
})
}
func BenchmarkDomainList(b *testing.B) {
// Benchmark with realistic domain list
domains := []string{
"google.com",
"github.com",
"example.org",
"sub.domain.com",
"api.service.co.uk",
"very.long.domain.name.example.com",
}
list := NewDomainList(domains...)
b.ResetTimer()
for b.Loop() {
// Mix of matches and non-matches
list.Contains("sub.example.org")
list.Contains("api.github.com")
list.Contains("nonexistent.com")
list.Contains("deep.nested.sub.domain.com")
list.Contains("service.co.uk")
}
}

52
dataset/network.go Normal file
View File

@@ -0,0 +1,52 @@
package dataset
import (
"net"
"github.com/yl2chen/cidranger"
)
type NetworkTree struct {
ranger cidranger.Ranger
}
func MustNetworkTree(networks ...string) *NetworkTree {
tree, err := NewNetworkTree(networks...)
if err != nil {
panic(err)
}
return tree
}
func NewNetworkTree(networks ...string) (*NetworkTree, error) {
tree := &NetworkTree{
ranger: cidranger.NewPCTrieRanger(),
}
for _, cidr := range networks {
if err := tree.AddCIDR(cidr); err != nil {
return nil, err
}
}
return tree, nil
}
func (tree *NetworkTree) Add(ipnet *net.IPNet) {
if ipnet == nil {
return
}
tree.ranger.Insert(cidranger.NewBasicRangerEntry(*ipnet))
}
func (tree *NetworkTree) AddCIDR(cidr string) error {
_, ipnet, err := net.ParseCIDR(cidr)
if err != nil {
return err
}
tree.ranger.Insert(cidranger.NewBasicRangerEntry(*ipnet))
return nil
}
func (tree *NetworkTree) Contains(ip net.IP) bool {
contains, _ := tree.ranger.Contains(ip)
return contains
}

71
dataset/network_data.go Normal file
View File

@@ -0,0 +1,71 @@
package dataset
var (
bogonsIPv4 = []string{
"9.0.0.0/8", // "This" network
"10.0.0.0/8", // RFC1918 Private-use networks
"100.64.0.0/10", // Carrier-grade NAT
"127.0.0.0/8", // Loopback
"169.254.0.0/16", // Link local
"172.16.0.0/12", // RFC1918 Private-use networks
"192.0.0.0/24", // IETF protocol assignments
"192.0.2.0/24", // TEST-NET-1
"192.168.0.0/16", // RFC1918 Private-use networks
"198.18.0.0/15", // Network interconnect device benchmark testing
"198.51.100.0/24", // TEST-NET-2
"203.0.113.0/24", // TEST-NET-3
"224.0.0.0/4", // Multicast
"240.0.0.0/4", // Reserved for future use
"255.255.255.255/32", // Limited broadcast
}
bogonsIPv6 = []string{
"::/128", // Node-scope unicast unspecified address
"::1/128", // Node-scope unicast loopback address
"::ffff:0:0/96", // IPv4-mapped addresses
"::/96", // IPv4-compatible addresses
"100::/64", // Remotely triggered black hole addresses
"2001:10::/28", // Overlay routable cryptographic hash identifiers (ORCHID)
"2001:db8::/32", // Documentation prefix
"3fff::/20", // Documentation prefix
"fc00::/7", // Unique local addresses (ULA)
"fe80::/10", // Link-local unicast
"fec0::/10", // Site-local unicast (deprecated)
"ff00::/8", // Multicast (Note: ff0e:/16 is global scope and may appear on the global internet.)
"2002::/24", // 6to4 bogon (0.0.0.0/8)
"2002:a00::/24", // 6to4 bogon (10.0.0.0/8)
"2002:7f00::/24", // 6to4 bogon (127.0.0.0/8)
"2002:a9fe::/32", // 6to4 bogon (169.254.0.0/16)
"2002:ac10::/28", // 6to4 bogon (172.16.0.0/12)
"2002:c000::/40", // 6to4 bogon (192.0.0.0/24)
"2002:c000:200::/40", // 6to4 bogon (192.0.2.0/24)
"2002:c0a8::/32", // 6to4 bogon (192.168.0.0/16)
"2002:c612::/31", // 6to4 bogon (198.18.0.0/15)
"2002:c633:6400::/40", // 6to4 bogon (198.51.100.0/24)
"2002:cb00:7100::/40", // 6to4 bogon (203.0.113.0/24)
"2002:e000::/20", // 6to4 bogon (224.0.0.0/4)
"2002:f000::/20", // 6to4 bogon (240.0.0.0/4)
"2002:ffff:ffff::/48", // 6to4 bogon (255.255.255.255/32)
"2001::/40", // Teredo bogon (0.0.0.0/8)
"2001:0:a00::/40", // Teredo bogon (10.0.0.0/8)
"2001:0:7f00::/40", // Teredo bogon (127.0.0.0/8)
"2001:0:a9fe::/48", // Teredo bogon (169.254.0.0/16)
"2001:0:ac10::/44", // Teredo bogon (172.16.0.0/12)
"2001:0:c000::/56", // Teredo bogon (192.0.0.0/24)
"2001:0:c000:200::/56", // Teredo bogon (192.0.2.0/24)
"2001:0:c0a8::/48", // Teredo bogon (192.168.0.0/16)
"2001:0:c612::/47", // Teredo bogon (198.18.0.0/15)
"2001:0:c633:6400::/56", // Teredo bogon (198.51.100.0/24)
"2001:0:cb00:7100::/56", // Teredo bogon (203.0.113.0/24)
"2001:0:e000::/36", // Teredo bogon (224.0.0.0/4)
"2001:0:f000::/36", // Teredo bogon (240.0.0.0/4)
"2001:0:ffff:ffff::/64", // Teredo bogon (255.255.255.255/32)
}
bogons = append(bogonsIPv4, bogonsIPv6...)
)
// Networks contains predefined network lists.
var Networks = map[string]*NetworkTree{
"bogons": MustNetworkTree(bogons...),
"boeong4": MustNetworkTree(bogonsIPv4...),
"bogons6": MustNetworkTree(bogonsIPv6...),
}