Checkpoint

This commit is contained in:
2025-10-06 22:25:23 +02:00
parent a23259cfdc
commit a254b306f2
48 changed files with 3327 additions and 212 deletions

53
dataset/parser/adblock.go Normal file
View File

@@ -0,0 +1,53 @@
package parser
import (
"bufio"
"io"
"strings"
)
func init() {
RegisterDomainsParser(adblockDomainsParser{})
}
type adblockDomainsParser struct{}
func (adblockDomainsParser) CanHandle(line string) bool {
return strings.HasPrefix(strings.ToLower(line), `[adblock`) ||
strings.HasPrefix(line, "@@") || // exception
strings.HasPrefix(line, "||") || // blah
line[0] == '*'
}
func (adblockDomainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
// Common AdBlock patterns:
// ||domain.com^
// |http://domain.com|
// domain.com/path
// *domain.com*
switch {
case strings.HasPrefix(line, `||`): // domain anchor
if i := strings.IndexByte(line, '^'); i != -1 {
domains = append(domains, line[2:i])
continue
}
case strings.HasPrefix(line, `|`) && strings.HasSuffix(line, `|`):
domains = append(domains, line[1:len(line)-2])
continue
case strings.HasPrefix(line, `[`):
continue
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}

View File

@@ -0,0 +1,41 @@
package parser
import (
"reflect"
"sort"
"strings"
"testing"
)
func TestAdBlockParser(t *testing.T) {
test := `[Adblock Plus 2.0]
! Title: AdRules DNS List
! Homepage: https://github.com/Cats-Team/AdRules
! Powerd by Cats-Team
! Expires: 1 (update frequency)
! Description: The DNS Filters
! Total count: 145270
! Update: 2025-10-07 02:05:08(GMT+8)
/^.+stat\.kugou\.com/
/^admarvel\./
||*-ad-sign.byteimg.com^
||*-ad.a.yximgs.com^
||*-applog.fqnovel.com^
||*-datareceiver.aki-game.net^
||*.exaapi.com^`
want := []string{"*-ad-sign.byteimg.com", "*-ad.a.yximgs.com", "*-applog.fqnovel.com", "*-datareceiver.aki-game.net", "*.exaapi.com"}
parsed, ignored, err := ParseDomains(strings.NewReader(test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, want) {
t.Errorf("expected ParseDomains(domains) to return %v, got %v", want, parsed)
}
if ignored != 2 {
t.Errorf("expected 2 ignored, got %d", ignored)
}
}

139
dataset/parser/dns.go Normal file
View File

@@ -0,0 +1,139 @@
package parser
import (
"bufio"
"io"
"strings"
"github.com/miekg/dns"
)
func init() {
RegisterDomainsParser(dnsmasqDomainsParser{})
RegisterDomainsParser(mosDNSDomainsParser{})
RegisterDomainsParser(smartDNSDomainsParser{})
RegisterDomainsParser(unboundDomainsParser{})
}
type dnsmasqDomainsParser struct{}
func (dnsmasqDomainsParser) CanHandle(line string) bool {
return strings.HasPrefix(line, "address=/")
}
func (dnsmasqDomainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
switch {
case strings.HasPrefix(line, "address=/"):
part := strings.FieldsFunc(line, func(r rune) bool { return r == '/' })
if len(part) >= 3 && isDomainName(part[1]) {
domains = append(domains, part[1])
continue
}
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}
type mosDNSDomainsParser struct{}
func (mosDNSDomainsParser) CanHandle(line string) bool {
if strings.HasPrefix(line, "domain:") {
return isDomainName(line[7:])
}
return false
}
func (mosDNSDomainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
if strings.HasPrefix(line, "domain:") {
domains = append(domains, line[7:])
continue
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}
type smartDNSDomainsParser struct{}
func (smartDNSDomainsParser) CanHandle(line string) bool {
return strings.HasPrefix(line, "address /")
}
func (smartDNSDomainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
if strings.HasPrefix(line, "address /") {
if i := strings.IndexByte(line[9:], '/'); i > -1 {
domains = append(domains, line[9:i+9])
continue
}
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}
type unboundDomainsParser struct{}
func (unboundDomainsParser) CanHandle(line string) bool {
return strings.HasPrefix(line, "local-data:") ||
strings.HasPrefix(line, "local-zone:")
}
func (unboundDomainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
switch {
case strings.HasPrefix(line, "local-data:"):
record := strings.Trim(strings.TrimSpace(line[11:]), `"`)
if rr, err := dns.NewRR(record); err == nil {
switch rr.Header().Rrtype {
case dns.TypeA, dns.TypeAAAA, dns.TypeCNAME:
domains = append(domains, strings.Trim(rr.Header().Name, `.`))
continue
}
}
case strings.HasPrefix(line, "local-zone:") && strings.HasSuffix(line, " reject"):
line = strings.Trim(strings.TrimSpace(line[11:]), `"`)
if i := strings.IndexByte(line, '"'); i > -1 {
domains = append(domains, line[:i])
continue
}
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}

106
dataset/parser/dns_test.go Normal file
View File

@@ -0,0 +1,106 @@
package parser
import (
"reflect"
"sort"
"strings"
"testing"
)
func TestDNSMasqParser(t *testing.T) {
tests := []struct {
Name string
Test string
Want []string
WantIgnored int
}{
{
"data",
`
local-data: "junk1.doubleclick.net A 127.0.0.1"
local-data: "junk2.doubleclick.net A 127.0.0.1"
local-data: "junk2.doubleclick.net CNAME doubleclick.net."
local-data: "junk6.doubleclick.net AAAA ::1"
local-data: "doubleclick.net A 127.0.0.1"
local-data: "ad.junk1.doubleclick.net A 127.0.0.1"
local-data: "adjunk.google.com A 127.0.0.1"`,
[]string{"ad.junk1.doubleclick.net", "adjunk.google.com", "doubleclick.net", "junk1.doubleclick.net", "junk2.doubleclick.net", "junk6.doubleclick.net"},
0,
},
{
"zone",
`
local-zone: "doubleclick.net" reject
local-zone: "adjunk.google.com" reject`,
[]string{"adjunk.google.com", "doubleclick.net"},
0,
},
{
"address",
`
address=/ziyu.net/0.0.0.0
address=/zlp6s.pw/0.0.0.0
address=/zm232.com/0.0.0.0
`,
[]string{"ziyu.net", "zlp6s.pw", "zm232.com"},
0,
},
}
for _, test := range tests {
t.Run(test.Name, func(it *testing.T) {
parsed, ignored, err := ParseDomains(strings.NewReader(test.Test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, test.Want) {
t.Errorf("expected ParseDomains(dnsmasq) to return\n\t%v, got\n\t%v", test.Want, parsed)
}
if ignored != test.WantIgnored {
t.Errorf("expected %d ignored, got %d", test.WantIgnored, ignored)
}
})
}
}
func TestMOSDNSParser(t *testing.T) {
test := `domain:0019x.com
domain:002777.xyz
domain:003store.com
domain:00404850.xyz`
want := []string{"0019x.com", "002777.xyz", "003store.com", "00404850.xyz"}
parsed, _, err := ParseDomains(strings.NewReader(test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, want) {
t.Errorf("expected ParseDomains(domains) to return %v, got %v", want, parsed)
}
}
func TestSmartDNSParser(t *testing.T) {
test := `# Title:AdRules SmartDNS List
# Update: 2025-10-07 02:05:08(GMT+8)
address /0.myikas.com/#
address /0.net.easyjet.com/#
address /0.nextyourcontent.com/#
address /0019x.com/#`
want := []string{"0.myikas.com", "0.net.easyjet.com", "0.nextyourcontent.com", "0019x.com"}
parsed, _, err := ParseDomains(strings.NewReader(test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, want) {
t.Errorf("expected ParseDomains(domains) to return %v, got %v", want, parsed)
}
}

40
dataset/parser/domains.go Normal file
View File

@@ -0,0 +1,40 @@
package parser
import (
"bufio"
"io"
"net"
"strings"
)
func init() {
domainsParsers = append(domainsParsers, domainsParser{})
}
type domainsParser struct{}
func (domainsParser) CanHandle(line string) bool {
return isDomainName(line) &&
!strings.ContainsRune(line, ' ') &&
!strings.ContainsRune(line, ':') &&
net.ParseIP(line) == nil
}
func (domainsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
if isDomainName(line) {
domains = append(domains, line)
continue
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}

View File

@@ -0,0 +1,31 @@
package parser
import (
"reflect"
"sort"
"strings"
"testing"
)
func TestParseDomains(t *testing.T) {
test := `# This is a comment
facebook.com
tiktok.com
bogus ignored
youtube.com`
want := []string{"facebook.com", "tiktok.com", "youtube.com"}
parsed, ignored, err := ParseDomains(strings.NewReader(test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, want) {
t.Errorf("expected ParseDomains(domains) to return %v, got %v", want, parsed)
}
if ignored != 1 {
t.Errorf("expected 1 ignored, got %d", ignored)
}
}

41
dataset/parser/hosts.go Normal file
View File

@@ -0,0 +1,41 @@
package parser
import (
"bufio"
"io"
"net"
"strings"
)
func init() {
RegisterDomainsParser(hostsParser{})
}
type hostsParser struct{}
func (hostsParser) CanHandle(line string) bool {
part := strings.Fields(line)
return len(part) >= 2 && net.ParseIP(part[0]) != nil
}
func (hostsParser) ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
part := strings.Fields(line)
if len(part) >= 2 && net.ParseIP(part[0]) != nil {
domains = append(domains, part[1:]...)
continue
}
ignored++
}
if err = scanner.Err(); err != nil {
return
}
return unique(domains), ignored, nil
}

View File

@@ -0,0 +1,38 @@
package parser
import (
"reflect"
"sort"
"strings"
"testing"
)
func TestParseHosts(t *testing.T) {
test := `##
# Host Database
#
# localhost is used to configure the loopback interface
# when the system is booting. Do not change this entry.
##
127.0.0.1 localhost dragon dragon.local dragon.maze.network
255.255.255.255 broadcasthost
::1 localhost
ff00::1 multicast
1.2.3.4
`
want := []string{"broadcasthost", "dragon", "dragon.local", "dragon.maze.network", "localhost", "multicast"}
parsed, ignored, err := ParseDomains(strings.NewReader(test))
if err != nil {
t.Fatal(err)
return
}
sort.Strings(parsed)
if !reflect.DeepEqual(parsed, want) {
t.Errorf("expected ParseDomains(hosts) to return %v, got %v", want, parsed)
}
if ignored != 1 {
t.Errorf("expected 1 ignored, got %d", ignored)
}
}

76
dataset/parser/parser.go Normal file
View File

@@ -0,0 +1,76 @@
package parser
import (
"bufio"
"bytes"
"errors"
"io"
"log"
"strings"
"github.com/miekg/dns"
)
var ErrNoParser = errors.New("no suitable parser could be found")
type Parser interface {
CanHandle(line string) bool
}
type DomainsParser interface {
Parser
ParseDomains(io.Reader) (domains []string, ignored int, err error)
}
var domainsParsers []DomainsParser
func RegisterDomainsParser(parser DomainsParser) {
domainsParsers = append(domainsParsers, parser)
}
func ParseDomains(r io.Reader) (domains []string, ignored int, err error) {
var (
buffer = new(bytes.Buffer)
scanner = bufio.NewScanner(io.TeeReader(r, buffer))
line string
parser DomainsParser
)
for scanner.Scan() {
line = strings.TrimSpace(scanner.Text())
if isComment(line) {
continue
}
for _, parser = range domainsParsers {
if parser.CanHandle(line) {
log.Printf("using parser %T", parser)
return parser.ParseDomains(io.MultiReader(buffer, r))
}
}
break
}
return nil, 0, ErrNoParser
}
func isComment(line string) bool {
return line == "" || line[0] == '#' || line[0] == '!'
}
func isDomainName(name string) bool {
n, ok := dns.IsDomainName(name)
return n >= 2 && ok
}
func unique(strings []string) []string {
if strings == nil {
return nil
}
v := make(map[string]struct{})
for _, s := range strings {
v[s] = struct{}{}
}
o := make([]string, 0, len(v))
for k := range v {
o = append(o, k)
}
return o
}

View File

@@ -0,0 +1,31 @@
package parser
import (
"reflect"
"sort"
"testing"
)
func TestUnique(t *testing.T) {
tests := []struct {
Name string
Test []string
Want []string
}{
{"nil", nil, nil},
{"single", []string{"test"}, []string{"test"}},
{"duplicate", []string{"test", "test"}, []string{"test"}},
{"multiple", []string{"a", "a", "b", "b", "b", "c"}, []string{"a", "b", "c"}},
}
for _, test := range tests {
t.Run(test.Name, func(it *testing.T) {
v := unique(test.Test)
if v != nil {
sort.Strings(v)
}
if !reflect.DeepEqual(v, test.Want) {
it.Errorf("expected unique(%v) to return %v, got %v", test.Test, test.Want, v)
}
})
}
}