Skip to content

Commit

Permalink
Switch the lib to A-labels by default (#40)
Browse files Browse the repository at this point in the history
* Add test cases for GH-31

* Switch the list internal representation to A-labels by default

* Pre-process the list to A-labels

* Enforce official PSL test cases to ASCII

* Add note about A-labels in the README
  • Loading branch information
weppos authored Nov 21, 2016
1 parent d5b7003 commit a0d04ff
Show file tree
Hide file tree
Showing 10 changed files with 721 additions and 489 deletions.
14 changes: 8 additions & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
language: go

go:
- 1.2
- 1.3
- 1.4
- 1.5
- tip
- 1.2
- 1.3
- 1.4
- 1.5
- tip

sudo: false
install:
- make get-deps

# before_install:
# - go get -t -v ./...
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
test:
go test ./... -v

gen:
go run cmd/gen/gen.go > publicsuffix/rules.txt && mv publicsuffix/rules.txt publicsuffix/rules.go

clean:
rm publicsuffix/rules.*

test:
go test ./... -v
get-deps:
go get golang.org/x/net/idna
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,19 @@ publicsuffix.DomainFromListWithOptions(list, "google.blogspot.com", nil)
// blogspot.com
```

## IDN domains, A-labels and U-labels

[A-label and U-label](https://tools.ietf.org/html/rfc5890#section-2.3.2.1) are two different ways to represent IDN domain names. These two encodings are also known as ASCII (A-label) or Pynucode vs Unicode (U-label). Conversions between U-labels and A-labels are performed according to the ["Punycode" specification](https://tools.ietf.org/html/rfc3492), adding or removing the ACE prefix as needed.

IDNA-aware applications generally use the A-label form for storing and manipulating data, whereas the U-labels can appear in presentation and user interface forms.

Although the PSL list has been traditionally U-label encoded, this library follows the common industry standards and stores the rules in their A-label form. Therefore, unless explicitly mentioned, any method call, comparison or internal representation is expected to be ASCII-compatible encoded (ACE).

Passing Unicode names to the library may either result in error or unexpected behaviors.

If you are interested in the details of this decision, you can read the full discussion [here](https://github.com/weppos/publicsuffix-go/issues/31).


## Differences with `golang.org/x/net/publicsuffix`

The [`golang.org/x/net/publicsuffix`](https://godoc.org/golang.org/x/net/publicsuffix) is a package part of the Golang `x/net` package, that provides a public suffix list implementation.
Expand Down
7 changes: 3 additions & 4 deletions cmd/gen/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@ import (
"os"
"regexp"
"strings"
"time"
"text/template"
"time"

"github.com/weppos/publicsuffix-go/publicsuffix"
)


const (
goSrc = `// This file is automatically generated
// Run "go run cmd/gen/gen.go" to update the list.
Expand Down Expand Up @@ -63,9 +62,9 @@ func main() {
}

data := struct {
VersionSHA string
VersionSHA string
VersionDate string
Rules []publicsuffix.Rule
Rules []publicsuffix.Rule
}{
sha,
datetime.Format(time.ANSIC),
Expand Down
2 changes: 1 addition & 1 deletion cmd/load/main.rb.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package main

import (
"time"
"fmt"
"time"

"github.com/weppos/publicsuffix-go/publicsuffix"
)
Expand Down
100 changes: 87 additions & 13 deletions publicsuffix/acceptance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ type validTestCase struct {

func TestValid(t *testing.T) {
testCases := []validTestCase{
validTestCase{"example.com", "example.com", &DomainName{"com", "example", "", NewRule("com")}},
validTestCase{"foo.example.com", "example.com", &DomainName{"com", "example", "foo", NewRule("com")}},
validTestCase{"example.com", "example.com", &DomainName{"com", "example", "", MustNewRule("com")}},
validTestCase{"foo.example.com", "example.com", &DomainName{"com", "example", "foo", MustNewRule("com")}},

validTestCase{"verybritish.co.uk", "verybritish.co.uk", &DomainName{"co.uk", "verybritish", "", NewRule("*.uk")}},
validTestCase{"foo.verybritish.co.uk", "verybritish.co.uk", &DomainName{"co.uk", "verybritish", "foo", NewRule("*.uk")}},
validTestCase{"verybritish.co.uk", "verybritish.co.uk", &DomainName{"co.uk", "verybritish", "", MustNewRule("*.uk")}},
validTestCase{"foo.verybritish.co.uk", "verybritish.co.uk", &DomainName{"co.uk", "verybritish", "foo", MustNewRule("*.uk")}},

validTestCase{"parliament.uk", "parliament.uk", &DomainName{"uk", "parliament", "", NewRule("!parliament.uk")}},
validTestCase{"foo.parliament.uk", "parliament.uk", &DomainName{"uk", "parliament", "foo", NewRule("!parliament.uk")}},
validTestCase{"parliament.uk", "parliament.uk", &DomainName{"uk", "parliament", "", MustNewRule("!parliament.uk")}},
validTestCase{"foo.parliament.uk", "parliament.uk", &DomainName{"uk", "parliament", "foo", MustNewRule("!parliament.uk")}},

validTestCase{"foo.blogspot.com", "foo.blogspot.com", &DomainName{"blogspot.com", "foo", "", NewRule("blogspot.com")}},
validTestCase{"bar.foo.blogspot.com", "foo.blogspot.com", &DomainName{"blogspot.com", "foo", "bar", NewRule("blogspot.com")}},
validTestCase{"foo.blogspot.com", "foo.blogspot.com", &DomainName{"blogspot.com", "foo", "", MustNewRule("blogspot.com")}},
validTestCase{"bar.foo.blogspot.com", "foo.blogspot.com", &DomainName{"blogspot.com", "foo", "bar", MustNewRule("blogspot.com")}},
}

for _, testCase := range testCases {
Expand All @@ -46,18 +46,18 @@ func TestValid(t *testing.T) {

type privateTestCase struct {
input string
domain string
ignore bool
error bool
domain string
}

func TestIncludePrivate(t *testing.T) {
testCases := []privateTestCase{
privateTestCase{"blogspot.com", false, true, ""},
privateTestCase{"blogspot.com", true, false, "blogspot.com"},
privateTestCase{"blogspot.com", "", false, true},
privateTestCase{"blogspot.com", "blogspot.com", true, false},

privateTestCase{"foo.blogspot.com", false, false, "foo.blogspot.com"},
privateTestCase{"foo.blogspot.com", true, false, "blogspot.com"},
privateTestCase{"foo.blogspot.com", "foo.blogspot.com", false, false},
privateTestCase{"foo.blogspot.com", "blogspot.com", true, false},
}

for _, testCase := range testCases {
Expand All @@ -76,5 +76,79 @@ func TestIncludePrivate(t *testing.T) {
t.Errorf("Domain(%v) = %v, want %v", testCase.input, got, want)
}
}
}

type idnaTestCase struct {
input string
domain string
error bool
}

func TestIDNA(t *testing.T) {
testACases := []idnaTestCase{
// A-labels are supported
// Check single IDN part
idnaTestCase{"xn--p1ai", "", true},
idnaTestCase{"example.xn--p1ai", "example.xn--p1ai", false},
idnaTestCase{"subdomain.example.xn--p1ai", "example.xn--p1ai", false},
// Check multiple IDN parts
idnaTestCase{"xn--example--3bhk5a.xn--p1ai", "xn--example--3bhk5a.xn--p1ai", false},
idnaTestCase{"subdomain.xn--example--3bhk5a.xn--p1ai", "xn--example--3bhk5a.xn--p1ai", false},
// Check multiple IDN rules
idnaTestCase{"example.xn--o1ach.xn--90a3ac", "example.xn--o1ach.xn--90a3ac", false},
idnaTestCase{"sudbomain.example.xn--o1ach.xn--90a3ac", "example.xn--o1ach.xn--90a3ac", false},
}

for _, testCase := range testACases {
got, err := DomainFromListWithOptions(DefaultList, testCase.input, nil)

if testCase.error && err == nil {
t.Errorf("A-label %v should have returned error, got: %v", testCase.input, got)
continue
}
if !testCase.error && err != nil {
t.Errorf("A-label %v returned error: %v", testCase.input, err)
continue
}

if want := testCase.domain; want != got {
t.Errorf("A-label Domain(%v) = %v, want %v", testCase.input, got, want)
}
}

// These tests validates the non-acceptance of U-labels.
//
// TODO(weppos): some tests are passing because of the default rule *
// Consider to add some tests overriding the default rule to nil.
// Right now, setting the default rule to nil with cause a panic if the lookup results in a nil.
testUCases := []idnaTestCase{
// U-labels are NOT supported
// Check single IDN part
idnaTestCase{"рф", "", true},
idnaTestCase{"example.рф", "example.рф", false}, // passes because of *
idnaTestCase{"subdomain.example.рф", "example.рф", false}, // passes because of *
// Check multiple IDN parts
idnaTestCase{"example-упр.рф", "example-упр.рф", false}, // passes because of *
idnaTestCase{"subdomain.example-упр.рф", "example-упр.рф", false}, // passes because of *
// Check multiple IDN rules
idnaTestCase{"example.упр.срб", "упр.срб", false},
idnaTestCase{"sudbomain.example.упр.срб", "упр.срб", false},
}

for _, testCase := range testUCases {
got, err := DomainFromListWithOptions(DefaultList, testCase.input, nil)

if testCase.error && err == nil {
t.Errorf("U-label %v should have returned error, got: %v", testCase.input, got)
continue
}
if !testCase.error && err != nil {
t.Errorf("U-label %v returned error: %v", testCase.input, err)
continue
}

if want := testCase.domain; want != got {
t.Errorf("U-label Domain(%v) = %v, want %v", testCase.input, got, want)
}
}
}
16 changes: 9 additions & 7 deletions publicsuffix/psl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,18 @@ func TestPsl(t *testing.T) {
}

for _, testCase := range testCases {
input := testCase.input
if strings.Contains(testCase.input, "xn--") {
input, _ = idna.ToUnicode(input)
input, err := idna.ToASCII(testCase.input)
if err != nil {
t.Fatalf("failed to convert input %v to ASCII", testCase.input)
}

got, err := Domain(input)
if strings.Contains(testCase.input, "xn--") {
got, _ = idna.ToASCII(got)
output, err := idna.ToASCII(testCase.output)
if err != nil {
t.Fatalf("failed to convert output %v to ASCII", testCase.output)
}

got, err := Domain(input)

if testCase.error && err == nil {
t.Errorf("PSL(%v) should have returned error, got: %v", testCase.input, got)
continue
Expand All @@ -66,7 +68,7 @@ func TestPsl(t *testing.T) {
t.Errorf("PSL(%v) returned error: %v", testCase.input, err)
continue
}
if testCase.output != got {
if got != output {
t.Errorf("PSL(%v) = %v, want %v", testCase.input, got, testCase.output)
continue
}
Expand Down
53 changes: 49 additions & 4 deletions publicsuffix/publicsuffix.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"os"
"regexp"
"strings"

"golang.org/x/net/idna"
)

const (
Expand All @@ -31,10 +33,10 @@ const (
var DefaultList = NewList()

// DefaultRule is the default Rule that represents "*".
var DefaultRule = NewRule("*")
var DefaultRule = MustNewRule("*")

// DefaultParserOptions are the default options used to parse a Public Suffix list.
var DefaultParserOptions = &ParserOption{PrivateDomains: true}
var DefaultParserOptions = &ParserOption{PrivateDomains: true, ASCIIEncoded: false}

// DefaultFindOptions are the default options used to perform the lookup of rules in the list.
var DefaultFindOptions = &FindOptions{IgnorePrivate: false, DefaultRule: DefaultRule}
Expand All @@ -50,7 +52,15 @@ type Rule struct {
// ParserOption are the options you can use to customize the way a List
// is parsed from a file or a string.
type ParserOption struct {
// Set to false to skip the private domains when parsing.
// Default to true, which means the private domains are included.
PrivateDomains bool

// Set to false if the input is encoded in U-labels (Unicode)
// as opposite to A-labels.
// Default to false, which means the list is containing Unicode domains.
// This is the default because the original PSL currently contains Unicode.
ASCIIEncoded bool
}

// FindOptions are the options you can use to customize the way a Rule
Expand Down Expand Up @@ -159,7 +169,18 @@ Scanning:
break

default:
rule := NewRule(line)
var rule *Rule
var err error

if options.ASCIIEncoded {
rule, err = NewRule(line)
} else {
rule, err = NewRuleUnicode(line)
}
if err != nil {
return []Rule{}, err
}

rule.Private = (section == 2)
l.AddRule(rule)
rules = append(rules, *rule)
Expand Down Expand Up @@ -212,7 +233,9 @@ func (l *List) selectRules(name string, options *FindOptions) []Rule {
}

// NewRule parses the rule content, creates and returns a Rule.
func NewRule(content string) *Rule {
//
// The content of the rule MUST be encoded in ASCII (A-labels).
func NewRule(content string) (*Rule, error) {
var rule *Rule
var value string

Expand All @@ -231,6 +254,28 @@ func NewRule(content string) *Rule {
value = content
rule = &Rule{Type: NormalType, Value: value, Length: len(Labels(value))}
}

return rule, nil
}

// NewRuleUnicode is like NewRule, but expects the content to be encoded in Unicode (U-labels).
func NewRuleUnicode(content string) (*Rule, error) {
var err error

content, err = idna.ToASCII(content)
if err != nil {
return nil, err
}

return NewRule(content)
}

// MustNewRule is like NewRule, but panics if the content cannot be parsed.
func MustNewRule(content string) *Rule {
rule, err := NewRule(content)
if err != nil {
panic(err)
}
return rule
}

Expand Down
Loading

0 comments on commit a0d04ff

Please sign in to comment.