Skip to content

Commit

Permalink
fix: use url.URL to encode and decode PURLs
Browse files Browse the repository at this point in the history
This commit refactors the `ToString` and `FromString` functions /
methods to use the `url.URL` type, instead of trying to parse URLs
directly.

The [PURL Spec](https://github.com/package-url/purl-spec/blob/master/PURL-SPECIFICATION.rst#a-purl-is-a-url)
explicitly says:

> A purl is a URL:
> A purl is a valid URL and URI that conforms to the URL definitions or
> specifications ...

So why not actually use Go's URL type to do operations on it?

This commit does exactly that, and removes a lot of the previously dense
parsing-logic with simpler checks based on the URL's fields. Especially
`ToString()` has gotten a lot simpler through that.

Additionaly, by switching to the URL package, this commit fixes a couple
of outstanding bugs:
- When a qualifier contained `+` signs, which are valid in URL paths,
  but not in URL queries, the sign did not get escaped. The previous
  code relied on `url.PathUnescape` to unescape keys and values, which
  should have used `url.QueryUnescape` instead. Further, encoding
  Qualifiers used `url.PathEscape` instead of `url.QueryEscape`. This
  could have led to pURLs losing `+` signs if they were properly
  encoded.
- Fixes #51, where spaces in names have not gotten encoded correctly.
- Fixes most test-cases from #22 that are round-trip-save (e.g. all
  cases where input == output), except the "pre-encoded qualifier value
  is unchanged" testcase that is wrong - a qualifier shouldn't be
  encoded with `%20` for a space, but with a plus-sign (query encoding).
- Fixes most cases from #41 as well, except:
  -  where the query encoding in the test-cases are wrong (" " -> "+",
     "+" -> "%20") (test-cases `pre-encoded_qualifier_value_is_unchanged`
     and `unencoded_qualifier_value_is_encoded`),
  - `characters_are_unencoded_where_allowed`: `<` should be encoded, as
    far as I can tell. The Go stdlib also encodes it, so this should be
    fine.
  - `explicit_characters_are_encoded`: `@` does not need to be encoded.

When reading through that list and all the nuances, it makes it clear
that we shouldn't do this ourselves! And this commit doesn't, it simply
relies on the Go standard library to handle all of these cases
correctly.

All of the aforementioned issues should have test-cases, presumably
added to the test-suite-data.
  • Loading branch information
tommyknows authored and shibumi committed Jul 1, 2023
1 parent d729287 commit 564b6fc
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 147 deletions.
266 changes: 120 additions & 146 deletions packageurl.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"errors"
"fmt"
"net/url"
"path"
"regexp"
"sort"
"strings"
Expand Down Expand Up @@ -120,6 +121,15 @@ func (q Qualifier) String() string {
// in the package URL.
type Qualifiers []Qualifier

// urlQuery returns a raw URL query with all the qualifiers as keys + values.
func (q Qualifiers) urlQuery() (rawQuery string) {
v := make(url.Values)
for _, qq := range q {
v.Add(qq.Key, qq.Value)
}
return v.Encode()
}

// QualifiersFromMap constructs a Qualifiers slice from a string map. To get a
// deterministic qualifier order (despite maps not providing any iteration order
// guarantees) the returned Qualifiers are sorted in increasing order of key.
Expand Down Expand Up @@ -184,39 +194,20 @@ func NewPackageURL(purlType, namespace, name, version string,
// ToString returns the human-readable instance of the PackageURL structure.
// This is the literal purl as defined by the spec.
func (p *PackageURL) ToString() string {
// Start with the type and a colon
purl := fmt.Sprintf("pkg:%s/", p.Type)
// Add namespaces if provided
if p.Namespace != "" {
var ns []string
for _, item := range strings.Split(p.Namespace, "/") {
ns = append(ns, url.QueryEscape(item))
}
purl = purl + strings.Join(ns, "/") + "/"
}
// The name is always required and must be a percent-encoded string
// Use url.QueryEscape instead of PathEscape, as it handles @ signs
purl = purl + url.QueryEscape(p.Name)
// If a version is provided, add it after the at symbol
if p.Version != "" {
// A name must be a percent-encoded string
purl = purl + "@" + url.PathEscape(p.Version)
u := &url.URL{
Scheme: "pkg",
RawQuery: p.Qualifiers.urlQuery(),
Fragment: p.Subpath,
}

// Iterate over qualifiers and make groups of key=value
var qualifiers []string
for _, q := range p.Qualifiers {
qualifiers = append(qualifiers, q.String())
}
// If there are one or more key=value pairs, append on the package url
if len(qualifiers) != 0 {
purl = purl + "?" + strings.Join(qualifiers, "&")
}
// Add a subpath if available
if p.Subpath != "" {
purl = purl + "#" + p.Subpath
}
return purl
// we use JoinPath and EscapePath as the behavior for "/" is only correct with that.
// We don't want to escape "/", but want to escape all other characters that are necessary.
u = u.JoinPath(p.Type, p.Namespace, strings.Join([]string{p.Name, p.Version}, "@"))
// write the actual path into the "Opaque" block, so that the generated string at the end is
// pkg:<path> and not pkg://<path>.
u.Opaque, u.Path = u.EscapedPath(), ""

return u.String()
}

func (p PackageURL) String() string {
Expand All @@ -225,134 +216,117 @@ func (p PackageURL) String() string {

// FromString parses a valid package url string into a PackageURL structure
func FromString(purl string) (PackageURL, error) {
initialIndex := strings.Index(purl, "#")
// Start with purl being stored in the remainder
remainder := purl
substring := ""
if initialIndex != -1 {
initialSplit := strings.SplitN(purl, "#", 2)
remainder = initialSplit[0]
rightSide := initialSplit[1]
rightSide = strings.TrimLeft(rightSide, "/")
rightSide = strings.TrimRight(rightSide, "/")
var rightSides []string

for _, item := range strings.Split(rightSide, "/") {
item = strings.Replace(item, ".", "", -1)
item = strings.Replace(item, "..", "", -1)
if item != "" {
i, err := url.PathUnescape(item)
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape path: %s", err)
}
rightSides = append(rightSides, i)
}
}
substring = strings.Join(rightSides, "/")
u, err := url.Parse(purl)
if err != nil {
return PackageURL{}, fmt.Errorf("failed to parse as URL: %w", err)
}
qualifiers := Qualifiers{}
index := strings.LastIndex(remainder, "?")
// If we don't have anything to split then return an empty result
if index != -1 {
qualifier := remainder[index+1:]
for _, item := range strings.Split(qualifier, "&") {
kv := strings.Split(item, "=")
if len(kv) != 2 {
return PackageURL{}, fmt.Errorf("wanted 2 kv segments, got %d", len(kv))
}
key := strings.ToLower(kv[0])
key, err := url.PathUnescape(key)
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape qualifier key: %s", err)
}
if !validQualifierKey(key) {
return PackageURL{}, fmt.Errorf("invalid qualifier key: '%s'", key)
}
// TODO
// - If the `key` is `checksums`, split the `value` on ',' to create
// a list of `checksums`
if kv[1] == "" {
continue
}
value, err := url.PathUnescape(kv[1])
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape qualifier value: %s", err)
}
qualifiers = append(qualifiers, Qualifier{key, value})
}
remainder = remainder[:index]

if u.Scheme != "pkg" {
return PackageURL{}, fmt.Errorf("purl scheme is not \"pkg\": %q", u.Scheme)
}

nextSplit := strings.SplitN(remainder, ":", 2)
if len(nextSplit) != 2 || nextSplit[0] != "pkg" {
return PackageURL{}, errors.New("scheme is missing")
p := u.Opaque
// if a purl starts with pkg:/ or even pkg://, we need to fall back to host + path.
if p == "" {
p = strings.TrimPrefix(path.Join(u.Host, u.Path), "/")
}
// leading slashes after pkg: are to be ignored (pkg://maven is
// equivalent to pkg:maven)
remainder = strings.TrimLeft(nextSplit[1], "/")

nextSplit = strings.SplitN(remainder, "/", 2)
if len(nextSplit) != 2 {
return PackageURL{}, errors.New("type is missing")
typ, p, ok := strings.Cut(p, "/")
if !ok {
return PackageURL{}, fmt.Errorf("purl is missing type or name")
}
// purl type is case-insensitive, canonical form is lower-case
purlType := strings.ToLower(nextSplit[0])
remainder = nextSplit[1]
typ = strings.ToLower(typ)

index = strings.LastIndex(remainder, "/")
name := typeAdjustName(purlType, remainder[index+1:], qualifiers)
version := ""
qualifiers, err := parseQualifiers(u.RawQuery)
if err != nil {
return PackageURL{}, fmt.Errorf("invalid qualifiers: %w", err)
}
namespace, name, version, err := separateNamespaceNameVersion(p)
if err != nil {
return PackageURL{}, err
}

atIndex := strings.Index(name, "@")
if atIndex != -1 {
v, err := url.PathUnescape(name[atIndex+1:])
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape purl version: %s", err)
}
version = typeAdjustVersion(purlType, v)
pURL := PackageURL{
Qualifiers: qualifiers,
Type: typ,
Namespace: typeAdjustNamespace(typ, namespace),
Name: typeAdjustName(typ, name, qualifiers),
Version: typeAdjustVersion(typ, version),
Subpath: strings.Trim(u.Fragment, "/"),
}

return pURL, validCustomRules(pURL)
}

unecapeName, err := url.PathUnescape(name[:atIndex])
func separateNamespaceNameVersion(path string) (ns, name, version string, err error) {
namespaceSep := strings.LastIndex(path, "/")
if namespaceSep != -1 {
ns, err = url.PathUnescape(path[:namespaceSep])
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape purl name: %s", err)
return "", "", "", fmt.Errorf("error unescaping namespace: %w", err)
}
name = unecapeName
}
var namespaces []string

if index != -1 {
remainder = remainder[:index]
path = path[namespaceSep+1:]
}

for _, item := range strings.Split(remainder, "/") {
if item != "" {
unescaped, err := url.PathUnescape(item)
if err != nil {
return PackageURL{}, fmt.Errorf("failed to unescape path: %s", err)
}
namespaces = append(namespaces, unescaped)
}
}
v := strings.Split(path, "@")
name, err = url.PathUnescape(v[0])
if err != nil {
return "", "", "", fmt.Errorf("error unescaping name: %w", err)
}
namespace := strings.Join(namespaces, "/")
namespace = typeAdjustNamespace(purlType, namespace)

// Fail if name is empty at this point
if name == "" {
return PackageURL{}, errors.New("name is required")
return "", "", "", fmt.Errorf("purl is missing name")
}

err := validCustomRules(purlType, name, namespace, version, qualifiers)
if err != nil {
return PackageURL{}, err
if len(v) > 1 {
version, err = url.PathUnescape(v[1])
if err != nil {
return "", "", "", fmt.Errorf("error unescaping version: %w", err)
}
}

return PackageURL{
Type: purlType,
Namespace: namespace,
Name: name,
Version: version,
Qualifiers: qualifiers,
Subpath: substring,
}, nil
return ns, name, version, nil
}

func parseQualifiers(rawQuery string) (Qualifiers, error) {
// we need to parse the qualifiers ourselves and cannot rely on the `url.Query` type because
// that uses a map, meaning it's unordered. We want to keep the order of the qualifiers, so this
// function re-implements the `url.parseQuery` function based on our `Qualifier` type. Most of
// the code here is taken from `url.parseQuery`.
q := Qualifiers{}
for rawQuery != "" {
var key string
key, rawQuery, _ = strings.Cut(rawQuery, "&")
if strings.Contains(key, ";") {
return nil, fmt.Errorf("invalid semicolon separator in query")
}
if key == "" {
continue
}
key, value, _ := strings.Cut(key, "=")
key, err := url.QueryUnescape(key)
if err != nil {
return nil, fmt.Errorf("error unescaping qualifier key %q", key)
}

if !validQualifierKey(key) {
return nil, fmt.Errorf("invalid qualifier key: '%s'", key)
}

value, err = url.QueryUnescape(value)
if err != nil {
return nil, fmt.Errorf("error unescaping qualifier value %q", value)
}

q = append(q, Qualifier{
Key: strings.ToLower(key),
// only the first character needs to be lowercase. Note that pURL is always UTF8, so we
// don't need to care about unicode here.
Value: strings.ToLower(value[:1]) + value[1:],
})
}
return q, nil
}

// Make any purl type-specific adjustments to the parsed namespace.
Expand Down Expand Up @@ -432,11 +406,11 @@ func validQualifierKey(key string) bool {

// validCustomRules evaluates additional rules for each package url type, as specified in the package-url specification.
// On success, it returns nil. On failure, a descriptive error will be returned.
func validCustomRules(purlType, name, ns, version string, qualifiers Qualifiers) error {
q := qualifiers.Map()
switch purlType {
func validCustomRules(p PackageURL) error {
q := p.Qualifiers.Map()
switch p.Type {
case TypeConan:
if ns != "" {
if p.Namespace != "" {
if val, ok := q["channel"]; ok {
if val == "" {
return errors.New("the qualifier channel must be not empty if namespace is present")
Expand All @@ -452,14 +426,14 @@ func validCustomRules(purlType, name, ns, version string, qualifiers Qualifiers)
}
}
case TypeSwift:
if ns == "" {
if p.Namespace == "" {
return errors.New("namespace is required")
}
if version == "" {
if p.Version == "" {
return errors.New("version is required")
}
case TypeCran:
if version == "" {
if p.Version == "" {
return errors.New("version is required")
}
}
Expand Down
2 changes: 1 addition & 1 deletion packageurl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ type OrderedMap struct {

// qualifiersMapPattern is used to parse the TestFixture "qualifiers" field to
// ensure that it's a json object.
var qualifiersMapPattern = regexp.MustCompile(`^\{.*\}$`)
var qualifiersMapPattern = regexp.MustCompile(`(?ms)^\{.*\}$`)

// UnmarshalJSON unmarshals the qualifiers field for a TestFixture. The
// qualifiers field is given as a json object such as:
Expand Down

0 comments on commit 564b6fc

Please sign in to comment.