Skip to content

Commit

Permalink
Add module source address parsing (#7)
Browse files Browse the repository at this point in the history
* add module source addr file

This enabled parsing of module source addresses for
registry modules.

* add module source example to readme
* remove ParseRawModuleSource and local source parsing
  • Loading branch information
dbanck authored Apr 22, 2022
1 parent df91c12 commit eb7bcc2
Show file tree
Hide file tree
Showing 4 changed files with 594 additions and 4 deletions.
31 changes: 27 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ The most common source of these addresses outside of Terraform Core
is JSON representation of state, plan, or schemas as obtained
via [`hashicorp/terraform-exec`](https://github.com/hashicorp/terraform-exec).

## Example
## Parsing Provider Addresses

### Example

```go
p, err := ParseRawProviderSourceString("hashicorp/aws")
Expand All @@ -23,7 +25,7 @@ if err != nil {
// }
```

## Legacy address
### Legacy address

A legacy address is by itself (without more context) ambiguous.
For example `aws` may represent either the official `hashicorp/aws`
Expand All @@ -36,7 +38,7 @@ the address was produced by an affected version.
If you do not have that context you should parse the string via
`ParseRawProviderSourceString` and then check `addr.IsLegacy()`.

### What to do with a legacy address?
#### What to do with a legacy address?

Ask the Registry API whether and where the provider was moved to

Expand Down Expand Up @@ -70,7 +72,7 @@ If you cache results (which you should), ensure you have invalidation
mechanism in place because target (migrated) namespace may change.
Hard-coding migrations anywhere in code is strongly discouraged.

### `terraform` provider
#### `terraform` provider

Like any other legacy address `terraform` is also ambiguous. Such address may
(most unlikely) represent a custom-built provider called `terraform`,
Expand All @@ -86,3 +88,24 @@ i.e. assume all of its logic including schema is contained within
Terraform Core.

In such case you should just use `NewBuiltInProvider("terraform")`.

## Parsing Module Addresses

### Example

```go
registry, err := ParseRawModuleSourceRegistry("hashicorp/subnets/cidr")
if err != nil {
// deal with error
}

// registry == ModuleSourceRegistry{
// PackageAddr: ModuleRegistryPackage{
// Host: svchost.Hostname("registry.terraform.io"),
// Namespace: "hashicorp",
// Name: "subnets",
// TargetSystem: "cidr",
// },
// Subdir: "",
// },
```
241 changes: 241 additions & 0 deletions module.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
package tfaddr

import (
"fmt"
"path"
"regexp"
"strings"

svchost "github.com/hashicorp/terraform-svchost"
)

// ModuleSourceRegistry is representing a module listed in a Terraform module
// registry.
type ModuleSourceRegistry struct {
// PackageAddr is the registry package that the target module belongs to.
// The module installer must translate this into a ModuleSourceRemote
// using the registry API and then take that underlying address's
// PackageAddr in order to find the actual package location.
PackageAddr ModuleRegistryPackage

// If Subdir is non-empty then it represents a sub-directory within the
// remote package that the registry address eventually resolves to.
// This will ultimately become the suffix of the Subdir of the
// ModuleSourceRemote that the registry address translates to.
//
// Subdir uses a normalized forward-slash-based path syntax within the
// virtual filesystem represented by the final package. It will never
// include `../` or `./` sequences.
Subdir string
}

// DefaultModuleRegistryHost is the hostname used for registry-based module
// source addresses that do not have an explicit hostname.
const DefaultModuleRegistryHost = svchost.Hostname("registry.terraform.io")

var moduleRegistryNamePattern = regexp.MustCompile("^[0-9A-Za-z](?:[0-9A-Za-z-_]{0,62}[0-9A-Za-z])?$")
var moduleRegistryTargetSystemPattern = regexp.MustCompile("^[0-9a-z]{1,64}$")

// ParseRawModuleSourceRegistry only accepts module registry addresses, and
// will reject any other address type.
func ParseRawModuleSourceRegistry(raw string) (ModuleSourceRegistry, error) {
var err error

var subDir string
raw, subDir = splitPackageSubdir(raw)
if strings.HasPrefix(subDir, "../") {
return ModuleSourceRegistry{}, fmt.Errorf("subdirectory path %q leads outside of the module package", subDir)
}

parts := strings.Split(raw, "/")
// A valid registry address has either three or four parts, because the
// leading hostname part is optional.
if len(parts) != 3 && len(parts) != 4 {
return ModuleSourceRegistry{}, fmt.Errorf("a module registry source address must have either three or four slash-separated components")
}

host := DefaultModuleRegistryHost
if len(parts) == 4 {
host, err = svchost.ForComparison(parts[0])
if err != nil {
// The svchost library doesn't produce very good error messages to
// return to an end-user, so we'll use some custom ones here.
switch {
case strings.Contains(parts[0], "--"):
// Looks like possibly punycode, which we don't allow here
// to ensure that source addresses are written readably.
return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname %q; internationalized domain names must be given as direct unicode characters, not in punycode", parts[0])
default:
return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname %q", parts[0])
}
}
if !strings.Contains(host.String(), ".") {
return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname: must contain at least one dot")
}
// Discard the hostname prefix now that we've processed it
parts = parts[1:]
}

ret := ModuleSourceRegistry{
PackageAddr: ModuleRegistryPackage{
Host: host,
},

Subdir: subDir,
}

if host == svchost.Hostname("github.com") || host == svchost.Hostname("bitbucket.org") {
return ret, fmt.Errorf("can't use %q as a module registry host, because it's reserved for installing directly from version control repositories", host)
}

if ret.PackageAddr.Namespace, err = parseModuleRegistryName(parts[0]); err != nil {
if strings.Contains(parts[0], ".") {
// Seems like the user omitted one of the latter components in
// an address with an explicit hostname.
return ret, fmt.Errorf("source address must have three more components after the hostname: the namespace, the name, and the target system")
}
return ret, fmt.Errorf("invalid namespace %q: %s", parts[0], err)
}
if ret.PackageAddr.Name, err = parseModuleRegistryName(parts[1]); err != nil {
return ret, fmt.Errorf("invalid module name %q: %s", parts[1], err)
}
if ret.PackageAddr.TargetSystem, err = parseModuleRegistryTargetSystem(parts[2]); err != nil {
if strings.Contains(parts[2], "?") {
// The user was trying to include a query string, probably?
return ret, fmt.Errorf("module registry addresses may not include a query string portion")
}
return ret, fmt.Errorf("invalid target system %q: %s", parts[2], err)
}

return ret, nil
}

// parseModuleRegistryName validates and normalizes a string in either the
// "namespace" or "name" position of a module registry source address.
func parseModuleRegistryName(given string) (string, error) {
// Similar to the names in provider source addresses, we defined these
// to be compatible with what filesystems and typical remote systems
// like GitHub allow in names. Unfortunately we didn't end up defining
// these exactly equivalently: provider names can only use dashes as
// punctuation, whereas module names can use underscores. So here we're
// using some regular expressions from the original module source
// implementation, rather than using the IDNA rules as we do in
// ParseProviderPart.

if !moduleRegistryNamePattern.MatchString(given) {
return "", fmt.Errorf("must be between one and 64 characters, including ASCII letters, digits, dashes, and underscores, where dashes and underscores may not be the prefix or suffix")
}

// We also skip normalizing the name to lowercase, because we historically
// didn't do that and so existing module registries might be doing
// case-sensitive matching.
return given, nil
}

// parseModuleRegistryTargetSystem validates and normalizes a string in the
// "target system" position of a module registry source address. This is
// what we historically called "provider" but never actually enforced as
// being a provider address, and now _cannot_ be a provider address because
// provider addresses have three slash-separated components of their own.
func parseModuleRegistryTargetSystem(given string) (string, error) {
// Similar to the names in provider source addresses, we defined these
// to be compatible with what filesystems and typical remote systems
// like GitHub allow in names. Unfortunately we didn't end up defining
// these exactly equivalently: provider names can't use dashes or
// underscores. So here we're using some regular expressions from the
// original module source implementation, rather than using the IDNA rules
// as we do in ParseProviderPart.

if !moduleRegistryTargetSystemPattern.MatchString(given) {
return "", fmt.Errorf("must be between one and 64 ASCII letters or digits")
}

// We also skip normalizing the name to lowercase, because we historically
// didn't do that and so existing module registries might be doing
// case-sensitive matching.
return given, nil
}

// String returns a full representation of the address, including any
// additional components that are typically implied by omission in
// user-written addresses.
//
// We typically use this longer representation in error message, in case
// the inclusion of normally-omitted components is helpful in debugging
// unexpected behavior.
func (s ModuleSourceRegistry) String() string {
if s.Subdir != "" {
return s.PackageAddr.String() + "//" + s.Subdir
}
return s.PackageAddr.String()
}

// ForDisplay is similar to String but instead returns a representation of
// the idiomatic way to write the address in configuration, omitting
// components that are commonly just implied in addresses written by
// users.
//
// We typically use this shorter representation in informational messages,
// such as the note that we're about to start downloading a package.
func (s ModuleSourceRegistry) ForDisplay() string {
if s.Subdir != "" {
return s.PackageAddr.ForDisplay() + "//" + s.Subdir
}
return s.PackageAddr.ForDisplay()
}

// splitPackageSubdir detects whether the given address string has a
// subdirectory portion, and if so returns a non-empty subDir string
// along with the trimmed package address.
//
// If the given string doesn't have a subdirectory portion then it'll
// just be returned verbatim in packageAddr, with an empty subDir value.
func splitPackageSubdir(given string) (packageAddr, subDir string) {
packageAddr, subDir = sourceDirSubdir(given)
if subDir != "" {
subDir = path.Clean(subDir)
}
return packageAddr, subDir
}

// sourceDirSubdir takes a source URL and returns a tuple of the URL without
// the subdir and the subdir.
//
// ex:
// dom.com/path/?q=p => dom.com/path/?q=p, ""
// proto://dom.com/path//*?q=p => proto://dom.com/path?q=p, "*"
// proto://dom.com/path//path2?q=p => proto://dom.com/path?q=p, "path2"
func sourceDirSubdir(src string) (string, string) {
// URL might contains another url in query parameters
stop := len(src)
if idx := strings.Index(src, "?"); idx > -1 {
stop = idx
}

// Calculate an offset to avoid accidentally marking the scheme
// as the dir.
var offset int
if idx := strings.Index(src[:stop], "://"); idx > -1 {
offset = idx + 3
}

// First see if we even have an explicit subdir
idx := strings.Index(src[offset:stop], "//")
if idx == -1 {
return src, ""
}

idx += offset
subdir := src[idx+2:]
src = src[:idx]

// Next, check if we have query parameters and push them onto the
// URL.
if idx = strings.Index(subdir, "?"); idx > -1 {
query := subdir[idx:]
subdir = subdir[:idx]
src += query
}

return src, subdir
}
87 changes: 87 additions & 0 deletions module_package.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package tfaddr

import (
"strings"

svchost "github.com/hashicorp/terraform-svchost"
)

// A ModulePackage represents a physical location where Terraform can retrieve
// a module package, which is an archive, repository, or other similar
// container which delivers the source code for one or more Terraform modules.
//
// A ModulePackage is a string in go-getter's address syntax. By convention,
// we use ModulePackage-typed values only for the result of successfully
// running the go-getter "detectors", which produces an address string which
// includes an explicit installation method prefix along with an address
// string in the format expected by that installation method.
//
// Note that although the "detector" phase of go-getter does do some simple
// normalization in certain cases, it isn't generally possible to compare
// two ModulePackage values to decide if they refer to the same package. Two
// equal ModulePackage values represent the same package, but there might be
// other non-equal ModulePackage values that also refer to that package, and
// there is no reliable way to determine that.
//
// Don't convert a user-provided string directly to ModulePackage. Instead,
// use ParseModuleSource with a remote module address and then access the
// ModulePackage value from the result, making sure to also handle the
// selected subdirectory if any. You should convert directly to ModulePackage
// only for a string that is hard-coded into the program (e.g. in a unit test)
// where you've ensured that it's already in the expected syntax.
type ModulePackage string

func (p ModulePackage) String() string {
return string(p)
}

// A ModuleRegistryPackage is an extra indirection over a ModulePackage where
// we use a module registry to translate a more symbolic address (and
// associated version constraint given out of band) into a physical source
// location.
//
// ModuleRegistryPackage is distinct from ModulePackage because they have
// disjoint use-cases: registry package addresses are only used to query a
// registry in order to find a real module package address. These being
// distinct is intended to help future maintainers more easily follow the
// series of steps in the module installer, with the help of the type checker.
type ModuleRegistryPackage struct {
Host svchost.Hostname
Namespace string
Name string
TargetSystem string
}

func (s ModuleRegistryPackage) String() string {
// Note: we're using the "display" form of the hostname here because
// for our service hostnames "for display" means something different:
// it means to render non-ASCII characters directly as Unicode
// characters, rather than using the "punycode" representation we
// use for internal processing, and so the "display" representation
// is actually what users would write in their configurations.
return s.Host.ForDisplay() + "/" + s.ForRegistryProtocol()
}

func (s ModuleRegistryPackage) ForDisplay() string {
if s.Host == DefaultModuleRegistryHost {
return s.ForRegistryProtocol()
}
return s.Host.ForDisplay() + "/" + s.ForRegistryProtocol()
}

// ForRegistryProtocol returns a string representation of just the namespace,
// name, and target system portions of the address, always omitting the
// registry hostname and the subdirectory portion, if any.
//
// This is primarily intended for generating addresses to send to the
// registry in question via the registry protocol, since the protocol
// skips sending the registry its own hostname as part of identifiers.
func (s ModuleRegistryPackage) ForRegistryProtocol() string {
var buf strings.Builder
buf.WriteString(s.Namespace)
buf.WriteByte('/')
buf.WriteString(s.Name)
buf.WriteByte('/')
buf.WriteString(s.TargetSystem)
return buf.String()
}
Loading

0 comments on commit eb7bcc2

Please sign in to comment.