Skip to content

Commit

Permalink
Add ACME DNS validator (#20293)
Browse files Browse the repository at this point in the history
* Add DNS challenge validation to ACME

This allows us to validate against wildcard domain names, as the HTTP-01
challenge does not support wildcard validation.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Properly delay challenge retry attempts

Previously we'd essentially semi-busy wait for a challenge to become
retry-able, as the queue itself had no knowledge of the retry after
value of the validation attempt. Now, we plumb through this value into
the queue itself, to aide selection of validations to attempt.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

---------

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
  • Loading branch information
cipherboy committed Apr 21, 2023
1 parent 816e182 commit 924129c
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 23 deletions.
125 changes: 102 additions & 23 deletions builtin/logical/pki/acme_challenge_engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ type ChallengeValidation struct {
RetryAfter time.Time `json:"retry_after,omitempty"`
}

type ChallengeQueueEntry struct {
Identifier string
RetryAfter time.Time
}

type ACMEChallengeEngine struct {
NumWorkers int

Expand Down Expand Up @@ -72,7 +77,9 @@ func (ace *ACMEChallengeEngine) LoadFromStorage(b *backend, sc *storageContext)

// Add them to our queue of validations to work through later.
for _, item := range items {
ace.Validations.PushBack(item)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: item,
})
}

return nil
Expand Down Expand Up @@ -125,22 +132,76 @@ func (ace *ACMEChallengeEngine) _run(b *backend) error {
finishedWorkersChannels = newFinishedWorkersChannels

// If we have space to take on another work item, do so.
if len(finishedWorkersChannels) < ace.NumWorkers {
firstIdentifier := ""
startedWork := false
now := time.Now()
for len(finishedWorkersChannels) < ace.NumWorkers {
var task *ChallengeQueueEntry

// Find our next work item. We do all of these operations
// while holding the queue lock, hence some repeated checks
// afterwards. Out of this, we get a candidate task, using
// element == nil as a sentinel for breaking our parent
// loop.
ace.ValidationLock.Lock()
element := ace.Validations.Front()
if element != nil {
ace.Validations.Remove(element)
task = element.Value.(*ChallengeQueueEntry)
if !task.RetryAfter.IsZero() && now.Before(task.RetryAfter) {
// We cannot work on this element yet; remove it to
// the back of the queue. This allows us to potentially
// select the next item in the next iteration.
ace.Validations.PushBack(task)
}

if firstIdentifier != "" && task.Identifier == firstIdentifier {
// We found and rejected this element before; exit the
// loop by "claiming" we didn't find any work.
element = nil
} else if firstIdentifier == "" {
firstIdentifier = task.Identifier
}
}
ace.ValidationLock.Unlock()
if element == nil {
// There was no more work to do to fill up the queue; exit
// this loop.
break
}
if now.Before(task.RetryAfter) {
// Here, while we found an element, we didn't want to
// completely exit the loop (perhaps it was our first time
// finding a work order), so retry without modifying
// firstIdentifier.
continue
}

task := element.Value.(string)
// Since this work item was valid, we won't expect to see it in
// the validation queue again until it is executed. Here, we
// want to avoid infinite looping above (if we removed the one
// valid item and the remainder are all not immediately
// actionable). At the worst, we'll spend a little more time
// looping through the queue until we hit a repeat.
firstIdentifier = ""

// Here, we got a piece of work that is ready to check; create a
// channel and a new go routine and run it. Note that this still
// could have a RetryAfter date we're not aware of (e.g., if the
// cluster restarted as we do not read the entries there).
channel := make(chan bool, 1)
go ace.VerifyChallenge(runnerSC, task, channel)
go ace.VerifyChallenge(runnerSC, task.Identifier, channel)
finishedWorkersChannels = append(finishedWorkersChannels, channel)
startedWork = true
}

// If we have no more work.
if len(finishedWorkersChannels) == ace.NumWorkers {
time.Sleep(50 * time.Millisecond)
// If we have no more capacity for work, we should pause a little to
// let the system catch up. Additionally, if we only had
// non-actionable work items, we should pause until some time has
// elapsed: not too much that we potentially starve any new incoming
// items from validation, but not too short that we cause a busy loop.
if len(finishedWorkersChannels) == ace.NumWorkers || !startedWork {
time.Sleep(100 * time.Millisecond)
}

// Lastly, if we have more work to do, re-trigger ourselves.
Expand Down Expand Up @@ -206,7 +267,9 @@ func (ace *ACMEChallengeEngine) AcceptChallenge(sc *storageContext, account stri

ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(name)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: name,
})

select {
case ace.NewValidation <- name:
Expand All @@ -220,7 +283,7 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
sc, _ /* cancel func */ := runnerSc.WithFreshTimeout(MaxChallengeTimeout)
runnerSc.Backend.Logger().Debug("Starting verification of challenge: %v", id)

if retry, err := ace._verifyChallenge(sc, id); err != nil {
if retry, retryAfter, err := ace._verifyChallenge(sc, id); err != nil {
// Because verification of this challenge failed, we need to retry
// it in the future. Log the error and re-add the item to the queue
// to try again later.
Expand All @@ -229,7 +292,10 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
if retry {
ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(id)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: id,
RetryAfter: retryAfter,
})

// Let the validator know there's a pending challenge.
select {
Expand All @@ -249,12 +315,12 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
finished <- false
}

func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string) (bool, time.Time, error) {
now := time.Now()
path := acmeValidationPrefix + id
challengeEntry, err := sc.Storage.Get(sc.Context, path)
if err != nil {
return true, fmt.Errorf("error loading challenge %v: %w", id, err)
return true, now, fmt.Errorf("error loading challenge %v: %w", id, err)
}

if challengeEntry == nil {
Expand All @@ -267,18 +333,17 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)

var cv *ChallengeValidation
if err := challengeEntry.DecodeJSON(&cv); err != nil {
return true, fmt.Errorf("error decoding challenge %v: %w", id, err)
return true, now, fmt.Errorf("error decoding challenge %v: %w", id, err)
}

if now.Before(cv.RetryAfter) {
time.Sleep(50 * time.Millisecond)
return true, fmt.Errorf("retrying challenge %v too soon", id)
return true, cv.RetryAfter, fmt.Errorf("retrying challenge %v too soon", id)
}

authzPath := getAuthorizationPath(cv.Account, cv.Authorization)
authz, err := loadAuthorizationAtPath(sc, authzPath)
if err != nil {
return true, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
return true, now, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
}

if authz.Status != ACMEAuthorizationPending {
Expand Down Expand Up @@ -324,12 +389,24 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)
err = fmt.Errorf("error validating http-01 challenge %v: %w", id, err)
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}
case ACMEDNSChallenge:
if authz.Identifier.Type != ACMEDNSIdentifier {
err = fmt.Errorf("unsupported identifier type for authorization %v/%v in challenge %v: %v", cv.Account, cv.Authorization, id, authz.Identifier.Type)
return ace._verifyChallengeCleanup(sc, err, id)
}

valid, err = ValidateDNS01Challenge(authz.Identifier.Value, cv.Token, cv.Thumbprint)
if err != nil {
err = fmt.Errorf("error validating dns-01 challenge %v: %w", id, err)
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}
default:
err = fmt.Errorf("unsupported ACME challenge type %v for challenge %v", cv.ChallengeType, id)
return ace._verifyChallengeCleanup(sc, err, id)
}

if !valid {
err = fmt.Errorf("challenge failed with no additional information")
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}

Expand All @@ -349,7 +426,7 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)
return ace._verifyChallengeCleanup(sc, nil, id)
}

func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *ChallengeValidation, authz *ACMEAuthorization, err error, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *ChallengeValidation, authz *ACMEAuthorization, err error, id string) (bool, time.Time, error) {
now := time.Now()
path := acmeValidationPrefix + id

Expand All @@ -367,29 +444,31 @@ func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *Ch

json, jsonErr := logical.StorageEntryJSON(path, cv)
if jsonErr != nil {
return true, fmt.Errorf("error persisting updated challenge validation queue entry (error prior to retry, if any: %v): %w", err, jsonErr)
return true, now, fmt.Errorf("error persisting updated challenge validation queue entry (error prior to retry, if any: %v): %w", err, jsonErr)
}

if putErr := sc.Storage.Put(sc.Context, json); putErr != nil {
return true, fmt.Errorf("error writing updated challenge validation entry (error prior to retry, if any: %v): %w", err, putErr)
return true, now, fmt.Errorf("error writing updated challenge validation entry (error prior to retry, if any: %v): %w", err, putErr)
}

if err != nil {
err = fmt.Errorf("retrying validation: %w", err)
}

return true, err
return true, cv.RetryAfter, err
}

func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err error, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err error, id string) (bool, time.Time, error) {
now := time.Now()

// Remove our ChallengeValidation entry only.
if deleteErr := sc.Storage.Delete(sc.Context, acmeValidationPrefix+id); deleteErr != nil {
return true, fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
return true, now.Add(-1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
}

if err != nil {
err = fmt.Errorf("removing challenge validation attempt and not retrying %v; previous error: %w", id, err)
}

return false, err
return false, now, err
}
46 changes: 46 additions & 0 deletions builtin/logical/pki/acme_challenges.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package pki

import (
"crypto/sha256"
"encoding/base64"
"fmt"
"io"
"net"
Expand Down Expand Up @@ -28,6 +30,23 @@ func ValidateKeyAuthorization(keyAuthz string, token string, thumbprint string)
return true, nil
}

// ValidateSHA256KeyAuthorization validates that the given keyAuthz from a
// challenge matches our expectation, returning (true, nil) if so, or
// (false, err) if not.
//
// This is for use with DNS challenges, which require
func ValidateSHA256KeyAuthorization(keyAuthz string, token string, thumbprint string) (bool, error) {
authzContents := token + "." + thumbprint
checksum := sha256.Sum256([]byte(authzContents))
expectedAuthz := base64.RawURLEncoding.EncodeToString(checksum[:])

if keyAuthz != expectedAuthz {
return false, fmt.Errorf("sha256 key authorization was invalid")
}

return true, nil
}

// Validates a given ACME http-01 challenge against the specified domain,
// per RFC 8555.
//
Expand Down Expand Up @@ -109,3 +128,30 @@ func ValidateHTTP01Challenge(domain string, token string, thumbprint string) (bo
// the token because we're bounded by a reasonable amount of length.
return ValidateKeyAuthorization(keyAuthz, token, thumbprint)
}

func ValidateDNS01Challenge(domain string, token string, thumbprint string) (bool, error) {
// Here, domain is the value from the post-wildcard-processed identifier.
// Per RFC 8555, no difference in validation occurs if a wildcard entry
// is requested or if a non-wildcard entry is requested.
//
// XXX: In this case the DNS server is operator controlled and is assumed
// to be less malicious so the default resolver is used. In the future,
// we'll want to use net.Resolver for two reasons:
//
// 1. To control the actual resolver via ACME configuration,
// 2. To use a context to set stricter timeout limits.
name := "_acme-challenge." + domain
results, err := net.LookupTXT(name)
if err != nil {
return false, fmt.Errorf("dns-01: failed to lookup TXT records for domain (%v): %w", name, err)
}

for _, keyAuthz := range results {
ok, _ := ValidateSHA256KeyAuthorization(keyAuthz, token, thumbprint)
if ok {
return true, nil
}
}

return false, fmt.Errorf("dns-01: challenge failed against %v records", len(results))
}

0 comments on commit 924129c

Please sign in to comment.