Skip to content

Commit

Permalink
*: improve remote cluster loader results download, add private DNS ma…
Browse files Browse the repository at this point in the history
…pping

Signed-off-by: Gyuho Lee <leegyuho@amazon.com>
  • Loading branch information
gyuho committed Jun 19, 2020
1 parent 1ba0c33 commit a3c9d7d
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 32 deletions.
134 changes: 120 additions & 14 deletions eks/cluster-loader/remote/cluster-loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"errors"
"fmt"
"io/ioutil"
"os"
"reflect"
"strings"
"time"
Expand All @@ -18,6 +19,7 @@ import (
"github.com/aws/aws-k8s-tester/pkg/fileutil"
k8s_client "github.com/aws/aws-k8s-tester/pkg/k8s-client"
"github.com/aws/aws-k8s-tester/pkg/timeutil"
"github.com/aws/aws-k8s-tester/ssh"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ecr/ecriface"
"go.uber.org/zap"
Expand Down Expand Up @@ -740,20 +742,27 @@ func (ts *tester) AggregateResults() (err error) {
for _, fpaths := range v.Logs {
for _, fpath := range fpaths {
if strings.HasSuffix(fpath, "cluster-loader-remote.tar.gz") {
if cerr := fileutil.Copy(fpath, ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath); cerr != nil {
ts.cfg.Logger.Warn("found AddOnClusterLoaderRemote cluster loader report dir .tar.gz file but failed to copy", zap.String("original-file-path", fpath), zap.Error(cerr))
if !fileutil.Exist(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath) {
if cerr := fileutil.Copy(fpath, ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath); cerr != nil {
ts.cfg.Logger.Warn("found AddOnClusterLoaderRemote cluster loader report dir .tar.gz file but failed to copy", zap.String("original-file-path", fpath), zap.Error(cerr))
} else {
ts.cfg.Logger.Info("successfully copied AddOnClusterLoaderRemote cluster loader report dir .tar.gz file", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath))
}
} else {
ts.cfg.Logger.Info("successfully copied AddOnClusterLoaderRemote cluster loader report dir .tar.gz file", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath))
ts.cfg.Logger.Info("AddOnClusterLoaderRemote cluster loader report dir .tar.gz file already exists; skipping copy", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath))
}
}
if strings.HasSuffix(fpath, "cluster-loader-remote.log") {
if cerr := fileutil.CopyAppend(fpath, ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath); cerr != nil {
ts.cfg.Logger.Warn("found AddOnClusterLoaderRemote cluster loader logs file but failed to copy", zap.String("original-file-path", fpath), zap.Error(cerr))
if !fileutil.Exist(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath) {
if cerr := fileutil.CopyAppend(fpath, ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath); cerr != nil {
ts.cfg.Logger.Warn("found AddOnClusterLoaderRemote cluster loader logs file but failed to copy", zap.String("original-file-path", fpath), zap.Error(cerr))
} else {
ts.cfg.Logger.Info("successfully copied AddOnClusterLoaderRemote cluster loader logs file", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath))
}
} else {
ts.cfg.Logger.Info("successfully copied AddOnClusterLoaderRemote cluster loader logs file", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath))
ts.cfg.Logger.Info("AddOnClusterLoaderRemote cluster loader report logs file already exists; skipping copy", zap.String("original-file-path", fpath), zap.String("copy-file-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath))
}
}

}
}
}
Expand Down Expand Up @@ -825,7 +834,60 @@ func (ts *tester) checkClusterLoader() (err error) {
if clusterloaderPod == "" {
return fmt.Errorf("failed to find pod/cluster-loader-remote-deployment in %q", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.Namespace)
}
ts.cfg.Logger.Info("found pod/cluster-loader-remote-deployment", zap.String("name", clusterloaderPod))
ts.cfg.Logger.Info("found pod/cluster-loader-remote-deployment", zap.String("pod-name", clusterloaderPod))

ts.cfg.Logger.Info("checking node name for pod/cluster-loader-remote-deployment")
nodeName, podPhase := "", v1.PodPending
retryStart = time.Now()
for time.Now().Sub(retryStart) < 10*time.Minute {
select {
case <-ts.cfg.Stopc:
ts.cfg.Logger.Warn("cluster loader check stopped")
return nil
case <-time.After(10 * time.Second):
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
var pod *v1.Pod
pod, err = ts.cfg.K8SClient.
KubernetesClientSet().
CoreV1().
Pods(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.Namespace).
Get(ctx, clusterloaderPod, metav1.GetOptions{})
cancel()
if err != nil {
ts.cfg.Logger.Warn("failed to get pod", zap.Error(err))
continue
}
podPhase = pod.Status.Phase
nodeName = pod.Spec.NodeName
ts.cfg.Logger.Info("pod status",
zap.String("pod-name", pod.Name),
zap.String("pod-phase", fmt.Sprintf("%v", podPhase)),
zap.String("node-name", nodeName),
)
if podPhase == v1.PodRunning && nodeName != "" {
break
}
}
if podPhase != v1.PodRunning || nodeName == "" {
return fmt.Errorf("failed to find running pod and assigned node for %q", clusterloaderPod)
}

sshConfig, ok := ts.cfg.EKSConfig.Status.PrivateDNSToSSHConfig[nodeName]
if !ok {
ts.cfg.Logger.Warn("got pod/cluster-loader-remote-deployment, but no SSH config found for private DNS",
zap.String("pod-name", clusterloaderPod),
zap.String("pod-phase", fmt.Sprintf("%v", podPhase)),
zap.String("node-name", nodeName),
)
return fmt.Errorf("no SSH config found for node name (private DNS) %q", nodeName)
}
ts.cfg.Logger.Info("found node name for running pod/cluster-loader-remote-deployment",
zap.String("pod-name", clusterloaderPod),
zap.String("pod-phase", fmt.Sprintf("%v", podPhase)),
zap.String("node-name", nodeName),
zap.String("ssh-config", sshConfig.ToString()),
)

argsLogsPod := []string{
ts.cfg.EKSConfig.KubectlPath,
Expand All @@ -837,20 +899,17 @@ func (ts *tester) checkClusterLoader() (err error) {
}
cmdLogsPod := strings.Join(argsLogsPod, " ")

ts.cfg.Logger.Info("running cluster loader", zap.String("logs-command-pod", cmdLogsPod))

interval := 15 * time.Second

ts.cfg.Logger.Info("checking cluster loader logs", zap.String("logs-command-pod", cmdLogsPod))
waitDur := 20 * time.Minute * time.Duration(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.Runs)
start := time.Now()
ready := false
for time.Now().Sub(start) < waitDur {
ts.cfg.Logger.Info("waiting for cluster loader run", zap.Duration("interval", interval))
ts.cfg.Logger.Info("waiting for cluster loader run")
select {
case <-ts.cfg.Stopc:
ts.cfg.Logger.Warn("cluster loader check stopped")
return nil
case <-time.After(interval):
case <-time.After(15 * time.Second):
}

ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
Expand All @@ -876,5 +935,52 @@ func (ts *tester) checkClusterLoader() (err error) {
return errors.New("cluster loader remote failed to complete")
}

ts.cfg.Logger.Info("checking cluster loader report .tar.gz")
sh, err := ssh.New(ssh.Config{
Logger: ts.cfg.Logger,
KeyPath: ts.cfg.EKSConfig.RemoteAccessPrivateKeyPath,
PublicIP: sshConfig.PublicIP,
PublicDNSName: sshConfig.PublicDNSName,
UserName: sshConfig.UserName,
})
if err != nil {
return fmt.Errorf("failed to create SSH config for %q (%v)", nodeName, err)
}
defer sh.Close()
if err = sh.Connect(); err != nil {
return fmt.Errorf("failed to SSH connect to %q (%v)", nodeName, err)
}
var out []byte
out, err = sh.Download(
"/var/log/cluster-loader-remote.tar.gz",
ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath,
ssh.WithVerbose(ts.cfg.EKSConfig.LogLevel == "debug"),
ssh.WithRetry(3, 10*time.Second),
)
if err != nil {
os.RemoveAll(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath)
return fmt.Errorf("failed to download '/var/log/cluster-loader-remote.tar.gz' from %q (%v)", nodeName, err)
}
fmt.Printf("\nDownloaded '/var/log/cluster-loader-remote.tar.gz' output:\n%s\n", string(out))

out, err = sh.Download(
"/var/log/cluster-loader-remote.log",
ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath,
ssh.WithVerbose(ts.cfg.EKSConfig.LogLevel == "debug"),
ssh.WithRetry(3, 10*time.Second),
)
if err != nil {
os.RemoveAll(ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath)
return fmt.Errorf("failed to download '/var/log/cluster-loader-remote.log' from %q (%v)", nodeName, err)
}
fmt.Printf("\nDownloaded '/var/log/cluster-loader-remote.tar.gz' output:\n%s\n", string(out))

ts.cfg.Logger.Info("downloaded cluster loader results from remote node",
zap.String("tar-gz-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.ReportTarGzPath),
zap.String("log-path", ts.cfg.EKSConfig.AddOnClusterLoaderRemote.LogPath),
)

return ts.cfg.EKSConfig.Sync()
}

// /var/log/cluster-loader-remote.tar.gz
12 changes: 6 additions & 6 deletions eks/mng/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
)
waits += len(nodeGroup.Instances)

for instID, iv := range nodeGroup.Instances {
for instID, cur := range nodeGroup.Instances {
pfx := instID + "-"

go func(instID, logsDir, pfx string, iv ec2config.Instance) {
go func(instID, logsDir, pfx string, cur ec2config.Instance) {
select {
case <-ts.cfg.Stopc:
ts.cfg.Logger.Warn("exiting fetch logger", zap.String("prefix", pfx))
Expand All @@ -119,9 +119,9 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
sh, err := ssh.New(ssh.Config{
Logger: ts.cfg.Logger,
KeyPath: ts.cfg.EKSConfig.RemoteAccessPrivateKeyPath,
PublicIP: iv.PublicIP,
PublicDNSName: iv.PublicDNSName,
UserName: iv.RemoteAccessUserName,
PublicIP: cur.PublicIP,
PublicDNSName: cur.PublicDNSName,
UserName: cur.RemoteAccessUserName,
})
if err != nil {
rch <- instanceLogs{mngName: name, errs: []string{err.Error()}}
Expand Down Expand Up @@ -382,7 +382,7 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
}
}
rch <- data
}(instID, logsDir, pfx, iv)
}(instID, logsDir, pfx, cur)
}
}

Expand Down
7 changes: 7 additions & 0 deletions eks/mng/wait/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,13 @@ func (ts *tester) waitForNodes(mngName string, retriesLeft int) error {
ivv.RemoteAccessUserName = cur.RemoteAccessUserName
cur.Instances[id] = ivv
}
for _, inst := range cur.Instances {
ts.cfg.EKSConfig.Status.PrivateDNSToSSHConfig[inst.PrivateDNSName] = eksconfig.SSHConfig{
PublicIP: inst.PublicIP,
PublicDNSName: inst.PublicDNSName,
UserName: cur.RemoteAccessUserName,
}
}
ts.cfg.EKSConfig.AddOnManagedNodeGroups.MNGs[mngName] = cur
ts.cfg.EKSConfig.Sync()

Expand Down
12 changes: 6 additions & 6 deletions eks/ng/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
)
waits += len(nodeGroup.Instances)

for instID, iv := range nodeGroup.Instances {
for instID, cur := range nodeGroup.Instances {
pfx := instID + "-"

go func(instID, logsDir, pfx string, iv ec2config.Instance) {
go func(instID, logsDir, pfx string, cur ec2config.Instance) {
select {
case <-ts.cfg.Stopc:
ts.cfg.Logger.Warn("exiting fetch logger", zap.String("prefix", pfx))
Expand All @@ -121,9 +121,9 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
sh, err := ssh.New(ssh.Config{
Logger: ts.cfg.Logger,
KeyPath: ts.cfg.EKSConfig.RemoteAccessPrivateKeyPath,
PublicIP: iv.PublicIP,
PublicDNSName: iv.PublicDNSName,
UserName: iv.RemoteAccessUserName,
PublicIP: cur.PublicIP,
PublicDNSName: cur.PublicDNSName,
UserName: cur.RemoteAccessUserName,
})
if err != nil {
rch <- instanceLogs{asgName: name, errs: []string{err.Error()}}
Expand Down Expand Up @@ -384,7 +384,7 @@ func (ts *tester) fetchLogs(qps float32, burst int) error {
}
}
rch <- data
}(instID, logsDir, pfx, iv)
}(instID, logsDir, pfx, cur)
}
}

Expand Down
7 changes: 7 additions & 0 deletions eks/ng/wait/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ func (ts *tester) waitForNodes(asgName string, retriesLeft int) error {
ivv.RemoteAccessUserName = cur.RemoteAccessUserName
cur.Instances[id] = ivv
}
for _, inst := range cur.Instances {
ts.cfg.EKSConfig.Status.PrivateDNSToSSHConfig[inst.PrivateDNSName] = eksconfig.SSHConfig{
PublicIP: inst.PublicIP,
PublicDNSName: inst.PublicDNSName,
UserName: cur.RemoteAccessUserName,
}
}
ts.cfg.EKSConfig.AddOnNodeGroups.ASGs[asgName] = cur
ts.cfg.EKSConfig.Sync()

Expand Down
10 changes: 8 additions & 2 deletions eksconfig/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,10 @@ func NewDefault() *Config {
AddOnClusterVersionUpgrade: getDefaultAddOnClusterVersionUpgrade(),

// read-only
Status: &Status{Up: false},
Status: &Status{
Up: false,
PrivateDNSToSSHConfig: make(map[string]SSHConfig),
},
}

// https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html
Expand Down Expand Up @@ -1128,7 +1131,10 @@ func (cfg *Config) validateConfig() error {
}

if cfg.Status == nil {
cfg.Status = &Status{Up: false}
cfg.Status = &Status{
Up: false,
PrivateDNSToSSHConfig: make(map[string]SSHConfig),
}
}
if cfg.Status.ClusterCFNStackYAMLFilePath == "" {
cfg.Status.ClusterCFNStackYAMLFilePath = strings.ReplaceAll(cfg.ConfigPath, ".yaml", "") + ".cluster.cfn.yaml"
Expand Down
9 changes: 5 additions & 4 deletions eksconfig/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ log-level: info
log-outputs:
- stderr
- /home/ANT.AMAZON.COM/leegyuho/go/src/github.com/aws/aws-k8s-tester/eksconfig/default.log
name: eks-2020061817-morningaua72
name: eks-2020061900-bellevuewzm4
on-failure-delete: true
on-failure-delete-wait-seconds: 120
parameters:
Expand All @@ -41,7 +41,7 @@ parameters:
role-cfn-stack-yaml-file-path: /home/ANT.AMAZON.COM/leegyuho/go/src/github.com/aws/aws-k8s-tester/eksconfig/default.role.cfn.yaml
role-create: true
role-managed-policy-arns: null
role-name: eks-2020061817-morningaua72-role
role-name: eks-2020061900-bellevuewzm4-role
role-service-principals: null
signing-name: eks
tags: null
Expand All @@ -55,8 +55,8 @@ partition: aws
region: us-west-2
remote-access-commands-output-path: /home/ANT.AMAZON.COM/leegyuho/go/src/github.com/aws/aws-k8s-tester/eksconfig/default.ssh.sh
remote-access-key-create: true
remote-access-key-name: eks-2020061817-morningaua72-remote-access-key
remote-access-private-key-path: /tmp/cherrybgcpbx847.insecure.key
remote-access-key-name: eks-2020061900-bellevuewzm4-remote-access-key
remote-access-private-key-path: /tmp/livelygq2d81y9a.insecure.key
s3-bucket-create: false
s3-bucket-create-keep: false
s3-bucket-lifecycle-expiration-days: 0
Expand All @@ -79,6 +79,7 @@ status:
cluster-oidc-issuer-url: ""
cluster-status: null
cluster-status-current: ""
private-dns-to-ssh-config: {}
server-version-info:
buildDate: ""
compiler: ""
Expand Down
29 changes: 29 additions & 0 deletions eksconfig/status.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package eksconfig

import (
"encoding/json"
"fmt"
"time"

k8s_client "github.com/aws/aws-k8s-tester/pkg/k8s-client"
Expand Down Expand Up @@ -60,8 +62,35 @@ type Status struct {
ClusterStatusCurrent string `json:"cluster-status-current"`
// ClusterStatus represents the status of the cluster.
ClusterStatus []ClusterStatus `json:"cluster-status"`

// PrivateDNSToSSHConfig maps each worker node's private IP to its public IP,
// public DNS, and SSH access user name.
// Worker node name in AWS is the node's EC2 instance private DNS.
// This is used for SSH access.
PrivateDNSToSSHConfig map[string]SSHConfig `json:"private-dns-to-ssh-config"`
}

// SSHConfig represents basic SSH access configuration for worker nodes.
type SSHConfig struct {
PublicIP string `json:"public-ip"`
PublicDNSName string `json:"public-dns-name"`
UserName string `json:"user-name"`
}

func (sc SSHConfig) ToString() string {
b, err := json.Marshal(sc)
if err != nil {
return fmt.Sprintf("%+v", sc)
}
return string(b)
}

/*
map all private IPs to public IP + public DNS
map node name to internal ip, private ip
pod node name to internal ip ->
*/

// ClusterStatus represents the cluster status.
type ClusterStatus struct {
Time time.Time `json:"time"`
Expand Down

0 comments on commit a3c9d7d

Please sign in to comment.