Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Metrics Collector error in case of non-existing Process #1614

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions pkg/metricscollector/v1beta1/common/pns.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"time"

psutil "github.com/shirou/gopsutil/process"
"k8s.io/klog"
)

// WaitPidsOpts is the input options for metrics collector
Expand All @@ -40,7 +41,7 @@ type WaitPidsOpts struct {
func WaitMainProcesses(opts WaitPidsOpts) error {

if runtime.GOOS != "linux" {
return fmt.Errorf("Platform '%s' unsupported", runtime.GOOS)
return fmt.Errorf("platform '%s' unsupported", runtime.GOOS)
}

pids, mainPid, err := GetMainProcesses(opts.CompletedMarkedDirPath)
Expand All @@ -59,21 +60,23 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error)
mainPid := 0

if err != nil {
return nil, 0, fmt.Errorf("Failed to list processes: %v", err)
return nil, 0, fmt.Errorf("failed to list processes: %v", err)
}

thisPID := os.Getpid()
for _, pid := range allPids {
// Create process object from pid
proc, err := psutil.NewProcess(pid)
if err != nil {
return nil, 0, fmt.Errorf("Failed to create new Process from pid %v, error: %v", pid, err)
klog.Infof("Unable to create new process from pid: %v, error: %v. Continue to next pid", pid, err)
continue
}

// Get parent process
ppid, err := proc.Ppid()
if err != nil {
return nil, 0, fmt.Errorf("Unable to get parent pid for pid: %v, error: %v", ppid, err)
klog.Infof("Unable to get parent process for pid: %v, error: %v. Continue to next pid", pid, err)
continue
}

// Ignore the pause container, our own pid, and non-root processes (parent pid != 0)
Expand All @@ -84,11 +87,12 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error)
// Read the process command line
cmdline, err := proc.Cmdline()
if err != nil {
return nil, 0, fmt.Errorf("Unable to get cmdline from pid %v, error: %v", pid, err)
klog.Infof("Unable to get cmdline from pid: %v, error: %v. Continue to next pid", pid, err)
continue
}

// By default mainPid is the first process.
// Command line contains completed marker for the main pid
// In addition to that, command line contains completed marker for the main pid
// For example: echo completed > /var/log/katib/$$$$.pid
// completedMarkedDirPath is the directory for completed marker, e.g. /var/log/katib
if mainPid == 0 ||
Expand All @@ -99,6 +103,11 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error)
pids[int(pid)] = true
}

// If mainPid has not been found, return an error.
if mainPid == 0 {
return nil, 0, fmt.Errorf("unable to find main pid from the process list %v", allPids)
}

return pids, mainPid, nil
}

Expand Down Expand Up @@ -132,7 +141,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error {
// Read file with "completed" marker
contents, err := ioutil.ReadFile(markFile)
if err != nil {
return fmt.Errorf("Training container is failed. Unable to read file %v for pid %v, error: %v", markFile, pid, err)
return fmt.Errorf("training container is failed. Unable to read file %v for pid %v, error: %v", markFile, pid, err)
}
// Check if file contains "early-stopped" marker
// In that case process is not completed
Expand All @@ -141,7 +150,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error {
}
// Check if file contains "completed" marker
if strings.TrimSpace(string(contents)) != TrainingCompleted {
return fmt.Errorf("Unable to find marker: %v in file: %v with contents: %v for pid: %v",
return fmt.Errorf("unable to find marker: %v in file: %v with contents: %v for pid: %v",
TrainingCompleted, markFile, string(contents), pid)
}
}
Expand All @@ -157,7 +166,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error {
}
// We should receive only not exist error when we check /proc/<pid> dir
} else {
return fmt.Errorf("Fail to check process info: %v", err)
return fmt.Errorf("fail to check process info: %v", err)
}
}
}
Expand All @@ -167,7 +176,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error {

// After main loop notFinishedPids map should be empty
if len(notFinishedPids) != 0 {
return fmt.Errorf("Timed out waiting for pids to complete")
return fmt.Errorf("timed out waiting for pids to complete")
}
return nil
}
6 changes: 5 additions & 1 deletion pkg/metricscollector/v1beta1/common/pns.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,18 @@ def GetMainProcesses(completed_marked_dir):
cmd_lind = " ".join(proc.cmdline())

# By default main_pid is the first process.
# Command line contains completed marker for the main pid
# In addition to that, command line contains completed marker for the main pid.
# For example: echo completed > /var/log/katib/$$$$.pid
# completed_marked_dir is the directory for completed marker, e.g. /var/log/katib
if main_pid == 0 or ("echo {} > {}".format(const.TRAINING_COMPLETED, completed_marked_dir) in cmd_lind):
main_pid = pid

pids.add(pid)

# If mainPid has not been found, return an error.
if main_pid == 0:
raise Exception("Unable to find main pid from the process list {}".format(pids))

return pids, main_pid


Expand Down