From 5d2095f8398b33792b116472563c568742a10ad8 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Sat, 19 Aug 2023 14:33:30 -0700 Subject: [PATCH 1/5] job-list: cleanup test fd cleanup Problem: In job-list/test/job_data.c, the function read_file() returns a file descriptor that no one uses. Have read_file() return void instead of the file descriptor. Simply close the file descriptor before returning. --- src/modules/job-list/test/job_data.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/modules/job-list/test/job_data.c b/src/modules/job-list/test/job_data.c index 3029782e6ea2..c0dd23fe40e3 100644 --- a/src/modules/job-list/test/job_data.c +++ b/src/modules/job-list/test/job_data.c @@ -277,7 +277,7 @@ struct test_ncores { { NULL, NULL, 0, 0 }, }; -static int read_file (const char *filename, void **datap) +static void read_file (const char *filename, void **datap) { int fd; ssize_t size; @@ -287,34 +287,32 @@ static int read_file (const char *filename, void **datap) /* N.B. read_all() NUL terminates buffer */ if ((size = read_all (fd, datap)) < 0) BAIL_OUT ("failed to read data %s", filename); - return fd; + close (fd); } static int parse_jobspec (struct job *job, const char *filename) { char *data; - int fd, ret; + int ret; - fd = read_file (filename, (void **)&data); + read_file (filename, (void **)&data); ret = job_parse_jobspec_fatal (job, data); free (data); - close (fd); return ret; } static int parse_R (struct job *job, const char *filename) { char *data; - int fd, ret; + int ret; - fd = read_file (filename, (void **)&data); + read_file (filename, (void **)&data); ret = job_parse_R_fatal (job, data); free (data); - close (fd); return ret; } From 3af776e13753464175e186c61fc77d4f3a545444 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 18 Aug 2023 12:40:18 -0700 Subject: [PATCH 2/5] job-list: update data comments for fields parsed Problem: In the header file job_data.h, the comments for what fields are parsed in specific parsing functions is out of date. Update the headers appropriately for all fields parsed by those functions. --- src/modules/job-list/job_data.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/modules/job-list/job_data.h b/src/modules/job-list/job_data.h index b6d0b08e83c9..39c53d783dcb 100644 --- a/src/modules/job-list/job_data.h +++ b/src/modules/job-list/job_data.h @@ -95,8 +95,11 @@ struct job *job_create (flux_t *h, flux_jobid_t id); /* Parse and internally cache jobspec. Set values for: * - job name + * - queue * - ntasks * - nnodes (if available) + * - ncores (if possible) + * - duration */ int job_parse_jobspec (struct job *job, const char *s); @@ -109,6 +112,8 @@ int job_parse_jobspec_fatal (struct job *job, const char *s); * - expiration * - nnodes * - nodelist + * - ncores + * - ntasks (if necessary) */ int job_parse_R (struct job *job, const char *s); From 8b4531e88129e6d469eac03c8a66da76d1903e10 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 18 Aug 2023 14:03:22 -0700 Subject: [PATCH 3/5] job-list: do not return error when nonfatal set Problem: The internal function parse_jobspec() returns an error for an illegal json jobspec when the 'allow_nonfatal' flag is set. Do not return on error when 'allow_nonfatal' is set to true. --- src/modules/job-list/job_data.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/modules/job-list/job_data.c b/src/modules/job-list/job_data.c index c7b4df40663f..5b425c0cc43e 100644 --- a/src/modules/job-list/job_data.c +++ b/src/modules/job-list/job_data.c @@ -266,13 +266,12 @@ static int parse_jobspec (struct job *job, const char *s, bool allow_nonfatal) json_error_t error; json_t *jobspec_job = NULL; json_t *tasks; - int rc = -1; if (!(job->jobspec = json_loads (s, 0, &error))) { flux_log (job->h, LOG_ERR, "%s: job %s invalid jobspec: %s", __FUNCTION__, idf58 (job->id), error.text); - goto error; + goto nonfatal_error; } if (json_unpack_ex (job->jobspec, &error, 0, @@ -344,9 +343,7 @@ static int parse_jobspec (struct job *job, const char *s, bool allow_nonfatal) /* nonfatal error - jobspec illegal, but we'll continue on. job * listing will return whatever data is available */ nonfatal_error: - rc = allow_nonfatal ? 0 : -1; -error: - return rc; + return allow_nonfatal ? 0 : -1; } int job_parse_jobspec (struct job *job, const char *s) From 4cc7e6b2b65e4ead3fe7f3bc2cb3ba194c3ebfc7 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 18 Aug 2023 16:38:55 -0700 Subject: [PATCH 4/5] testsuite: add extra job-list illegal R tests Problem: There are several tests in t2260-job-list.t that test that job-list works if R is illegal. However, they only test when R is an illegal formatted json object. They do not test if R is missing expected fields. Add additional tests that set R to an empty json object. --- t/t2260-job-list.t | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index d4e722b5aff7..d06cd8b42b08 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -2533,7 +2533,7 @@ test_expect_success 'reload job-ingest with defaults' ' # we make R invalid by overwriting it in the KVS before job-list will # look it up -test_expect_success 'flux job list works on job with illegal R' ' +test_expect_success 'flux job list works on job with illegal R/json' ' ${RPC} job-list.job-state-pause 0 /dev/null \ + && [ $i -lt 5 ] + do + sleep 1 + i=$((i + 1)) + done && + test "$i" -lt "5" && + flux job list --states=inactive | grep $jobid > list_illegal_R.out && + cat list_illegal_R.out | $jq -e ".ranks == null" && + cat list_illegal_R.out | $jq -e ".nnodes == null" && + cat list_illegal_R.out | $jq -e ".nodelist == null" && + cat list_illegal_R.out | $jq -e ".expiration == null" +' + +test_expect_success NO_CHAIN_LINT 'flux job list-ids works on job with illegal R/json' ' ${RPC} job-list.job-state-pause 0 list_id_illegal_R.out & + pid=$! + wait_idsync 1 && + ${RPC} job-list.job-state-unpause 0 Date: Sun, 20 Aug 2023 21:00:42 -0700 Subject: [PATCH 5/5] job-list: refactor state transition flags Problem: In the internal state transition struct, the flag "processed" is a little ambiguous. In addition, multiple locations manually remove a state transition from the next_states list. Solution: Split the "processed" flag into two flags, "processing" and "finished". The "processing" flag more clearly indicates that the state transition is being processed. The "finished" flag indicates the transition is done and we can remove the struct from the next_states list. With these new flags, we no longer need to remove the struct from the next_states list in so many locations. Simply set the "finished" flag to true and remove from the list in process_next_state(). --- src/modules/job-list/job_state.c | 33 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/modules/job-list/job_state.c b/src/modules/job-list/job_state.c index 3b58c3ccad63..71137d97c416 100644 --- a/src/modules/job-list/job_state.c +++ b/src/modules/job-list/job_state.c @@ -47,7 +47,10 @@ struct state_transition { flux_job_state_t state; - bool processed; + bool processing; /* indicates we are waiting for + * current update to complete */ + bool finished; /* indicates we are done, can remove + * from list */ double timestamp; int flags; flux_job_state_t expected_state; @@ -294,7 +297,7 @@ static void state_depend_lookup_continuation (flux_future_t *f, void *arg) st = zlist_head (job->next_states); assert (st); update_job_state_and_list (jsctx, job, st->state, st->timestamp); - zlist_remove (job->next_states, st); + st->finished = true; process_next_state (jsctx, job); out: @@ -344,7 +347,7 @@ static void state_run_lookup_continuation (flux_future_t *f, void *arg) st = zlist_head (job->next_states); assert (st); update_job_state_and_list (jsctx, job, st->state, st->timestamp); - zlist_remove (job->next_states, st); + st->finished = true; process_next_state (jsctx, job); out: @@ -413,7 +416,8 @@ static int add_state_transition (struct job *job, if (!(st = calloc (1, sizeof (*st)))) return -1; st->state = newstate; - st->processed = false; + st->processing = false; + st->finished = false; st->timestamp = timestamp; st->flags = flags; st->expected_state = expected_state; @@ -437,7 +441,10 @@ static void process_next_state (struct job_state_ctx *jsctx, struct job *job) struct state_transition *st; while ((st = zlist_head (job->next_states)) - && !st->processed) { + && (!st->processing || st->finished)) { + + if (st->finished) + goto next; if ((st->flags & STATE_TRANSITION_FLAG_REVERT)) { /* only revert if the current state is what is expected */ @@ -447,15 +454,15 @@ static void process_next_state (struct job_state_ctx *jsctx, struct job *job) update_job_state_and_list (jsctx, job, st->state, st->timestamp); } else { - zlist_remove (job->next_states, st); - continue; + st->finished = true; + goto next; } } else if ((st->flags & STATE_TRANSITION_FLAG_CONDITIONAL)) { /* if current state isn't what we expected, move on */ if (job->state != st->expected_state) { - zlist_remove (job->next_states, st); - continue; + st->finished = true; + goto next; } } @@ -489,7 +496,7 @@ static void process_next_state (struct job_state_ctx *jsctx, struct job *job) return; } - st->processed = true; + st->processing = true; break; } else { @@ -502,8 +509,12 @@ static void process_next_state (struct job_state_ctx *jsctx, struct job *job) eventlog_inactive_complete (job); update_job_state_and_list (jsctx, job, st->state, st->timestamp); - zlist_remove (job->next_states, st); + st->finished = true; } + + next: + if (st->finished) + zlist_remove (job->next_states, st); } }