Skip to content

Commit

Permalink
Draft for htcondor group submission.
Browse files Browse the repository at this point in the history
  • Loading branch information
riga committed Jun 5, 2024
1 parent 1848c57 commit 682c983
Show file tree
Hide file tree
Showing 10 changed files with 503 additions and 103 deletions.
15 changes: 12 additions & 3 deletions law.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,17 @@
; values above are used. The only exception is "htcondor_job_file_dir_cleanup" whose default value
; is False.

; htcondor_job_grouping_submit
; Desciption: Whether to use job grouping (cluster submission in HTCondor nomenclature) or not. If
; not, the standard batched submission is used and settings such as "htcondor_chunk_size_submit" and
; "htcondor_merge_job_files" are considered.
; Type: boolean
; Default: True

; htcondor_chunk_size_submit
; Description: Number of jobs that can be submitted in parallel inside a single call to
; "law.htcondor.HTCondorJobManager.submit", i.e., in a single "condor_submit" command.
; "law.htcondor.HTCondorJobManager.submit", i.e., in a single "condor_submit" command. Ignored when
; job grouping is enabled in "htcondor_job_grouping_submit".
; Type: integer
; Default: 25

Expand All @@ -864,8 +872,9 @@

; htcondor_merge_job_files
; Description: A boolean flag that decides whether multiple job description files should be merged
; into a single file before submission. When "False", the "htcondor_chunk_size_submit" option is
; not considered either.
; into a single file before submission. Ignored when job grouping is enabled in
; "htcondor_job_grouping_submit". When "False", the "htcondor_chunk_size_submit" option is not
; considered either.
; Type: boolean
; Default: True

Expand Down
5 changes: 4 additions & 1 deletion law/contrib/cms/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ class CrabJobManager(BaseJobManager):

log_file_pattern = "https://cmsweb.cern.ch:8443/scheddmon/{scheduler_id}/{user}/{task_name}/job_out.{crab_num}.{attempt}.txt" # noqa

job_grouping = True
job_grouping_submit = True
job_grouping_query = True
job_grouping_cancel = True
job_grouping_cleanup = True

JobId = namedtuple("JobId", ["crab_num", "task_name", "proj_dir"])

Expand Down
2 changes: 1 addition & 1 deletion law/contrib/cms/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def create_job_file(self, submit_jobs):
c.custom_log_file = log_file

# task hook
c = task.crab_job_config(c, submit_jobs)
c = task.crab_job_config(c, list(submit_jobs.keys()), list(submit_jobs.values()))

# build the job file and get the sanitized config
job_file, c = self.job_file_factory(**c.__dict__)
Expand Down
1 change: 1 addition & 0 deletions law/contrib/htcondor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
def config_defaults(default_config):
return {
"job": {
"htcondor_job_grouping_submit": True,
"htcondor_job_file_dir": None,
"htcondor_job_file_dir_mkdtemp": None,
"htcondor_job_file_dir_cleanup": False,
Expand Down
155 changes: 155 additions & 0 deletions law/contrib/htcondor/htcondor_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env bash

# Wrapper script that is to be configured as htcondor's main executable file

htcondor_wrapper() {
# helper to select the correct python executable
_law_python() {
command -v python &> /dev/null && python "$@" || python3 "$@"
}

#
# detect variables
#

local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )"
local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )"
local this_file_base="$( basename "${this_file}" )"

# get the job number
export LAW_HTCONDOR_JOB_NUMBER="${LAW_HTCONDOR_JOB_PROCESS}"
if [ -z "${LAW_HTCONDOR_JOB_NUMBER}" ]; then
>&2 echo "could not determine htcondor job number"
return "1"
fi
# htcondor process numbers start at 0, law job numbers at 1, so increment
((LAW_HTCONDOR_JOB_NUMBER++))
echo "running ${this_file_base} for job number ${LAW_HTCONDOR_JOB_NUMBER}"


#
# job argument definitons, depending on LAW_HTCONDOR_JOB_NUMBER
#

# definition
local htcondor_job_arguments_map
declare -A htcondor_job_arguments_map
htcondor_job_arguments_map=(
{{htcondor_job_arguments_map}}
)

# pick
local htcondor_job_arguments="${htcondor_job_arguments_map[${LAW_HTCONDOR_JOB_NUMBER}]}"
if [ -z "${htcondor_job_arguments}" ]; then
>&2 echo "empty htcondor job arguments for LAW_HTCONDOR_JOB_NUMBER ${LAW_HTCONDOR_JOB_NUMBER}"
return "3"
fi


#
# variable rendering
#

# check variables
local render_variables="{{render_variables}}"
if [ -z "${render_variables}" ]; then
>&2 echo "empty render variables"
return "4"
fi

# decode
render_variables="$( echo "${render_variables}" | base64 --decode )"

# check files to render
local input_files_render=( {{input_files_render}} )
if [ "${#input_files_render[@]}" == "0" ]; then
>&2 echo "received empty input files for rendering for LAW_HTCONDOR_JOB_NUMBER ${LAW_HTCONDOR_JOB_NUMBER}"
return "5"
fi

# render files
local input_file_render
for input_file_render in ${input_files_render[@]}; do
# skip if the file refers to _this_ one
local input_file_render_base="$( basename "${input_file_render}" )"
[ "${input_file_render_base}" = "${this_file_base}" ] && continue
# render
echo "render ${input_file_render}"
_law_python -c "\
import re;\
repl = ${render_variables};\
repl['input_files_render'] = '';\
repl['file_postfix'] = '${file_postfix}' or repl.get('file_postfix', '');\
repl['log_file'] = '${log_file}' or repl.get('log_file', '');\
content = open('${input_file_render}', 'r').read();\
content = re.sub(r'\{\{(\w+)\}\}', lambda m: repl.get(m.group(1), ''), content);\
open('${input_file_render_base}', 'w').write(content);\
"
local render_ret="$?"
# handle rendering errors
if [ "${render_ret}" != "0" ]; then
>&2 echo "input file rendering failed with code ${render_ret}"
return "6"
fi
done


#
# run the actual job file
#

# check the job file
local job_file="{{job_file}}"
if [ ! -f "${job_file}" ]; then
>&2 echo "job file '${job_file}' does not exist"
return "7"
fi

# helper to print a banner
banner() {
local msg="$1"

echo
echo "================================================================================"
echo "=== ${msg}"
echo "================================================================================"
echo
}

# debugging: print its contents
# echo "=== content of job file '${job_file}'"
# echo
# cat "${job_file}"
# echo
# echo "=== end of job file content"

# run it
banner "Start of law job"

local job_ret
bash "${job_file}" ${htcondor_job_arguments}
job_ret="$?"

banner "End of law job"

return "${job_ret}"
}

action() {
# arguments: file_postfix, log_file
local file_postfix="$1"
local log_file="$2"

if [ -z "${log_file}" ]; then
htcondor_wrapper "$@"
elif command -v tee &> /dev/null; then
set -o pipefail
echo "---" >> "${log_file}"
htcondor_wrapper "$@" 2>&1 | tee -a "${log_file}"
else
echo "---" >> "${log_file}"
htcondor_wrapper "$@" &>> "${log_file}"
fi
}

action "$@"
Loading

0 comments on commit 682c983

Please sign in to comment.