Skip to content

Commit

Permalink
Avoid Perl dependency in test-setup.sh
Browse files Browse the repository at this point in the history
Fixes bazelbuild#4691.

PiperOrigin-RevId: 230308181
  • Loading branch information
ulfjack authored and weixiao-huang committed Jan 31, 2019
1 parent 7d34d9b commit 92fffc1
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 34 deletions.
10 changes: 10 additions & 0 deletions src/test/shell/bazel/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -800,3 +800,13 @@ sh_test(
],
tags = ["no_windows"],
)

sh_test(
name = "generate_xml_test",
srcs = ["generate_xml_test.sh"],
data = [
"//src/test/shell:bashunit",
"//tools/test:test_xml_generator",
],
deps = ["@bazel_tools//tools/bash/runfiles"],
)
38 changes: 20 additions & 18 deletions src/test/shell/bazel/bazel_test_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,14 @@ function write_test_xml_timeout_files() {
mkdir -p dir

cat <<'EOF' > dir/test.sh
#!/bin/sh
#!/bin/bash
echo "xmltest"
echo -n "before "
# Invalid XML character
perl -e 'print "\x1b"'
# Invalid UTF-8 characters
perl -e 'print "\xc0\x00\xa0\xa1"'
echo " after"
# ]]> needs escaping
echo "<!CDATA[]]>"
sleep 10
Expand All @@ -337,19 +339,19 @@ EOF
function test_xml_is_present_when_timingout() {
write_test_xml_timeout_files
bazel test -s --test_timeout=1 --nocache_test_results \
--noexperimental_split_xml_generation \
//dir:test &> $TEST_log && fail "should have failed" || true

xml_log=bazel-testlogs/dir/test/test.xml
[[ -s "${xml_log}" ]] || fail "${xml_log} was not present after test"
cat "${xml_log}" > $TEST_log
expect_log '"Timed out"'
expect_log '<system-out><!\[CDATA\[xmltest'
# We should check that the invalid characters are correctly encoded, except
# they're not! At least as of perl v5.24.1, perl -CSDA, which we're using in
# test-setup.sh returns an error message and stops processing, so the output
# only contains 'bleh', but not the question marks.
# TODO(ulfjack): Reinstate this check when the new implementation has landed.
# expect_log '\?\?\?\?\?<!CDATA\[\]\]>\]\]<!\[CDATA\[>\]\]></system-out>'
expect_log '<system-out>'
# "xmltest" is the first line of output from the test.sh script.
expect_log '<!\[CDATA\[xmltest'
expect_log 'before ????? after'
expect_log '<!CDATA\[\]\]>\]\]<!\[CDATA\[>\]\]>'
expect_log '</system-out>'
}

function test_xml_is_present_when_timingout_split_xml() {
Expand All @@ -363,16 +365,16 @@ function test_xml_is_present_when_timingout_split_xml() {
cat "${xml_log}" > $TEST_log
# The new script does not convert exit codes to signals.
expect_log '"exited with error code 142"'
# The old code only inlines the output of the subprocess into the xml file,
# while the new code inlines the entire test log into the xml file, which
# includes a header generated by test-setup.sh; we check for the header here.
expect_log '<system-out><!\[CDATA\[exec ${PAGER:-/usr/bin/less}'
# We should check that the invalid characters are correctly encoded, except
# they're not! At least as of perl v5.24.1, perl -CSDA, which we're using in
# test-setup.sh returns an error message and stops processing, so the output
# only contains 'bleh', but not the question marks.
# TODO(ulfjack): Reinstate this check when the new implementation has landed.
# expect_log '\?\?\?\?\?<!CDATA\[\]\]>\]\]<!\[CDATA\[>\]\]></system-out>'
expect_log '<system-out>'
# When using --noexperimental_split_xml_generation, the output of the
# subprocesses goes into the xml file, while
# --experimental_split_xml_generation inlines the entire test log into
# the xml file, which includes a header generated by test-setup.sh;
# the header starts with "exec ${PAGER:-/usr/bin/less}".
expect_log '<!\[CDATA\[exec ${PAGER:-/usr/bin/less}'
expect_log 'before ????? after'
expect_log '<!CDATA\[\]\]>\]\]<!\[CDATA\[>\]\]>'
expect_log '</system-out>'
}

# Tests that the test.xml and test.log are correct and the test does not
Expand Down
83 changes: 83 additions & 0 deletions src/test/shell/bazel/generate_xml_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/bin/bash
#
# Copyright 2019 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tests the UTF-8 fixing script in test-setup.sh / generate-xml.sh.
#

# Bootstrap runfiles lookup. We can't use a central script for that because we'd
# need to be able to lookup runfiles in order to use it.

# --- begin runfiles.bash initialization ---
# Copy-pasted from Bazel's Bash runfiles library (tools/bash/runfiles/runfiles.bash).
set -euo pipefail
if [[ ! -d "${RUNFILES_DIR:-/dev/null}" && ! -f "${RUNFILES_MANIFEST_FILE:-/dev/null}" ]]; then
if [[ -f "$0.runfiles_manifest" ]]; then
export RUNFILES_MANIFEST_FILE="$0.runfiles_manifest"
elif [[ -f "$0.runfiles/MANIFEST" ]]; then
export RUNFILES_MANIFEST_FILE="$0.runfiles/MANIFEST"
elif [[ -f "$0.runfiles/bazel_tools/tools/bash/runfiles/runfiles.bash" ]]; then
export RUNFILES_DIR="$0.runfiles"
fi
fi
if [[ -f "${RUNFILES_DIR:-/dev/null}/bazel_tools/tools/bash/runfiles/runfiles.bash" ]]; then
source "${RUNFILES_DIR}/bazel_tools/tools/bash/runfiles/runfiles.bash"
elif [[ -f "${RUNFILES_MANIFEST_FILE:-/dev/null}" ]]; then
source "$(grep -m1 "^bazel_tools/tools/bash/runfiles/runfiles.bash " \
"$RUNFILES_MANIFEST_FILE" | cut -d ' ' -f 2-)"
else
echo >&2 "ERROR: cannot find @bazel_tools//tools/bash/runfiles:runfiles.bash"
exit 1
fi
# --- end runfiles.bash initialization ---

# Load the unit test framework
source "$(rlocation io_bazel/src/test/shell/unittest.bash)" \
|| (echo "unittest.bash not found!" && exit 1)
GENERATE_XML="$(rlocation io_bazel/tools/test/generate-xml.sh)"

# Encode the passed parameters using the encode_utf8 routing in generate-xml.sh.
function encode {
echo -e "$@" | "$GENERATE_XML" "-" "-" "-" "-"
}

function test_simple_ascii() {
assert_equals 'Simple ascii' "$(encode "Simple ascii")"
}

function test_low_control_chars() {
# Need echo to turn \t into a tab.
assert_equals "$(echo -e '????????\t?')" \
"$(encode '\x1\x2\x3\x4\x5\x6\x7\x8\x9\xb')"
}

function test_high_control_chars() {
assert_equals '?????????????' \
"$(encode '\xc\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19')"
}

function test_valid_two_byte_seq() {
assert_equals "$(echo -e '\xc0\x80')" "$(encode '\xc0\x80')"
}

function test_valid_three_byte_seq() {
assert_equals "$(echo -e '\xea\xa0\xb0')" "$(encode '\xea\xa0\xb0')"
}

function test_invalid_two_byte_seq() {
assert_equals '??' "$(encode '\xc0\xc0')"
}

run_suite "generate-xml.sh tests"
93 changes: 82 additions & 11 deletions tools/test/generate-xml.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,83 @@ DURATION_IN_SECONDS="$3"
EXIT_CODE="$4"

# Keep this in sync with test-setup.sh!
function encode_stream {
# Replace invalid XML characters and invalid sequence in CDATA
# We do this in four steps:
#
# 1. Add a single whitespace character to the end of every line
#
# 2. Replace every sequence of legal characters followed by an illegal
# character *or* followed by a legal character at the end of the line with
# the same sequence of legal characters followed by a question mark
# character (replacing the illegal or last character). Since this will
# always replace the last character in a line with a question mark, we
# make sure to append a whitespace in step #1.
#
# A character is legal if it is a valid UTF-8 character that is allowed in
# an XML file (this excludes a few control codes, but otherwise allows
# most UTF-8 characters).
#
# We can't use sed in UTF-8 mode, because it would fail on the first
# illegal character. Instead, we have to match legal characters by their
# 8-bit binary sequences, and also switch sed to an 8-bit mode.
#
# The legal UTF codepoint ranges are 9,a,d,20-d7ff,e000-fffd,10000-10ffff,
# which results in the following 8-bit binary UTF-8 matchers:
# [\x9\xa\xd\x20-\x7f] <--- (9,A,D,20-7F)
# [\xc0-\xdf][\x80-\xbf] <--- (0080-07FF)
# [\xe0-\xec][\x80-\xbf][\x80-\xbf] <--- (0800-CFFF)
# [\xed][\x80-\x9f][\x80-\xbf] <--- (D000-D7FF)
# [\xee][\x80-\xbf][\x80-\xbf] <--- (E000-EFFF)
# [\xef][\x80-\xbe][\x80-\xbf] <--- (F000-FFEF)
# [\xef][\xbf][\x80-\xbd] <--- (FFF0-FFFD)
# [\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf] <--- (010000-10FFFF)
#
# We omit \xa and \xd below since sed already splits the input into lines.
#
# 3. Remove the last character in the line, which we expect to be a
# question mark (that was originally added as a whitespace in step #1).
#
# 4. Replace the string ']]>' with ']]>]]<![CDATA[>' to prevent escaping the
# surrounding CDATA block.
#
# Sed supports the necessary operations as of version 4.4, but not in all
# earlier versions. Specifically, we have found that sed 4.1.5 is not 8-bit
# safe even when set to an 8-bit locale.
#
# OSX sed does not support escape sequences (\xhh), use echo as workaround.
#
# Alternatives considered:
# Perl - We originally used Perl, but wanted to avoid the dependency.
# Recent versions of Perl now error on invalid utf-8 characters.
# tr - tr only replaces single-byte sequences, so cannot handle utf-8.
LC_ALL=C sed -E \
-e 's/.*/& /g' \
-e 's/(('\
"$(echo -e '[\x9\x20-\x7f]')|"\
"$(echo -e '[\xc0-\xdf][\x80-\xbf]')|"\
"$(echo -e '[\xe0-\xec][\x80-\xbf][\x80-\xbf]')|"\
"$(echo -e '[\xed][\x80-\x9f][\x80-\xbf]')|"\
"$(echo -e '[\xee-\xef][\x80-\xbf][\x80-\xbf]')|"\
"$(echo -e '[\xf0][\x80-\x8f][\x80-\xbf][\x80-\xbf]')"\
')*)./\1?/g' \
-e 's/(.*)\?/\1/g' \
-e 's|]]>|]]>]]<![CDATA[>|g'
}

function encode_as_xml {
if [[ -f "$1" ]]; then
# Replace invalid XML characters and invalid sequence in CDATA
# cf. https://stackoverflow.com/a/7774512/4717701
perl -CSDA -pe's/[^\x9\xA\xD\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/?/g;' "$1" \
| sed 's|]]>|]]>]]<![CDATA[>|g'
if [ -f "$1" ]; then
cat "$1" | encode_stream
fi
}

# For testing, we allow calling this script with "-", in which case we only
# perform the encoding step. We intentionally ignore the rest of the parameters.
if [ "$TEST_LOG" == "-" ]; then
encode_stream
exit 0
fi

test_name="${TEST_BINARY#./}"
errors=0
error_msg=""
Expand All @@ -46,14 +114,17 @@ if [[ -n "${TEST_TOTAL_SHARDS+x}" ]] && ((TEST_TOTAL_SHARDS != 0)); then
fi

FAILED=0
ENCODED_LOG="$(encode_as_xml "${TEST_LOG}")" || FAILED=1
cat <<EOF >${XML_OUTPUT_FILE}
ENCODED_LOG="$(encode_as_xml "${TEST_LOG}")" || FAILED=$?
cat >"${XML_OUTPUT_FILE}" <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<testsuites>
<testsuite name="${test_name}" tests="1" failures="0" errors="${errors}">
<testcase name="${test_name}" status="run" duration="${DURATION_IN_SECONDS}" time="${DURATION_IN_SECONDS}">${error_msg}</testcase>
<system-out><![CDATA[${ENCODED_LOG}]]></system-out>
</testsuite>
<testsuite name="${test_name}" tests="1" failures="0" errors="${errors}">
<testcase name="${test_name}" status="run" duration="${DURATION_IN_SECONDS}" time="${DURATION_IN_SECONDS}">${error_msg}</testcase>
<system-out>
Generated test.log (if the file is not UTF-8, then this may be unreadable):
<![CDATA[${ENCODED_LOG}]]>
</system-out>
</testsuite>
</testsuites>
EOF
exit "$FAILED"
25 changes: 20 additions & 5 deletions tools/test/test-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,25 @@ if [[ -z "$no_echo" ]]; then
fi

# Unused if EXPERIMENTAL_SPLIT_XML_GENERATION is set.
function encode_stream {
# See generate-xml.sh for documentation.
LC_ALL=C sed -E \
-e 's/.*/& /g' \
-e 's/(('\
"$(echo -e '[\x9\x20-\x7f]')|"\
"$(echo -e '[\xc0-\xdf][\x80-\xbf]')|"\
"$(echo -e '[\xe0-\xec][\x80-\xbf][\x80-\xbf]')|"\
"$(echo -e '[\xed][\x80-\x9f][\x80-\xbf]')|"\
"$(echo -e '[\xee-\xef][\x80-\xbf][\x80-\xbf]')|"\
"$(echo -e '[\xf0][\x80-\x8f][\x80-\xbf][\x80-\xbf]')"\
')*)./\1?/g' \
-e 's/(.*)\?/\1/g' \
-e 's|]]>|]]>]]<![CDATA[>|g'
}

function encode_output_file {
if [ -f "$1" ]; then
# Replace invalid XML characters and invalid sequence in CDATA
# cf. https://stackoverflow.com/a/7774512/4717701
perl -CSDA -pe's/[^\x9\xA\xD\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/?/g;' "$1" \
| sed 's|]]>|]]>]]<![CDATA[>|g'
cat "$1" | encode_stream
fi
}

Expand Down Expand Up @@ -194,7 +207,9 @@ function write_xml_output_file {
<testsuites>
<testsuite name="$test_name" tests="1" failures="0" errors="${errors}">
<testcase name="$test_name" status="run" duration="${duration}" time="${duration}">${error_msg}</testcase>
<system-out><![CDATA[$(encode_output_file "${XML_OUTPUT_FILE}.log")]]></system-out>
<system-out>Generated test.log (if the file is not UTF-8, then this may be unreadable):
<![CDATA[$(encode_output_file "${XML_OUTPUT_FILE}.log")]]>
</system-out>
</testsuite>
</testsuites>
EOF
Expand Down

0 comments on commit 92fffc1

Please sign in to comment.