Skip to content

Commit

Permalink
Provide private implementations for str.maketrans and str.translate
Browse files Browse the repository at this point in the history
This pair of utilities from Python's core are helpful when encoding or escaping strings.
Unlike the common alternative — repeated application of `str.replace` —
a `str.translate` implementation performs its work in a single pass.

This isn't principally about efficiency — although a single-pass
implementation may be more efficient — but rather about correctness.
Doing all the translation in a single pass
sidesteps the issue of double-encoding errors which are possible under
repeated-processing schemes when when substitution input/output aliasing
is present (i.e. some substitutions produce output that other
substitutions recognize as to-be-replaced input).
See chainguard-dev/rules_apko#30 for an concrete
example of a double-encoding issue resulting from a repeated-processing
translation implementation.
  • Loading branch information
plobsing committed Nov 22, 2024
1 parent 84e72b7 commit 032b7e7
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 1 deletion.
3 changes: 3 additions & 0 deletions lib/private/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ bzl_library(
name = "strings",
srcs = ["strings.bzl"],
visibility = ["//lib:__subpackages__"],
deps = [
"@bazel_skylib//lib:types",
],
)

bzl_library(
Expand Down
103 changes: 103 additions & 0 deletions lib/private/strings.bzl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"String utilities"

load("@bazel_skylib//lib:types.bzl", "types")

CHAR_TO_INT = {
"\0": 0,
"\1": 1,
Expand Down Expand Up @@ -653,3 +655,104 @@ def split_args(s):
if arg != "":
args.append(arg)
return args

def maketrans(x):
"""
Return a translation table usable with translate().
Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans)
of the same name.
Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
possible. Entries for characters outside this range will trigger a failure.
Args:
x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings)
to Unicode ordinals, strings, or None. Character keys will be converted to ordinals.
Returns:
dict. The translation table.
"""

if not types.is_dict(x):
fail("if you give only one argument to maketrans it must be a dict")

table = {}

for (k, v) in x.items():
if types.is_int(k):
if k > 0xFF:
fail("most Unicode is unsupported")
table[k] = v
elif types.is_string(k):
if len(k) != 1:
fail("string keys in translate table must be of length 1")
codepoint = ord(k)
if codepoint == None:
fail("could not compute ord('{}'), most Unicode is unsupported".format(k))
table[codepoint] = v
else:
fail("keys in translate table must be strings or integers")

return table

def translate(s, table):
"""
Replace characters a string according to a translation table.
Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate)
of the same name.
Characters with entries in the table are replaced in the output.
Characters mapped to None are deleted.
Characters absent from the table are mirrored to the output untouched.
Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
possible. Characters outside this range will be silently mirrored to the output without consulting
the translation table.
Args:
s: str. Input string upon which to perform replacements.
table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None.
Returns:
str. Output string derived from input string with substitutions and deletions applied from table.
"""

if not types.is_string(s):
fail("first argument to translate must be a string")
if not types.is_dict(table):
fail("second argument to translate must be a dict")

parts = []
lit_start = None # Index of start of current run of literal (i.e. no-op translation) content, or None.
for (i, c) in enumerate(s.elems()):
codepoint = ord(c)
if codepoint != None and codepoint in table:
# Terminate the current literal run, if any.
if lit_start != None:
parts.append(s[lit_start:i])
lit_start = None

replacement = table[codepoint]
if replacement == None:
pass
elif types.is_int(replacement):
parts.append(chr(replacement))
elif types.is_string(replacement):
parts.append(replacement)
else:
fail("character mapping must return integer, None or str")

else: # No entry in translation table.
if lit_start == None:
lit_start = i

# Flush the caudal literal run, if any.
if lit_start != None:
parts.append(s[lit_start:])
lit_start = None

if len(parts) == 1:
return parts[0]
return "".join(parts)
26 changes: 25 additions & 1 deletion lib/tests/strings_tests.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

load("@bazel_skylib//lib:partial.bzl", "partial")
load("@bazel_skylib//lib:unittest.bzl", "asserts", "unittest")
load("//lib/private:strings.bzl", "chr", "hex", "ord", "split_args")
load("//lib/private:strings.bzl", "chr", "hex", "maketrans", "ord", "split_args", "translate")

def _ord_test_impl(ctx):
env = unittest.begin(ctx)
Expand Down Expand Up @@ -83,11 +83,35 @@ def _split_args_test_impl(ctx):

split_args_test = unittest.make(_split_args_test_impl)

def _translate_test_impl(ctx):
env = unittest.begin(ctx)

table = maketrans({
"<": ">",
"!": None,
})

asserts.equals(env, "...", translate("...", table))
asserts.equals(env, ">..", translate("<..", table))
asserts.equals(env, ".>.", translate(".<.", table))
asserts.equals(env, "..>", translate("..<", table))
asserts.equals(env, "..", translate("!..", table))
asserts.equals(env, "..", translate(".!.", table))
asserts.equals(env, "..", translate("..!", table))
asserts.equals(env, ">>>", translate("<<<", table))
asserts.equals(env, "", translate("!!!", table))
asserts.equals(env, ".>", translate(".<!", table))

return unittest.end(env)

translate_test = unittest.make(_translate_test_impl)

def strings_test_suite():
unittest.suite(
"strings_tests",
partial.make(ord_test, timeout = "short"),
partial.make(chr_test, timeout = "short"),
partial.make(hex_test, timeout = "short"),
partial.make(split_args_test, timeout = "short"),
partial.make(translate_test, timeout = "short"),
)

0 comments on commit 032b7e7

Please sign in to comment.