From 7e50d1e8c167d8f3fb0ae01e05826da78acdccf9 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Mon, 17 Jan 2022 18:59:54 +0100 Subject: [PATCH 01/10] Using faster Python script to amalgamate --- build/single_file_libs/combine.py | 156 ++++++++++++++++++ .../create_single_file_decoder.sh | 7 +- .../create_single_file_library.sh | 7 +- 3 files changed, 168 insertions(+), 2 deletions(-) create mode 100755 build/single_file_libs/combine.py diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py new file mode 100755 index 0000000000..994b36b1dd --- /dev/null +++ b/build/single_file_libs/combine.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +# Tool to bundle multiple C/C++ source files, inlining any includes. +# +# Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain) + +import argparse, os, re, sys + +from pathlib import Path + +# File roots when searching (equivalent to -I paths for the compiler). +roots = set() + +# File Path objects previously inlined. +found = set() + +# Destination file object (or stdout if no output file was supplied). +destn = None + +# Regex to handle the following type of file includes: +# +# #include "file" +# #include "file" +# # include "file" +# #include "file" +# #include "file" // comment +# #include "file" // comment with quote " +# +# And all combinations of, as well as ignoring the following: +# +# #include +# //#include "file" +# /*#include "file"*/ +# +# We don't try to catch errors since the compiler will do this (and the code is +# expected to be valid before processing) and we don't care what follows the +# file (whether it's a valid comment or not, since anything after the quoted +# string is ignored) +# +include_regex = re.compile(r'^\s*#\s*include\s*"(.+?)"') + +# Simple tests to prove include_regex's cases. +# +def test_match_include(): + if (include_regex.match('#include "file"') and + include_regex.match(' #include "file"') and + include_regex.match('# include "file"') and + include_regex.match('#include "file"') and + include_regex.match('#include "file" // comment')): + if (not include_regex.match('#include ') and + not include_regex.match('//#include "file"') and + not include_regex.match('/*#include "file"*/')): + found = include_regex.match('#include "file" // "') + if (found and found.group(1) == 'file'): + print('#include match valid') + return True + return False + +# Regex to handle "#pragma once" in various formats: +# +# #pragma once +# #pragma once +# # pragma once +# #pragma once +# #pragma once // comment +# +# Ignoring commented versions, same as include_regex. +# +pragma_regex = re.compile(r'^\s*#\s*pragma\s*once\s*') + +# Simple tests to prove pragma_regex's cases. +# +def text_match_pragma(): + if (pragma_regex.match('#pragma once') and + pragma_regex.match(' #pragma once') and + pragma_regex.match('# pragma once') and + pragma_regex.match('#pragma once') and + pragma_regex.match('#pragma once // comment')): + if (not pragma_regex.match('//#pragma once') and + not pragma_regex.match('/*#pragma once*/')): + print('#pragma once match valid') + return True + return False + +# Finds 'file'. First the currently processing file's 'parent' path is looked at +# for a match, followed by the list of 'root', returning a valid Path in +# canonical form. If no match is found None is returned. +# +def resolve_include(parent: Path, file: str): + found = parent.joinpath(file).resolve(); + if (found.is_file()): + return found + for root in roots: + found = root.joinpath(file).resolve() + if (found.is_file()): + return found + return None + +# Writes 'line' to the open file 'destn' (or stdout). +# +def write_line(line): + print(line, file=destn) + +# Logs 'line' to stderr. +# +def log_line(line): + print(line, file=sys.stderr) + +def add_file(file): + if (isinstance(file, Path) and file.is_file()): + log_line(f'Processing: {file}') + with file.open('r') as opened: + for line in opened: + line = line.rstrip('\n') + match_include = include_regex.match(line); + if (match_include): + inc_name = match_include.group(1) + resolved = resolve_include(file.parent, inc_name) + if (resolved not in found): + # The file was not previously encountered + found.add(resolved) + write_line(f'/**** start inlining {inc_name} ****/') + add_file(resolved) + write_line(f'/**** ended inlining {inc_name} ****/') + else: + write_line(f'/**** skipping file: {inc_name} ****/') + else: + if (not pragma_regex.match(line)): + write_line(line) + else: + log_line(f'Error: Unable to find: {file}') + + +parser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c') +parser.add_argument('-r', '--root', action='append', type=Path, help='file root search path') +parser.add_argument('-x', '--exclude', action='append', help='file to completely exclude from inlining') +parser.add_argument('-k', '--keep', action='append', help='file to exclude from inlining but keep the include directive') +parser.add_argument('-p', '--pragma', action='store_true', default=False, help='keep any "#pragma once" directives (removed by default)') +parser.add_argument('-o', '--output', type=argparse.FileType('w'), help='output file (otherwise stdout)') +parser.add_argument('input', type=Path, help='input file') +args = parser.parse_args() + +# Resolve all of the root paths upfront (we'll halt here on invalid roots) +if (args.root is not None): + for path in args.root: + roots.add(path.resolve(strict=True)) + +try: + if (args.output is None): + destn = sys.stdout + else: + destn = args.output + add_file(args.input) +finally: + if (destn is not None): + destn.close() diff --git a/build/single_file_libs/create_single_file_decoder.sh b/build/single_file_libs/create_single_file_decoder.sh index b5f5613ae2..1c8841d187 100755 --- a/build/single_file_libs/create_single_file_decoder.sh +++ b/build/single_file_libs/create_single_file_decoder.sh @@ -5,7 +5,12 @@ ZSTD_SRC_ROOT="../../lib" # Amalgamate the sources echo "Amalgamating files... this can take a while" -./combine.sh -r "$ZSTD_SRC_ROOT" -o zstddeclib.c zstddeclib-in.c +# Using the faster Python script if we have 3.8 or higher +if python3 -c 'import sys; assert sys.version_info >= (3,8)' 2>/dev/null; then + ./combine.py -r "$ZSTD_SRC_ROOT" -o zstddeclib.c zstddeclib-in.c +else + ./combine.sh -r "$ZSTD_SRC_ROOT" -o zstddeclib.c zstddeclib-in.c +fi # Did combining work? if [ $? -ne 0 ]; then echo "Combine script: FAILED" diff --git a/build/single_file_libs/create_single_file_library.sh b/build/single_file_libs/create_single_file_library.sh index 6f38526d5b..9b2f22a9d9 100755 --- a/build/single_file_libs/create_single_file_library.sh +++ b/build/single_file_libs/create_single_file_library.sh @@ -5,7 +5,12 @@ ZSTD_SRC_ROOT="../../lib" # Amalgamate the sources echo "Amalgamating files... this can take a while" -./combine.sh -r "$ZSTD_SRC_ROOT" -o zstd.c zstd-in.c +# Using the faster Python script if we have 3.8 or higher +if python3 -c 'import sys; assert sys.version_info >= (3,8)' 2>/dev/null; then + ./combine.py -r "$ZSTD_SRC_ROOT" -o zstd.c zstd-in.c +else + ./combine.sh -r "$ZSTD_SRC_ROOT" -o zstd.c zstd-in.c +fi # Did combining work? if [ $? -ne 0 ]; then echo "Combine script: FAILED" From 829ac2e9cedb0b360cfee9d7df88ed44e677ef20 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Tue, 18 Jan 2022 11:43:01 +0100 Subject: [PATCH 02/10] Work-in-progress; annotated types, added docs, parsed and resolved excluded files --- build/single_file_libs/combine.py | 66 +++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 994b36b1dd..4dd3a07205 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -4,20 +4,27 @@ # # Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain) -import argparse, os, re, sys +import argparse, io, os, re, sys from pathlib import Path -# File roots when searching (equivalent to -I paths for the compiler). +# Set of file roots when searching (equivalent to -I paths for the compiler). roots = set() -# File Path objects previously inlined. +# Set of (canonical) file Path objects to exclude from inlining (and not only +# exclude but to add a compiler error directive when they're encountered). +excludes = set() + +# Set of (canonical) file Path objects to keep as include directives. +keeps = set() + +# Set of file Path objects previously inlined. found = set() -# Destination file object (or stdout if no output file was supplied). +# Destination file TextIOBase object (or stdout if no output file was supplied). destn = None -# Regex to handle the following type of file includes: +# Compiled regex Patern to handle the following type of file includes: # # #include "file" # #include "file" @@ -41,7 +48,7 @@ # Simple tests to prove include_regex's cases. # -def test_match_include(): +def test_match_include() -> bool: if (include_regex.match('#include "file"') and include_regex.match(' #include "file"') and include_regex.match('# include "file"') and @@ -56,7 +63,7 @@ def test_match_include(): return True return False -# Regex to handle "#pragma once" in various formats: +# Compiled regex Patern to handle "#pragma once" in various formats: # # #pragma once # #pragma once @@ -70,7 +77,7 @@ def test_match_include(): # Simple tests to prove pragma_regex's cases. # -def text_match_pragma(): +def text_match_pragma() -> bool: if (pragma_regex.match('#pragma once') and pragma_regex.match(' #pragma once') and pragma_regex.match('# pragma once') and @@ -83,10 +90,10 @@ def text_match_pragma(): return False # Finds 'file'. First the currently processing file's 'parent' path is looked at -# for a match, followed by the list of 'root', returning a valid Path in +# for a match, followed by the list of 'root' paths, returning a valid Path in # canonical form. If no match is found None is returned. # -def resolve_include(parent: Path, file: str): +def resolve_include(parent: Path, file: str) -> Path: found = parent.joinpath(file).resolve(); if (found.is_file()): return found @@ -98,15 +105,17 @@ def resolve_include(parent: Path, file: str): # Writes 'line' to the open file 'destn' (or stdout). # -def write_line(line): +def write_line(line: str) -> None: print(line, file=destn) # Logs 'line' to stderr. # -def log_line(line): +def log_line(line: str) -> None: print(line, file=sys.stderr) -def add_file(file): +# Adds the contents of 'file' with any of its includes inlined. +# +def add_file(file: Path) -> None: if (isinstance(file, Path) and file.is_file()): log_line(f'Processing: {file}') with file.open('r') as opened: @@ -129,8 +138,8 @@ def add_file(file): write_line(line) else: log_line(f'Error: Unable to find: {file}') - +# Start here parser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c') parser.add_argument('-r', '--root', action='append', type=Path, help='file root search path') parser.add_argument('-x', '--exclude', action='append', help='file to completely exclude from inlining') @@ -140,17 +149,40 @@ def add_file(file): parser.add_argument('input', type=Path, help='input file') args = parser.parse_args() +# Fail early on an invalid input (and store it so we don't recurse) +args.input = args.input.resolve(strict=True) +found.add(args.input) + # Resolve all of the root paths upfront (we'll halt here on invalid roots) -if (args.root is not None): +if (args.root): for path in args.root: roots.add(path.resolve(strict=True)) +# Resolve the excluded files +if (args.exclude): + for filename in args.exclude: + resolved = resolve_include(args.input.parent, filename) + if (resolved): + excludes.add(resolved) + else: + log_line(f'Warning: excluded file not found: {filename}') + +# And the files to keep +if (args.keep): + for filename in args.keep: + resolved = resolve_include(args.input.parent, filename) + if (resolved): + keeps.add(resolved) + else: + log_line(f'Warning: kept #include not found: {filename}') + +# Then recursively process the input file try: - if (args.output is None): + if (not args.output): destn = sys.stdout else: destn = args.output add_file(args.input) finally: - if (destn is not None): + if (not destn): destn.close() From 8f1e51f99fa6ca6d132e53e58265b4d79d2fbb2b Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Tue, 18 Jan 2022 19:07:18 +0100 Subject: [PATCH 03/10] Feature parity with original shell script; needs further testing --- build/single_file_libs/combine.py | 123 ++++++++++++++++++------------ 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 4dd3a07205..00a6f11f7e 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -4,25 +4,30 @@ # # Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain) -import argparse, io, os, re, sys +import argparse, re, sys from pathlib import Path +from typing import Any, List, Optional, Pattern, Set, TextIO # Set of file roots when searching (equivalent to -I paths for the compiler). -roots = set() +roots: Set[Path] = set() # Set of (canonical) file Path objects to exclude from inlining (and not only # exclude but to add a compiler error directive when they're encountered). -excludes = set() +excludes: Set[Path] = set() # Set of (canonical) file Path objects to keep as include directives. -keeps = set() +keeps: Set[Path] = set() -# Set of file Path objects previously inlined. -found = set() +# Whether to keep the #pragma once directives (unlikely, since this will result +# in a warning, but the option is there). +keep_pragma: bool = False -# Destination file TextIOBase object (or stdout if no output file was supplied). -destn = None +# Destination file object (or stdout if no output file was supplied). +destn:TextIO = sys.stdout + +# Set of file Path objects previously inlined (and to ignore if reencountering). +found: Set[Path] = set() # Compiled regex Patern to handle the following type of file includes: # @@ -44,7 +49,7 @@ # file (whether it's a valid comment or not, since anything after the quoted # string is ignored) # -include_regex = re.compile(r'^\s*#\s*include\s*"(.+?)"') +include_regex: Pattern = re.compile(r'^\s*#\s*include\s*"(.+?)"') # Simple tests to prove include_regex's cases. # @@ -73,7 +78,7 @@ def test_match_include() -> bool: # # Ignoring commented versions, same as include_regex. # -pragma_regex = re.compile(r'^\s*#\s*pragma\s*once\s*') +pragma_regex: Pattern = re.compile(r'^\s*#\s*pragma\s*once\s*') # Simple tests to prove pragma_regex's cases. # @@ -93,8 +98,11 @@ def text_match_pragma() -> bool: # for a match, followed by the list of 'root' paths, returning a valid Path in # canonical form. If no match is found None is returned. # -def resolve_include(parent: Path, file: str) -> Path: - found = parent.joinpath(file).resolve(); +def resolve_include(file: str, parent: Optional[Path] = None) -> Optional[Path]: + if (parent): + found = parent.joinpath(file).resolve(); + else: + found = Path(file) if (found.is_file()): return found for root in roots: @@ -103,41 +111,75 @@ def resolve_include(parent: Path, file: str) -> Path: return found return None -# Writes 'line' to the open file 'destn' (or stdout). +# Helper to resolve lists of files. 'file_list' is passed in from the arguments +# and each entry resolved to its canonical path (like any include entry, either +# from the list of root paths or the owning file's 'parent', which in this case +# is case is the input file). The results are stored in 'resolved'. +# +def resolve_files(file_list: Optional[List[str]], resolved: Set[Path], parent: Optional[Path] = None) -> None: + if (file_list): + for filename in file_list: + found = resolve_include(filename, parent) + if (found): + resolved.add(found) + else: + error_line(f'Warning: excluded file not found: {filename}') + +# Writes 'line' to the open 'destn' (or stdout). # def write_line(line: str) -> None: print(line, file=destn) -# Logs 'line' to stderr. +# Logs 'line' to stderr. This is also used for general notifications that we +# don't want to go to stdout (so the source can be piped). # -def log_line(line: str) -> None: +def error_line(line: Any) -> None: print(line, file=sys.stderr) -# Adds the contents of 'file' with any of its includes inlined. +# Inline the contents of 'file' (with any of its includes also inlined, etc.). # def add_file(file: Path) -> None: - if (isinstance(file, Path) and file.is_file()): - log_line(f'Processing: {file}') + if (file.is_file()): + error_line(f'Processing: {file}') with file.open('r') as opened: for line in opened: line = line.rstrip('\n') match_include = include_regex.match(line); if (match_include): + # We have a quoted include directive so grab the file inc_name = match_include.group(1) - resolved = resolve_include(file.parent, inc_name) - if (resolved not in found): - # The file was not previously encountered - found.add(resolved) - write_line(f'/**** start inlining {inc_name} ****/') - add_file(resolved) - write_line(f'/**** ended inlining {inc_name} ****/') + resolved = resolve_include(inc_name, file.parent) + if (resolved): + if (resolved in excludes): + # The file was excluded so error if the source attempts to use it + write_line(f'#error Using excluded file: {inc_name}') + error_line(f'Excluding: {inc_name}') + else: + if (resolved not in found): + # The file was not previously encountered + found.add(resolved) + if (resolved in keeps): + # But the include was flagged to keep as included + write_line(f'/**** *NOT* inlining {inc_name} ****/') + write_line(line) + error_line('Not Inlining: {inc_name}') + else: + # The file was neither excluded nor seen before so inline it + write_line(f'/**** start inlining {inc_name} ****/') + add_file(resolved) + write_line(f'/**** ended inlining {inc_name} ****/') + else: + write_line(f'/**** skipping file: {inc_name} ****/') else: - write_line(f'/**** skipping file: {inc_name} ****/') + # The include file didn't resolve to a file + write_line(f'#error Unable to find: {inc_name}') + error_line(f'Error: Unable to find: {inc_name}') else: - if (not pragma_regex.match(line)): + # Skip any 'pragma once' directives, otherwise write the source line + if (keep_pragma or not pragma_regex.match(line)): write_line(line) else: - log_line(f'Error: Unable to find: {file}') + error_line(f'Error: Invalid file: {file}') # Start here parser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c') @@ -158,29 +200,14 @@ def add_file(file: Path) -> None: for path in args.root: roots.add(path.resolve(strict=True)) -# Resolve the excluded files -if (args.exclude): - for filename in args.exclude: - resolved = resolve_include(args.input.parent, filename) - if (resolved): - excludes.add(resolved) - else: - log_line(f'Warning: excluded file not found: {filename}') - -# And the files to keep -if (args.keep): - for filename in args.keep: - resolved = resolve_include(args.input.parent, filename) - if (resolved): - keeps.add(resolved) - else: - log_line(f'Warning: kept #include not found: {filename}') +# The remaining params: so resolve the excluded files and #pragma once directive +resolve_files(args.exclude, excludes, args.input.parent) +resolve_files(args.keep, keeps, args.input.parent) +keep_pragma = args.pragma; # Then recursively process the input file try: - if (not args.output): - destn = sys.stdout - else: + if (args.output): destn = args.output add_file(args.input) finally: From 7d90f0b520fda6cd9def6cd248c54b075717e948 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 11:32:53 +0100 Subject: [PATCH 04/10] Test and tidy Made the Python more Python-like. Added notes and general tidy. Tested exclusions and building with various options. Tested all scripts. --- build/single_file_libs/build_library_test.sh | 2 +- build/single_file_libs/combine.py | 232 ++++++++++-------- build/single_file_libs/combine.sh | 1 + .../create_single_file_decoder.sh | 6 +- .../create_single_file_library.sh | 6 +- build/single_file_libs/zstd-in.c | 2 +- build/single_file_libs/zstddeclib-in.c | 2 +- 7 files changed, 133 insertions(+), 118 deletions(-) diff --git a/build/single_file_libs/build_library_test.sh b/build/single_file_libs/build_library_test.sh index 7fb99656bc..31545fc3fe 100755 --- a/build/single_file_libs/build_library_test.sh +++ b/build/single_file_libs/build_library_test.sh @@ -69,7 +69,7 @@ fi echo "Single file library creation script: PASSED" # Copy the header to here (for the tests) -cp "$ZSTD_SRC_ROOT/zstd.h" zstd.h +cp "$ZSTD_SRC_ROOT/zstd.h" examples/zstd.h # Compile the generated output cc -Wall -Wextra -Werror -Wshadow -pthread -I. -Os -g0 -o $OUT_FILE zstd.c examples/roundtrip.c diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 00a6f11f7e..0538ccb699 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -2,6 +2,18 @@ # Tool to bundle multiple C/C++ source files, inlining any includes. # +# Note: there are two types of exclusion options: the '-x' flag, which besides +# excluding a file also adds an #error directive in place of the #include, and +# the '-k' flag, which keeps the #include and doesn't inline the file. The +# intended use cases are: '-x' for files that would normally be #if'd out, so +# features that 100% won't be used in the amalgamated file, for which every +# occurrence adds the error, and '-k' for headers that we wish to manually +# include, such as a project's public API, for which occurrences after the first +# are removed. +# +# Todo: the error handling could be better, which currently throws and halts +# (which is functional just not very friendly). +# # Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain) import argparse, re, sys @@ -31,18 +43,18 @@ # Compiled regex Patern to handle the following type of file includes: # -# #include "file" -# #include "file" -# # include "file" -# #include "file" -# #include "file" // comment -# #include "file" // comment with quote " +# #include "file" +# #include "file" +# # include "file" +# #include "file" +# #include "file" // comment +# #include "file" // comment with quote " # # And all combinations of, as well as ignoring the following: # -# #include -# //#include "file" -# /*#include "file"*/ +# #include +# //#include "file" +# /*#include "file"*/ # # We don't try to catch errors since the compiler will do this (and the code is # expected to be valid before processing) and we don't care what follows the @@ -54,27 +66,27 @@ # Simple tests to prove include_regex's cases. # def test_match_include() -> bool: - if (include_regex.match('#include "file"') and - include_regex.match(' #include "file"') and - include_regex.match('# include "file"') and - include_regex.match('#include "file"') and - include_regex.match('#include "file" // comment')): - if (not include_regex.match('#include ') and - not include_regex.match('//#include "file"') and - not include_regex.match('/*#include "file"*/')): - found = include_regex.match('#include "file" // "') - if (found and found.group(1) == 'file'): - print('#include match valid') - return True - return False + if (include_regex.match('#include "file"') and + include_regex.match(' #include "file"') and + include_regex.match('# include "file"') and + include_regex.match('#include "file"') and + include_regex.match('#include "file" // comment')): + if (not include_regex.match('#include ') and + not include_regex.match('//#include "file"') and + not include_regex.match('/*#include "file"*/')): + found = include_regex.match('#include "file" // "') + if (found and found.group(1) == 'file'): + print('#include match valid') + return True + return False # Compiled regex Patern to handle "#pragma once" in various formats: # -# #pragma once -# #pragma once -# # pragma once -# #pragma once -# #pragma once // comment +# #pragma once +# #pragma once +# # pragma once +# #pragma once +# #pragma once // comment # # Ignoring commented versions, same as include_regex. # @@ -83,103 +95,105 @@ def test_match_include() -> bool: # Simple tests to prove pragma_regex's cases. # def text_match_pragma() -> bool: - if (pragma_regex.match('#pragma once') and - pragma_regex.match(' #pragma once') and - pragma_regex.match('# pragma once') and - pragma_regex.match('#pragma once') and - pragma_regex.match('#pragma once // comment')): - if (not pragma_regex.match('//#pragma once') and - not pragma_regex.match('/*#pragma once*/')): - print('#pragma once match valid') - return True - return False + if (pragma_regex.match('#pragma once') and + pragma_regex.match(' #pragma once') and + pragma_regex.match('# pragma once') and + pragma_regex.match('#pragma once') and + pragma_regex.match('#pragma once // comment')): + if (not pragma_regex.match('//#pragma once') and + not pragma_regex.match('/*#pragma once*/')): + print('#pragma once match valid') + return True + return False # Finds 'file'. First the currently processing file's 'parent' path is looked at # for a match, followed by the list of 'root' paths, returning a valid Path in # canonical form. If no match is found None is returned. # def resolve_include(file: str, parent: Optional[Path] = None) -> Optional[Path]: - if (parent): - found = parent.joinpath(file).resolve(); - else: - found = Path(file) - if (found.is_file()): - return found - for root in roots: - found = root.joinpath(file).resolve() - if (found.is_file()): - return found - return None + if (parent): + found = parent.joinpath(file).resolve(); + else: + found = Path(file) + if (found.is_file()): + return found + for root in roots: + found = root.joinpath(file).resolve() + if (found.is_file()): + return found + return None # Helper to resolve lists of files. 'file_list' is passed in from the arguments # and each entry resolved to its canonical path (like any include entry, either # from the list of root paths or the owning file's 'parent', which in this case # is case is the input file). The results are stored in 'resolved'. # -def resolve_files(file_list: Optional[List[str]], resolved: Set[Path], parent: Optional[Path] = None) -> None: - if (file_list): - for filename in file_list: - found = resolve_include(filename, parent) - if (found): - resolved.add(found) - else: - error_line(f'Warning: excluded file not found: {filename}') +def resolve_excluded_files(file_list: Optional[List[str]], resolved: Set[Path], parent: Optional[Path] = None) -> None: + if (file_list): + for filename in file_list: + found = resolve_include(filename, parent) + if (found): + resolved.add(found) + else: + error_line(f'Warning: excluded file not found: {filename}') # Writes 'line' to the open 'destn' (or stdout). # def write_line(line: str) -> None: - print(line, file=destn) + print(line, file=destn) # Logs 'line' to stderr. This is also used for general notifications that we # don't want to go to stdout (so the source can be piped). # def error_line(line: Any) -> None: - print(line, file=sys.stderr) + print(line, file=sys.stderr) # Inline the contents of 'file' (with any of its includes also inlined, etc.). # -def add_file(file: Path) -> None: - if (file.is_file()): - error_line(f'Processing: {file}') - with file.open('r') as opened: - for line in opened: - line = line.rstrip('\n') - match_include = include_regex.match(line); - if (match_include): - # We have a quoted include directive so grab the file - inc_name = match_include.group(1) - resolved = resolve_include(inc_name, file.parent) - if (resolved): - if (resolved in excludes): - # The file was excluded so error if the source attempts to use it - write_line(f'#error Using excluded file: {inc_name}') - error_line(f'Excluding: {inc_name}') - else: - if (resolved not in found): - # The file was not previously encountered - found.add(resolved) - if (resolved in keeps): - # But the include was flagged to keep as included - write_line(f'/**** *NOT* inlining {inc_name} ****/') - write_line(line) - error_line('Not Inlining: {inc_name}') - else: - # The file was neither excluded nor seen before so inline it - write_line(f'/**** start inlining {inc_name} ****/') - add_file(resolved) - write_line(f'/**** ended inlining {inc_name} ****/') - else: - write_line(f'/**** skipping file: {inc_name} ****/') - else: - # The include file didn't resolve to a file - write_line(f'#error Unable to find: {inc_name}') - error_line(f'Error: Unable to find: {inc_name}') - else: - # Skip any 'pragma once' directives, otherwise write the source line - if (keep_pragma or not pragma_regex.match(line)): - write_line(line) - else: - error_line(f'Error: Invalid file: {file}') +def add_file(file: Path, file_name: str = None) -> None: + if (file.is_file()): + if (not file_name): + file_name = file.name + error_line(f'Processing: {file_name}') + with file.open('r') as opened: + for line in opened: + line = line.rstrip('\n') + match_include = include_regex.match(line); + if (match_include): + # We have a quoted include directive so grab the file + inc_name = match_include.group(1) + resolved = resolve_include(inc_name, file.parent) + if (resolved): + if (resolved in excludes): + # The file was excluded so error if the compiler uses it + write_line(f'#error Using excluded file: {inc_name}') + error_line(f'Excluding: {inc_name}') + else: + if (resolved not in found): + # The file was not previously encountered + found.add(resolved) + if (resolved in keeps): + # But the include was flagged to keep as included + write_line(f'/**** *NOT* inlining {inc_name} ****/') + write_line(line) + error_line(f'Not inlining: {inc_name}') + else: + # The file was neither excluded nor seen before so inline it + write_line(f'/**** start inlining {inc_name} ****/') + add_file(resolved, inc_name) + write_line(f'/**** ended inlining {inc_name} ****/') + else: + write_line(f'/**** skipping file: {inc_name} ****/') + else: + # The include file didn't resolve to a file + write_line(f'#error Unable to find: {inc_name}') + error_line(f'Error: Unable to find: {inc_name}') + else: + # Skip any 'pragma once' directives, otherwise write the source line + if (keep_pragma or not pragma_regex.match(line)): + write_line(line) + else: + error_line(f'Error: Invalid file: {file}') # Start here parser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c') @@ -197,19 +211,19 @@ def add_file(file: Path) -> None: # Resolve all of the root paths upfront (we'll halt here on invalid roots) if (args.root): - for path in args.root: - roots.add(path.resolve(strict=True)) + for path in args.root: + roots.add(path.resolve(strict=True)) # The remaining params: so resolve the excluded files and #pragma once directive -resolve_files(args.exclude, excludes, args.input.parent) -resolve_files(args.keep, keeps, args.input.parent) +resolve_excluded_files(args.exclude, excludes, args.input.parent) +resolve_excluded_files(args.keep, keeps, args.input.parent) keep_pragma = args.pragma; # Then recursively process the input file try: - if (args.output): - destn = args.output - add_file(args.input) + if (args.output): + destn = args.output + add_file(args.input) finally: - if (not destn): - destn.close() + if (not destn): + destn.close() diff --git a/build/single_file_libs/combine.sh b/build/single_file_libs/combine.sh index 8eac4f9eb1..674a5a275c 100755 --- a/build/single_file_libs/combine.sh +++ b/build/single_file_libs/combine.sh @@ -200,6 +200,7 @@ if [ -n "$1" ]; then printf "" > "$DESTN" fi test_deps + log_line "Processing using the slower shell script; this might take a while" add_file "$1" else echo "Input file not found: \"$1\"" diff --git a/build/single_file_libs/create_single_file_decoder.sh b/build/single_file_libs/create_single_file_decoder.sh index 1c8841d187..3c0c577df5 100755 --- a/build/single_file_libs/create_single_file_decoder.sh +++ b/build/single_file_libs/create_single_file_decoder.sh @@ -4,12 +4,12 @@ ZSTD_SRC_ROOT="../../lib" # Amalgamate the sources -echo "Amalgamating files... this can take a while" +echo "Amalgamating files..." # Using the faster Python script if we have 3.8 or higher if python3 -c 'import sys; assert sys.version_info >= (3,8)' 2>/dev/null; then - ./combine.py -r "$ZSTD_SRC_ROOT" -o zstddeclib.c zstddeclib-in.c + ./combine.py -r "$ZSTD_SRC_ROOT" -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c else - ./combine.sh -r "$ZSTD_SRC_ROOT" -o zstddeclib.c zstddeclib-in.c + ./combine.sh -r "$ZSTD_SRC_ROOT" -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c fi # Did combining work? if [ $? -ne 0 ]; then diff --git a/build/single_file_libs/create_single_file_library.sh b/build/single_file_libs/create_single_file_library.sh index 9b2f22a9d9..a6f71f0f08 100755 --- a/build/single_file_libs/create_single_file_library.sh +++ b/build/single_file_libs/create_single_file_library.sh @@ -4,12 +4,12 @@ ZSTD_SRC_ROOT="../../lib" # Amalgamate the sources -echo "Amalgamating files... this can take a while" +echo "Amalgamating files..." # Using the faster Python script if we have 3.8 or higher if python3 -c 'import sys; assert sys.version_info >= (3,8)' 2>/dev/null; then - ./combine.py -r "$ZSTD_SRC_ROOT" -o zstd.c zstd-in.c + ./combine.py -r "$ZSTD_SRC_ROOT" -x legacy/zstd_legacy.h -o zstd.c zstd-in.c else - ./combine.sh -r "$ZSTD_SRC_ROOT" -o zstd.c zstd-in.c + ./combine.sh -r "$ZSTD_SRC_ROOT" -x legacy/zstd_legacy.h -o zstd.c zstd-in.c fi # Did combining work? if [ $? -ne 0 ]; then diff --git a/build/single_file_libs/zstd-in.c b/build/single_file_libs/zstd-in.c index b694681396..c22e4cd971 100644 --- a/build/single_file_libs/zstd-in.c +++ b/build/single_file_libs/zstd-in.c @@ -4,7 +4,7 @@ * * Generate using: * \code - * combine.sh -r ../../lib -o zstd.c zstd-in.c + * combine.sh -r ../../lib -x legacy/zstd_legacy.h -o zstd.c zstd-in.c * \endcode */ /* diff --git a/build/single_file_libs/zstddeclib-in.c b/build/single_file_libs/zstddeclib-in.c index 72abe61343..42c510736b 100644 --- a/build/single_file_libs/zstddeclib-in.c +++ b/build/single_file_libs/zstddeclib-in.c @@ -4,7 +4,7 @@ * * Generate using: * \code - * combine.sh -r ../../lib -o zstddeclib.c zstddeclib-in.c + * combine.sh -r ../../lib -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c * \endcode */ /* From dd7d29a19c7ac00f02f524c0f8822c691b7ed64c Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 15:57:33 +0100 Subject: [PATCH 05/10] Updated README --- build/single_file_libs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/single_file_libs/README.md b/build/single_file_libs/README.md index 1705b769b8..64c973a68d 100644 --- a/build/single_file_libs/README.md +++ b/build/single_file_libs/README.md @@ -12,7 +12,7 @@ This is the most common use case. The decompression library is small, adding, fo Create `zstddeclib.c` from the Zstd source using: ``` cd zstd/build/single_file_libs -./combine.sh -r ../../lib -o zstddeclib.c zstddeclib-in.c +python3 combine.py -r ../../lib -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c ``` Then add the resulting file to your project (see the [example files](examples)). @@ -26,7 +26,7 @@ The same tool can amalgamate the entire Zstd library for ease of adding both com Create `zstd.c` from the Zstd source using: ``` cd zstd/build/single_file_libs -./combine.sh -r ../../lib -o zstd.c zstd-in.c +python3 combine.py -r ../../lib -x legacy/zstd_legacy.h -k zstd.h -o zstd.c zstd-in.c ``` It's possible to create a compressor-only library but since the decompressor is so small in comparison this doesn't bring much of a gain (but for the curious, simply remove the files in the _decompress_ section at the end of `zstd-in.c`). From 5fd6ddaf8b633ea5999cd593b0671cb2bbe8f8d5 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 16:56:03 +0100 Subject: [PATCH 06/10] Fixed bugs found in other projects When testing amalgamating other projects it was found: invalid Unicode errors were tripping Python's text IO, and the header search order appears differs from the shell version. --- build/single_file_libs/combine.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 0538ccb699..3d1018d5b3 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -106,21 +106,21 @@ def text_match_pragma() -> bool: return True return False -# Finds 'file'. First the currently processing file's 'parent' path is looked at -# for a match, followed by the list of 'root' paths, returning a valid Path in +# Finds 'file'. First the list of 'root' paths are searched, followed by the +# the currently processing file's 'parent' path, returning a valid Path in # canonical form. If no match is found None is returned. # def resolve_include(file: str, parent: Optional[Path] = None) -> Optional[Path]: + for root in roots: + found = root.joinpath(file).resolve() + if (found.is_file()): + return found if (parent): found = parent.joinpath(file).resolve(); else: found = Path(file) if (found.is_file()): return found - for root in roots: - found = root.joinpath(file).resolve() - if (found.is_file()): - return found return None # Helper to resolve lists of files. 'file_list' is passed in from the arguments @@ -150,12 +150,17 @@ def error_line(line: Any) -> None: # Inline the contents of 'file' (with any of its includes also inlined, etc.). # +# Note: text encoding errors are ignored and replaced with ? when reading the +# input files. This isn't ideal, but it's more than likely in the comments than +# code and a) the text editor has probably also failed to read the same content, +# and b) the compiler probably did too. +# def add_file(file: Path, file_name: str = None) -> None: if (file.is_file()): if (not file_name): file_name = file.name error_line(f'Processing: {file_name}') - with file.open('r') as opened: + with file.open('r', errors='replace') as opened: for line in opened: line = line.rstrip('\n') match_include = include_regex.match(line); From 566ebce347aef4f7ea48e4c87fa58934a5944bd8 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 17:33:20 +0100 Subject: [PATCH 07/10] Python style change Co-authored-by: Alexandre Bury --- build/single_file_libs/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 3d1018d5b3..125e7d427a 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -36,7 +36,7 @@ keep_pragma: bool = False # Destination file object (or stdout if no output file was supplied). -destn:TextIO = sys.stdout +destn: TextIO = sys.stdout # Set of file Path objects previously inlined (and to ignore if reencountering). found: Set[Path] = set() From 786263ea85c9325fc03eca3a8d78bfe1c7a19547 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 17:48:10 +0100 Subject: [PATCH 08/10] Suggestion from code review --- build/single_file_libs/combine.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 125e7d427a..55cf32c5ea 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -41,6 +41,18 @@ # Set of file Path objects previously inlined (and to ignore if reencountering). found: Set[Path] = set() +# Compiled regex Patern to handle "#pragma once" in various formats: +# +# #pragma once +# #pragma once +# # pragma once +# #pragma once +# #pragma once // comment +# +# Ignoring commented versions, same as include_regex. +# +pragma_regex: Pattern = re.compile(r'^\s*#\s*pragma\s*once\s*') + # Compiled regex Patern to handle the following type of file includes: # # #include "file" @@ -80,18 +92,6 @@ def test_match_include() -> bool: return True return False -# Compiled regex Patern to handle "#pragma once" in various formats: -# -# #pragma once -# #pragma once -# # pragma once -# #pragma once -# #pragma once // comment -# -# Ignoring commented versions, same as include_regex. -# -pragma_regex: Pattern = re.compile(r'^\s*#\s*pragma\s*once\s*') - # Simple tests to prove pragma_regex's cases. # def text_match_pragma() -> bool: From dc983e7d68aa2a78ceace5a73f3f29ba86b1383b Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Wed, 19 Jan 2022 18:05:35 +0100 Subject: [PATCH 09/10] Typo (and missing commit) --- build/single_file_libs/combine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 55cf32c5ea..6443edad3b 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -94,7 +94,7 @@ def test_match_include() -> bool: # Simple tests to prove pragma_regex's cases. # -def text_match_pragma() -> bool: +def test_match_pragma() -> bool: if (pragma_regex.match('#pragma once') and pragma_regex.match(' #pragma once') and pragma_regex.match('# pragma once') and @@ -230,5 +230,5 @@ def add_file(file: Path, file_name: str = None) -> None: destn = args.output add_file(args.input) finally: - if (not destn): + if (destn): destn.close() From 3f181b61927b0f517b11ddec2ca43490b6de693f Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Thu, 20 Jan 2022 14:50:31 +0100 Subject: [PATCH 10/10] More descriptive exclusion error; updated docs and copyright --- build/single_file_libs/combine.py | 2 +- build/single_file_libs/combine.sh | 2 +- build/single_file_libs/zstd-in.c | 8 ++++++-- build/single_file_libs/zstddeclib-in.c | 8 ++++++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/build/single_file_libs/combine.py b/build/single_file_libs/combine.py index 6443edad3b..badd68a91a 100755 --- a/build/single_file_libs/combine.py +++ b/build/single_file_libs/combine.py @@ -171,7 +171,7 @@ def add_file(file: Path, file_name: str = None) -> None: if (resolved): if (resolved in excludes): # The file was excluded so error if the compiler uses it - write_line(f'#error Using excluded file: {inc_name}') + write_line(f'#error Using excluded file: {inc_name} (re-amalgamate source to fix)') error_line(f'Excluding: {inc_name}') else: if (resolved not in found): diff --git a/build/single_file_libs/combine.sh b/build/single_file_libs/combine.sh index 674a5a275c..a4933bf21d 100755 --- a/build/single_file_libs/combine.sh +++ b/build/single_file_libs/combine.sh @@ -130,7 +130,7 @@ add_file() { local res_inc="$(resolve_include "$srcdir" "$inc")" if list_has_item "$XINCS" "$inc"; then # The file was excluded so error if the source attempts to use it - write_line "#error Using excluded file: $inc" + write_line "#error Using excluded file: $inc (re-amalgamate source to fix)" log_line "Excluding: $inc" else if ! list_has_item "$FOUND" "$res_inc"; then diff --git a/build/single_file_libs/zstd-in.c b/build/single_file_libs/zstd-in.c index c22e4cd971..eecd9a688e 100644 --- a/build/single_file_libs/zstd-in.c +++ b/build/single_file_libs/zstd-in.c @@ -4,11 +4,11 @@ * * Generate using: * \code - * combine.sh -r ../../lib -x legacy/zstd_legacy.h -o zstd.c zstd-in.c + * python combine.py -r ../../lib -x legacy/zstd_legacy.h -o zstd.c zstd-in.c * \endcode */ /* - * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -28,6 +28,10 @@ * Note: the undefs for xxHash allow Zstd's implementation to coincide with with * standalone xxHash usage (with global defines). * + * Note: if you enable ZSTD_LEGACY_SUPPORT the combine.py script will need + * re-running without the "-x legacy/zstd_legacy.h" option (it excludes the + * legacy support at the source level). + * * Note: multithreading is enabled for all platforms apart from Emscripten. */ #define DEBUGLEVEL 0 diff --git a/build/single_file_libs/zstddeclib-in.c b/build/single_file_libs/zstddeclib-in.c index 42c510736b..d0343c54a4 100644 --- a/build/single_file_libs/zstddeclib-in.c +++ b/build/single_file_libs/zstddeclib-in.c @@ -4,11 +4,11 @@ * * Generate using: * \code - * combine.sh -r ../../lib -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c + * python combine.py -r ../../lib -x legacy/zstd_legacy.h -o zstddeclib.c zstddeclib-in.c * \endcode */ /* - * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -27,6 +27,10 @@ * * Note: the undefs for xxHash allow Zstd's implementation to coincide with with * standalone xxHash usage (with global defines). + * + * Note: if you enable ZSTD_LEGACY_SUPPORT the combine.py script will need + * re-running without the "-x legacy/zstd_legacy.h" option (it excludes the + * legacy support at the source level). */ #define DEBUGLEVEL 0 #define MEM_MODULE