diff --git a/.github/workflows/selftest.yaml b/.github/workflows/selftest.yaml index 140cf35..d373ef6 100644 --- a/.github/workflows/selftest.yaml +++ b/.github/workflows/selftest.yaml @@ -11,41 +11,77 @@ on: pull_request: jobs: - # Generate SBOM using cdxgen, but with NPMJS package, not Docker container - sbom-gen: + # Generate SBOM using syft + sbom-gen-syft: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - run: mkdir -p ~/.local/bin + - name: Install syft + run: curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b ~/.local/bin + - name: Install compliance-assistant + uses: ./.github/actions/poetrybuild + - name: Generate SBOM with syft + run: poetry run compliance-assistant sbom generate -v -g syft -d . -o ${{ runner.temp }}/sbom-syft.json + - name: Store raw SBOM as artifact + uses: actions/upload-artifact@v4 + with: + name: sbom-syft + path: ${{ runner.temp }}/sbom-syft.json + + # Generate SBOM using cdxgen (npm package) + sbom-gen-cdxgen: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - name: Install cdxgen run: npm install -g @cyclonedx/cdxgen - - name: Generate CycloneDX SBOM with cdxgen - run: cdxgen -r . -o ${{ runner.temp }}/sbom-raw.json + - name: Install compliance-assistant + uses: ./.github/actions/poetrybuild + - name: Generate SBOM with cdxgen + run: poetry run compliance-assistant sbom generate -v -g cdxgen -d . -o ${{ runner.temp }}/sbom-cdxgen.json - name: Store raw SBOM as artifact uses: actions/upload-artifact@v4 with: - name: sbom-raw - path: ${{ runner.temp }}/sbom-raw.json + name: sbom-cdxgen + path: ${{ runner.temp }}/sbom-cdxgen.json # Enrich the generated SBOM sbom-enrich: runs-on: ubuntu-22.04 - needs: sbom-gen + needs: [sbom-gen-syft, sbom-gen-cdxgen] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/poetrybuild - # Download raw SBOM - - uses: actions/download-artifact@v4 + # Download raw SBOMs + - name: Download Syft SBOM artifact + uses: actions/download-artifact@v4 + with: + name: sbom-syft + path: ${{ runner.temp }} + - name: Download cdxgen SBOM artifact + uses: actions/download-artifact@v4 with: - name: sbom-raw + name: sbom-cdxgen path: ${{ runner.temp }} # Run compliance-assistant sbom-enrich - - name: Enrich SBOM - run: poetry run compliance-assistant sbom enrich -v -f ${{ runner.temp }}/sbom-raw.json -o ${{ runner.temp }}/sbom-enriched.json - # Show and upload enriched SBOM - - name: Print SBOM content - run: cat ${{ runner.temp }}/sbom-enriched.json - - name: Store enriched SBOM as artifact + - name: Enrich Syft SBOM + run: poetry run compliance-assistant sbom enrich -v -f ${{ runner.temp }}/sbom-syft.json -o ${{ runner.temp }}/sbom-syft-enriched.json + - name: Enrich cdxgen SBOM + run: poetry run compliance-assistant sbom enrich -v -f ${{ runner.temp }}/sbom-cdxgen.json -o ${{ runner.temp }}/sbom-cdxgen-enriched.json + # Show enriched SBOMs + - name: Print enriched Syft SBOM content + run: cat ${{ runner.temp }}/sbom-syft-enriched.json + - name: Print enriched cdxgen SBOM content + run: cat ${{ runner.temp }}/sbom-cdxgen-enriched.json + # Compare licensing + - name: Print licenses as found in Syft SBOM + run: poetry run compliance-assistant licensing list -f ${{ runner.temp }}/sbom-syft-enriched.json + - name: Print licenses as found in cdxgen SBOM + run: poetry run compliance-assistant licensing list -f ${{ runner.temp }}/sbom-cdxgen-enriched.json + # Store SBOMs as artifacts + - name: Store enriched SBOMs as artifact uses: actions/upload-artifact@v4 with: - name: sbom-enriched - path: ${{ runner.temp }}/sbom-enriched.json + name: sboms-enriched + path: ${{ runner.temp }}/sbom-*-enriched.json diff --git a/README.md b/README.md index e805ce5..c7ccdbf 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,16 @@ SPDX-License-Identifier: Apache-2.0 - **License and Copyright Information Retrieval**: Fetch licensing and copyright details for a single package from ClearlyDefined. - **License compliance support**: Extract and unify licenses from SBOM, suggest possible license outbound candidates -Some of these features are made possible by excellent programs such as [flict](https://github.com/vinland-technology/flict) and [cdxgen](https://github.com/CycloneDX/cdxgen). +Some of these features are made possible by excellent programs such as [flict](https://github.com/vinland-technology/flict), [cdxgen](https://github.com/CycloneDX/cdxgen) and [syft](https://github.com/anchore/syft/). ## Requirements - Python 3.10+ - Internet connection for accessing ClearlyDefined services -- [Docker](https://www.docker.com/) for generating SBOMs +- At least one SBOM generator: + - [syft](https://github.com/anchore/syft/) + - [cdxgen](https://github.com/CycloneDX/cdxgen) + - [Docker](https://www.docker.com/) for generating SBOMs with the dockerized cdxgen ## Installation @@ -108,10 +111,11 @@ For each command, you can get detailed options, e.g., `compliance-assistant sbom ### Examples -* Create an SBOM for the current directory: `compliance-assistant sbom generate -d .` +* Create an SBOM for the current directory using [syft](https://github.com/anchore/syft/): `compliance-assistant sbom generate -g syft -d . -o /tmp/my-sbom.json` * Enrich an SBOM with ClearlyDefined data: `compliance-assistant sbom enrich -f /tmp/my-sbom.json -o /tmp/my-enriched-sbom.json` * Extract certain data from an SBOM: `compliance-assistant sbom parse -f /tmp/my-enriched-sbom.json -e purl,copyright,name` * Gather ClearlyDefined licensing/copyright information for one package: `compliance-assistant clearlydefined fetch -p pkg:pypi/inwx-dns-recordmaster@0.3.1` +* Get all licenses found in the enriched SBOM: `compliance-assistant licensing list -f /tmp/my-enriched-sbom.json -o plain` * Get license outbound candidate based on licenses from SBOM: `compliance-assistant licensing outbound -f /tmp/my-enriched-sbom.json` ### Run as GitHub workflow @@ -126,23 +130,8 @@ on: types: [published] jobs: - # Generate raw SBOM using cdxgen, but with NPMJS package, not Docker container - sbom-gen: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - name: Install cdxgen - run: npm install -g @cyclonedx/cdxgen - - name: Generate CycloneDX SBOM with cdxgen - run: cdxgen -r . -o ${{ runner.temp }}/sbom-raw.json - - name: Store raw SBOM as artifact - uses: actions/upload-artifact@v4 - with: - name: sbom-raw - path: ${{ runner.temp }}/sbom-raw.json - - # Enrich the generated SBOM - sbom-enrich: + # Generate the SBOM with syft and enrich the generated SBOM + sbom-generate-and-enrich: runs-on: ubuntu-22.04 needs: sbom-gen steps: @@ -154,12 +143,14 @@ jobs: cache: "pip" - name: Install compliance-assistant run: pip install compliance-assistant - # Download raw SBOM - - uses: actions/download-artifact@v4 - with: - name: sbom-raw - path: ${{ runner.temp }} - # Run compliance-assistant sbom-enrich + # Install syft + - run: mkdir -p ~/.local/bin + - name: Install syft + run: curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b ~/.local/bin + # Generate SBOM with syft via compliance-assistant + - name: Generate SBOM with syft + run: poetry run compliance-assistant sbom generate -g syft -d . -o ${{ runner.temp }}/sbom-raw.json + # Enrich SBOM with compliance-assistant - name: Enrich SBOM run: compliance-assistant sbom enrich -f ${{ runner.temp }}/sbom-raw.json -o ${{ runner.temp }}/sbom-enriched.json # Upload enriched SBOM as artifact diff --git a/complassist/_clearlydefined.py b/complassist/_clearlydefined.py index f171ef4..9963069 100644 --- a/complassist/_clearlydefined.py +++ b/complassist/_clearlydefined.py @@ -64,7 +64,11 @@ def purl_to_cd_coordinates(purl: str) -> str: } coordinates["provider"] = replacer(coordinates["type"], type_to_provider) - return "/".join([v for _, v in coordinates.items()]) + coordinates_string = "/".join([v for _, v in coordinates.items()]) + + logging.debug("Converted '%s' to '%s'", purl, coordinates_string) + + return coordinates_string def _cdapi_call( @@ -74,7 +78,7 @@ def _cdapi_call( basepath: str = "definitions", json_dict: dict | list | None = None, **params: str, -) -> dict: +) -> dict | None: """ Makes a request to the ClearlyDefined API. @@ -111,12 +115,19 @@ def _cdapi_call( # Return JSON response if possible try: return result.json() - except JSONDecodeError: + except (JSONDecodeError, AttributeError): logging.debug("JSON return is no valid JSON") - return {"result": result.text} - except AttributeError: - logging.warning("API call did not return a valid response. No ClearlyDefined returned") - return {"result": "error"} + if basepath != "harvest": + try: + error_msg = result.content.decode("UTF-8") + except: # pylint: disable=bare-except + error_msg = result.content + logging.warning( + "Unexpected JSON decoding error as result from %s: %s", + url, + error_msg, + ) + return None def _extract_license_copyright(cd_api_response: dict) -> tuple[str, str]: @@ -204,53 +215,74 @@ def get_clearlydefined_license_and_copyright(coordinates: str) -> tuple[str, str """ api_return = _cdapi_call(coordinates, expand="-files") - declared_license, copyrights = _extract_license_copyright(api_return) + if api_return: + declared_license, copyrights = _extract_license_copyright(api_return) - # Declared license couldn't be extracted. Add to harvest - if not declared_license: - _handle_missing_license_and_request_harvest(coordinates) + # Declared license couldn't be extracted. Add to harvest + if not declared_license: + _handle_missing_license_and_request_harvest(coordinates) - return declared_license, copyrights + return declared_license, copyrights + + # If no valid API result, return empty license and copyright + return "", "" def get_clearlydefined_license_and_copyright_in_batches( purls: list[str], ) -> dict[str, tuple[str, str]]: """ - Retrieves the declared license for multiple purls from ClearlyDefined. + Retrieves the declared license and detected copyright for multiple Package + URLs from ClearlyDefined. - Queries the ClearlyDefined API to get the declared license for the provided - packages via Package URLs. If no license is found, it initiates a - harvest request. + Queries the ClearlyDefined API to retrieve both the declared license and the + detected copyright attributions for multiple packages specified via Package + URLs. If no declared license is found for a package, a harvest request is + initiated. Args: - coordinates (str): The ClearlyDefined coordinates or Package URL for - which to retrieve the license. + purls (list[str]): A list of Package URLs (purls) for which to retrieve + the license and copyright information. Returns: tuple[str, str]: A tuple containing: - The declared license as a string, or an empty string if not found. - The detected copyright attributions as a single string, with each - attribution separated by a newline, or an empty string if not + attribution separated by a newline, or an empty string if none are found. + + Returns a dict of the provided purls and empty tuples if the + ClearlyDefined API did not return valid data. """ + # Create connections between coordinates <-> purl coordinates_purls = {purl_to_cd_coordinates(purl): purl for purl in purls} + # Request the CD API for the coordinates api_return = _cdapi_call( path="", method="POST", json_dict=list(coordinates_purls.keys()), expand="-files" ) - result: dict[str, tuple[str, str]] = {} - for pkg_coordinates, cd_data in api_return.items(): - pkg_purl = coordinates_purls[pkg_coordinates] - declared_license, copyrights = _extract_license_copyright(cd_data) + if api_return: + result: dict[str, tuple[str, str]] = {} + for pkg_coordinates, cd_data in api_return.items(): + # Fetch the corresponding PURL for the coordinates + pkg_purl = coordinates_purls[pkg_coordinates] - # Declared license couldn't be extracted. Add to harvest - if not declared_license: - _handle_missing_license_and_request_harvest(pkg_coordinates) + # Extract license and copyright data from the CD API return + declared_license, copyrights = _extract_license_copyright(cd_data) + + # Declared license couldn't be extracted. Add to harvest + if not declared_license: + _handle_missing_license_and_request_harvest(pkg_coordinates) - result[pkg_purl] = (declared_license, copyrights) + result[pkg_purl] = (declared_license, copyrights) - return result + return result + + logging.warning( + "No valid data from ClearlyDefined received for the following packages: %s", + ", ".join(purls), + ) + return {purl: ("", "") for purl in purls} def print_clearlydefined_result(results: tuple[str, str]) -> None: diff --git a/complassist/_helpers.py b/complassist/_helpers.py index 2f6d1a0..cd51e4c 100644 --- a/complassist/_helpers.py +++ b/complassist/_helpers.py @@ -36,7 +36,6 @@ def replacer(string: str, replacement_dict: dict) -> str: """ if string in replacement_dict: replacement = replacement_dict.get(string, "") - logging.debug("Replace '%s' by '%s'", string, replacement) return replacement return string diff --git a/complassist/_licensing.py b/complassist/_licensing.py index 4c93d11..09642ae 100644 --- a/complassist/_licensing.py +++ b/complassist/_licensing.py @@ -19,7 +19,7 @@ def _extract_license_expression_and_names_from_sbom( sbom_path: str, flict_simplify: bool = False ) -> tuple[list[str], list[str]]: - """Exract all SPDX expressions and license names from an SBOM""" + """Extract all SPDX expressions and license names from an SBOM""" lic_expressions = [] lic_names = [] @@ -32,11 +32,14 @@ def _extract_license_expression_and_names_from_sbom( if lic_expression := entry.get("expression", ""): lic_expressions.append(lic_expression) # Use license name instead - else: - lic_dict: dict = entry.get("license", {}) + elif lic_dict := entry.get("license", {}): if lic_name := lic_dict.get("name", ""): lic_names.append(lic_name) + # No license found. Warn user + if not licenses_short: + logging.info("No licensing data found for %s (%s)", item.get("name"), item.get("purl")) + # Make expressions and names unique, and sort them expressions = sorted(list(set(lic_expressions))) # If using flict, simplify these found licenses. Will reduce possible diff --git a/complassist/_logging.py b/complassist/_logging.py index aa610c1..674a5af 100644 --- a/complassist/_logging.py +++ b/complassist/_logging.py @@ -16,7 +16,7 @@ def configure_logger(args) -> logging.Logger: level=logging.INFO, ) # Adapt logging level - if args.verbose: + if getattr(args, "verbose", False): log.setLevel("DEBUG") # Activate extreme logging for requests to also get POST data if hasattr(args, "http_debug") and args.http_debug: diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index a74a270..36d5ee6 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -148,15 +148,21 @@ def _enrich_component_with_cd_data( """ # Get purl, original licenses, and short/simplified licenses data from component raw_data = extract_items_from_component( - component, ["purl", "licenses", "licenses-short", "copyright"], flict_simplify=True + component, ["name", "purl", "licenses", "licenses-short", "copyright"], flict_simplify=True ) # Put raw data into separate variables, slightly adapted + name = raw_data["name"] purl = raw_data["purl"] sbom_licenses_item: list[dict] = raw_data["licenses"] sbom_licenses_short_item: list[dict] = raw_data["licenses-short"] sbom_license = licenses_short_to_string(sbom_licenses_short_item) sbom_copyright = raw_data["copyright"] + # If no purl in component, there is no CD data we can enter. Abort here + if not purl: + logging.debug("No purl for component '%s', therefore no enrichment of the SBOM", name) + return + # Get fetched licensing/copyright data from ClearlyDefined cd_license = clearlydefined_data[purl].get("license") cd_copyright = clearlydefined_data[purl].get("copyright") @@ -260,9 +266,15 @@ def enrich_sbom_with_clearlydefined( # Loop all contained components, and collect ClearlyDefined data clearlydefined_data: dict[str, dict[str, str]] = {} - all_purls: list[str] = [ - c["purl"] for c in extract_items_from_cdx_sbom(sbom_file, information=["purl"]) - ] + all_purls: list[str] = [] + + # Filter components without PURLs + for pkg in extract_items_from_cdx_sbom(sbom_file, information=["name", "purl"]): + if purl := pkg.get("purl"): + all_purls.append(purl) + else: + logging.info("No purl available for component: %s", pkg.get("name", "")) + if in_batches: # Split all purls in batches of `batch_size` size purls_batches: list[list[str]] = [ diff --git a/complassist/_sbom_generate.py b/complassist/_sbom_generate.py index 5c71224..29bf904 100644 --- a/complassist/_sbom_generate.py +++ b/complassist/_sbom_generate.py @@ -6,10 +6,12 @@ import logging import re +import subprocess import sys from os.path import abspath, basename, dirname from shutil import copy2 from tempfile import NamedTemporaryFile, gettempdir +from typing import Literal from uuid import uuid4 import docker @@ -38,7 +40,7 @@ def _sanitize_container_name(name: str) -> str: return re.sub("^[^a-zA-Z0-9]+", "0", name) -def _run_cdxgen( +def _run_cdxgen_docker( dclient: docker.DockerClient, directory: str, cont_name: str, @@ -95,7 +97,7 @@ def _run_cdxgen( sys.exit(1) -def generate_cdx_sbom(directory: str, output: str = "") -> str: +def sbom_gen_cdxgen_docker(directory: str, output: str = "") -> str: """ Generates a CycloneDX Software Bill of Materials (SBOM) for the project located in the specified directory. @@ -143,7 +145,7 @@ def generate_cdx_sbom(directory: str, output: str = "") -> str: # generated by cdxgen under root ownership logging.info("Generating SBOM for %s using cdxgen", directory) with NamedTemporaryFile() as tmpfile: - _run_cdxgen(dclient, directory, cont_name, tmpfile.name) + _run_cdxgen_docker(dclient, directory, cont_name, tmpfile.name) # Copy to final destination with user permissions, or print file if requested if output == "-": @@ -151,6 +153,111 @@ def generate_cdx_sbom(directory: str, output: str = "") -> str: else: copy2(tmpfile.name, output) + logging.info("SBOM has been saved to %s", output) + + return output + + +def _run_program( + program: str, *arguments, working_directory: str | None = None +) -> tuple[int, str, str]: + cmd = [program, *arguments] + logging.debug("Running %s", cmd) + try: + ret = subprocess.run(cmd, cwd=working_directory, capture_output=True, check=False) + except FileNotFoundError as exc: + logging.critical( + "There was an error executing '%s'. The file does not seem to exist: %s", program, exc + ) + sys.exit(1) + code = ret.returncode + stderr = ret.stderr.decode("UTF-8").strip() + stdout = ret.stdout.decode("UTF-8").strip() + + return code, stdout, stderr + + +def _run_syft(directory: str, tmpfile: str) -> tuple[int, str, str]: + """Run syft scan to generate SBOM""" + _, syft_version, _ = _run_program("syft", "--version") + logging.info("Running %s to generate SBOM", syft_version) + return _run_program("syft", "scan", f"dir:{directory}", "-o", f"cyclonedx-json={tmpfile}") + + +def _run_cdxgen(directory: str, tmpfile: str) -> tuple[int, str, str]: + """Run cdxgen to generate SBOM""" + _, cdxgen_version, _ = _run_program("cdxgen", "--version") + logging.info("Running cdxgen %s to generate SBOM", cdxgen_version) + return _run_program("cdxgen", "-r", "-o", tmpfile, working_directory=directory) + + +def sbom_gen_system_program( + program: Literal["syft", "cdxgen"], directory: str, output: str = "" +) -> str: + """ + Generates a CycloneDX Software Bill of Materials (SBOM) for the project + located in the specified directory. + + This function can use multiple applications, e.g. syft and cdxgen, as + installed on the system. The resulting SBOM is saved as a JSON file and its + path is returned. + + Args: + program (str): The program which shall be used for SBOM generation. + Supported choices are provided in the type hinting. + + directory (str): The path to the directory containing the project for + which the SBOM is to be generated. The path can be either relative or + absolute. + + output (str): The path to the SBOM that is to be generated. If left + empty, it will be created in a temporary directory. + + Returns: + str: The absolute path to the generated SBOM JSON file. + """ + + with NamedTemporaryFile() as tmpfile: + if program == "syft": + code, stdout, stderr = _run_syft(directory=directory, tmpfile=tmpfile.name) + elif program == "cdxgen": + code, stdout, stderr = _run_cdxgen(directory=directory, tmpfile=tmpfile.name) + else: + logging.critical("Unsupported program provided for SBOM generation") + sys.exit(1) + + if code != 0: + logging.critical("There was an error during SBOM generation: %s\n%s", stdout, stderr) + sys.exit(1) + + # Print file and exit if output is set to `-` + if output == "-": + print_json_file(tmpfile.name) + return "-" + + # Set an output file in a temp location, if none given + if not output: + output = f"{gettempdir()}/sbom-{basename(tmpfile.name)}.json" + + # Copy temporary SBOM file to final destination + try: + copy2(tmpfile.name, output) + except FileNotFoundError: + logging.critical( + "Could not copy the temporary SBOM from '%s' to '%s'. " + "Path does not seem to exist or be accessible.", + tmpfile.name, + output, + ) + sys.exit(1) + except PermissionError: + logging.critical( + "Could not copy the temporary SBOM from '%s' to '%s'. User has no permission.", + tmpfile.name, + output, + ) + sys.exit(1) + logging.info("SBOM has been saved to %s", output) return output diff --git a/complassist/main.py b/complassist/main.py index 9b1be79..d3809da 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -22,7 +22,7 @@ from ._licensing import get_outbound_candidate, list_all_licenses from ._logging import configure_logger from ._sbom_enrich import enrich_sbom_with_clearlydefined -from ._sbom_generate import generate_cdx_sbom +from ._sbom_generate import sbom_gen_cdxgen_docker, sbom_gen_system_program from ._sbom_parse import extract_items_from_cdx_sbom # Main parser with root-level flags @@ -52,6 +52,13 @@ help="Generate a CycloneDX SBOM using the cdxgen Docker image", parents=[common_flags], ) +parser_sbom_gen.add_argument( + "-g", + "--generator", + help="SBOM Generator to use", + choices=["syft", "cdxgen", "cdxgen-docker"], + required=True, +) parser_sbom_gen.add_argument( "-d", "--directory", @@ -261,7 +268,12 @@ def main(): # pylint: disable=too-many-branches, too-many-statements # SBOM commands if args.command == "sbom": if args.sbom_command == "generate": - generate_cdx_sbom(directory=args.directory, output=args.output) + if args.generator == "cdxgen-docker": + sbom_gen_cdxgen_docker(directory=args.directory, output=args.output) + else: + sbom_gen_system_program( + program=args.generator, directory=args.directory, output=args.output + ) # Enrich SBOM by ClearlyDefined data elif args.sbom_command == "enrich":