| """Tool for generating Software Bill of Materials (SBOM) for Python's dependencies""" |
| import os |
| import re |
| import hashlib |
| import json |
| import glob |
| import pathlib |
| import subprocess |
| import sys |
| import typing |
| from urllib.request import urlopen |
| |
| CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent |
| |
| # Before adding a new entry to this list, double check that |
| # the license expression is a valid SPDX license expression: |
| # See: https://spdx.org/licenses |
| ALLOWED_LICENSE_EXPRESSIONS = { |
| "MIT", |
| "CC0-1.0", |
| "Apache-2.0", |
| "BSD-2-Clause", |
| } |
| |
| # Properties which are required for our purposes. |
| REQUIRED_PROPERTIES_PACKAGE = frozenset([ |
| "SPDXID", |
| "name", |
| "versionInfo", |
| "downloadLocation", |
| "checksums", |
| "licenseConcluded", |
| "externalRefs", |
| "originator", |
| "primaryPackagePurpose", |
| ]) |
| |
| |
| class PackageFiles(typing.NamedTuple): |
| """Structure for describing the files of a package""" |
| include: list[str] |
| exclude: list[str] | None = None |
| |
| |
| # SBOMS don't have a method to specify the sources of files |
| # so we need to do that external to the SBOM itself. Add new |
| # values to 'exclude' if we create new files within tracked |
| # directories that aren't sourced from third-party packages. |
| PACKAGE_TO_FILES = { |
| # NOTE: pip's entry in this structure is automatically generated in |
| # the 'discover_pip_sbom_package()' function below. |
| "mpdecimal": PackageFiles( |
| include=["Modules/_decimal/libmpdec/**"] |
| ), |
| "expat": PackageFiles( |
| include=["Modules/expat/**"] |
| ), |
| "macholib": PackageFiles( |
| include=["Lib/ctypes/macholib/**"], |
| exclude=[ |
| "Lib/ctypes/macholib/README.ctypes", |
| "Lib/ctypes/macholib/fetch_macholib", |
| "Lib/ctypes/macholib/fetch_macholib.bat", |
| ], |
| ), |
| "libb2": PackageFiles( |
| include=["Modules/_blake2/impl/**"] |
| ), |
| "hacl-star": PackageFiles( |
| include=["Modules/_hacl/**"], |
| exclude=[ |
| "Modules/_hacl/refresh.sh", |
| "Modules/_hacl/README.md", |
| "Modules/_hacl/python_hacl_namespace.h", |
| ] |
| ), |
| } |
| |
| |
| def spdx_id(value: str) -> str: |
| """Encode a value into characters that are valid in an SPDX ID""" |
| return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) |
| |
| |
| def error_if(value: bool, error_message: str) -> None: |
| """Prints an error if a comparison fails along with a link to the devguide""" |
| if value: |
| print(error_message) |
| print("See 'https://devguide.python.org/developer-workflow/sbom' for more information.") |
| sys.exit(1) |
| |
| |
| def filter_gitignored_paths(paths: list[str]) -> list[str]: |
| """ |
| Filter out paths excluded by the gitignore file. |
| The output of 'git check-ignore --non-matching --verbose' looks |
| like this for non-matching (included) files: |
| |
| '::<whitespace><path>' |
| |
| And looks like this for matching (excluded) files: |
| |
| '.gitignore:9:*.a Tools/lib.a' |
| """ |
| # Filter out files in gitignore. |
| # Non-matching files show up as '::<whitespace><path>' |
| git_check_ignore_proc = subprocess.run( |
| ["git", "check-ignore", "--verbose", "--non-matching", *paths], |
| cwd=CPYTHON_ROOT_DIR, |
| check=False, |
| stdout=subprocess.PIPE, |
| ) |
| # 1 means matches, 0 means no matches. |
| assert git_check_ignore_proc.returncode in (0, 1) |
| |
| # Return the list of paths sorted |
| git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines() |
| return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")]) |
| |
| |
| def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None: |
| """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually |
| automatable to discover the metadata we need like the version and checksums |
| so let's do that on behalf of our friends at the PyPA. |
| """ |
| global PACKAGE_TO_FILES |
| |
| ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled" |
| pip_wheels = [] |
| |
| # Find the hopefully one pip wheel in the bundled directory. |
| for wheel_filename in os.listdir(ensurepip_bundled_dir): |
| if wheel_filename.startswith("pip-"): |
| pip_wheels.append(wheel_filename) |
| if len(pip_wheels) != 1: |
| print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'") |
| sys.exit(1) |
| pip_wheel_filename = pip_wheels[0] |
| |
| # Add the wheel filename to the list of files so the SBOM file |
| # and relationship generator can work its magic on the wheel too. |
| PACKAGE_TO_FILES["pip"] = PackageFiles( |
| include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"] |
| ) |
| |
| # Wheel filename format puts the version right after the project name. |
| pip_version = pip_wheel_filename.split("-")[1] |
| pip_checksum_sha256 = hashlib.sha256( |
| (ensurepip_bundled_dir / pip_wheel_filename).read_bytes() |
| ).hexdigest() |
| |
| # Get pip's download location from PyPI. Check that the checksum is correct too. |
| try: |
| raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read() |
| pip_release_metadata = json.loads(raw_text) |
| url: dict[str, typing.Any] |
| |
| # Look for a matching artifact filename and then check |
| # its remote checksum to the local one. |
| for url in pip_release_metadata["urls"]: |
| if url["filename"] == pip_wheel_filename: |
| break |
| else: |
| raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'") |
| if url["digests"]["sha256"] != pip_checksum_sha256: |
| raise ValueError(f"Local pip checksum doesn't match artifact on PyPI") |
| |
| # Successfully found the download URL for the matching artifact. |
| pip_download_url = url["url"] |
| |
| except (OSError, ValueError) as e: |
| print(f"Couldn't fetch pip's metadata from PyPI: {e}") |
| sys.exit(1) |
| |
| # Remove pip from the existing SBOM packages if it's there |
| # and then overwrite its entry with our own generated one. |
| sbom_data["packages"] = [ |
| sbom_package |
| for sbom_package in sbom_data["packages"] |
| if sbom_package["name"] != "pip" |
| ] |
| sbom_data["packages"].append( |
| { |
| "SPDXID": spdx_id("SPDXRef-PACKAGE-pip"), |
| "name": "pip", |
| "versionInfo": pip_version, |
| "originator": "Organization: Python Packaging Authority", |
| "licenseConcluded": "MIT", |
| "downloadLocation": pip_download_url, |
| "checksums": [ |
| {"algorithm": "SHA256", "checksumValue": pip_checksum_sha256} |
| ], |
| "externalRefs": [ |
| { |
| "referenceCategory": "SECURITY", |
| "referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*", |
| "referenceType": "cpe23Type", |
| }, |
| { |
| "referenceCategory": "PACKAGE_MANAGER", |
| "referenceLocator": f"pkg:pypi/pip@{pip_version}", |
| "referenceType": "purl", |
| }, |
| ], |
| "primaryPackagePurpose": "SOURCE", |
| } |
| ) |
| |
| |
| def main() -> None: |
| sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json" |
| sbom_data = json.loads(sbom_path.read_bytes()) |
| |
| # Insert pip's SBOM metadata from the wheel. |
| discover_pip_sbom_package(sbom_data) |
| |
| # Ensure all packages in this tool are represented also in the SBOM file. |
| error_if( |
| {package["name"] for package in sbom_data["packages"]} != set(PACKAGE_TO_FILES), |
| "Packages defined in SBOM tool don't match those defined in SBOM file.", |
| ) |
| |
| # Make a bunch of assertions about the SBOM data to ensure it's consistent. |
| for package in sbom_data["packages"]: |
| # Properties and ID must be properly formed. |
| error_if( |
| "name" not in package, |
| "Package is missing the 'name' field" |
| ) |
| error_if( |
| set(package.keys()) != REQUIRED_PROPERTIES_PACKAGE, |
| f"Package '{package['name']}' is missing required fields", |
| ) |
| error_if( |
| package["SPDXID"] != spdx_id(f"SPDXRef-PACKAGE-{package['name']}"), |
| f"Package '{package['name']}' has a malformed SPDXID", |
| ) |
| |
| # Version must be in the download and external references. |
| version = package["versionInfo"] |
| error_if( |
| version not in package["downloadLocation"], |
| f"Version '{version}' for package '{package['name']} not in 'downloadLocation' field", |
| ) |
| error_if( |
| any(version not in ref["referenceLocator"] for ref in package["externalRefs"]), |
| ( |
| f"Version '{version}' for package '{package['name']} not in " |
| f"all 'externalRefs[].referenceLocator' fields" |
| ), |
| ) |
| |
| # License must be on the approved list for SPDX. |
| license_concluded = package["licenseConcluded"] |
| error_if( |
| license_concluded not in ALLOWED_LICENSE_EXPRESSIONS, |
| f"License identifier '{license_concluded}' not in SBOM tool allowlist" |
| ) |
| |
| # Regenerate file information from current data. |
| sbom_files = [] |
| sbom_relationships = [] |
| |
| # We call 'sorted()' here a lot to avoid filesystem scan order issues. |
| for name, files in sorted(PACKAGE_TO_FILES.items()): |
| package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}") |
| exclude = files.exclude or () |
| for include in sorted(files.include): |
| # Find all the paths and then filter them through .gitignore. |
| paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True) |
| paths = filter_gitignored_paths(paths) |
| error_if( |
| len(paths) == 0, |
| f"No valid paths found at path '{include}' for package '{name}", |
| ) |
| |
| for path in paths: |
| # Skip directories and excluded files |
| if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude: |
| continue |
| |
| # SPDX requires SHA1 to be used for files, but we provide SHA256 too. |
| data = (CPYTHON_ROOT_DIR / path).read_bytes() |
| checksum_sha1 = hashlib.sha1(data).hexdigest() |
| checksum_sha256 = hashlib.sha256(data).hexdigest() |
| |
| file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}") |
| sbom_files.append({ |
| "SPDXID": file_spdx_id, |
| "fileName": path, |
| "checksums": [ |
| {"algorithm": "SHA1", "checksumValue": checksum_sha1}, |
| {"algorithm": "SHA256", "checksumValue": checksum_sha256}, |
| ], |
| }) |
| |
| # Tie each file back to its respective package. |
| sbom_relationships.append({ |
| "spdxElementId": package_spdx_id, |
| "relatedSpdxElement": file_spdx_id, |
| "relationshipType": "CONTAINS", |
| }) |
| |
| # Update the SBOM on disk |
| sbom_data["files"] = sbom_files |
| sbom_data["relationships"] = sbom_relationships |
| sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True)) |
| |
| |
| if __name__ == "__main__": |
| main() |