Source code for pynydus.common.scan_paths

"""File classification for credential and PII scanning.

``classify`` returns **ignored** for known binary/non-text extensions and
**plain** for everything else. ``partition_files`` splits on that: ignored
entries are skipped. The rest are scanned as UTF-8 text.
"""

from __future__ import annotations

from typing import Literal

FileCategory = Literal["ignored", "structured", "markdown", "plain"]

IGNORED_EXTENSIONS: frozenset[str] = frozenset(
    {
        "png",
        "jpg",
        "jpeg",
        "gif",
        "webp",
        "ico",
        "svg",
        "pdf",
        "zip",
        "egg",
        "gz",
        "tar",
        "bz2",
        "xz",
        "7z",
        "woff",
        "woff2",
        "ttf",
        "otf",
        "eot",
        "mp3",
        "mp4",
        "wav",
        "ogg",
        "webm",
        "avi",
        "bin",
        "exe",
        "dll",
        "so",
        "dylib",
        "pyc",
        "pyo",
        "class",
    }
)


[docs] def classify(name: str) -> FileCategory: """Classify a filename into a scanning category. Args: name: File path or basename (extension is inferred from the last ``.``). Returns: ``"ignored"`` for binary/non-text assets, ``"plain"`` for everything else. Only ``"ignored"`` is filtered before scanning. """ ext = name.rsplit(".", 1)[-1].lower() if "." in name else "" return "ignored" if ext in IGNORED_EXTENSIONS else "plain"
[docs] def partition_files( files: dict[str, str], ) -> tuple[dict[str, str], dict[str, str]]: """Split *files* into scannable vs ignored dicts by extension. Args: files: Map of relative path to file body. Returns: A pair ``(scannable, ignored)``. Ignored entries are binary or non-text assets. scannable entries are scanned for secrets/PII. """ scannable: dict[str, str] = {} ignored: dict[str, str] = {} for key, content in files.items(): if classify(key) == "ignored": ignored[key] = content else: scannable[key] = content return scannable, ignored