Files
wild-cloud/tools/collect_files.py

330 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Collect Files Utility
Recursively scans the specified file/directory patterns and outputs a single Markdown
document containing each file's relative path and its content.
This tool helps aggregate source code files for analysis or documentation purposes.
Usage examples:
# Collect all Python files in the current directory:
python collect_files.py *.py > my_python_files.md
# Collect all files in the 'output' directory:
python collect_files.py output > my_output_dir_files.md
# Collect specific files, excluding 'utils' and 'logs', but including Markdown files from 'utils':
python collect_files.py *.py --exclude "utils,logs,__pycache__,*.pyc" --include "utils/*.md" > my_output.md
"""
import argparse
import datetime
import fnmatch
import glob
import os
import pathlib
from typing import List, Optional, Set, Tuple
# Default exclude patterns: common directories and binary files to ignore.
DEFAULT_EXCLUDE = [".venv", "node_modules", "*.lock", ".git", "__pycache__", "*.pyc", "*.ruff_cache", "logs", "output"]
def parse_patterns(pattern_str: str) -> List[str]:
"""Splits a comma-separated string into a list of stripped patterns."""
return [p.strip() for p in pattern_str.split(",") if p.strip()]
def resolve_pattern(pattern: str) -> str:
"""
Resolves a pattern that might contain relative path navigation.
Returns the absolute path of the pattern.
"""
# Convert the pattern to a Path object
pattern_path = pathlib.Path(pattern)
# Check if the pattern is absolute or contains relative navigation
if os.path.isabs(pattern) or ".." in pattern:
# Resolve to absolute path
return str(pattern_path.resolve())
# For simple patterns without navigation, return as is
return pattern
def match_pattern(path: str, pattern: str, component_matching=False) -> bool:
"""
Centralized pattern matching logic.
Args:
path: File path to match against
pattern: Pattern to match
component_matching: If True, matches individual path components
(used primarily for exclude patterns)
Returns:
True if path matches the pattern
"""
# For simple exclude-style component matching
if component_matching:
parts = os.path.normpath(path).split(os.sep)
for part in parts:
if fnmatch.fnmatch(part, pattern):
return True
return False
# Convert paths to absolute for consistent comparison
abs_path = os.path.abspath(path)
# Handle relative path navigation in the pattern
if ".." in pattern or "/" in pattern or "\\" in pattern:
# If pattern contains path navigation, resolve it to an absolute path
resolved_pattern = resolve_pattern(pattern)
# Check if this is a directory pattern with a wildcard
if "*" in resolved_pattern:
# Get the directory part of the pattern
pattern_dir = os.path.dirname(resolved_pattern)
# Get the filename pattern
pattern_file = os.path.basename(resolved_pattern)
# Check if the file is in or under the pattern directory
file_dir = os.path.dirname(abs_path)
if file_dir.startswith(pattern_dir):
# Match the filename against the pattern
return fnmatch.fnmatch(os.path.basename(abs_path), pattern_file)
return False # Not under the pattern directory
else:
# Direct file match
return abs_path == resolved_pattern or fnmatch.fnmatch(abs_path, resolved_pattern)
else:
# Regular pattern without navigation, use relative path matching
return fnmatch.fnmatch(path, pattern)
def should_exclude(path: str, exclude_patterns: List[str]) -> bool:
"""
Returns True if any component of the path matches an exclude pattern.
"""
for pattern in exclude_patterns:
if match_pattern(path, pattern, component_matching=True):
return True
return False
def should_include(path: str, include_patterns: List[str]) -> bool:
"""
Returns True if the path matches any of the include patterns.
Handles relative path navigation in include patterns.
"""
for pattern in include_patterns:
if match_pattern(path, pattern):
return True
return False
def collect_files(patterns: List[str], exclude_patterns: List[str], include_patterns: List[str]) -> List[str]:
"""
Collects file paths matching the given patterns, applying exclusion first.
Files that match an include pattern are added back in.
Returns a sorted list of absolute file paths.
"""
collected = set()
# Process included files with simple filenames or relative paths
for pattern in include_patterns:
# Check for files in the current directory first
direct_matches = glob.glob(pattern, recursive=True)
for match in direct_matches:
if os.path.isfile(match):
collected.add(os.path.abspath(match))
# Then check for relative paths
if ".." in pattern or os.path.isabs(pattern):
resolved_pattern = resolve_pattern(pattern)
# Direct file inclusion
if "*" not in resolved_pattern and os.path.isfile(resolved_pattern):
collected.add(resolved_pattern)
else:
# Pattern with wildcards
directory = os.path.dirname(resolved_pattern)
if os.path.exists(directory):
filename_pattern = os.path.basename(resolved_pattern)
for root, _, files in os.walk(directory):
for file in files:
if fnmatch.fnmatch(file, filename_pattern):
full_path = os.path.join(root, file)
collected.add(os.path.abspath(full_path))
# Process the main patterns
for pattern in patterns:
matches = glob.glob(pattern, recursive=True)
for path in matches:
if os.path.isfile(path):
process_file(path, collected, exclude_patterns, include_patterns)
elif os.path.isdir(path):
process_directory(path, collected, exclude_patterns, include_patterns)
return sorted(collected)
def process_file(file_path: str, collected: Set[str], exclude_patterns: List[str], include_patterns: List[str]) -> None:
"""Process a single file"""
abs_path = os.path.abspath(file_path)
rel_path = os.path.relpath(file_path)
# Skip if excluded and not specifically included
if should_exclude(rel_path, exclude_patterns) and not should_include(rel_path, include_patterns):
return
collected.add(abs_path)
def process_directory(
dir_path: str, collected: Set[str], exclude_patterns: List[str], include_patterns: List[str]
) -> None:
"""Process a directory recursively"""
for root, dirs, files in os.walk(dir_path):
# Filter directories based on exclude patterns, but respect include patterns
dirs[:] = [
d
for d in dirs
if not should_exclude(os.path.join(root, d), exclude_patterns)
or should_include(os.path.join(root, d), include_patterns)
]
# Process each file in the directory
for file in files:
full_path = os.path.join(root, file)
process_file(full_path, collected, exclude_patterns, include_patterns)
def read_file(file_path: str) -> Tuple[str, Optional[str]]:
"""
Read a file and return its content.
Returns:
Tuple of (content, error_message)
"""
# Check if file is likely binary
try:
with open(file_path, "rb") as f:
chunk = f.read(1024)
if b"\0" in chunk: # Simple binary check
return "[Binary file not displayed]", None
# If not binary, read as text
with open(file_path, "r", encoding="utf-8") as f:
return f.read(), None
except UnicodeDecodeError:
# Handle encoding issues
return "[File contains non-UTF-8 characters]", None
except Exception as e:
return "", f"[ERROR reading file: {e}]"
def format_output(
file_paths: List[str],
format_type: str,
exclude_patterns: List[str],
include_patterns: List[str],
patterns: List[str],
) -> str:
"""
Format the collected files according to the output format.
Args:
file_paths: List of absolute file paths to format
format_type: Output format type ("markdown" or "plain")
exclude_patterns: List of exclusion patterns (for info)
include_patterns: List of inclusion patterns (for info)
patterns: Original input patterns (for info)
Returns:
Formatted output string
"""
output_lines = []
# Add metadata header
now = datetime.datetime.now()
date_str = now.strftime("%-m/%-d/%Y, %-I:%M:%S %p")
output_lines.append(f"# {patterns}")
output_lines.append("")
output_lines.append("[collect-files]")
output_lines.append("")
output_lines.append(f"**Search:** {patterns}")
output_lines.append(f"**Exclude:** {exclude_patterns}")
output_lines.append(f"**Include:** {include_patterns}")
output_lines.append(f"**Date:** {date_str}")
output_lines.append(f"**Files:** {len(file_paths)}\n\n")
# Process each file
for file_path in file_paths:
rel_path = os.path.relpath(file_path)
# Add file header based on format
if format_type == "markdown":
output_lines.append(f"### File: {rel_path}")
output_lines.append("```")
else:
output_lines.append(f"=== File: {rel_path} ===")
# Read and add file content
content, error = read_file(file_path)
if error:
output_lines.append(error)
else:
output_lines.append(content)
# Add file footer based on format
if format_type == "markdown":
output_lines.append("```")
# Add separator between files
output_lines.append("\n")
return "\n".join(output_lines)
def main() -> None:
"""Main function"""
parser = argparse.ArgumentParser(
description="Recursively collect files matching the given patterns and output a document with file names and content."
)
parser.add_argument("patterns", nargs="+", help="File and/or directory patterns to collect (e.g. *.py or output)")
parser.add_argument(
"--exclude",
type=str,
default="",
help="Comma-separated patterns to exclude (will be combined with default excludes: "
+ ",".join(DEFAULT_EXCLUDE)
+ ")",
)
parser.add_argument(
"--include", type=str, default="", help="Comma-separated patterns to include (overrides excludes if matched)"
)
parser.add_argument(
"--format", type=str, choices=["markdown", "plain"], default="plain", help="Output format (default: plain)"
)
args = parser.parse_args()
# Parse pattern arguments and combine with default excludes
user_exclude_patterns = parse_patterns(args.exclude)
exclude_patterns = DEFAULT_EXCLUDE + user_exclude_patterns
include_patterns = parse_patterns(args.include) if args.include else []
# Collect files
patterns = args.patterns
files = collect_files(patterns, exclude_patterns, include_patterns)
# Format and print output
output = format_output(files, args.format, exclude_patterns, include_patterns, patterns)
print(output)
if __name__ == "__main__":
main()