330 lines
11 KiB
Python
330 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Collect Files Utility
|
|
|
|
Recursively scans the specified file/directory patterns and outputs a single Markdown
|
|
document containing each file's relative path and its content.
|
|
|
|
This tool helps aggregate source code files for analysis or documentation purposes.
|
|
|
|
Usage examples:
|
|
# Collect all Python files in the current directory:
|
|
python collect_files.py *.py > my_python_files.md
|
|
|
|
# Collect all files in the 'output' directory:
|
|
python collect_files.py output > my_output_dir_files.md
|
|
|
|
# Collect specific files, excluding 'utils' and 'logs', but including Markdown files from 'utils':
|
|
python collect_files.py *.py --exclude "utils,logs,__pycache__,*.pyc" --include "utils/*.md" > my_output.md
|
|
"""
|
|
|
|
import argparse
|
|
import datetime
|
|
import fnmatch
|
|
import glob
|
|
import os
|
|
import pathlib
|
|
from typing import List, Optional, Set, Tuple
|
|
|
|
# Default exclude patterns: common directories and binary files to ignore.
|
|
DEFAULT_EXCLUDE = [".venv", "node_modules", "*.lock", ".git", "__pycache__", "*.pyc", "*.ruff_cache", "logs", "output"]
|
|
|
|
|
|
def parse_patterns(pattern_str: str) -> List[str]:
|
|
"""Splits a comma-separated string into a list of stripped patterns."""
|
|
return [p.strip() for p in pattern_str.split(",") if p.strip()]
|
|
|
|
|
|
def resolve_pattern(pattern: str) -> str:
|
|
"""
|
|
Resolves a pattern that might contain relative path navigation.
|
|
Returns the absolute path of the pattern.
|
|
"""
|
|
# Convert the pattern to a Path object
|
|
pattern_path = pathlib.Path(pattern)
|
|
|
|
# Check if the pattern is absolute or contains relative navigation
|
|
if os.path.isabs(pattern) or ".." in pattern:
|
|
# Resolve to absolute path
|
|
return str(pattern_path.resolve())
|
|
|
|
# For simple patterns without navigation, return as is
|
|
return pattern
|
|
|
|
|
|
def match_pattern(path: str, pattern: str, component_matching=False) -> bool:
|
|
"""
|
|
Centralized pattern matching logic.
|
|
|
|
Args:
|
|
path: File path to match against
|
|
pattern: Pattern to match
|
|
component_matching: If True, matches individual path components
|
|
(used primarily for exclude patterns)
|
|
|
|
Returns:
|
|
True if path matches the pattern
|
|
"""
|
|
# For simple exclude-style component matching
|
|
if component_matching:
|
|
parts = os.path.normpath(path).split(os.sep)
|
|
for part in parts:
|
|
if fnmatch.fnmatch(part, pattern):
|
|
return True
|
|
return False
|
|
|
|
# Convert paths to absolute for consistent comparison
|
|
abs_path = os.path.abspath(path)
|
|
|
|
# Handle relative path navigation in the pattern
|
|
if ".." in pattern or "/" in pattern or "\\" in pattern:
|
|
# If pattern contains path navigation, resolve it to an absolute path
|
|
resolved_pattern = resolve_pattern(pattern)
|
|
|
|
# Check if this is a directory pattern with a wildcard
|
|
if "*" in resolved_pattern:
|
|
# Get the directory part of the pattern
|
|
pattern_dir = os.path.dirname(resolved_pattern)
|
|
# Get the filename pattern
|
|
pattern_file = os.path.basename(resolved_pattern)
|
|
|
|
# Check if the file is in or under the pattern directory
|
|
file_dir = os.path.dirname(abs_path)
|
|
if file_dir.startswith(pattern_dir):
|
|
# Match the filename against the pattern
|
|
return fnmatch.fnmatch(os.path.basename(abs_path), pattern_file)
|
|
return False # Not under the pattern directory
|
|
else:
|
|
# Direct file match
|
|
return abs_path == resolved_pattern or fnmatch.fnmatch(abs_path, resolved_pattern)
|
|
else:
|
|
# Regular pattern without navigation, use relative path matching
|
|
return fnmatch.fnmatch(path, pattern)
|
|
|
|
|
|
def should_exclude(path: str, exclude_patterns: List[str]) -> bool:
|
|
"""
|
|
Returns True if any component of the path matches an exclude pattern.
|
|
"""
|
|
for pattern in exclude_patterns:
|
|
if match_pattern(path, pattern, component_matching=True):
|
|
return True
|
|
return False
|
|
|
|
|
|
def should_include(path: str, include_patterns: List[str]) -> bool:
|
|
"""
|
|
Returns True if the path matches any of the include patterns.
|
|
Handles relative path navigation in include patterns.
|
|
"""
|
|
for pattern in include_patterns:
|
|
if match_pattern(path, pattern):
|
|
return True
|
|
return False
|
|
|
|
|
|
def collect_files(patterns: List[str], exclude_patterns: List[str], include_patterns: List[str]) -> List[str]:
|
|
"""
|
|
Collects file paths matching the given patterns, applying exclusion first.
|
|
Files that match an include pattern are added back in.
|
|
|
|
Returns a sorted list of absolute file paths.
|
|
"""
|
|
collected = set()
|
|
|
|
# Process included files with simple filenames or relative paths
|
|
for pattern in include_patterns:
|
|
# Check for files in the current directory first
|
|
direct_matches = glob.glob(pattern, recursive=True)
|
|
for match in direct_matches:
|
|
if os.path.isfile(match):
|
|
collected.add(os.path.abspath(match))
|
|
|
|
# Then check for relative paths
|
|
if ".." in pattern or os.path.isabs(pattern):
|
|
resolved_pattern = resolve_pattern(pattern)
|
|
|
|
# Direct file inclusion
|
|
if "*" not in resolved_pattern and os.path.isfile(resolved_pattern):
|
|
collected.add(resolved_pattern)
|
|
else:
|
|
# Pattern with wildcards
|
|
directory = os.path.dirname(resolved_pattern)
|
|
if os.path.exists(directory):
|
|
filename_pattern = os.path.basename(resolved_pattern)
|
|
for root, _, files in os.walk(directory):
|
|
for file in files:
|
|
if fnmatch.fnmatch(file, filename_pattern):
|
|
full_path = os.path.join(root, file)
|
|
collected.add(os.path.abspath(full_path))
|
|
|
|
# Process the main patterns
|
|
for pattern in patterns:
|
|
matches = glob.glob(pattern, recursive=True)
|
|
for path in matches:
|
|
if os.path.isfile(path):
|
|
process_file(path, collected, exclude_patterns, include_patterns)
|
|
elif os.path.isdir(path):
|
|
process_directory(path, collected, exclude_patterns, include_patterns)
|
|
|
|
return sorted(collected)
|
|
|
|
|
|
def process_file(file_path: str, collected: Set[str], exclude_patterns: List[str], include_patterns: List[str]) -> None:
|
|
"""Process a single file"""
|
|
abs_path = os.path.abspath(file_path)
|
|
rel_path = os.path.relpath(file_path)
|
|
|
|
# Skip if excluded and not specifically included
|
|
if should_exclude(rel_path, exclude_patterns) and not should_include(rel_path, include_patterns):
|
|
return
|
|
|
|
collected.add(abs_path)
|
|
|
|
|
|
def process_directory(
|
|
dir_path: str, collected: Set[str], exclude_patterns: List[str], include_patterns: List[str]
|
|
) -> None:
|
|
"""Process a directory recursively"""
|
|
for root, dirs, files in os.walk(dir_path):
|
|
# Filter directories based on exclude patterns, but respect include patterns
|
|
dirs[:] = [
|
|
d
|
|
for d in dirs
|
|
if not should_exclude(os.path.join(root, d), exclude_patterns)
|
|
or should_include(os.path.join(root, d), include_patterns)
|
|
]
|
|
|
|
# Process each file in the directory
|
|
for file in files:
|
|
full_path = os.path.join(root, file)
|
|
process_file(full_path, collected, exclude_patterns, include_patterns)
|
|
|
|
|
|
def read_file(file_path: str) -> Tuple[str, Optional[str]]:
|
|
"""
|
|
Read a file and return its content.
|
|
|
|
Returns:
|
|
Tuple of (content, error_message)
|
|
"""
|
|
# Check if file is likely binary
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
chunk = f.read(1024)
|
|
if b"\0" in chunk: # Simple binary check
|
|
return "[Binary file not displayed]", None
|
|
|
|
# If not binary, read as text
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
return f.read(), None
|
|
except UnicodeDecodeError:
|
|
# Handle encoding issues
|
|
return "[File contains non-UTF-8 characters]", None
|
|
except Exception as e:
|
|
return "", f"[ERROR reading file: {e}]"
|
|
|
|
|
|
def format_output(
|
|
file_paths: List[str],
|
|
format_type: str,
|
|
exclude_patterns: List[str],
|
|
include_patterns: List[str],
|
|
patterns: List[str],
|
|
) -> str:
|
|
"""
|
|
Format the collected files according to the output format.
|
|
|
|
Args:
|
|
file_paths: List of absolute file paths to format
|
|
format_type: Output format type ("markdown" or "plain")
|
|
exclude_patterns: List of exclusion patterns (for info)
|
|
include_patterns: List of inclusion patterns (for info)
|
|
patterns: Original input patterns (for info)
|
|
|
|
Returns:
|
|
Formatted output string
|
|
"""
|
|
output_lines = []
|
|
|
|
# Add metadata header
|
|
now = datetime.datetime.now()
|
|
date_str = now.strftime("%-m/%-d/%Y, %-I:%M:%S %p")
|
|
output_lines.append(f"# {patterns}")
|
|
output_lines.append("")
|
|
output_lines.append("[collect-files]")
|
|
output_lines.append("")
|
|
output_lines.append(f"**Search:** {patterns}")
|
|
output_lines.append(f"**Exclude:** {exclude_patterns}")
|
|
output_lines.append(f"**Include:** {include_patterns}")
|
|
output_lines.append(f"**Date:** {date_str}")
|
|
output_lines.append(f"**Files:** {len(file_paths)}\n\n")
|
|
|
|
# Process each file
|
|
for file_path in file_paths:
|
|
rel_path = os.path.relpath(file_path)
|
|
|
|
# Add file header based on format
|
|
if format_type == "markdown":
|
|
output_lines.append(f"### File: {rel_path}")
|
|
output_lines.append("```")
|
|
else:
|
|
output_lines.append(f"=== File: {rel_path} ===")
|
|
|
|
# Read and add file content
|
|
content, error = read_file(file_path)
|
|
if error:
|
|
output_lines.append(error)
|
|
else:
|
|
output_lines.append(content)
|
|
|
|
# Add file footer based on format
|
|
if format_type == "markdown":
|
|
output_lines.append("```")
|
|
|
|
# Add separator between files
|
|
output_lines.append("\n")
|
|
|
|
return "\n".join(output_lines)
|
|
|
|
|
|
def main() -> None:
|
|
"""Main function"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Recursively collect files matching the given patterns and output a document with file names and content."
|
|
)
|
|
parser.add_argument("patterns", nargs="+", help="File and/or directory patterns to collect (e.g. *.py or output)")
|
|
parser.add_argument(
|
|
"--exclude",
|
|
type=str,
|
|
default="",
|
|
help="Comma-separated patterns to exclude (will be combined with default excludes: "
|
|
+ ",".join(DEFAULT_EXCLUDE)
|
|
+ ")",
|
|
)
|
|
parser.add_argument(
|
|
"--include", type=str, default="", help="Comma-separated patterns to include (overrides excludes if matched)"
|
|
)
|
|
parser.add_argument(
|
|
"--format", type=str, choices=["markdown", "plain"], default="plain", help="Output format (default: plain)"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Parse pattern arguments and combine with default excludes
|
|
user_exclude_patterns = parse_patterns(args.exclude)
|
|
exclude_patterns = DEFAULT_EXCLUDE + user_exclude_patterns
|
|
|
|
include_patterns = parse_patterns(args.include) if args.include else []
|
|
|
|
# Collect files
|
|
patterns = args.patterns
|
|
files = collect_files(patterns, exclude_patterns, include_patterns)
|
|
|
|
# Format and print output
|
|
output = format_output(files, args.format, exclude_patterns, include_patterns, patterns)
|
|
print(output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|