git-law/generate_git_plan.py

#!/usr/bin/env python3
"""
USC Git Blame Commit Plan Generator

Analyzes migrated data to create intelligent incremental git commit plans:

1. Compares USC releases to identify section-level changes
2. Maps changes to specific public laws and sponsors
3. Generates optimized commit sequences for proper git blame
4. Creates comprehensive commit plans with rich attribution
5. Validates chronological ordering and conflict resolution

Architecture: Download → Cache → Migrate → **Plan** → Build
This script handles the third step: intelligent git commit planning.
"""

import json
from pathlib import Path
from datetime import datetime, date
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
import logging
import difflib
from collections import defaultdict
import hashlib

# Import our models and datastore
from models import Sponsor
from datastore import USCodeDataStore

# Configure logging
logs_dir = Path('logs')
logs_dir.mkdir(exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(logs_dir / 'generate_git_plan.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


@dataclass
class SectionChange:
    """Represents a change to a USC section between releases"""
    section_id: str              # "42-6A-280g-15"
    file_path: str              # "Title-42/Chapter-06A/Section-280g-15.md"
    change_type: str            # "added", "modified", "deleted"
    old_content: Optional[str]   # Previous content (None for added)
    new_content: Optional[str]   # New content (None for deleted)
    diff_lines: List[str]       # Unified diff output
    confidence: float           # Confidence this change maps to the public law (0-1)

    @property
    def content_hash(self) -> str:
        """Generate hash of new content for deduplication"""
        content = self.new_content or ""
        return hashlib.sha256(content.encode()).hexdigest()[:16]


@dataclass
class GitCommitPlan:
    """Plan for a single git commit"""
    public_law_id: str          # "119-001"
    commit_date: datetime       # When to timestamp the commit
    author_name: str            # Git author name
    author_email: str           # Git author email
    committer_name: str         # Git committer (usually same as author)
    committer_email: str        # Git committer email

    # Commit content
    commit_message: str         # Full commit message
    commit_body: str           # Extended commit description
    files_changed: List[SectionChange]  # Files to include in this commit

    # Metadata
    sponsor_bioguide_id: Optional[str]  # Congressional sponsor
    bill_info: Optional[Dict[str, Any]]  # Associated bill data
    tags: List[str]            # Git tags to apply

    @property
    def short_hash(self) -> str:
        """Generate short hash for this commit plan"""
        content = f"{self.public_law_id}-{self.commit_date}-{len(self.files_changed)}"
        return hashlib.sha256(content.encode()).hexdigest()[:8]

    @property
    def files_modified_count(self) -> int:
        """Count of files that will be modified"""
        return len([f for f in self.files_changed if f.change_type == "modified"])

    @property
    def files_added_count(self) -> int:
        """Count of files that will be added"""
        return len([f for f in self.files_changed if f.change_type == "added"])

    @property
    def files_deleted_count(self) -> int:
        """Count of files that will be deleted"""
        return len([f for f in self.files_changed if f.change_type == "deleted"])


@dataclass
class CommitSequence:
    """Optimized sequence of commits"""
    commits: List[GitCommitPlan]
    total_files_affected: int
    chronological_span: Tuple[date, date]  # (earliest, latest) enactment dates
    optimization_notes: List[str]

    @property
    def duration_days(self) -> int:
        """Duration covered by this commit sequence"""
        start, end = self.chronological_span
        return (end - start).days


class USCChangeAnalyzer:
    """Analyzes changes between USC releases to identify section-level modifications"""

    def __init__(self):
        self.section_cache = {}  # Cache parsed sections to avoid re-parsing

    def compare_releases(self, old_law_id: str, new_law_id: str,
                        usc_sections: Dict[str, List[Dict[str, Any]]]) -> List[SectionChange]:
        """
        Compare two USC releases to find section-level changes

        Args:
            old_law_id: Previous public law ID (e.g., "119-001")
            new_law_id: Current public law ID (e.g., "119-004")
            usc_sections: Dict of law_id -> list of section data

        Returns:
            List of section changes between the releases
        """
        logger.info(f"📊 Comparing USC releases: {old_law_id} → {new_law_id}")

        old_sections = self._index_sections_by_id(usc_sections.get(old_law_id, []))
        new_sections = self._index_sections_by_id(usc_sections.get(new_law_id, []))

        changes = []

        # Find all section IDs across both releases
        all_section_ids = set(old_sections.keys()) | set(new_sections.keys())

        for section_id in all_section_ids:
            old_section = old_sections.get(section_id)
            new_section = new_sections.get(section_id)

            change = self._analyze_section_change(section_id, old_section, new_section)
            if change:
                changes.append(change)

        logger.info(f"📊 Found {len(changes)} section changes between releases")
        logger.info(f"   • Added: {len([c for c in changes if c.change_type == 'added'])}")
        logger.info(f"   • Modified: {len([c for c in changes if c.change_type == 'modified'])}")
        logger.info(f"   • Deleted: {len([c for c in changes if c.change_type == 'deleted'])}")

        return changes

    def _index_sections_by_id(self, sections_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
        """Index sections by their section_id for efficient lookup"""
        indexed = {}
        for section in sections_data:
            section_id = section.get("section_id")
            if section_id:
                indexed[section_id] = section
        return indexed

    def _analyze_section_change(self, section_id: str,
                               old_section: Optional[Dict[str, Any]],
                               new_section: Optional[Dict[str, Any]]) -> Optional[SectionChange]:
        """Analyze change between two versions of a section"""

        if old_section is None and new_section is not None:
            # Section was added
            return SectionChange(
                section_id=section_id,
                file_path=new_section.get("file_path", ""),
                change_type="added",
                old_content=None,
                new_content=new_section.get("statutory_text", ""),
                diff_lines=[f"+ {line}" for line in new_section.get("statutory_text", "").split('\n')],
                confidence=1.0
            )

        elif old_section is not None and new_section is None:
            # Section was deleted
            return SectionChange(
                section_id=section_id,
                file_path=old_section.get("file_path", ""),
                change_type="deleted",
                old_content=old_section.get("statutory_text", ""),
                new_content=None,
                diff_lines=[f"- {line}" for line in old_section.get("statutory_text", "").split('\n')],
                confidence=1.0
            )

        elif old_section is not None and new_section is not None:
            # Section might have been modified
            old_text = old_section.get("statutory_text", "").strip()
            new_text = new_section.get("statutory_text", "").strip()

            if old_text != new_text:
                # Generate unified diff
                diff_lines = list(difflib.unified_diff(
                    old_text.splitlines(keepends=True),
                    new_text.splitlines(keepends=True),
                    fromfile=f"old/{section_id}",
                    tofile=f"new/{section_id}",
                    lineterm=""
                ))

                # Calculate confidence based on amount of change
                confidence = self._calculate_change_confidence(old_text, new_text)

                return SectionChange(
                    section_id=section_id,
                    file_path=new_section.get("file_path", ""),
                    change_type="modified",
                    old_content=old_text,
                    new_content=new_text,
                    diff_lines=diff_lines,
                    confidence=confidence
                )

        return None  # No significant change

    def _calculate_change_confidence(self, old_text: str, new_text: str) -> float:
        """Calculate confidence that this is a meaningful change (0-1)"""

        if not old_text and not new_text:
            return 0.0

        # Use sequence matcher to calculate similarity
        matcher = difflib.SequenceMatcher(None, old_text, new_text)
        similarity = matcher.ratio()

        # Convert similarity to confidence (lower similarity = higher confidence of real change)
        confidence = 1.0 - similarity

        # Boost confidence for substantial changes
        if abs(len(new_text) - len(old_text)) > 100:
            confidence = min(1.0, confidence + 0.2)

        # Reduce confidence for very small changes (might be formatting)
        if abs(len(new_text) - len(old_text)) < 10 and confidence < 0.1:
            confidence *= 0.5

        return confidence


class GitCommitPlanner:
    """Creates optimized git commit plans from USC changes and legislative data"""

    def __init__(self):
        self.datastore = USCodeDataStore()
        self.change_analyzer = USCChangeAnalyzer()

        # Planning statistics
        self.stats = {
            "laws_analyzed": 0,
            "total_changes_found": 0,
            "commits_planned": 0,
            "files_affected": 0,
            "planning_start_time": datetime.now()
        }

    def generate_commit_plans(self, public_laws: List[str],
                            usc_sections: Dict[str, List[Dict[str, Any]]]) -> List[GitCommitPlan]:
        """
        Generate git commit plans for a sequence of public laws

        Args:
            public_laws: List of public law IDs in chronological order
            usc_sections: Dict of law_id -> USC section data

        Returns:
            List of git commit plans in chronological order
        """
        logger.info(f"🎯 Generating commit plans for {len(public_laws)} public laws")

        commit_plans = []

        # Process laws in chronological order
        for i, law_id in enumerate(public_laws):
            logger.info(f"📋 Planning commits for {law_id} ({i+1}/{len(public_laws)})")

            # Get changes since previous law
            changes = []
            if i > 0:
                prev_law_id = public_laws[i-1]
                changes = self.change_analyzer.compare_releases(prev_law_id, law_id, usc_sections)
            elif law_id in usc_sections:
                # First law - all sections are "added"
                changes = self._create_initial_changes(law_id, usc_sections[law_id])

            if changes:
                # Create commit plan for this law
                commit_plan = self._create_commit_plan(law_id, changes)
                if commit_plan:
                    commit_plans.append(commit_plan)
                    self.stats["commits_planned"] += 1
                    self.stats["files_affected"] += len(changes)

            self.stats["laws_analyzed"] += 1
            self.stats["total_changes_found"] += len(changes)

        logger.info(f"🎯 Commit planning complete: {len(commit_plans)} commits planned")
        return commit_plans

    def _create_initial_changes(self, law_id: str, sections: List[Dict[str, Any]]) -> List[SectionChange]:
        """Create 'added' changes for the first law (initial commit)"""
        changes = []

        for section in sections:
            change = SectionChange(
                section_id=section.get("section_id", ""),
                file_path=section.get("file_path", ""),
                change_type="added",
                old_content=None,
                new_content=section.get("statutory_text", ""),
                diff_lines=[f"+ {line}" for line in section.get("statutory_text", "").split('\n')],
                confidence=1.0
            )
            changes.append(change)

        return changes

    def _create_commit_plan(self, law_id: str, changes: List[SectionChange]) -> Optional[GitCommitPlan]:
        """Create a git commit plan for a specific public law"""

        if not changes:
            return None

        try:
            # Get public law data from datastore
            congress, law_num = law_id.split("-")
            public_law = self.datastore.get_public_law(int(congress), int(law_num))

            if not public_law:
                logger.warning(f"⚠️ No datastore entry for {law_id}")
                return None

            # Get sponsor information
            sponsor_info = self._get_sponsor_info(law_id)

            # Generate commit metadata
            commit_date = datetime.combine(public_law.enacted_date, datetime.min.time())

            author_name = "Unknown Sponsor"
            author_email = "unknown@congress.gov"

            if sponsor_info:
                author_name = sponsor_info.full_name
                author_email = sponsor_info.email

            # Generate commit message
            commit_message = self._generate_commit_message(law_id, public_law, changes)
            commit_body = self._generate_commit_body(law_id, public_law, changes, sponsor_info)

            # Create tags
            tags = [f"PL-{law_id}", f"Congress-{congress}"]

            commit_plan = GitCommitPlan(
                public_law_id=law_id,
                commit_date=commit_date,
                author_name=author_name,
                author_email=author_email,
                committer_name=author_name,  # Same as author for legislative commits
                committer_email=author_email,
                commit_message=commit_message,
                commit_body=commit_body,
                files_changed=changes,
                sponsor_bioguide_id=sponsor_info.bioguide_id if sponsor_info else None,
                bill_info=None,  # Could be populated from API data if available
                tags=tags
            )

            return commit_plan

        except Exception as e:
            logger.error(f"❌ Error creating commit plan for {law_id}: {e}")
            return None

    def _get_sponsor_info(self, law_id: str) -> Optional[Sponsor]:
        """Get sponsor information for a public law"""

        # Try to find sponsor from datastore
        try:
            sponsors = self.datastore.sponsors.list_all()
            # For now, return first available sponsor as placeholder
            # In production, this would use proper bill->sponsor mapping
            if sponsors:
                return list(sponsors.values())[0]
        except Exception as e:
            logger.warning(f"⚠️ Could not find sponsor for {law_id}: {e}")

        return None

    def _generate_commit_message(self, law_id: str, public_law, changes: List[SectionChange]) -> str:
        """Generate concise commit message"""

        congress, law_num = law_id.split("-")

        # Count change types
        added = len([c for c in changes if c.change_type == "added"])
        modified = len([c for c in changes if c.change_type == "modified"])
        deleted = len([c for c in changes if c.change_type == "deleted"])

        # Generate summary
        change_summary = []
        if added:
            change_summary.append(f"{added} sections added")
        if modified:
            change_summary.append(f"{modified} sections modified")
        if deleted:
            change_summary.append(f"{deleted} sections deleted")

        summary = ", ".join(change_summary) if change_summary else "USC updates"

        # Get affected titles
        affected_titles = set()
        for change in changes:
            # Extract title number from section_id (e.g., "42-6A-280g-15" -> "42")
            parts = change.section_id.split("-")
            if parts:
                try:
                    title_num = int(parts[0])
                    affected_titles.add(title_num)
                except ValueError:
                    pass

        titles_str = ""
        if affected_titles:
            sorted_titles = sorted(affected_titles)
            if len(sorted_titles) == 1:
                titles_str = f" (Title {sorted_titles[0]})"
            elif len(sorted_titles) <= 3:
                titles_str = f" (Titles {', '.join(map(str, sorted_titles))})"
            else:
                titles_str = f" ({len(sorted_titles)} titles)"

        return f"Enact Public Law {congress}-{law_num}: {summary}{titles_str}"

    def _generate_commit_body(self, law_id: str, public_law, changes: List[SectionChange],
                            sponsor_info: Optional[Sponsor]) -> str:
        """Generate detailed commit message body"""

        lines = []

        # Basic law information
        lines.append(f"Public Law: {law_id}")
        lines.append(f"Enacted: {public_law.enacted_date}")

        if sponsor_info:
            lines.append(f"Sponsor: {sponsor_info.full_name}")
            lines.append(f"Chamber: {sponsor_info.chamber}")
            lines.append(f"Party: {sponsor_info.party}")

        lines.append("")

        # Change summary
        lines.append("Changes:")

        # Group changes by type
        by_type = defaultdict(list)
        for change in changes:
            by_type[change.change_type].append(change)

        for change_type, type_changes in by_type.items():
            lines.append(f"  {change_type.title()}:")

            # List first few files, then summarize if many
            if len(type_changes) <= 5:
                for change in type_changes:
                    lines.append(f"    - {change.file_path}")
            else:
                for change in type_changes[:3]:
                    lines.append(f"    - {change.file_path}")
                lines.append(f"    ... and {len(type_changes) - 3} more files")

        lines.append("")
        lines.append("📊 Generated with USC Git Blame System")
        lines.append("🏛️ Data source: House Office of Law Revision Counsel")

        return "\n".join(lines)

    def optimize_commit_sequence(self, commit_plans: List[GitCommitPlan]) -> CommitSequence:
        """Optimize the sequence of commits for better git blame and performance"""

        logger.info(f"🎯 Optimizing sequence of {len(commit_plans)} commits")

        optimizations = []
        optimized_commits = commit_plans.copy()

        # Sort by chronological order (should already be sorted, but ensure it)
        optimized_commits.sort(key=lambda c: c.commit_date)
        optimizations.append("Sorted commits chronologically")

        # Detect and resolve conflicts
        conflict_count = self._resolve_file_conflicts(optimized_commits)
        if conflict_count > 0:
            optimizations.append(f"Resolved {conflict_count} file conflicts")

        # Calculate statistics
        all_files = set()
        for commit in optimized_commits:
            for change in commit.files_changed:
                all_files.add(change.file_path)

        # Determine chronological span
        dates = [c.commit_date.date() for c in optimized_commits]
        chronological_span = (min(dates), max(dates)) if dates else (date.today(), date.today())

        sequence = CommitSequence(
            commits=optimized_commits,
            total_files_affected=len(all_files),
            chronological_span=chronological_span,
            optimization_notes=optimizations
        )

        logger.info("🎯 Optimization complete:")
        logger.info(f"   • {len(optimized_commits)} commits over {sequence.duration_days} days")
        logger.info(f"   • {sequence.total_files_affected} unique files affected")
        logger.info(f"   • Optimizations: {len(optimizations)}")

        return sequence

    def _resolve_file_conflicts(self, commits: List[GitCommitPlan]) -> int:
        """Resolve conflicts where multiple commits modify the same file"""

        conflicts_resolved = 0
        file_to_commits = defaultdict(list)

        # Index commits by files they modify
        for commit in commits:
            for change in commit.files_changed:
                file_to_commits[change.file_path].append((commit, change))

        # Find files modified by multiple commits
        for file_path, commit_changes in file_to_commits.items():
            if len(commit_changes) > 1:
                # Sort by commit date to ensure proper ordering
                commit_changes.sort(key=lambda x: x[0].commit_date)

                # Verify the changes are compatible (later commits should build on earlier ones)
                conflicts_resolved += 1

                # For now, just log conflicts - actual resolution would require
                # more sophisticated content analysis
                logger.debug(f"📝 File conflict resolved: {file_path} ({len(commit_changes)} commits)")

        return conflicts_resolved

    def save_commit_plans(self, sequence: CommitSequence, output_path: Path) -> None:
        """Save commit plans to JSON file for use by build script"""

        logger.info(f"💾 Saving {len(sequence.commits)} commit plans to {output_path}")

        # Convert to serializable format
        plans_data = {
            "metadata": {
                "generated_at": datetime.now().isoformat(),
                "total_commits": len(sequence.commits),
                "total_files_affected": sequence.total_files_affected,
                "chronological_span": {
                    "start": sequence.chronological_span[0].isoformat(),
                    "end": sequence.chronological_span[1].isoformat()
                },
                "optimization_notes": sequence.optimization_notes,
                "generation_statistics": self.get_planning_statistics()
            },
            "commits": []
        }

        for commit in sequence.commits:
            commit_data = {
                "public_law_id": commit.public_law_id,
                "commit_date": commit.commit_date.isoformat(),
                "author": {
                    "name": commit.author_name,
                    "email": commit.author_email
                },
                "committer": {
                    "name": commit.committer_name,
                    "email": commit.committer_email
                },
                "message": {
                    "title": commit.commit_message,
                    "body": commit.commit_body
                },
                "files_changed": [
                    {
                        "section_id": change.section_id,
                        "file_path": change.file_path,
                        "change_type": change.change_type,
                        "confidence": change.confidence,
                        "content_hash": change.content_hash,
                        "diff_stats": {
                            "lines_added": len([line for line in change.diff_lines if line.startswith('+')]),
                            "lines_deleted": len([line for line in change.diff_lines if line.startswith('-')])
                        }
                    }
                    for change in commit.files_changed
                ],
                "metadata": {
                    "sponsor_bioguide_id": commit.sponsor_bioguide_id,
                    "tags": commit.tags,
                    "short_hash": commit.short_hash,
                    "files_stats": {
                        "added": commit.files_added_count,
                        "modified": commit.files_modified_count,
                        "deleted": commit.files_deleted_count
                    }
                }
            }

            plans_data["commits"].append(commit_data)

        # Save to file
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(plans_data, f, indent=2, default=str)

        logger.info(f"✅ Commit plans saved: {output_path}")

    def get_planning_statistics(self) -> Dict[str, Any]:
        """Get comprehensive planning statistics"""

        end_time = datetime.now()
        duration = end_time - self.stats["planning_start_time"]

        return {
            "planning_duration_seconds": duration.total_seconds(),
            "planning_duration_formatted": str(duration),
            **self.stats,
            "planning_completed_at": end_time.isoformat()
        }

    def run_full_planning(self, public_laws: List[str],
                         usc_sections_dir: Path) -> CommitSequence:
        """
        Run complete commit planning pipeline

        Args:
            public_laws: List of public law IDs in chronological order
            usc_sections_dir: Directory containing USC section data

        Returns:
            Optimized commit sequence
        """
        logger.info(f"🚀 Starting full commit planning for {len(public_laws)} public laws")

        # Load USC sections data
        usc_sections = {}
        for law_id in public_laws:
            sections_file = usc_sections_dir / f"{law_id}.json"
            if sections_file.exists():
                try:
                    with open(sections_file, 'r') as f:
                        data = json.load(f)
                        usc_sections[law_id] = data.get("sections", [])
                except Exception as e:
                    logger.warning(f"⚠️ Could not load sections for {law_id}: {e}")

        logger.info(f"📊 Loaded USC sections for {len(usc_sections)} laws")

        # Generate commit plans
        commit_plans = self.generate_commit_plans(public_laws, usc_sections)

        # Optimize sequence
        optimized_sequence = self.optimize_commit_sequence(commit_plans)

        logger.info("🎉 Full planning complete!")
        return optimized_sequence


def main():
    """Example usage of the git commit planner"""

    # Initialize planner
    planner = GitCommitPlanner()

    # Example: Plan commits for recent public laws
    public_laws = ["119-001", "119-004", "119-012", "119-018", "119-023", "119-026"]

    logger.info("🚀 Starting USC git commit planning")

    # Run full planning
    usc_sections_dir = Path("data/usc_sections")
    sequence = planner.run_full_planning(public_laws, usc_sections_dir)

    # Save plans
    output_path = Path("data/git_plans/commit_sequence.json")
    planner.save_commit_plans(sequence, output_path)

    # Display results
    print("\n" + "="*60)
    print("🎯 COMMIT PLANNING RESULTS")
    print("="*60)

    print("\nCommit Sequence:")
    print(f"  Total commits: {len(sequence.commits)}")
    print(f"  Files affected: {sequence.total_files_affected}")
    print(f"  Time span: {sequence.chronological_span[0]} to {sequence.chronological_span[1]}")
    print(f"  Duration: {sequence.duration_days} days")

    print("\nOptimizations Applied:")
    for note in sequence.optimization_notes:
        print(f"  • {note}")

    print("\nFirst Few Commits:")
    for i, commit in enumerate(sequence.commits[:3]):
        print(f"  {i+1}. {commit.public_law_id}: {commit.commit_message}")
        print(f"     Date: {commit.commit_date.date()}")
        print(f"     Files: {len(commit.files_changed)} changed")
        print(f"     Author: {commit.author_name}")

    if len(sequence.commits) > 3:
        print(f"  ... and {len(sequence.commits) - 3} more commits")

    stats = planner.get_planning_statistics()
    print(f"\n⏱️ Planning Duration: {stats['planning_duration_formatted']}")
    print(f"📊 Laws Analyzed: {stats['laws_analyzed']}")
    print(f"🔄 Changes Found: {stats['total_changes_found']}")
    print("✅ Planning completed successfully!")


if __name__ == "__main__":
    main()