git-law/migrate_to_datastore.py

#!/usr/bin/env python3
"""
USC Git Blame Data Migrator

Processes cached raw data from multiple sources into normalized JSON datastore:

1. Parses House US Code HTML releases using semantic field extraction
2. Normalizes Congress.gov API data with Pydantic validation
3. Cross-references bills to public laws to USC sections
4. Validates data integrity and builds comprehensive indexes
5. Migrates to production-ready normalized datastore

Architecture: Download → Cache → **Migrate** → Plan → Build
This script handles the second step: raw data normalization and validation.
"""

import json
import zipfile
import re
from pathlib import Path
from datetime import datetime, date
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
import logging
from html.parser import HTMLParser
import html

# Import our existing models and datastore
from models import Sponsor
from datastore import USCodeDataStore
from download_cache import CacheManager

# Configure logging
logs_dir = Path('logs')
logs_dir.mkdir(exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(logs_dir / 'migrate_to_datastore.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


@dataclass
class USCSection:
    """Represents an individual USC section extracted from HTML"""
    title_num: int              # 42 (Public Health and Welfare)
    chapter_num: Optional[str]  # "6A" (can have letters)
    section_num: str            # "280g-15" (handles subsection numbering)
    heading: str               # Clean section title
    statutory_text: str        # Normalized legal text
    source_credit: str         # Original enactment attribution
    amendment_history: Optional[List[str]] = None # Amendment notes
    cross_references: Optional[List[str]] = None # References to other sections
    enacted_through: str = ""   # Which public law this version reflects

    def __post_init__(self):
        if self.amendment_history is None:
            self.amendment_history = []
        if self.cross_references is None:
            self.cross_references = []

    @property
    def section_id(self) -> str:
        """Unique identifier for this section"""
        chapter_part = f"-{self.chapter_num}" if self.chapter_num else ""
        return f"{self.title_num}{chapter_part}-{self.section_num}"

    @property
    def file_path(self) -> str:
        """File path for hierarchical git structure"""
        title_name = f"Title-{self.title_num:02d}"
        if self.chapter_num:
            chapter_name = f"Chapter-{self.chapter_num}"
            return f"{title_name}/{chapter_name}/Section-{self.section_num.replace('.', '-')}.md"
        else:
            return f"{title_name}/Section-{self.section_num.replace('.', '-')}.md"


@dataclass
class ParsedBillData:
    """Normalized bill data from Congress.gov API"""
    congress: int
    bill_type: str              # "hr", "s", etc.
    bill_number: int
    title: str
    sponsor: Optional[Dict[str, Any]]
    cosponsors: List[Dict[str, Any]]
    committees: List[Dict[str, Any]]
    amendments: List[Dict[str, Any]]
    related_bills: List[Dict[str, Any]]
    public_law: Optional[str]   # "119-001" if this bill became a public law
    enacted_date: Optional[date]


class USCHTMLParser(HTMLParser):
    """Parse USC HTML files using semantic field markers"""

    def __init__(self):
        super().__init__()
        self.reset_parser_state()

    def reset_parser_state(self):
        """Reset parser state for new document"""
        self.current_section = None
        self.sections = []
        self.in_statute_field = False
        self.in_sourcecredit_field = False
        self.in_notes_field = False
        self.current_text = ""
        self.current_tag = None
        self.section_data = {}

    def handle_comment(self, data: str):
        """Handle HTML comments that contain semantic information"""
        data = data.strip()

        # Parse itempath comments for section structure
        if data.startswith("itempath:/"):
            self._parse_itempath(data)
        elif data.startswith("expcite:"):
            self._parse_expcite(data)
        elif data.startswith("field-start:"):
            self._handle_field_start(data)
        elif data.startswith("field-end:"):
            self._handle_field_end(data)
        elif data.startswith("AUTHORITIES-LAWS-ENACTED-THROUGH:"):
            self._parse_enacted_through(data)

    def _parse_itempath(self, data: str):
        """Parse itempath to extract section structure"""
        # Examples:
        # itempath:/010/CHAPTER 1/Sec. 1
        # itempath:/042/CHAPTER 6A/SUBCHAPTER II/Part A/Sec. 280g-15

        path_match = re.search(r"itempath:/(\d+)(?:/CHAPTER\s+([^/]+))?(?:/[^/]*)*?(?:/Sec\.\s+(.+))?", data)
        if path_match:
            title_num = int(path_match.group(1))
            chapter_num = path_match.group(2)
            section_num = path_match.group(3)

            if section_num:  # This is a section
                self.section_data = {
                    "title_num": title_num,
                    "chapter_num": chapter_num,
                    "section_num": section_num.strip(),
                    "heading": "",
                    "statutory_text": "",
                    "source_credit": "",
                    "amendment_history": [],
                    "cross_references": [],
                    "enacted_through": ""
                }

    def _parse_expcite(self, data: str):
        """Parse expcite for additional context"""
        # Example: expcite:TITLE 42-PUBLIC HEALTH AND WELFARE!@!CHAPTER 6A-PUBLIC HEALTH SERVICE!@!Sec. 280g-15
        pass  # Additional parsing if needed

    def _parse_enacted_through(self, data: str):
        """Parse enacted-through info"""
        # Example: AUTHORITIES-LAWS-ENACTED-THROUGH:119-1 (01/29/2025)
        match = re.search(r"AUTHORITIES-LAWS-ENACTED-THROUGH:(\d+-\d+)", data)
        if match and self.section_data:
            self.section_data["enacted_through"] = match.group(1)

    def _handle_field_start(self, data: str):
        """Handle field start markers"""
        if "statute" in data:
            self.in_statute_field = True
            self.current_text = ""
        elif "sourcecredit" in data:
            self.in_sourcecredit_field = True
            self.current_text = ""
        elif "notes" in data or "amendment-note" in data:
            self.in_notes_field = True
            self.current_text = ""

    def _handle_field_end(self, data: str):
        """Handle field end markers"""
        if "statute" in data and self.in_statute_field:
            if self.section_data:
                self.section_data["statutory_text"] = self._clean_text(self.current_text)
            self.in_statute_field = False
        elif "sourcecredit" in data and self.in_sourcecredit_field:
            if self.section_data:
                self.section_data["source_credit"] = self._clean_text(self.current_text)
            self.in_sourcecredit_field = False
        elif ("notes" in data or "amendment-note" in data) and self.in_notes_field:
            if self.section_data and self.current_text.strip():
                self.section_data["amendment_history"].append(self._clean_text(self.current_text))
            self.in_notes_field = False

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        """Handle HTML start tags"""
        self.current_tag = tag

        # Extract section headings from section-head class
        if tag == "h3":
            for attr_name, attr_value in attrs:
                if attr_name == "class" and "section-head" in attr_value:
                    self.current_text = ""

    def handle_endtag(self, tag: str):
        """Handle HTML end tags"""
        if tag == "h3" and self.section_data and self.current_text.strip():
            # Extract section heading
            heading_text = self._clean_text(self.current_text)
            # Remove section number prefix (e.g., "§1. " -> "")
            heading_clean = re.sub(r"^§\s*[\d\w\-\.]+\.\s*", "", heading_text)
            self.section_data["heading"] = heading_clean

            # Finalize current section if we have complete data
            if (self.section_data.get("title_num") and
                self.section_data.get("section_num") and
                self.section_data.get("statutory_text")):

                section = USCSection(**self.section_data)
                self.sections.append(section)
                self.section_data = {}

        self.current_tag = None

    def handle_data(self, data: str):
        """Handle text content"""
        if (self.in_statute_field or self.in_sourcecredit_field or
            self.in_notes_field or self.current_tag == "h3"):
            self.current_text += data

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text content"""
        # Decode HTML entities
        text = html.unescape(text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text.strip())

        # Convert HTML entities to proper unicode
        text = text.replace("&mdash;", "—")
        text = text.replace("&ldquo;", """)
        text = text.replace("&rdquo;", """)
        text = text.replace("&nbsp;", " ")

        return text

    def parse_html_file(self, html_content: str) -> List[USCSection]:
        """Parse complete HTML file and return extracted sections"""
        self.reset_parser_state()
        self.feed(html_content)
        return self.sections


class DataMigrator:
    """
    Migrates raw cached data into normalized JSON datastore

    Processes:
    - House USC HTML releases -> USCSection objects
    - Congress.gov API data -> Normalized bill data
    - Cross-references and validation
    - Integration with existing datastore
    """

    def __init__(self, cache_dir: Path = Path("download_cache"), force: bool = False):
        self.cache_manager = CacheManager(cache_dir)
        self.datastore = USCodeDataStore()
        self.html_parser = USCHTMLParser()
        self.force = force  # Force re-migration even if output exists

        # Migration statistics
        self.stats = {
            "html_files_processed": 0,
            "usc_sections_extracted": 0,
            "api_bills_processed": 0,
            "cross_references_resolved": 0,
            "validation_errors": 0,
            "files_skipped": 0,
            "migration_start_time": datetime.now()
        }

    def migrate_house_html_data(self, public_laws: List[str]) -> Dict[str, List[USCSection]]:
        """
        Migrate House USC HTML releases to structured section data

        Args:
            public_laws: List of public law IDs (e.g., ["119-001", "119-004"])

        Returns:
            Dict mapping public law -> list of USC sections
        """
        logger.info(f"🔄 Migrating House HTML data for {len(public_laws)} public laws")

        all_sections = {}
        metadata_path = Path("data/usc_sections")

        for law_id in public_laws:
            # Check if output already exists (idempotency)
            sections_file = metadata_path / f"{law_id}.json"
            if sections_file.exists() and not self.force:
                logger.info(f"✅ Skipping HTML migration for {law_id} - output exists")
                self.stats["files_skipped"] += 1
                # Load existing sections for return value
                try:
                    with open(sections_file, 'r') as f:
                        existing_data = json.load(f)
                        # Convert back to USCSection objects for consistency
                        sections = []
                        for section_data in existing_data.get('sections', []):
                            section = USCSection(
                                section_id=section_data['section_id'],
                                file_path=section_data['file_path'],
                                title_num=section_data['title_num'],
                                chapter_num=section_data['chapter_num'],
                                section_num=section_data['section_num'],
                                heading=section_data['heading'],
                                statutory_text=section_data['statutory_text'],
                                source_credit=section_data['source_credit'],
                                amendment_history=section_data['amendment_history'],
                                cross_references=section_data['cross_references'],
                                enacted_through=section_data['enacted_through']
                            )
                            sections.append(section)
                        all_sections[law_id] = sections
                except Exception as e:
                    logger.warning(f"⚠️ Error loading existing sections for {law_id}: {e}")
                continue
            congress, law_num = law_id.split("-")
            cache_key = f"house_usc_{congress}_{law_num}"

            if not self.cache_manager.is_cached(cache_key):
                logger.warning(f"⚠️ No cached HTML data for {law_id}")
                continue

            zip_path = self.cache_manager.get_cached_path(cache_key)
            sections = self._extract_sections_from_zip(zip_path, law_id)

            if sections:
                all_sections[law_id] = sections
                self.stats["usc_sections_extracted"] += len(sections)
                logger.info(f"✅ Extracted {len(sections)} sections from {law_id}")
            else:
                logger.warning(f"⚠️ No sections extracted from {law_id}")

        self.stats["html_files_processed"] = len(all_sections)
        logger.info(f"📊 HTML migration complete: {self.stats['usc_sections_extracted']} total sections")

        return all_sections

    def _extract_sections_from_zip(self, zip_path: Path, law_id: str) -> List[USCSection]:
        """Extract USC sections from downloaded ZIP file"""
        sections = []

        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_file:
                html_files = [name for name in zip_file.namelist() if name.endswith('.htm')]

                for html_file in html_files:
                    html_content = zip_file.read(html_file).decode('utf-8', errors='ignore')
                    file_sections = self.html_parser.parse_html_file(html_content)

                    # Set enacted_through for all sections from this release
                    for section in file_sections:
                        if not section.enacted_through:
                            section.enacted_through = law_id

                    sections.extend(file_sections)

        except Exception as e:
            logger.error(f"❌ Error extracting sections from {zip_path}: {e}")

        return sections

    def migrate_congress_api_data(self, public_laws: List[str]) -> Dict[str, ParsedBillData]:
        """
        Migrate Congress.gov API data to normalized bill structures

        Args:
            public_laws: List of public law IDs

        Returns:
            Dict mapping public law -> normalized bill data
        """
        logger.info(f"🔄 Migrating Congress.gov API data for {len(public_laws)} public laws")

        normalized_bills = {}

        for law_id in public_laws:
            congress, law_num = law_id.split("-")

            # Look for cached bill details
            search_cache_key = f"bill_search_{congress}_{law_num.zfill(3)}"

            if not self.cache_manager.is_cached(search_cache_key):
                logger.warning(f"⚠️ No cached API data for {law_id}")
                continue

            # Load bill search results
            search_path = self.cache_manager.get_cached_path(search_cache_key)
            try:
                with open(search_path, 'r') as f:
                    bill_info = json.load(f)

                # Load full bill details if available
                bill_type = bill_info.get('bill_type', '').lower()
                bill_number = bill_info.get('bill_number')

                if bill_type and bill_number:
                    details_cache_key = f"bill_details_{congress}_{bill_type}_{bill_number}"

                    if self.cache_manager.is_cached(details_cache_key):
                        details_path = self.cache_manager.get_cached_path(details_cache_key)
                        with open(details_path, 'r') as f:
                            bill_details = json.load(f)

                        # Normalize the bill data
                        normalized_bill = self._normalize_bill_data(bill_details, law_id)
                        normalized_bills[law_id] = normalized_bill

                        self.stats["api_bills_processed"] += 1
                        logger.info(f"✅ Normalized API data for {law_id}")

            except Exception as e:
                logger.error(f"❌ Error processing API data for {law_id}: {e}")
                self.stats["validation_errors"] += 1

        logger.info(f"📊 API migration complete: {len(normalized_bills)} bills normalized")
        return normalized_bills

    def _normalize_bill_data(self, bill_details: Dict[str, Any], law_id: str) -> ParsedBillData:
        """Normalize raw bill data from Congress.gov API"""

        basic_info = bill_details.get('details', {})

        # Extract basic bill information
        congress = basic_info.get('congress', 0)
        bill_type = basic_info.get('type', '').lower()
        bill_number = basic_info.get('number', 0)
        title = basic_info.get('title', '')

        # Extract sponsor information
        sponsor = None
        sponsor_data = basic_info.get('sponsors', [])
        if sponsor_data and len(sponsor_data) > 0:
            sponsor = sponsor_data[0]  # Primary sponsor

        # Extract cosponsors
        cosponsors = bill_details.get('cosponsors', [])

        # Extract committee information
        committees = bill_details.get('committees', [])

        # Extract amendments
        amendments = bill_details.get('amendments', [])

        # Extract related bills
        related_bills = bill_details.get('related_bills', [])

        # Extract enactment information
        enacted_date = None
        public_law = law_id

        # Try to parse enacted date from basic info
        if 'becamelaw' in basic_info:
            became_law = basic_info['becamelaw']
            if isinstance(became_law, str):
                try:
                    enacted_date = datetime.strptime(became_law, '%Y-%m-%d').date()
                except ValueError:
                    # Try other date formats
                    for date_format in ['%Y-%m-%d', '%m/%d/%Y', '%B %d, %Y']:
                        try:
                            enacted_date = datetime.strptime(became_law, date_format).date()
                            break
                        except ValueError:
                            continue

        return ParsedBillData(
            congress=congress,
            bill_type=bill_type,
            bill_number=bill_number,
            title=title,
            sponsor=sponsor,
            cosponsors=cosponsors,
            committees=committees,
            amendments=amendments,
            related_bills=related_bills,
            public_law=public_law,
            enacted_date=enacted_date
        )

    def cross_reference_and_validate(self,
                                   usc_sections: Dict[str, List[USCSection]],
                                   bill_data: Dict[str, ParsedBillData]) -> Dict[str, Any]:
        """
        Cross-reference USC sections with bill data and validate relationships

        Args:
            usc_sections: Dict of public law -> USC sections
            bill_data: Dict of public law -> normalized bill data

        Returns:
            Dict with validation results and cross-reference mappings
        """
        logger.info("🔄 Cross-referencing and validating data relationships")

        validation_results = {
            "total_laws_processed": len(set(list(usc_sections.keys()) + list(bill_data.keys()))),
            "laws_with_both_html_and_api": 0,
            "laws_missing_html": [],
            "laws_missing_api": [],
            "section_title_distribution": {},
            "sponsor_attribution_success": 0,
            "validation_errors": []
        }

        all_laws = set(list(usc_sections.keys()) + list(bill_data.keys()))

        for law_id in all_laws:
            has_html = law_id in usc_sections
            has_api = law_id in bill_data

            if has_html and has_api:
                validation_results["laws_with_both_html_and_api"] += 1

                # Cross-reference sponsor data
                sections = usc_sections[law_id]
                bill = bill_data[law_id]

                if bill.sponsor:
                    validation_results["sponsor_attribution_success"] += 1

                # Track section title distribution
                for section in sections:
                    title_key = f"Title-{section.title_num}"
                    validation_results["section_title_distribution"][title_key] = \
                        validation_results["section_title_distribution"].get(title_key, 0) + 1

            elif not has_html:
                validation_results["laws_missing_html"].append(law_id)
            elif not has_api:
                validation_results["laws_missing_api"].append(law_id)

        # Validate USC section data quality
        total_sections = sum(len(sections) for sections in usc_sections.values())
        sections_with_text = sum(1 for sections in usc_sections.values()
                               for section in sections if section.statutory_text.strip())

        validation_results.update({
            "total_sections_extracted": total_sections,
            "sections_with_statutory_text": sections_with_text,
            "text_extraction_rate": sections_with_text / total_sections if total_sections > 0 else 0
        })

        self.stats["cross_references_resolved"] = validation_results["laws_with_both_html_and_api"]

        logger.info("📊 Cross-reference complete:")
        logger.info(f"   • {validation_results['laws_with_both_html_and_api']} laws with complete data")
        logger.info(f"   • {len(validation_results['laws_missing_html'])} laws missing HTML")
        logger.info(f"   • {len(validation_results['laws_missing_api'])} laws missing API data")
        logger.info(f"   • {validation_results['text_extraction_rate']:.2%} text extraction success rate")

        return validation_results

    def integrate_with_datastore(self,
                                usc_sections: Dict[str, List[USCSection]],
                                bill_data: Dict[str, ParsedBillData],
                                validation_results: Dict[str, Any]) -> Dict[str, Any]:
        """
        Integrate migrated data with existing datastore

        Args:
            usc_sections: Extracted USC sections
            bill_data: Normalized bill data
            validation_results: Cross-reference validation results

        Returns:
            Integration statistics
        """
        logger.info("🔄 Integrating migrated data with existing datastore")

        integration_stats = {
            "existing_laws_in_datastore": self.datastore.public_laws.count(),
            "new_sections_added": 0,
            "enhanced_laws_with_api_data": 0,
            "sponsor_profiles_created": 0,
            "integration_errors": []
        }

        # Create sponsor profiles from bill data
        unique_sponsors = set()
        for bill in bill_data.values():
            if bill.sponsor and 'bioguideId' in bill.sponsor:
                bioguide_id = bill.sponsor['bioguideId']
                if bioguide_id not in unique_sponsors:
                    try:
                        sponsor = self._create_sponsor_from_api_data(bill.sponsor)
                        if sponsor:
                            self.datastore.sponsors.save(bioguide_id, sponsor)
                            unique_sponsors.add(bioguide_id)
                            integration_stats["sponsor_profiles_created"] += 1
                    except Exception as e:
                        integration_stats["integration_errors"].append(f"Sponsor creation error: {e}")

        # Save USC sections as metadata for future git processing
        sections_metadata = {}
        for law_id, sections in usc_sections.items():
            sections_data = []
            for section in sections:
                sections_data.append({
                    "section_id": section.section_id,
                    "file_path": section.file_path,
                    "title_num": section.title_num,
                    "chapter_num": section.chapter_num,
                    "section_num": section.section_num,
                    "heading": section.heading,
                    "statutory_text": section.statutory_text,
                    "source_credit": section.source_credit,
                    "amendment_history": section.amendment_history,
                    "cross_references": section.cross_references,
                    "enacted_through": section.enacted_through
                })

            sections_metadata[law_id] = {
                "public_law": law_id,
                "sections": sections_data,
                "extracted_at": datetime.now().isoformat(),
                "section_count": len(sections_data)
            }

            integration_stats["new_sections_added"] += len(sections_data)

        # Save sections metadata to datastore
        try:
            metadata_path = Path("data/usc_sections")
            metadata_path.mkdir(exist_ok=True)

            for law_id, metadata in sections_metadata.items():
                sections_file = metadata_path / f"{law_id}.json"

                # Skip if file already exists and not forcing re-migration
                if sections_file.exists() and not self.force:
                    logger.info(f"✅ Skipping {law_id} - sections file already exists")
                    self.stats["files_skipped"] += 1
                    continue

                with open(sections_file, 'w') as f:
                    json.dump(metadata, f, indent=2, default=str)

                if self.force and sections_file.exists():
                    logger.info(f"🔄 Force-updated sections for {law_id}")
                else:
                    logger.info(f"💾 Created sections file for {law_id}")

        except Exception as e:
            integration_stats["integration_errors"].append(f"Sections metadata save error: {e}")

        # Update existing public law records with enhanced API data
        for law_id, bill in bill_data.items():
            congress, law_num = law_id.split("-")

            try:
                # Try to get existing public law record
                existing_law = self.datastore.get_public_law(int(congress), int(law_num))

                if existing_law and bill.enacted_date:
                    # Update with more accurate enacted date if available
                    if existing_law.enacted_date != bill.enacted_date:
                        existing_law.enacted_date = bill.enacted_date
                        self.datastore.public_laws.save(f"{congress}-{law_num.zfill(3)}", existing_law)
                        integration_stats["enhanced_laws_with_api_data"] += 1

            except Exception as e:
                integration_stats["integration_errors"].append(f"Law update error for {law_id}: {e}")

        logger.info("📊 Integration complete:")
        logger.info(f"   • {integration_stats['new_sections_added']} USC sections saved")
        logger.info(f"   • {integration_stats['sponsor_profiles_created']} sponsor profiles created")
        logger.info(f"   • {integration_stats['enhanced_laws_with_api_data']} laws enhanced with API data")

        return integration_stats

    def _create_sponsor_from_api_data(self, sponsor_data: Dict[str, Any]) -> Optional[Sponsor]:
        """Create Sponsor object from Congress.gov API data"""

        try:
            bioguide_id = sponsor_data.get('bioguideId', '')
            if not bioguide_id:
                return None

            # Extract basic information
            first_name = sponsor_data.get('firstName', '')
            last_name = sponsor_data.get('lastName', '')
            party = sponsor_data.get('party', '')
            state = sponsor_data.get('state', '')

            # Determine chamber and title
            chamber = "house"  # Default
            title = "Representative"

            if 'chamber' in sponsor_data:
                chamber_name = sponsor_data['chamber'].lower()
                if 'senate' in chamber_name:
                    chamber = "senate"
                    title = "Senator"

            # Map party to enum value
            from models import PoliticalParty, CongressionalChamber

            party_enum = PoliticalParty.UNKNOWN
            if party == "D":
                party_enum = PoliticalParty.DEMOCRATIC
            elif party == "R":
                party_enum = PoliticalParty.REPUBLICAN
            elif party == "I":
                party_enum = PoliticalParty.INDEPENDENT

            chamber_enum = CongressionalChamber.HOUSE
            if chamber == "senate":
                chamber_enum = CongressionalChamber.SENATE

            # Parse district number
            district_num = None
            district_str = sponsor_data.get('district', '')
            if district_str and district_str.isdigit():
                district_num = int(district_str)

            # Create sponsor object
            sponsor = Sponsor(
                bioguide_id=bioguide_id,
                title=title,
                first_name=first_name,
                last_name=last_name,
                full_name=f"{first_name} {last_name}".strip(),
                party=party_enum,
                state=state,
                district=district_num,
                chamber=chamber_enum
            )

            return sponsor

        except Exception as e:
            logger.error(f"❌ Error creating sponsor from API data: {e}")
            return None

    def get_migration_statistics(self) -> Dict[str, Any]:
        """Get comprehensive migration statistics"""

        end_time = datetime.now()
        duration = end_time - self.stats["migration_start_time"]

        return {
            "migration_duration_seconds": duration.total_seconds(),
            "migration_duration_formatted": str(duration),
            **self.stats,
            "migration_completed_at": end_time.isoformat()
        }

    def run_full_migration(self, public_laws: List[str]) -> Dict[str, Any]:
        """
        Run complete migration pipeline

        Args:
            public_laws: List of public law IDs to migrate

        Returns:
            Complete migration results with statistics
        """
        logger.info(f"🚀 Starting full migration for {len(public_laws)} public laws")

        results = {
            "public_laws_requested": public_laws,
            "migration_phases": {}
        }

        # Phase 1: Migrate House HTML data
        logger.info("📋 Phase 1: House HTML Data Migration")
        usc_sections = self.migrate_house_html_data(public_laws)
        results["migration_phases"]["html_migration"] = {
            "laws_processed": len(usc_sections),
            "sections_extracted": sum(len(sections) for sections in usc_sections.values())
        }

        # Phase 2: Migrate Congress.gov API data
        logger.info("📋 Phase 2: Congress.gov API Data Migration")
        bill_data = self.migrate_congress_api_data(public_laws)
        results["migration_phases"]["api_migration"] = {
            "bills_processed": len(bill_data)
        }

        # Phase 3: Cross-reference and validate
        logger.info("📋 Phase 3: Cross-Reference and Validation")
        validation_results = self.cross_reference_and_validate(usc_sections, bill_data)
        results["migration_phases"]["validation"] = validation_results

        # Phase 4: Integrate with datastore
        logger.info("📋 Phase 4: Datastore Integration")
        integration_results = self.integrate_with_datastore(usc_sections, bill_data, validation_results)
        results["migration_phases"]["integration"] = integration_results

        # Final statistics
        migration_stats = self.get_migration_statistics()
        results["migration_statistics"] = migration_stats

        logger.info("🎉 Full migration complete!")
        logger.info("📊 Summary:")
        logger.info(f"   • Duration: {migration_stats['migration_duration_formatted']}")
        logger.info(f"   • HTML files: {migration_stats['html_files_processed']}")
        logger.info(f"   • USC sections: {migration_stats['usc_sections_extracted']}")
        logger.info(f"   • API bills: {migration_stats['api_bills_processed']}")
        logger.info(f"   • Cross-references: {migration_stats['cross_references_resolved']}")

        return results


def main():
    """Example usage of the data migrator"""

    # Initialize migrator
    migrator = DataMigrator()

    # Example: Migrate recent public laws
    public_laws = ["119-001", "119-004", "119-012", "119-018", "119-023", "119-026"]

    logger.info("🚀 Starting USC data migration process")

    # Run full migration
    results = migrator.run_full_migration(public_laws)

    # Display results
    print("\n" + "="*60)
    print("📊 MIGRATION RESULTS")
    print("="*60)

    for phase_name, phase_results in results["migration_phases"].items():
        print(f"\n{phase_name.upper()}:")
        for key, value in phase_results.items():
            if isinstance(value, list) and len(value) > 10:
                print(f"  {key}: {len(value)} items")
            elif isinstance(value, float):
                print(f"  {key}: {value:.2%}" if "rate" in key else f"  {key}: {value:.2f}")
            else:
                print(f"  {key}: {value}")

    stats = results["migration_statistics"]
    print(f"\n⏱️  Total Duration: {stats['migration_duration_formatted']}")
    print("✅ Migration completed successfully!")


if __name__ == "__main__":
    main()