git-law/download_cache.py


import json
from venv import logger
import sys
import requests
from pathlib import Path
from typing import Any

GOV_BULK_SITE="https://www.govinfo.gov/bulkdata"
CACHE_DIR = "cache"

def scrape(page: str, cache_dir: Path):
    """
    Main entry point for the scraping process.
    This function orchestrates the scraping of various data sources.
    """

    # Get page.
    cached_page = cache_dir / "page.json"
    if cached_page.exists():
        with open(cached_page, 'r', encoding='utf-8') as f:
            body = json.load(f)
    else:
        cache_dir.mkdir(parents=True, exist_ok=True)
        try:
            response = requests.get(page, headers={"User-Agent": "GitLawScraper/1.0", "Accept": "application/json"}, timeout=30)
            response.raise_for_status()
            if 'application/json' in response.headers.get('Content-Type', ''):
                body = response.json()
                with open(cached_page, 'w', encoding='utf-8') as f:
                    json.dump(body, f)
            else:
                print(f"Non-JSON response from {page}")
                return
            print(f"Cached resource: {page}")
        except requests.RequestException as e:
            print(f"❌ Failed to fetch resource {page}: {e}")
            return

    files: list[dict[str, Any]] = body.get('files', [])

    # Look for a zip file if we're in a new directory.
    in_new_dir = len(list(cache_dir.glob('*'))) == 1
    if in_new_dir:
        zip_file = next((f for f in files if f.get("mimeType") == "application/zip"), None)
        if zip_file:
            print(f"📦 Downloading zip file: {zip_file['link']}")
            zip_url = zip_file.get('link')
            if zip_url:
                try:
                    # Download the zip file.
                    response = requests.get(zip_url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
                    response.raise_for_status()
                    zip_path = cache_dir / zip_file['justFileName']
                    with open(zip_path, 'wb') as f:
                        f.write(response.content)
                    print(f"✅ Downloaded zip file: {zip_file['link']}")

                    # Unzip the file.
                    import zipfile
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(cache_dir)
                    print(f"✅ Unzipped files to {cache_dir}")

                except requests.RequestException as e:
                    print(f"❌ Failed to download zip file {zip_file['justFileName']}: {e}")
        else:
            print("No zip file found, continuing with individual files.")

    for file in files:

        # Download non-folder files directly.
        if not file.get("folder", False):
            url = file.get('link')
            if url:

                file_path = cache_dir / file['justFileName']
                if file_path.exists():
                    print(f"✅ File already exists: {file['justFileName']}")
                    continue

                print(f"📥 Downloading file: {file['justFileName']} from {url}")
                try:
                    response = requests.get(url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
                    response.raise_for_status()
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                    print(f"✅ Downloaded file: {file['justFileName']}")
                except requests.RequestException as e:
                    print(f"❌ Failed to download file {file['justFileName']}: {e}")
            continue

        # Recursively scrape folders.
        scrape(file['link'], cache_dir / file['justFileName'])

def main():
    print("🚀 Starting scraping process for US Code data...")
    cache_dir = Path(CACHE_DIR)
    if not cache_dir.exists():
        cache_dir.mkdir(parents=True, exist_ok=True)

    try:
        scrape("https://www.govinfo.gov/bulkdata/json/BILLS", cache_dir / "BILLS")
        scrape("https://www.govinfo.gov/bulkdata/json/BILLSTATUS", cache_dir / "BILLSTATUS")
        scrape("https://www.govinfo.gov/bulkdata/json/BILLSUM", cache_dir / "BILLSUM")
        scrape("https://www.govinfo.gov/bulkdata/json/PLAW", cache_dir / "PLAW" )
        scrape("https://www.govinfo.gov/bulkdata/json/STATUTES", cache_dir / "STATUTES" )
    except Exception as e:
        logger.error(f"❌ An error occurred during scraping: {e}")
        sys.exit(1)

    print("🎉 Scraping completed without errors")


if __name__ == "__main__":
    main()