import json from venv import logger import sys import requests from pathlib import Path from typing import Any GOV_BULK_SITE="https://www.govinfo.gov/bulkdata" CACHE_DIR = "cache" def scrape(page: str, cache_dir: Path): """ Main entry point for the scraping process. This function orchestrates the scraping of various data sources. """ # Get page. cached_page = cache_dir / "page.json" if cached_page.exists(): with open(cached_page, 'r', encoding='utf-8') as f: body = json.load(f) else: cache_dir.mkdir(parents=True, exist_ok=True) try: response = requests.get(page, headers={"User-Agent": "GitLawScraper/1.0", "Accept": "application/json"}, timeout=30) response.raise_for_status() if 'application/json' in response.headers.get('Content-Type', ''): body = response.json() with open(cached_page, 'w', encoding='utf-8') as f: json.dump(body, f) else: print(f"Non-JSON response from {page}") return print(f"Cached resource: {page}") except requests.RequestException as e: print(f"❌ Failed to fetch resource {page}: {e}") return files: list[dict[str, Any]] = body.get('files', []) # Look for a zip file if we're in a new directory. in_new_dir = len(list(cache_dir.glob('*'))) == 1 if in_new_dir: zip_file = next((f for f in files if f.get("mimeType") == "application/zip"), None) if zip_file: print(f"📦 Downloading zip file: {zip_file['link']}") zip_url = zip_file.get('link') if zip_url: try: # Download the zip file. response = requests.get(zip_url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30) response.raise_for_status() zip_path = cache_dir / zip_file['justFileName'] with open(zip_path, 'wb') as f: f.write(response.content) print(f"✅ Downloaded zip file: {zip_file['link']}") # Unzip the file. import zipfile with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(cache_dir) print(f"✅ Unzipped files to {cache_dir}") except requests.RequestException as e: print(f"❌ Failed to download zip file {zip_file['justFileName']}: {e}") else: print("No zip file found, continuing with individual files.") for file in files: # Download non-folder files directly. if not file.get("folder", False): url = file.get('link') if url: file_path = cache_dir / file['justFileName'] if file_path.exists(): print(f"✅ File already exists: {file['justFileName']}") continue print(f"📥 Downloading file: {file['justFileName']} from {url}") try: response = requests.get(url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30) response.raise_for_status() with open(file_path, 'wb') as f: f.write(response.content) print(f"✅ Downloaded file: {file['justFileName']}") except requests.RequestException as e: print(f"❌ Failed to download file {file['justFileName']}: {e}") continue # Recursively scrape folders. scrape(file['link'], cache_dir / file['justFileName']) def main(): print("🚀 Starting scraping process for US Code data...") cache_dir = Path(CACHE_DIR) if not cache_dir.exists(): cache_dir.mkdir(parents=True, exist_ok=True) try: scrape("https://www.govinfo.gov/bulkdata/json/BILLS", cache_dir / "BILLS") scrape("https://www.govinfo.gov/bulkdata/json/BILLSTATUS", cache_dir / "BILLSTATUS") scrape("https://www.govinfo.gov/bulkdata/json/BILLSUM", cache_dir / "BILLSUM") scrape("https://www.govinfo.gov/bulkdata/json/PLAW", cache_dir / "PLAW" ) scrape("https://www.govinfo.gov/bulkdata/json/STATUTES", cache_dir / "STATUTES" ) except Exception as e: logger.error(f"❌ An error occurred during scraping: {e}") sys.exit(1) print("🎉 Scraping completed without errors") if __name__ == "__main__": main()