Files
git-law/download_cache.py

116 lines
4.5 KiB
Python

import json
from venv import logger
import sys
import requests
from pathlib import Path
from typing import Any
GOV_BULK_SITE="https://www.govinfo.gov/bulkdata"
CACHE_DIR = "cache"
def scrape(page: str, cache_dir: Path):
"""
Main entry point for the scraping process.
This function orchestrates the scraping of various data sources.
"""
# Get page.
cached_page = cache_dir / "page.json"
if cached_page.exists():
with open(cached_page, 'r', encoding='utf-8') as f:
body = json.load(f)
else:
cache_dir.mkdir(parents=True, exist_ok=True)
try:
response = requests.get(page, headers={"User-Agent": "GitLawScraper/1.0", "Accept": "application/json"}, timeout=30)
response.raise_for_status()
if 'application/json' in response.headers.get('Content-Type', ''):
body = response.json()
with open(cached_page, 'w', encoding='utf-8') as f:
json.dump(body, f)
else:
print(f"Non-JSON response from {page}")
return
print(f"Cached resource: {page}")
except requests.RequestException as e:
print(f"❌ Failed to fetch resource {page}: {e}")
return
files: list[dict[str, Any]] = body.get('files', [])
# Look for a zip file if we're in a new directory.
in_new_dir = len(list(cache_dir.glob('*'))) == 1
if in_new_dir:
zip_file = next((f for f in files if f.get("mimeType") == "application/zip"), None)
if zip_file:
print(f"📦 Downloading zip file: {zip_file['link']}")
zip_url = zip_file.get('link')
if zip_url:
try:
# Download the zip file.
response = requests.get(zip_url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
response.raise_for_status()
zip_path = cache_dir / zip_file['justFileName']
with open(zip_path, 'wb') as f:
f.write(response.content)
print(f"✅ Downloaded zip file: {zip_file['link']}")
# Unzip the file.
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(cache_dir)
print(f"✅ Unzipped files to {cache_dir}")
except requests.RequestException as e:
print(f"❌ Failed to download zip file {zip_file['justFileName']}: {e}")
else:
print("No zip file found, continuing with individual files.")
for file in files:
# Download non-folder files directly.
if not file.get("folder", False):
url = file.get('link')
if url:
file_path = cache_dir / file['justFileName']
if file_path.exists():
print(f"✅ File already exists: {file['justFileName']}")
continue
print(f"📥 Downloading file: {file['justFileName']} from {url}")
try:
response = requests.get(url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"✅ Downloaded file: {file['justFileName']}")
except requests.RequestException as e:
print(f"❌ Failed to download file {file['justFileName']}: {e}")
continue
# Recursively scrape folders.
scrape(file['link'], cache_dir / file['justFileName'])
def main():
print("🚀 Starting scraping process for US Code data...")
cache_dir = Path(CACHE_DIR)
if not cache_dir.exists():
cache_dir.mkdir(parents=True, exist_ok=True)
try:
scrape("https://www.govinfo.gov/bulkdata/json/BILLS", cache_dir / "BILLS")
scrape("https://www.govinfo.gov/bulkdata/json/BILLSTATUS", cache_dir / "BILLSTATUS")
scrape("https://www.govinfo.gov/bulkdata/json/BILLSUM", cache_dir / "BILLSUM")
scrape("https://www.govinfo.gov/bulkdata/json/PLAW", cache_dir / "PLAW" )
scrape("https://www.govinfo.gov/bulkdata/json/STATUTES", cache_dir / "STATUTES" )
except Exception as e:
logger.error(f"❌ An error occurred during scraping: {e}")
sys.exit(1)
print("🎉 Scraping completed without errors")
if __name__ == "__main__":
main()