116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
|
|
import json
|
|
from venv import logger
|
|
import sys
|
|
import requests
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
GOV_BULK_SITE="https://www.govinfo.gov/bulkdata"
|
|
CACHE_DIR = "cache"
|
|
|
|
def scrape(page: str, cache_dir: Path):
|
|
"""
|
|
Main entry point for the scraping process.
|
|
This function orchestrates the scraping of various data sources.
|
|
"""
|
|
|
|
# Get page.
|
|
cached_page = cache_dir / "page.json"
|
|
if cached_page.exists():
|
|
with open(cached_page, 'r', encoding='utf-8') as f:
|
|
body = json.load(f)
|
|
else:
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
response = requests.get(page, headers={"User-Agent": "GitLawScraper/1.0", "Accept": "application/json"}, timeout=30)
|
|
response.raise_for_status()
|
|
if 'application/json' in response.headers.get('Content-Type', ''):
|
|
body = response.json()
|
|
with open(cached_page, 'w', encoding='utf-8') as f:
|
|
json.dump(body, f)
|
|
else:
|
|
print(f"Non-JSON response from {page}")
|
|
return
|
|
print(f"Cached resource: {page}")
|
|
except requests.RequestException as e:
|
|
print(f"❌ Failed to fetch resource {page}: {e}")
|
|
return
|
|
|
|
files: list[dict[str, Any]] = body.get('files', [])
|
|
|
|
# Look for a zip file if we're in a new directory.
|
|
in_new_dir = len(list(cache_dir.glob('*'))) == 1
|
|
if in_new_dir:
|
|
zip_file = next((f for f in files if f.get("mimeType") == "application/zip"), None)
|
|
if zip_file:
|
|
print(f"📦 Downloading zip file: {zip_file['link']}")
|
|
zip_url = zip_file.get('link')
|
|
if zip_url:
|
|
try:
|
|
# Download the zip file.
|
|
response = requests.get(zip_url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
|
|
response.raise_for_status()
|
|
zip_path = cache_dir / zip_file['justFileName']
|
|
with open(zip_path, 'wb') as f:
|
|
f.write(response.content)
|
|
print(f"✅ Downloaded zip file: {zip_file['link']}")
|
|
|
|
# Unzip the file.
|
|
import zipfile
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(cache_dir)
|
|
print(f"✅ Unzipped files to {cache_dir}")
|
|
|
|
except requests.RequestException as e:
|
|
print(f"❌ Failed to download zip file {zip_file['justFileName']}: {e}")
|
|
else:
|
|
print("No zip file found, continuing with individual files.")
|
|
|
|
for file in files:
|
|
|
|
# Download non-folder files directly.
|
|
if not file.get("folder", False):
|
|
url = file.get('link')
|
|
if url:
|
|
|
|
file_path = cache_dir / file['justFileName']
|
|
if file_path.exists():
|
|
print(f"✅ File already exists: {file['justFileName']}")
|
|
continue
|
|
|
|
print(f"📥 Downloading file: {file['justFileName']} from {url}")
|
|
try:
|
|
response = requests.get(url, headers={"User-Agent": "GitLawScraper/1.0"}, timeout=30)
|
|
response.raise_for_status()
|
|
with open(file_path, 'wb') as f:
|
|
f.write(response.content)
|
|
print(f"✅ Downloaded file: {file['justFileName']}")
|
|
except requests.RequestException as e:
|
|
print(f"❌ Failed to download file {file['justFileName']}: {e}")
|
|
continue
|
|
|
|
# Recursively scrape folders.
|
|
scrape(file['link'], cache_dir / file['justFileName'])
|
|
|
|
def main():
|
|
print("🚀 Starting scraping process for US Code data...")
|
|
cache_dir = Path(CACHE_DIR)
|
|
if not cache_dir.exists():
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
scrape("https://www.govinfo.gov/bulkdata/json/BILLS", cache_dir / "BILLS")
|
|
scrape("https://www.govinfo.gov/bulkdata/json/BILLSTATUS", cache_dir / "BILLSTATUS")
|
|
scrape("https://www.govinfo.gov/bulkdata/json/BILLSUM", cache_dir / "BILLSUM")
|
|
scrape("https://www.govinfo.gov/bulkdata/json/PLAW", cache_dir / "PLAW" )
|
|
scrape("https://www.govinfo.gov/bulkdata/json/STATUTES", cache_dir / "STATUTES" )
|
|
except Exception as e:
|
|
logger.error(f"❌ An error occurred during scraping: {e}")
|
|
sys.exit(1)
|
|
|
|
print("🎉 Scraping completed without errors")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |