338 lines
13 KiB
Python
338 lines
13 KiB
Python
"""
|
|
Filesystem-based JSON datastore for US Code git repository system.
|
|
Provides persistent storage with validation and caching.
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any, Type, TypeVar, Generic
|
|
from pydantic import BaseModel
|
|
|
|
from models import (
|
|
PublicLaw, Sponsor, Bill, USCodeRelease, CongressionalSession,
|
|
GitCommitMetadata, APICache, RepositoryMetadata
|
|
)
|
|
|
|
T = TypeVar('T', bound=BaseModel)
|
|
|
|
|
|
class DataStore(Generic[T]):
|
|
"""Generic filesystem-based datastore for Pydantic models"""
|
|
|
|
def __init__(self, model_class: Type[T], base_path: Path, collection_name: str):
|
|
self.model_class = model_class
|
|
self.base_path = Path(base_path)
|
|
self.collection_name = collection_name
|
|
self.collection_path = self.base_path / collection_name
|
|
|
|
# Ensure directory exists
|
|
self.collection_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Index file for quick lookups
|
|
self.index_file = self.collection_path / "_index.json"
|
|
self._index = self._load_index()
|
|
|
|
def _load_index(self) -> Dict[str, Dict[str, Any]]:
|
|
"""Load the index file"""
|
|
if self.index_file.exists():
|
|
with open(self.index_file, 'r') as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
def _save_index(self):
|
|
"""Save the index file"""
|
|
with open(self.index_file, 'w') as f:
|
|
json.dump(self._index, f, indent=2, default=str)
|
|
|
|
def _get_file_path(self, key: str) -> Path:
|
|
"""Get file path for a given key"""
|
|
return self.collection_path / f"{key}.json"
|
|
|
|
def save(self, key: str, item: T, metadata: Optional[Dict[str, Any]] = None) -> bool:
|
|
"""Save an item to the datastore"""
|
|
try:
|
|
# Serialize the item
|
|
item_data = item.dict()
|
|
|
|
# Add metadata
|
|
file_data = {
|
|
"data": item_data,
|
|
"metadata": {
|
|
"saved_at": datetime.now().isoformat(),
|
|
"model_class": self.model_class.__name__,
|
|
**(metadata or {})
|
|
}
|
|
}
|
|
|
|
# Save to file
|
|
file_path = self._get_file_path(key)
|
|
with open(file_path, 'w') as f:
|
|
json.dump(file_data, f, indent=2, default=str)
|
|
|
|
# Update index
|
|
self._index[key] = {
|
|
"file_path": str(file_path.relative_to(self.base_path)),
|
|
"model_class": self.model_class.__name__,
|
|
"saved_at": datetime.now().isoformat(),
|
|
**(metadata or {})
|
|
}
|
|
self._save_index()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"[!] Error saving {key}: {e}")
|
|
return False
|
|
|
|
def load(self, key: str) -> Optional[T]:
|
|
"""Load an item from the datastore"""
|
|
try:
|
|
file_path = self._get_file_path(key)
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
with open(file_path, 'r') as f:
|
|
file_data = json.load(f)
|
|
|
|
# Validate and create model instance
|
|
item_data = file_data.get("data", {})
|
|
return self.model_class(**item_data)
|
|
|
|
except Exception as e:
|
|
print(f"[!] Error loading {key}: {e}")
|
|
return None
|
|
|
|
def exists(self, key: str) -> bool:
|
|
"""Check if an item exists"""
|
|
return key in self._index
|
|
|
|
def list_keys(self) -> List[str]:
|
|
"""List all keys in the datastore"""
|
|
return list(self._index.keys())
|
|
|
|
def delete(self, key: str) -> bool:
|
|
"""Delete an item from the datastore"""
|
|
try:
|
|
file_path = self._get_file_path(key)
|
|
if file_path.exists():
|
|
file_path.unlink()
|
|
|
|
if key in self._index:
|
|
del self._index[key]
|
|
self._save_index()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"[!] Error deleting {key}: {e}")
|
|
return False
|
|
|
|
def count(self) -> int:
|
|
"""Count items in the datastore"""
|
|
return len(self._index)
|
|
|
|
def find_by_metadata(self, **filters) -> List[str]:
|
|
"""Find keys by metadata filters"""
|
|
matching_keys = []
|
|
for key, index_entry in self._index.items():
|
|
match = True
|
|
for filter_key, filter_value in filters.items():
|
|
if index_entry.get(filter_key) != filter_value:
|
|
match = False
|
|
break
|
|
if match:
|
|
matching_keys.append(key)
|
|
return matching_keys
|
|
|
|
|
|
class USCodeDataStore:
|
|
"""Main datastore for US Code repository data"""
|
|
|
|
def __init__(self, base_path: str = "data"):
|
|
self.base_path = Path(base_path)
|
|
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize individual datastores
|
|
self.public_laws = DataStore[PublicLaw](PublicLaw, self.base_path, "public_laws")
|
|
self.sponsors = DataStore[Sponsor](Sponsor, self.base_path, "sponsors")
|
|
self.bills = DataStore[Bill](Bill, self.base_path, "bills")
|
|
self.releases = DataStore[USCodeRelease](USCodeRelease, self.base_path, "releases")
|
|
self.sessions = DataStore[CongressionalSession](CongressionalSession, self.base_path, "sessions")
|
|
self.commits = DataStore[GitCommitMetadata](GitCommitMetadata, self.base_path, "commits")
|
|
self.api_cache = DataStore[APICache](APICache, self.base_path, "api_cache")
|
|
self.metadata = DataStore[RepositoryMetadata](RepositoryMetadata, self.base_path, "metadata")
|
|
|
|
# Public Law operations
|
|
def save_public_law(self, law: PublicLaw) -> bool:
|
|
"""Save a public law"""
|
|
key = f"{law.congress}-{law.law_number:03d}"
|
|
metadata = {
|
|
"congress": law.congress,
|
|
"law_number": law.law_number,
|
|
"enacted_date": law.enacted_date.isoformat()
|
|
}
|
|
return self.public_laws.save(key, law, metadata)
|
|
|
|
def get_public_law(self, congress: int, law_number: int) -> Optional[PublicLaw]:
|
|
"""Get a specific public law"""
|
|
key = f"{congress}-{law_number:03d}"
|
|
return self.public_laws.load(key)
|
|
|
|
def get_public_laws_by_congress(self, congress: int) -> List[PublicLaw]:
|
|
"""Get all public laws for a congress"""
|
|
keys = self.public_laws.find_by_metadata(congress=congress)
|
|
laws = []
|
|
for key in keys:
|
|
law = self.public_laws.load(key)
|
|
if law:
|
|
laws.append(law)
|
|
return sorted(laws, key=lambda x: x.law_number)
|
|
|
|
# Sponsor operations
|
|
def save_sponsor(self, sponsor: Sponsor) -> bool:
|
|
"""Save a sponsor"""
|
|
chamber_val = sponsor.chamber if isinstance(sponsor.chamber, str) else sponsor.chamber.value
|
|
party_val = sponsor.party if isinstance(sponsor.party, str) else sponsor.party.value
|
|
|
|
key = f"{chamber_val.lower()}_{sponsor.state}_{sponsor.last_name.lower()}_{sponsor.first_name.lower()}"
|
|
metadata = {
|
|
"chamber": chamber_val,
|
|
"state": sponsor.state,
|
|
"party": party_val,
|
|
"full_name": sponsor.full_name
|
|
}
|
|
return self.sponsors.save(key, sponsor, metadata)
|
|
|
|
def find_sponsor_by_name(self, full_name: str) -> Optional[Sponsor]:
|
|
"""Find a sponsor by full name"""
|
|
for key in self.sponsors.list_keys():
|
|
sponsor = self.sponsors.load(key)
|
|
if sponsor and sponsor.full_name == full_name:
|
|
return sponsor
|
|
return None
|
|
|
|
# API Cache operations
|
|
def save_api_cache(self, congress: int, law_number: int, response_data: Dict[str, Any], sponsor: Optional[Sponsor] = None) -> bool:
|
|
"""Save API cache entry"""
|
|
cache_key = f"{congress}-{law_number}"
|
|
cache_entry = APICache(
|
|
cache_key=cache_key,
|
|
congress=congress,
|
|
law_number=law_number,
|
|
cached_date=datetime.now(),
|
|
api_response=response_data,
|
|
sponsor_found=sponsor is not None,
|
|
sponsor=sponsor
|
|
)
|
|
return self.api_cache.save(cache_key, cache_entry)
|
|
|
|
def get_api_cache(self, congress: int, law_number: int) -> Optional[APICache]:
|
|
"""Get cached API response"""
|
|
cache_key = f"{congress}-{law_number}"
|
|
return self.api_cache.load(cache_key)
|
|
|
|
# US Code Release operations
|
|
def save_release(self, release: USCodeRelease) -> bool:
|
|
"""Save a US Code release"""
|
|
key = f"{release.public_law.congress}-{release.public_law.law_number:03d}"
|
|
metadata = {
|
|
"congress": release.public_law.congress,
|
|
"law_number": release.public_law.law_number,
|
|
"release_filename": release.release_filename
|
|
}
|
|
return self.releases.save(key, release, metadata)
|
|
|
|
def get_release(self, congress: int, law_number: int) -> Optional[USCodeRelease]:
|
|
"""Get a US Code release"""
|
|
key = f"{congress}-{law_number:03d}"
|
|
return self.releases.load(key)
|
|
|
|
# Git commit operations
|
|
def save_commit_metadata(self, commit: GitCommitMetadata) -> bool:
|
|
"""Save git commit metadata"""
|
|
key = commit.commit_hash[:8] # Use short hash as key
|
|
metadata = {
|
|
"congress": commit.public_law.congress,
|
|
"law_number": commit.public_law.law_number,
|
|
"commit_date": commit.commit_date.isoformat()
|
|
}
|
|
return self.commits.save(key, commit, metadata)
|
|
|
|
def get_commits_by_congress(self, congress: int) -> List[GitCommitMetadata]:
|
|
"""Get all commits for a congress"""
|
|
keys = self.commits.find_by_metadata(congress=congress)
|
|
commits = []
|
|
for key in keys:
|
|
commit = self.commits.load(key)
|
|
if commit:
|
|
commits.append(commit)
|
|
return sorted(commits, key=lambda x: x.commit_date)
|
|
|
|
# Bulk operations
|
|
def import_house_data(self, house_data_file: Path) -> int:
|
|
"""Import public laws from House JSON data"""
|
|
with open(house_data_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
imported_count = 0
|
|
for law_data in data['public_laws']:
|
|
try:
|
|
from models import create_public_law_from_house_data
|
|
law = create_public_law_from_house_data(law_data)
|
|
if self.save_public_law(law):
|
|
imported_count += 1
|
|
except Exception as e:
|
|
print(f"[!] Error importing law {law_data}: {e}")
|
|
|
|
return imported_count
|
|
|
|
# Statistics and reporting
|
|
def get_statistics(self) -> Dict[str, Any]:
|
|
"""Get datastore statistics"""
|
|
return {
|
|
"public_laws": self.public_laws.count(),
|
|
"sponsors": self.sponsors.count(),
|
|
"bills": self.bills.count(),
|
|
"releases": self.releases.count(),
|
|
"sessions": self.sessions.count(),
|
|
"commits": self.commits.count(),
|
|
"api_cache_entries": self.api_cache.count(),
|
|
"total_files": sum([
|
|
self.public_laws.count(),
|
|
self.sponsors.count(),
|
|
self.bills.count(),
|
|
self.releases.count(),
|
|
self.sessions.count(),
|
|
self.commits.count(),
|
|
self.api_cache.count()
|
|
])
|
|
}
|
|
|
|
def validate_integrity(self) -> Dict[str, List[str]]:
|
|
"""Validate datastore integrity"""
|
|
issues = {
|
|
"missing_files": [],
|
|
"corrupted_files": [],
|
|
"orphaned_entries": []
|
|
}
|
|
|
|
# Check each datastore
|
|
for name, datastore in [
|
|
("public_laws", self.public_laws),
|
|
("sponsors", self.sponsors),
|
|
("bills", self.bills),
|
|
("releases", self.releases),
|
|
("sessions", self.sessions),
|
|
("commits", self.commits),
|
|
("api_cache", self.api_cache)
|
|
]:
|
|
for key in datastore.list_keys():
|
|
try:
|
|
item = datastore.load(key)
|
|
if item is None:
|
|
issues["missing_files"].append(f"{name}/{key}")
|
|
except Exception:
|
|
issues["corrupted_files"].append(f"{name}/{key}")
|
|
|
|
return issues |