Source code for sim_panel.benchmarks.config

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Mapping, Optional

import yaml


[docs] @dataclass(frozen=True) class BenchmarkSubsetConfig: """ Configuration for exporting a benchmark-ready real-data subset. Parameters ---------- import_dir Directory containing imported source artifacts, expected to include at least ``events.jsonl`` and usually ``products.jsonl``. output_dir Directory to write the frozen benchmark subset. seed Random seed for reproducible product sampling. min_reviews_per_product Minimum number of rating-bearing events a product must have to be eligible for the subset. max_products Maximum number of products to keep. If None, keep all eligible products. require_product_record If True, only keep products that also appear in ``products.jsonl``. """ import_dir: str output_dir: str seed: int = 0 min_reviews_per_product: int = 25 max_products: Optional[int] = 100 require_product_record: bool = True
[docs] def load_benchmark_subset_config(path: str | Path) -> BenchmarkSubsetConfig: """ Load a benchmark subset config from YAML. Accepts either: - a top-level mapping containing benchmark_subset: {...} - or the benchmark_subset fields directly at the top level """ raw = _read_yaml(path) section = raw.get("benchmark_subset", raw) return BenchmarkSubsetConfig( import_dir=_required_str(section, "import_dir"), output_dir=_required_str(section, "output_dir"), seed=_coerce_int(section.get("seed", 0), field_name="seed"), min_reviews_per_product=_coerce_int( section.get("min_reviews_per_product", 25), field_name="min_reviews_per_product", ), max_products=_coerce_optional_int( section.get("max_products", 100), field_name="max_products", ), require_product_record=_coerce_bool( section.get("require_product_record", True), field_name="require_product_record", ), )
def _read_yaml(path: str | Path) -> Mapping[str, Any]: yaml_path = Path(path) with yaml_path.open("r", encoding="utf-8") as fp: data = yaml.safe_load(fp) or {} if not isinstance(data, Mapping): raise ValueError(f"Expected YAML mapping at {yaml_path}, got: {type(data).__name__}") return data def _required_str(section: Mapping[str, Any], key: str) -> str: value = section.get(key) if value is None: raise ValueError(f"Missing required benchmark subset config field: {key}") if not isinstance(value, str) or not value.strip(): raise ValueError(f"Field {key} must be a non-empty string.") return value def _coerce_int(value: Any, *, field_name: str) -> int: try: return int(value) except (TypeError, ValueError) as exc: raise ValueError(f"Field {field_name} must be an integer.") from exc def _coerce_optional_int(value: Any, *, field_name: str) -> Optional[int]: if value is None: return None try: return int(value) except (TypeError, ValueError) as exc: raise ValueError(f"Field {field_name} must be an integer or null.") from exc def _coerce_bool(value: Any, *, field_name: str) -> bool: if isinstance(value, bool): return value raise ValueError(f"Field {field_name} must be a boolean.")