Source code for sim_panel.sources.amazon_reviews_2023.config

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Literal, Optional

from sim_panel.sources.types import SourceConfig


[docs] @dataclass(slots=True) class AmazonReviews2023Config(SourceConfig): """ Configuration for the Amazon Reviews'23 source importer. Design choices for v0 --------------------- - products.jsonl is built from the item metadata file - product_id is the parent/family identifier: parent_asin - events.jsonl may still retain child asin as source provenance - all rows in the provided metadata file are exported as products - textual review content is mapped into the single event-level `traces` dict - `t` is derived from timestamps, defaulting to within-panelist sequence order Time index modes ---------------- - panelist_sequence: default and recommended; assigns t = 0, 1, ... within each panelist after chronological sorting of that panelist's reviews - raw_timestamp: uses the source timestamp directly - global_sequence: assigns a corpus-wide chronological sequence; supported in in-memory mode but not yet in streaming mode """ reviews_path: Path = Path() metadata_path: Path = Path() category: Optional[str] = None import_mode: Literal["in_memory", "streaming"] = "in_memory" require_metadata_match_for_events: bool = False trace_field_map: Dict[str, str] = field( default_factory=lambda: { "title": "review_title", "text": "review_text", } ) time_index_mode: Literal["panelist_sequence", "global_sequence", "raw_timestamp"] = ( "panelist_sequence" ) product_description_fallback_to_features: bool = True include_raw_product_meta: bool = True include_raw_review_meta: bool = True min_reviews_per_persona: int = 1 max_reviews: Optional[int] = None max_metadata_rows: Optional[int] = None def __post_init__(self) -> None: if not self.name: self.name = "amazon_reviews_2023" @property def product_id_field(self) -> str: return "parent_asin"
[docs] @classmethod def from_dict(cls, data: Dict[str, Any]) -> "AmazonReviews2023Config": payload = dict(data) payload["reviews_path"] = Path(payload["reviews_path"]) payload["metadata_path"] = Path(payload["metadata_path"]) if "output_dir" in payload and payload["output_dir"] is not None: payload["output_dir"] = Path(payload["output_dir"]) if "trace_field_map" in payload and payload["trace_field_map"] is not None: payload["trace_field_map"] = dict(payload["trace_field_map"]) return cls(**payload)