from __future__ import annotations import csv import json from dataclasses import dataclass from pathlib import Path from typing import Iterable, List REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"] @dataclass(frozen=True) class ManifestRecord: file_id: str file_name: str data_type: str data_format: str size: int md5: str def _validate_record(rec: ManifestRecord) -> None: if not rec.file_id or not rec.file_name: raise ValueError("file_id and file_name are required") if rec.size < 0: raise ValueError("size must be non-negative") def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None: path = Path(path) if fmt not in {"tsv", "json"}: raise ValueError("fmt must be 'tsv' or 'json'") records = list(records) for rec in records: _validate_record(rec) if fmt == "json": data = [rec.__dict__ for rec in records] path.write_text(json.dumps(data, indent=2)) return with path.open("w", newline="") as f: writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS, delimiter="\t") writer.writeheader() for rec in records: writer.writerow(rec.__dict__) def load_manifest(path: Path) -> List[ManifestRecord]: path = Path(path) if path.suffix.lower() == ".json": data = json.loads(path.read_text()) return [ManifestRecord(**row) for row in data] with path.open("r", newline="") as f: reader = csv.DictReader(f, delimiter="\t") records: List[ManifestRecord] = [] for row in reader: row["size"] = int(row["size"]) records.append(ManifestRecord(**row)) return records