tcga-downloader/tcga_downloader/manifest.py
2026-01-16 14:47:13 +08:00

58 lines
1.6 KiB
Python

from __future__ import annotations
import csv
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List
REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"]
@dataclass(frozen=True)
class ManifestRecord:
file_id: str
file_name: str
data_type: str
data_format: str
size: int
md5: str
def _validate_record(rec: ManifestRecord) -> None:
if not rec.file_id or not rec.file_name:
raise ValueError("file_id and file_name are required")
if rec.size < 0:
raise ValueError("size must be non-negative")
def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None:
path = Path(path)
if fmt not in {"tsv", "json"}:
raise ValueError("fmt must be 'tsv' or 'json'")
records = list(records)
for rec in records:
_validate_record(rec)
if fmt == "json":
data = [rec.__dict__ for rec in records]
path.write_text(json.dumps(data, indent=2))
return
with path.open("w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS, delimiter="\t")
writer.writeheader()
for rec in records:
writer.writerow(rec.__dict__)
def load_manifest(path: Path) -> List[ManifestRecord]:
path = Path(path)
if path.suffix.lower() == ".json":
data = json.loads(path.read_text())
return [ManifestRecord(**row) for row in data]
with path.open("r", newline="") as f:
reader = csv.DictReader(f, delimiter="\t")
return [ManifestRecord(**row) for row in reader]