From e6b2a174c511b8bb95cac174841f35e7fd8ca828 Mon Sep 17 00:00:00 2001 From: "yunpeng.zhang" Date: Fri, 16 Jan 2026 14:47:13 +0800 Subject: [PATCH] feat: add manifest read/write --- tcga_downloader/manifest.py | 57 +++++++++++++++++++++++++++++++++++++ tests/test_manifest.py | 18 ++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tcga_downloader/manifest.py create mode 100644 tests/test_manifest.py diff --git a/tcga_downloader/manifest.py b/tcga_downloader/manifest.py new file mode 100644 index 0000000..ea02456 --- /dev/null +++ b/tcga_downloader/manifest.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import csv +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List + +REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"] + + +@dataclass(frozen=True) +class ManifestRecord: + file_id: str + file_name: str + data_type: str + data_format: str + size: int + md5: str + + +def _validate_record(rec: ManifestRecord) -> None: + if not rec.file_id or not rec.file_name: + raise ValueError("file_id and file_name are required") + if rec.size < 0: + raise ValueError("size must be non-negative") + + +def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None: + path = Path(path) + if fmt not in {"tsv", "json"}: + raise ValueError("fmt must be 'tsv' or 'json'") + records = list(records) + for rec in records: + _validate_record(rec) + + if fmt == "json": + data = [rec.__dict__ for rec in records] + path.write_text(json.dumps(data, indent=2)) + return + + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS, delimiter="\t") + writer.writeheader() + for rec in records: + writer.writerow(rec.__dict__) + + +def load_manifest(path: Path) -> List[ManifestRecord]: + path = Path(path) + if path.suffix.lower() == ".json": + data = json.loads(path.read_text()) + return [ManifestRecord(**row) for row in data] + + with path.open("r", newline="") as f: + reader = csv.DictReader(f, delimiter="\t") + return [ManifestRecord(**row) for row in reader] diff --git a/tests/test_manifest.py b/tests/test_manifest.py new file mode 100644 index 0000000..ebf81ff --- /dev/null +++ b/tests/test_manifest.py @@ -0,0 +1,18 @@ +from tcga_downloader.manifest import ManifestRecord, write_manifest, load_manifest + + +def test_manifest_roundtrip_tsv(tmp_path): + records = [ + ManifestRecord( + file_id="f1", + file_name="a.tsv", + data_type="Gene Expression", + data_format="TSV", + size=123, + md5="abc", + ) + ] + path = tmp_path / "m.tsv" + write_manifest(records, path, fmt="tsv") + loaded = load_manifest(path) + assert loaded == records