diff --git a/docs/plans/2026-01-16-tcga-downloader-implementation.md b/docs/plans/2026-01-16-tcga-downloader-implementation.md new file mode 100644 index 0000000..92cdae6 --- /dev/null +++ b/docs/plans/2026-01-16-tcga-downloader-implementation.md @@ -0,0 +1,558 @@ +# TCGA Downloader Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Build a Python package + CLI to query public TCGA files, generate manifests, and download via `gdc-client` with retries, resume, and checksums. + +**Architecture:** A small Python package with three core modules (`query`, `manifest`, `download`) and an `argparse` CLI. Query uses the GDC REST API via `requests` to avoid hard-coupling to a specific SDK, while keeping the API surface thin for later swapping to `GenomicDataCommons` if desired. Downloads are delegated to `gdc-client`. + +**Tech Stack:** Python 3.11+, `requests`, `pytest`, `gdc-client` (external binary). + +### Task 1: Package Skeleton + Version + +**Files:** +- Create: `pyproject.toml` +- Create: `tcga_downloader/__init__.py` +- Create: `tests/test_version.py` + +**Step 1: Write the failing test** + +```python +# tests/test_version.py +from tcga_downloader import __version__ + +def test_version_present(): + assert __version__ +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_version.py -q` +Expected: FAIL with `ModuleNotFoundError` or missing `__version__`. + +**Step 3: Write minimal implementation** + +```toml +# pyproject.toml +[project] +name = "tcga-downloader" +version = "0.1.0" +description = "TCGA public data downloader" +requires-python = ">=3.11" +dependencies = ["requests>=2.31"] + +[project.optional-dependencies] +dev = ["pytest>=7.4"] + +[project.scripts] +tcga-downloader = "tcga_downloader.cli:main" +``` + +```python +# tcga_downloader/__init__.py +__version__ = "0.1.0" +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_version.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add pyproject.toml tcga_downloader/__init__.py tests/test_version.py +git commit -m "feat: add package skeleton" +``` + +### Task 2: Manifest Read/Write + Validation + +**Files:** +- Create: `tcga_downloader/manifest.py` +- Create: `tests/test_manifest.py` + +**Step 1: Write the failing test** + +```python +# tests/test_manifest.py +from tcga_downloader.manifest import ManifestRecord, write_manifest, load_manifest + +def test_manifest_roundtrip_tsv(tmp_path): + records = [ + ManifestRecord( + file_id="f1", + file_name="a.tsv", + data_type="Gene Expression", + data_format="TSV", + size=123, + md5="abc", + ) + ] + path = tmp_path / "m.tsv" + write_manifest(records, path, fmt="tsv") + loaded = load_manifest(path) + assert loaded == records +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_manifest.py -q` +Expected: FAIL with `ModuleNotFoundError`. + +**Step 3: Write minimal implementation** + +```python +# tcga_downloader/manifest.py +from __future__ import annotations + +import csv +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List + +REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"] + +@dataclass(frozen=True) +class ManifestRecord: + file_id: str + file_name: str + data_type: str + data_format: str + size: int + md5: str + + +def _validate_record(rec: ManifestRecord) -> None: + if not rec.file_id or not rec.file_name: + raise ValueError("file_id and file_name are required") + if rec.size < 0: + raise ValueError("size must be non-negative") + + +def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None: + path = Path(path) + if fmt not in {"tsv", "json"}: + raise ValueError("fmt must be 'tsv' or 'json'") + records = list(records) + for rec in records: + _validate_record(rec) + + if fmt == "json": + data = [rec.__dict__ for rec in records] + path.write_text(json.dumps(data, indent=2)) + return + + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS, delimiter="\t") + writer.writeheader() + for rec in records: + writer.writerow(rec.__dict__) + + +def load_manifest(path: Path) -> List[ManifestRecord]: + path = Path(path) + if path.suffix.lower() == ".json": + data = json.loads(path.read_text()) + return [ManifestRecord(**row) for row in data] + + with path.open("r", newline="") as f: + reader = csv.DictReader(f, delimiter="\t") + return [ManifestRecord(**row) for row in reader] +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_manifest.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add tcga_downloader/manifest.py tests/test_manifest.py +git commit -m "feat: add manifest read/write" +``` + +### Task 3: GDC Query (REST API) + +**Files:** +- Create: `tcga_downloader/query.py` +- Create: `tests/test_query.py` + +**Step 1: Write the failing test** + +```python +# tests/test_query.py +from tcga_downloader.query import build_filters + +def test_build_filters_project_and_type(): + filters = build_filters(project="TCGA-BRCA", data_type="Gene Expression") + assert filters["op"] == "and" + assert filters["content"][0]["content"]["field"] == "cases.project.project_id" +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_query.py -q` +Expected: FAIL with `ModuleNotFoundError`. + +**Step 3: Write minimal implementation** + +```python +# tcga_downloader/query.py +from __future__ import annotations + +import requests + +GDC_FILES_URL = "https://api.gdc.cancer.gov/files" + + +def build_filters(project: str, data_type: str) -> dict: + return { + "op": "and", + "content": [ + { + "op": "in", + "content": { + "field": "cases.project.project_id", + "value": [project], + }, + }, + { + "op": "in", + "content": { + "field": "data_type", + "value": [data_type], + }, + }, + ], + } + + +def query_files(project: str, data_type: str, fields: list[str] | None = None, size: int = 1000) -> list[dict]: + if fields is None: + fields = ["file_id", "file_name", "data_type", "data_format", "file_size", "md5sum"] + payload = { + "filters": build_filters(project, data_type), + "fields": ",".join(fields), + "format": "JSON", + "size": size, + } + resp = requests.post(GDC_FILES_URL, json=payload, timeout=30) + resp.raise_for_status() + data = resp.json() + return data.get("data", {}).get("hits", []) +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_query.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add tcga_downloader/query.py tests/test_query.py +git commit -m "feat: add GDC query via REST" +``` + +### Task 4: Download Runner (gdc-client) + +**Files:** +- Create: `tcga_downloader/download.py` +- Create: `tests/test_download.py` + +**Step 1: Write the failing test** + +```python +# tests/test_download.py +from pathlib import Path +from unittest.mock import patch + +from tcga_downloader.download import build_gdc_command + + +def test_build_gdc_command(): + cmd = build_gdc_command(Path("/tmp/m.tsv"), Path("/data"), processes=4, retries=3) + assert "gdc-client" in cmd[0] + assert "-m" in cmd +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_download.py -q` +Expected: FAIL with `ModuleNotFoundError`. + +**Step 3: Write minimal implementation** + +```python +# tcga_downloader/download.py +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path + + +def build_gdc_command(manifest_path: Path, out_dir: Path, processes: int, retries: int) -> list[str]: + return [ + "gdc-client", + "download", + "-m", + str(manifest_path), + "-d", + str(out_dir), + "--n-processes", + str(processes), + "--retry-count", + str(retries), + "--checksum", + ] + + +def run_gdc_download(manifest_path: Path, out_dir: Path, processes: int = 4, retries: int = 3) -> None: + if not shutil.which("gdc-client"): + raise RuntimeError("gdc-client not found in PATH") + cmd = build_gdc_command(manifest_path, out_dir, processes, retries) + subprocess.run(cmd, check=True) +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_download.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add tcga_downloader/download.py tests/test_download.py +git commit -m "feat: add gdc-client downloader" +``` + +### Task 5: CLI Wiring + +**Files:** +- Create: `tcga_downloader/cli.py` +- Create: `tests/test_cli.py` + +**Step 1: Write the failing test** + +```python +# tests/test_cli.py +from tcga_downloader.cli import build_parser + + +def test_cli_has_subcommands(): + parser = build_parser() + subparsers = parser._subparsers + assert subparsers is not None +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_cli.py -q` +Expected: FAIL with `ModuleNotFoundError`. + +**Step 3: Write minimal implementation** + +```python +# tcga_downloader/cli.py +from __future__ import annotations + +import argparse +from pathlib import Path + +from tcga_downloader.download import run_gdc_download +from tcga_downloader.manifest import ManifestRecord, write_manifest +from tcga_downloader.query import query_files + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="tcga-downloader") + sub = parser.add_subparsers(dest="command", required=True) + + q = sub.add_parser("query") + q.add_argument("--project", required=True) + q.add_argument("--data-type", required=True) + q.add_argument("--out", required=True) + q.add_argument("--format", choices=["tsv", "json"], default="tsv") + + d = sub.add_parser("download") + d.add_argument("--manifest", required=True) + d.add_argument("--out-dir", required=True) + d.add_argument("--processes", type=int, default=4) + d.add_argument("--retries", type=int, default=3) + + r = sub.add_parser("run") + r.add_argument("--project", required=True) + r.add_argument("--data-type", required=True) + r.add_argument("--out", required=True) + r.add_argument("--format", choices=["tsv", "json"], default="tsv") + r.add_argument("--out-dir", required=True) + r.add_argument("--processes", type=int, default=4) + r.add_argument("--retries", type=int, default=3) + + return parser + + +def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]: + records = [] + for h in hits: + records.append( + ManifestRecord( + file_id=h["file_id"], + file_name=h["file_name"], + data_type=h["data_type"], + data_format=h["data_format"], + size=int(h["file_size"]), + md5=h["md5sum"], + ) + ) + return records + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + + if args.command in {"query", "run"}: + hits = query_files(args.project, args.data_type) + records = _records_from_hits(hits) + write_manifest(records, Path(args.out), fmt=args.format) + + if args.command in {"download", "run"}: + run_gdc_download(Path(args.manifest if args.command == "download" else args.out), Path(args.out_dir), args.processes, args.retries) +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_cli.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add tcga_downloader/cli.py tests/test_cli.py +git commit -m "feat: add CLI entry points" +``` + +### Task 6: End-to-End Sanity Tests (Mocked) + +**Files:** +- Modify: `tests/test_cli.py` + +**Step 1: Write the failing test** + +```python +# tests/test_cli.py +from unittest.mock import patch + +from tcga_downloader.cli import main + + +def test_cli_query_writes_manifest(tmp_path, monkeypatch): + args = [ + "tcga-downloader", + "query", + "--project", + "TCGA-BRCA", + "--data-type", + "Gene Expression", + "--out", + str(tmp_path / "m.tsv"), + ] + monkeypatch.setattr("sys.argv", args) + with patch("tcga_downloader.cli.query_files") as q: + q.return_value = [ + { + "file_id": "f1", + "file_name": "a.tsv", + "data_type": "Gene Expression", + "data_format": "TSV", + "file_size": 123, + "md5sum": "abc", + } + ] + main() + assert (tmp_path / "m.tsv").exists() +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_cli.py::test_cli_query_writes_manifest -q` +Expected: FAIL if behavior not implemented or module missing. + +**Step 3: Write minimal implementation** + +```python +# tcga_downloader/cli.py +# (No code changes expected if Task 5 is complete. If failing, fix args handling.) +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_cli.py::test_cli_query_writes_manifest -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add tests/test_cli.py +git commit -m "test: add CLI query smoke test" +``` + +### Task 7: Documentation + +**Files:** +- Create: `README.md` + +**Step 1: Write the failing test** + +```python +# tests/test_readme.py +from pathlib import Path + +def test_readme_present(): + assert Path("README.md").exists() +``` + +**Step 2: Run test to verify it fails** + +Run: `pytest tests/test_readme.py -q` +Expected: FAIL with missing README. + +**Step 3: Write minimal implementation** + +```markdown +# README.md + +## TCGA Downloader + +Python package + CLI to query public TCGA files and download via gdc-client. + +### Install + +```bash +pip install -e . +``` + +### Example + +```bash +tcga-downloader query --project TCGA-BRCA --data-type "Gene Expression" --out manifest.tsv + +tcga-downloader download --manifest manifest.tsv --out-dir ./data +``` +``` + +**Step 4: Run test to verify it passes** + +Run: `pytest tests/test_readme.py -q` +Expected: PASS. + +**Step 5: Commit** + +```bash +git add README.md tests/test_readme.py +git commit -m "docs: add README" +```