docs: add implementation plan
This commit is contained in:
parent
9217a3f20b
commit
cd5cf41661
558
docs/plans/2026-01-16-tcga-downloader-implementation.md
Normal file
558
docs/plans/2026-01-16-tcga-downloader-implementation.md
Normal file
@ -0,0 +1,558 @@
|
|||||||
|
# TCGA Downloader Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Build a Python package + CLI to query public TCGA files, generate manifests, and download via `gdc-client` with retries, resume, and checksums.
|
||||||
|
|
||||||
|
**Architecture:** A small Python package with three core modules (`query`, `manifest`, `download`) and an `argparse` CLI. Query uses the GDC REST API via `requests` to avoid hard-coupling to a specific SDK, while keeping the API surface thin for later swapping to `GenomicDataCommons` if desired. Downloads are delegated to `gdc-client`.
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.11+, `requests`, `pytest`, `gdc-client` (external binary).
|
||||||
|
|
||||||
|
### Task 1: Package Skeleton + Version
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `pyproject.toml`
|
||||||
|
- Create: `tcga_downloader/__init__.py`
|
||||||
|
- Create: `tests/test_version.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_version.py
|
||||||
|
from tcga_downloader import __version__
|
||||||
|
|
||||||
|
def test_version_present():
|
||||||
|
assert __version__
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_version.py -q`
|
||||||
|
Expected: FAIL with `ModuleNotFoundError` or missing `__version__`.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# pyproject.toml
|
||||||
|
[project]
|
||||||
|
name = "tcga-downloader"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "TCGA public data downloader"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = ["requests>=2.31"]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["pytest>=7.4"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
tcga-downloader = "tcga_downloader.cli:main"
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/__init__.py
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_version.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add pyproject.toml tcga_downloader/__init__.py tests/test_version.py
|
||||||
|
git commit -m "feat: add package skeleton"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 2: Manifest Read/Write + Validation
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `tcga_downloader/manifest.py`
|
||||||
|
- Create: `tests/test_manifest.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_manifest.py
|
||||||
|
from tcga_downloader.manifest import ManifestRecord, write_manifest, load_manifest
|
||||||
|
|
||||||
|
def test_manifest_roundtrip_tsv(tmp_path):
|
||||||
|
records = [
|
||||||
|
ManifestRecord(
|
||||||
|
file_id="f1",
|
||||||
|
file_name="a.tsv",
|
||||||
|
data_type="Gene Expression",
|
||||||
|
data_format="TSV",
|
||||||
|
size=123,
|
||||||
|
md5="abc",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
path = tmp_path / "m.tsv"
|
||||||
|
write_manifest(records, path, fmt="tsv")
|
||||||
|
loaded = load_manifest(path)
|
||||||
|
assert loaded == records
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_manifest.py -q`
|
||||||
|
Expected: FAIL with `ModuleNotFoundError`.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/manifest.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ManifestRecord:
|
||||||
|
file_id: str
|
||||||
|
file_name: str
|
||||||
|
data_type: str
|
||||||
|
data_format: str
|
||||||
|
size: int
|
||||||
|
md5: str
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_record(rec: ManifestRecord) -> None:
|
||||||
|
if not rec.file_id or not rec.file_name:
|
||||||
|
raise ValueError("file_id and file_name are required")
|
||||||
|
if rec.size < 0:
|
||||||
|
raise ValueError("size must be non-negative")
|
||||||
|
|
||||||
|
|
||||||
|
def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None:
|
||||||
|
path = Path(path)
|
||||||
|
if fmt not in {"tsv", "json"}:
|
||||||
|
raise ValueError("fmt must be 'tsv' or 'json'")
|
||||||
|
records = list(records)
|
||||||
|
for rec in records:
|
||||||
|
_validate_record(rec)
|
||||||
|
|
||||||
|
if fmt == "json":
|
||||||
|
data = [rec.__dict__ for rec in records]
|
||||||
|
path.write_text(json.dumps(data, indent=2))
|
||||||
|
return
|
||||||
|
|
||||||
|
with path.open("w", newline="") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS, delimiter="\t")
|
||||||
|
writer.writeheader()
|
||||||
|
for rec in records:
|
||||||
|
writer.writerow(rec.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
def load_manifest(path: Path) -> List[ManifestRecord]:
|
||||||
|
path = Path(path)
|
||||||
|
if path.suffix.lower() == ".json":
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
return [ManifestRecord(**row) for row in data]
|
||||||
|
|
||||||
|
with path.open("r", newline="") as f:
|
||||||
|
reader = csv.DictReader(f, delimiter="\t")
|
||||||
|
return [ManifestRecord(**row) for row in reader]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_manifest.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tcga_downloader/manifest.py tests/test_manifest.py
|
||||||
|
git commit -m "feat: add manifest read/write"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 3: GDC Query (REST API)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `tcga_downloader/query.py`
|
||||||
|
- Create: `tests/test_query.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_query.py
|
||||||
|
from tcga_downloader.query import build_filters
|
||||||
|
|
||||||
|
def test_build_filters_project_and_type():
|
||||||
|
filters = build_filters(project="TCGA-BRCA", data_type="Gene Expression")
|
||||||
|
assert filters["op"] == "and"
|
||||||
|
assert filters["content"][0]["content"]["field"] == "cases.project.project_id"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_query.py -q`
|
||||||
|
Expected: FAIL with `ModuleNotFoundError`.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/query.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
GDC_FILES_URL = "https://api.gdc.cancer.gov/files"
|
||||||
|
|
||||||
|
|
||||||
|
def build_filters(project: str, data_type: str) -> dict:
|
||||||
|
return {
|
||||||
|
"op": "and",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"op": "in",
|
||||||
|
"content": {
|
||||||
|
"field": "cases.project.project_id",
|
||||||
|
"value": [project],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "in",
|
||||||
|
"content": {
|
||||||
|
"field": "data_type",
|
||||||
|
"value": [data_type],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def query_files(project: str, data_type: str, fields: list[str] | None = None, size: int = 1000) -> list[dict]:
|
||||||
|
if fields is None:
|
||||||
|
fields = ["file_id", "file_name", "data_type", "data_format", "file_size", "md5sum"]
|
||||||
|
payload = {
|
||||||
|
"filters": build_filters(project, data_type),
|
||||||
|
"fields": ",".join(fields),
|
||||||
|
"format": "JSON",
|
||||||
|
"size": size,
|
||||||
|
}
|
||||||
|
resp = requests.post(GDC_FILES_URL, json=payload, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return data.get("data", {}).get("hits", [])
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_query.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tcga_downloader/query.py tests/test_query.py
|
||||||
|
git commit -m "feat: add GDC query via REST"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 4: Download Runner (gdc-client)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `tcga_downloader/download.py`
|
||||||
|
- Create: `tests/test_download.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_download.py
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from tcga_downloader.download import build_gdc_command
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_gdc_command():
|
||||||
|
cmd = build_gdc_command(Path("/tmp/m.tsv"), Path("/data"), processes=4, retries=3)
|
||||||
|
assert "gdc-client" in cmd[0]
|
||||||
|
assert "-m" in cmd
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_download.py -q`
|
||||||
|
Expected: FAIL with `ModuleNotFoundError`.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/download.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def build_gdc_command(manifest_path: Path, out_dir: Path, processes: int, retries: int) -> list[str]:
|
||||||
|
return [
|
||||||
|
"gdc-client",
|
||||||
|
"download",
|
||||||
|
"-m",
|
||||||
|
str(manifest_path),
|
||||||
|
"-d",
|
||||||
|
str(out_dir),
|
||||||
|
"--n-processes",
|
||||||
|
str(processes),
|
||||||
|
"--retry-count",
|
||||||
|
str(retries),
|
||||||
|
"--checksum",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run_gdc_download(manifest_path: Path, out_dir: Path, processes: int = 4, retries: int = 3) -> None:
|
||||||
|
if not shutil.which("gdc-client"):
|
||||||
|
raise RuntimeError("gdc-client not found in PATH")
|
||||||
|
cmd = build_gdc_command(manifest_path, out_dir, processes, retries)
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_download.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tcga_downloader/download.py tests/test_download.py
|
||||||
|
git commit -m "feat: add gdc-client downloader"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 5: CLI Wiring
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `tcga_downloader/cli.py`
|
||||||
|
- Create: `tests/test_cli.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_cli.py
|
||||||
|
from tcga_downloader.cli import build_parser
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_has_subcommands():
|
||||||
|
parser = build_parser()
|
||||||
|
subparsers = parser._subparsers
|
||||||
|
assert subparsers is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_cli.py -q`
|
||||||
|
Expected: FAIL with `ModuleNotFoundError`.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/cli.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tcga_downloader.download import run_gdc_download
|
||||||
|
from tcga_downloader.manifest import ManifestRecord, write_manifest
|
||||||
|
from tcga_downloader.query import query_files
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(prog="tcga-downloader")
|
||||||
|
sub = parser.add_subparsers(dest="command", required=True)
|
||||||
|
|
||||||
|
q = sub.add_parser("query")
|
||||||
|
q.add_argument("--project", required=True)
|
||||||
|
q.add_argument("--data-type", required=True)
|
||||||
|
q.add_argument("--out", required=True)
|
||||||
|
q.add_argument("--format", choices=["tsv", "json"], default="tsv")
|
||||||
|
|
||||||
|
d = sub.add_parser("download")
|
||||||
|
d.add_argument("--manifest", required=True)
|
||||||
|
d.add_argument("--out-dir", required=True)
|
||||||
|
d.add_argument("--processes", type=int, default=4)
|
||||||
|
d.add_argument("--retries", type=int, default=3)
|
||||||
|
|
||||||
|
r = sub.add_parser("run")
|
||||||
|
r.add_argument("--project", required=True)
|
||||||
|
r.add_argument("--data-type", required=True)
|
||||||
|
r.add_argument("--out", required=True)
|
||||||
|
r.add_argument("--format", choices=["tsv", "json"], default="tsv")
|
||||||
|
r.add_argument("--out-dir", required=True)
|
||||||
|
r.add_argument("--processes", type=int, default=4)
|
||||||
|
r.add_argument("--retries", type=int, default=3)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]:
|
||||||
|
records = []
|
||||||
|
for h in hits:
|
||||||
|
records.append(
|
||||||
|
ManifestRecord(
|
||||||
|
file_id=h["file_id"],
|
||||||
|
file_name=h["file_name"],
|
||||||
|
data_type=h["data_type"],
|
||||||
|
data_format=h["data_format"],
|
||||||
|
size=int(h["file_size"]),
|
||||||
|
md5=h["md5sum"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command in {"query", "run"}:
|
||||||
|
hits = query_files(args.project, args.data_type)
|
||||||
|
records = _records_from_hits(hits)
|
||||||
|
write_manifest(records, Path(args.out), fmt=args.format)
|
||||||
|
|
||||||
|
if args.command in {"download", "run"}:
|
||||||
|
run_gdc_download(Path(args.manifest if args.command == "download" else args.out), Path(args.out_dir), args.processes, args.retries)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_cli.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tcga_downloader/cli.py tests/test_cli.py
|
||||||
|
git commit -m "feat: add CLI entry points"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 6: End-to-End Sanity Tests (Mocked)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tests/test_cli.py`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_cli.py
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from tcga_downloader.cli import main
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_query_writes_manifest(tmp_path, monkeypatch):
|
||||||
|
args = [
|
||||||
|
"tcga-downloader",
|
||||||
|
"query",
|
||||||
|
"--project",
|
||||||
|
"TCGA-BRCA",
|
||||||
|
"--data-type",
|
||||||
|
"Gene Expression",
|
||||||
|
"--out",
|
||||||
|
str(tmp_path / "m.tsv"),
|
||||||
|
]
|
||||||
|
monkeypatch.setattr("sys.argv", args)
|
||||||
|
with patch("tcga_downloader.cli.query_files") as q:
|
||||||
|
q.return_value = [
|
||||||
|
{
|
||||||
|
"file_id": "f1",
|
||||||
|
"file_name": "a.tsv",
|
||||||
|
"data_type": "Gene Expression",
|
||||||
|
"data_format": "TSV",
|
||||||
|
"file_size": 123,
|
||||||
|
"md5sum": "abc",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
main()
|
||||||
|
assert (tmp_path / "m.tsv").exists()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_cli.py::test_cli_query_writes_manifest -q`
|
||||||
|
Expected: FAIL if behavior not implemented or module missing.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tcga_downloader/cli.py
|
||||||
|
# (No code changes expected if Task 5 is complete. If failing, fix args handling.)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_cli.py::test_cli_query_writes_manifest -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tests/test_cli.py
|
||||||
|
git commit -m "test: add CLI query smoke test"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 7: Documentation
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `README.md`
|
||||||
|
|
||||||
|
**Step 1: Write the failing test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_readme.py
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def test_readme_present():
|
||||||
|
assert Path("README.md").exists()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_readme.py -q`
|
||||||
|
Expected: FAIL with missing README.
|
||||||
|
|
||||||
|
**Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# README.md
|
||||||
|
|
||||||
|
## TCGA Downloader
|
||||||
|
|
||||||
|
Python package + CLI to query public TCGA files and download via gdc-client.
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tcga-downloader query --project TCGA-BRCA --data-type "Gene Expression" --out manifest.tsv
|
||||||
|
|
||||||
|
tcga-downloader download --manifest manifest.tsv --out-dir ./data
|
||||||
|
```
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `pytest tests/test_readme.py -q`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add README.md tests/test_readme.py
|
||||||
|
git commit -m "docs: add README"
|
||||||
|
```
|
||||||
Loading…
Reference in New Issue
Block a user