tcga-downloader/tcga_downloader/cli.py
yunpeng.zhang a01a59b371
Some checks failed
CI / Lint (push) Failing after 9m32s
CI / Test (3.11) (push) Successful in 6m41s
CI / Test (3.12) (push) Successful in 4m21s
feat: add interactive cli
2026-02-09 13:13:39 +08:00

210 lines
8.0 KiB
Python

from __future__ import annotations
import argparse
from pathlib import Path
from tcga_downloader.config import load_config
from tcga_downloader.download import run_gdc_download
from tcga_downloader.interactive import interactive_select
from tcga_downloader.logger import get_logger, setup_logging
from tcga_downloader.manifest import (
ManifestRecord,
format_manifest_stats,
get_manifest_stats,
load_manifest,
validate_files_against_manifest,
validate_manifest,
write_manifest,
)
from tcga_downloader.query import query_files, query_multiple_projects
logger = get_logger("cli")
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="tcga-downloader")
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
parser.add_argument("--log-file", type=str, help="Log file path")
sub = parser.add_subparsers(dest="command", required=True)
q = sub.add_parser("query")
q.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
q.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
q.add_argument("--data-type", required=True)
q.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
q.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
q.add_argument("--out", required=True)
q.add_argument("--format", choices=["tsv", "json"], default="tsv")
d = sub.add_parser("download")
d.add_argument("--manifest", required=True)
d.add_argument("--out-dir", required=True)
d.add_argument("--processes", type=int, default=4)
d.add_argument("--retries", type=int, default=3)
d.add_argument("--token", help="Authentication token for controlled-access data")
r = sub.add_parser("run")
r.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
r.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
r.add_argument("--data-type", required=True)
r.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
r.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
r.add_argument("--out", required=True)
r.add_argument("--format", choices=["tsv", "json"], default="tsv")
r.add_argument("--out-dir", required=True)
r.add_argument("--processes", type=int, default=4)
r.add_argument("--retries", type=int, default=3)
r.add_argument("--token", help="Authentication token for controlled-access data")
c = sub.add_parser("config")
c.add_argument("--init", type=str, help="Generate default config file at path")
c.add_argument("--file", type=str, help="Use config file for other commands")
v = sub.add_parser("validate")
v.add_argument("--manifest", required=True, help="Manifest file to validate")
v.add_argument("--data-dir", help="Validate downloaded files against manifest")
i = sub.add_parser("interactive")
i.add_argument("--out", required=True, help="Output manifest file path")
i.add_argument("--format", choices=["tsv", "json"], default="tsv")
i.add_argument("--out-dir", required=True, help="Download directory")
i.add_argument("--token", help="Authentication token for controlled-access data")
return parser
def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]:
records = []
for h in hits:
records.append(
ManifestRecord(
file_id=h["file_id"],
file_name=h["file_name"],
data_type=h["data_type"],
data_format=h["data_format"],
size=int(h["file_size"]),
md5=h["md5sum"],
)
)
return records
def main() -> None:
parser = build_parser()
args = parser.parse_args()
config = None
if args.command == "config":
if getattr(args, "init", None):
from tcga_downloader.config import save_default_config
save_default_config(Path(args.init))
return
else:
parser.print_help()
return
if args.command == "validate":
setup_logging(verbose=args.verbose, log_file=args.log_file)
records = load_manifest(Path(args.manifest))
errors = validate_manifest(records)
if errors:
logger.error("Manifest validation errors:")
for error in errors:
logger.error(" - %s", error)
return
logger.info("Manifest is valid (%d records)", len(records))
if hasattr(args, "data_dir") and args.data_dir:
data_dir = Path(args.data_dir)
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
if missing:
logger.warning("Missing files (%d):", len(missing))
for filename in missing:
logger.warning(" - %s", filename)
if checksum_errors:
logger.error("Checksum errors (%d):", len(checksum_errors))
for error in checksum_errors:
logger.error(" - %s", error)
if not missing and not checksum_errors:
logger.info("All files validated successfully!")
return
if args.command != "config" and hasattr(args, "file") and args.file:
try:
config = load_config(Path(args.file))
except Exception as e:
logger.error("Failed to load config: %s", e)
return
setup_logging(
verbose=config.verbose or args.verbose, log_file=config.log_file or args.log_file
)
else:
setup_logging(verbose=args.verbose, log_file=args.log_file)
if args.command in {"query", "run"}:
project = args.project
projects = getattr(args, "projects", None)
data_type = args.data_type
sample_type = getattr(args, "sample_type", None)
platform = getattr(args, "platform", None)
out = args.out
fmt = getattr(args, "format", "tsv")
if config and args.command == "query":
project = project or config.query.project
data_type = data_type or config.query.data_type
sample_type = sample_type or config.query.sample_type
platform = platform or config.query.platform
if projects:
hits = query_multiple_projects(
projects, data_type, sample_type=sample_type, platform=platform
)
else:
hits = query_files(project, data_type, sample_type=sample_type, platform=platform)
records = _records_from_hits(hits)
write_manifest(records, Path(out), fmt=fmt)
stats = get_manifest_stats(records)
logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))
if args.command == "interactive":
setup_logging(verbose=args.verbose, log_file=args.log_file)
project_id, data_type = interactive_select()
hits = query_files(project_id, data_type)
records = _records_from_hits(hits)
write_manifest(records, Path(args.out), fmt=args.format)
stats = get_manifest_stats(records)
logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))
run_gdc_download(Path(args.out), Path(args.out_dir), token=getattr(args, "token", None))
if args.command in {"download", "run"}:
manifest = getattr(args, "manifest", None)
out_dir = args.out_dir
processes = args.processes
retries = args.retries
token = getattr(args, "token", None)
if config:
if args.command == "download":
out_dir = out_dir or config.download.out_dir
else:
out_dir = config.download.out_dir
processes = processes or config.download.processes
retries = retries or config.download.retries
token = token or config.download.token
manifest_path = Path(manifest if args.command == "download" else args.out)
run_gdc_download(manifest_path, Path(out_dir), processes, retries, token)