from __future__ import annotations import argparse from pathlib import Path from tcga_downloader.config import load_config from tcga_downloader.download import run_gdc_download from tcga_downloader.interactive import interactive_select from tcga_downloader.logger import get_logger, setup_logging from tcga_downloader.manifest import ( ManifestRecord, format_manifest_stats, get_manifest_stats, load_manifest, validate_files_against_manifest, validate_manifest, write_manifest, ) from tcga_downloader.query import query_files, query_multiple_projects logger = get_logger("cli") def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="tcga-downloader") parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") parser.add_argument("--log-file", type=str, help="Log file path") sub = parser.add_subparsers(dest="command", required=True) q = sub.add_parser("query") q.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)") q.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs") q.add_argument("--data-type", required=True) q.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')") q.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')") q.add_argument("--out", required=True) q.add_argument("--format", choices=["tsv", "json"], default="tsv") d = sub.add_parser("download") d.add_argument("--manifest", required=True) d.add_argument("--out-dir", required=True) d.add_argument("--processes", type=int, default=4) d.add_argument("--retries", type=int, default=3) d.add_argument("--token", help="Authentication token for controlled-access data") r = sub.add_parser("run") r.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)") r.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs") r.add_argument("--data-type", required=True) r.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')") r.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')") r.add_argument("--out", required=True) r.add_argument("--format", choices=["tsv", "json"], default="tsv") r.add_argument("--out-dir", required=True) r.add_argument("--processes", type=int, default=4) r.add_argument("--retries", type=int, default=3) r.add_argument("--token", help="Authentication token for controlled-access data") c = sub.add_parser("config") c.add_argument("--init", type=str, help="Generate default config file at path") c.add_argument("--file", type=str, help="Use config file for other commands") v = sub.add_parser("validate") v.add_argument("--manifest", required=True, help="Manifest file to validate") v.add_argument("--data-dir", help="Validate downloaded files against manifest") i = sub.add_parser("interactive") i.add_argument("--out", required=True, help="Output manifest file path") i.add_argument("--format", choices=["tsv", "json"], default="tsv") i.add_argument("--out-dir", required=True, help="Download directory") i.add_argument("--token", help="Authentication token for controlled-access data") return parser def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]: records = [] for h in hits: records.append( ManifestRecord( file_id=h["file_id"], file_name=h["file_name"], data_type=h["data_type"], data_format=h["data_format"], size=int(h["file_size"]), md5=h["md5sum"], ) ) return records def main() -> None: parser = build_parser() args = parser.parse_args() config = None if args.command == "config": if getattr(args, "init", None): from tcga_downloader.config import save_default_config save_default_config(Path(args.init)) return else: parser.print_help() return if args.command == "validate": setup_logging(verbose=args.verbose, log_file=args.log_file) records = load_manifest(Path(args.manifest)) errors = validate_manifest(records) if errors: logger.error("Manifest validation errors:") for error in errors: logger.error(" - %s", error) return logger.info("Manifest is valid (%d records)", len(records)) if hasattr(args, "data_dir") and args.data_dir: data_dir = Path(args.data_dir) missing, checksum_errors = validate_files_against_manifest(records, data_dir) if missing: logger.warning("Missing files (%d):", len(missing)) for filename in missing: logger.warning(" - %s", filename) if checksum_errors: logger.error("Checksum errors (%d):", len(checksum_errors)) for error in checksum_errors: logger.error(" - %s", error) if not missing and not checksum_errors: logger.info("All files validated successfully!") return if args.command != "config" and hasattr(args, "file") and args.file: try: config = load_config(Path(args.file)) except Exception as e: logger.error("Failed to load config: %s", e) return setup_logging( verbose=config.verbose or args.verbose, log_file=config.log_file or args.log_file ) else: setup_logging(verbose=args.verbose, log_file=args.log_file) if args.command in {"query", "run"}: project = args.project projects = getattr(args, "projects", None) data_type = args.data_type sample_type = getattr(args, "sample_type", None) platform = getattr(args, "platform", None) out = args.out fmt = getattr(args, "format", "tsv") if config and args.command == "query": project = project or config.query.project data_type = data_type or config.query.data_type sample_type = sample_type or config.query.sample_type platform = platform or config.query.platform if projects: hits = query_multiple_projects( projects, data_type, sample_type=sample_type, platform=platform ) else: hits = query_files(project, data_type, sample_type=sample_type, platform=platform) records = _records_from_hits(hits) write_manifest(records, Path(out), fmt=fmt) stats = get_manifest_stats(records) logger.info("Manifest statistics:\n%s", format_manifest_stats(stats)) if args.command == "interactive": setup_logging(verbose=args.verbose, log_file=args.log_file) project_id, data_type = interactive_select() hits = query_files(project_id, data_type) records = _records_from_hits(hits) write_manifest(records, Path(args.out), fmt=args.format) stats = get_manifest_stats(records) logger.info("Manifest statistics:\n%s", format_manifest_stats(stats)) run_gdc_download(Path(args.out), Path(args.out_dir), token=getattr(args, "token", None)) if args.command in {"download", "run"}: manifest = getattr(args, "manifest", None) out_dir = args.out_dir processes = args.processes retries = args.retries token = getattr(args, "token", None) if config: if args.command == "download": out_dir = out_dir or config.download.out_dir else: out_dir = config.download.out_dir processes = processes or config.download.processes retries = retries or config.download.retries token = token or config.download.token manifest_path = Path(manifest if args.command == "download" else args.out) run_gdc_download(manifest_path, Path(out_dir), processes, retries, token)