tcga-downloader/tcga_downloader/cli.py

from __future__ import annotations

import argparse
from pathlib import Path

from tcga_downloader.config import load_config
from tcga_downloader.download import run_gdc_download
from tcga_downloader.interactive import interactive_select
from tcga_downloader.logger import get_logger, setup_logging
from tcga_downloader.manifest import (
    ManifestRecord,
    format_manifest_stats,
    get_manifest_stats,
    load_manifest,
    validate_files_against_manifest,
    validate_manifest,
    write_manifest,
)
from tcga_downloader.query import query_files, query_multiple_projects

logger = get_logger("cli")


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(prog="tcga-downloader")
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
    parser.add_argument("--log-file", type=str, help="Log file path")
    sub = parser.add_subparsers(dest="command", required=True)

    q = sub.add_parser("query")
    q.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
    q.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
    q.add_argument("--data-type", required=True)
    q.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
    q.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
    q.add_argument("--out", required=True)
    q.add_argument("--format", choices=["tsv", "json"], default="tsv")

    d = sub.add_parser("download")
    d.add_argument("--manifest", required=True)
    d.add_argument("--out-dir", required=True)
    d.add_argument("--processes", type=int, default=4)
    d.add_argument("--retries", type=int, default=3)
    d.add_argument("--token", help="Authentication token for controlled-access data")

    r = sub.add_parser("run")
    r.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
    r.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
    r.add_argument("--data-type", required=True)
    r.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
    r.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
    r.add_argument("--out", required=True)
    r.add_argument("--format", choices=["tsv", "json"], default="tsv")
    r.add_argument("--out-dir", required=True)
    r.add_argument("--processes", type=int, default=4)
    r.add_argument("--retries", type=int, default=3)
    r.add_argument("--token", help="Authentication token for controlled-access data")

    c = sub.add_parser("config")
    c.add_argument("--init", type=str, help="Generate default config file at path")
    c.add_argument("--file", type=str, help="Use config file for other commands")

    v = sub.add_parser("validate")
    v.add_argument("--manifest", required=True, help="Manifest file to validate")
    v.add_argument("--data-dir", help="Validate downloaded files against manifest")

    i = sub.add_parser("interactive")
    i.add_argument("--out", required=True, help="Output manifest file path")
    i.add_argument("--format", choices=["tsv", "json"], default="tsv")
    i.add_argument("--out-dir", required=True, help="Download directory")
    i.add_argument("--token", help="Authentication token for controlled-access data")

    return parser


def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]:
    records = []
    for h in hits:
        records.append(
            ManifestRecord(
                file_id=h["file_id"],
                file_name=h["file_name"],
                data_type=h["data_type"],
                data_format=h["data_format"],
                size=int(h["file_size"]),
                md5=h["md5sum"],
            )
        )
    return records


def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    config = None
    if args.command == "config":
        if getattr(args, "init", None):
            from tcga_downloader.config import save_default_config

            save_default_config(Path(args.init))
            return
        else:
            parser.print_help()
            return

    if args.command == "validate":
        setup_logging(verbose=args.verbose, log_file=args.log_file)
        records = load_manifest(Path(args.manifest))

        errors = validate_manifest(records)
        if errors:
            logger.error("Manifest validation errors:")
            for error in errors:
                logger.error("  - %s", error)
            return

        logger.info("Manifest is valid (%d records)", len(records))

        if hasattr(args, "data_dir") and args.data_dir:
            data_dir = Path(args.data_dir)
            missing, checksum_errors = validate_files_against_manifest(records, data_dir)

            if missing:
                logger.warning("Missing files (%d):", len(missing))
                for filename in missing:
                    logger.warning("  - %s", filename)

            if checksum_errors:
                logger.error("Checksum errors (%d):", len(checksum_errors))
                for error in checksum_errors:
                    logger.error("  - %s", error)

            if not missing and not checksum_errors:
                logger.info("All files validated successfully!")

        return

    if args.command != "config" and hasattr(args, "file") and args.file:
        try:
            config = load_config(Path(args.file))
        except Exception as e:
            logger.error("Failed to load config: %s", e)
            return

        setup_logging(
            verbose=config.verbose or args.verbose, log_file=config.log_file or args.log_file
        )
    else:
        setup_logging(verbose=args.verbose, log_file=args.log_file)

    if args.command in {"query", "run"}:
        project = args.project
        projects = getattr(args, "projects", None)
        data_type = args.data_type
        sample_type = getattr(args, "sample_type", None)
        platform = getattr(args, "platform", None)
        out = args.out
        fmt = getattr(args, "format", "tsv")

        if config and args.command == "query":
            project = project or config.query.project
            data_type = data_type or config.query.data_type
            sample_type = sample_type or config.query.sample_type
            platform = platform or config.query.platform

        if projects:
            hits = query_multiple_projects(
                projects, data_type, sample_type=sample_type, platform=platform
            )
        else:
            hits = query_files(project, data_type, sample_type=sample_type, platform=platform)

        records = _records_from_hits(hits)
        write_manifest(records, Path(out), fmt=fmt)
        stats = get_manifest_stats(records)
        logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))

    if args.command == "interactive":
        setup_logging(verbose=args.verbose, log_file=args.log_file)

        project_id, data_type = interactive_select()

        hits = query_files(project_id, data_type)
        records = _records_from_hits(hits)
        write_manifest(records, Path(args.out), fmt=args.format)
        stats = get_manifest_stats(records)
        logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))

        run_gdc_download(Path(args.out), Path(args.out_dir), token=getattr(args, "token", None))

    if args.command in {"download", "run"}:
        manifest = getattr(args, "manifest", None)
        out_dir = args.out_dir
        processes = args.processes
        retries = args.retries
        token = getattr(args, "token", None)

        if config:
            if args.command == "download":
                out_dir = out_dir or config.download.out_dir
            else:
                out_dir = config.download.out_dir
            processes = processes or config.download.processes
            retries = retries or config.download.retries
            token = token or config.download.token

        manifest_path = Path(manifest if args.command == "download" else args.out)
        run_gdc_download(manifest_path, Path(out_dir), processes, retries, token)