210 lines
8.0 KiB
Python
210 lines
8.0 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from tcga_downloader.config import load_config
|
|
from tcga_downloader.download import run_gdc_download
|
|
from tcga_downloader.interactive import interactive_select
|
|
from tcga_downloader.logger import get_logger, setup_logging
|
|
from tcga_downloader.manifest import (
|
|
ManifestRecord,
|
|
format_manifest_stats,
|
|
get_manifest_stats,
|
|
load_manifest,
|
|
validate_files_against_manifest,
|
|
validate_manifest,
|
|
write_manifest,
|
|
)
|
|
from tcga_downloader.query import query_files, query_multiple_projects
|
|
|
|
logger = get_logger("cli")
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(prog="tcga-downloader")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
|
|
parser.add_argument("--log-file", type=str, help="Log file path")
|
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
|
|
q = sub.add_parser("query")
|
|
q.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
|
|
q.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
|
|
q.add_argument("--data-type", required=True)
|
|
q.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
|
|
q.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
|
|
q.add_argument("--out", required=True)
|
|
q.add_argument("--format", choices=["tsv", "json"], default="tsv")
|
|
|
|
d = sub.add_parser("download")
|
|
d.add_argument("--manifest", required=True)
|
|
d.add_argument("--out-dir", required=True)
|
|
d.add_argument("--processes", type=int, default=4)
|
|
d.add_argument("--retries", type=int, default=3)
|
|
d.add_argument("--token", help="Authentication token for controlled-access data")
|
|
|
|
r = sub.add_parser("run")
|
|
r.add_argument("--project", help="TCGA project ID (e.g., TCGA-BRCA)")
|
|
r.add_argument("--projects", nargs="+", help="Multiple TCGA project IDs")
|
|
r.add_argument("--data-type", required=True)
|
|
r.add_argument("--sample-type", help="Filter by sample type (e.g., 'Primary Tumor')")
|
|
r.add_argument("--platform", help="Filter by platform (e.g., 'Illumina HiSeq')")
|
|
r.add_argument("--out", required=True)
|
|
r.add_argument("--format", choices=["tsv", "json"], default="tsv")
|
|
r.add_argument("--out-dir", required=True)
|
|
r.add_argument("--processes", type=int, default=4)
|
|
r.add_argument("--retries", type=int, default=3)
|
|
r.add_argument("--token", help="Authentication token for controlled-access data")
|
|
|
|
c = sub.add_parser("config")
|
|
c.add_argument("--init", type=str, help="Generate default config file at path")
|
|
c.add_argument("--file", type=str, help="Use config file for other commands")
|
|
|
|
v = sub.add_parser("validate")
|
|
v.add_argument("--manifest", required=True, help="Manifest file to validate")
|
|
v.add_argument("--data-dir", help="Validate downloaded files against manifest")
|
|
|
|
i = sub.add_parser("interactive")
|
|
i.add_argument("--out", required=True, help="Output manifest file path")
|
|
i.add_argument("--format", choices=["tsv", "json"], default="tsv")
|
|
i.add_argument("--out-dir", required=True, help="Download directory")
|
|
i.add_argument("--token", help="Authentication token for controlled-access data")
|
|
|
|
return parser
|
|
|
|
|
|
def _records_from_hits(hits: list[dict]) -> list[ManifestRecord]:
|
|
records = []
|
|
for h in hits:
|
|
records.append(
|
|
ManifestRecord(
|
|
file_id=h["file_id"],
|
|
file_name=h["file_name"],
|
|
data_type=h["data_type"],
|
|
data_format=h["data_format"],
|
|
size=int(h["file_size"]),
|
|
md5=h["md5sum"],
|
|
)
|
|
)
|
|
return records
|
|
|
|
|
|
def main() -> None:
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
|
|
config = None
|
|
if args.command == "config":
|
|
if getattr(args, "init", None):
|
|
from tcga_downloader.config import save_default_config
|
|
|
|
save_default_config(Path(args.init))
|
|
return
|
|
else:
|
|
parser.print_help()
|
|
return
|
|
|
|
if args.command == "validate":
|
|
setup_logging(verbose=args.verbose, log_file=args.log_file)
|
|
records = load_manifest(Path(args.manifest))
|
|
|
|
errors = validate_manifest(records)
|
|
if errors:
|
|
logger.error("Manifest validation errors:")
|
|
for error in errors:
|
|
logger.error(" - %s", error)
|
|
return
|
|
|
|
logger.info("Manifest is valid (%d records)", len(records))
|
|
|
|
if hasattr(args, "data_dir") and args.data_dir:
|
|
data_dir = Path(args.data_dir)
|
|
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
|
|
|
|
if missing:
|
|
logger.warning("Missing files (%d):", len(missing))
|
|
for filename in missing:
|
|
logger.warning(" - %s", filename)
|
|
|
|
if checksum_errors:
|
|
logger.error("Checksum errors (%d):", len(checksum_errors))
|
|
for error in checksum_errors:
|
|
logger.error(" - %s", error)
|
|
|
|
if not missing and not checksum_errors:
|
|
logger.info("All files validated successfully!")
|
|
|
|
return
|
|
|
|
if args.command != "config" and hasattr(args, "file") and args.file:
|
|
try:
|
|
config = load_config(Path(args.file))
|
|
except Exception as e:
|
|
logger.error("Failed to load config: %s", e)
|
|
return
|
|
|
|
setup_logging(
|
|
verbose=config.verbose or args.verbose, log_file=config.log_file or args.log_file
|
|
)
|
|
else:
|
|
setup_logging(verbose=args.verbose, log_file=args.log_file)
|
|
|
|
if args.command in {"query", "run"}:
|
|
project = args.project
|
|
projects = getattr(args, "projects", None)
|
|
data_type = args.data_type
|
|
sample_type = getattr(args, "sample_type", None)
|
|
platform = getattr(args, "platform", None)
|
|
out = args.out
|
|
fmt = getattr(args, "format", "tsv")
|
|
|
|
if config and args.command == "query":
|
|
project = project or config.query.project
|
|
data_type = data_type or config.query.data_type
|
|
sample_type = sample_type or config.query.sample_type
|
|
platform = platform or config.query.platform
|
|
|
|
if projects:
|
|
hits = query_multiple_projects(
|
|
projects, data_type, sample_type=sample_type, platform=platform
|
|
)
|
|
else:
|
|
hits = query_files(project, data_type, sample_type=sample_type, platform=platform)
|
|
|
|
records = _records_from_hits(hits)
|
|
write_manifest(records, Path(out), fmt=fmt)
|
|
stats = get_manifest_stats(records)
|
|
logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))
|
|
|
|
if args.command == "interactive":
|
|
setup_logging(verbose=args.verbose, log_file=args.log_file)
|
|
|
|
project_id, data_type = interactive_select()
|
|
|
|
hits = query_files(project_id, data_type)
|
|
records = _records_from_hits(hits)
|
|
write_manifest(records, Path(args.out), fmt=args.format)
|
|
stats = get_manifest_stats(records)
|
|
logger.info("Manifest statistics:\n%s", format_manifest_stats(stats))
|
|
|
|
run_gdc_download(Path(args.out), Path(args.out_dir), token=getattr(args, "token", None))
|
|
|
|
if args.command in {"download", "run"}:
|
|
manifest = getattr(args, "manifest", None)
|
|
out_dir = args.out_dir
|
|
processes = args.processes
|
|
retries = args.retries
|
|
token = getattr(args, "token", None)
|
|
|
|
if config:
|
|
if args.command == "download":
|
|
out_dir = out_dir or config.download.out_dir
|
|
else:
|
|
out_dir = config.download.out_dir
|
|
processes = processes or config.download.processes
|
|
retries = retries or config.download.retries
|
|
token = token or config.download.token
|
|
|
|
manifest_path = Path(manifest if args.command == "download" else args.out)
|
|
run_gdc_download(manifest_path, Path(out_dir), processes, retries, token)
|