tcga-downloader/tcga_downloader/interactive.py
yunpeng.zhang a01a59b371
Some checks failed
CI / Lint (push) Failing after 9m32s
CI / Test (3.11) (push) Successful in 6m41s
CI / Test (3.12) (push) Successful in 4m21s
feat: add interactive cli
2026-02-09 13:13:39 +08:00

166 lines
4.4 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
import questionary
import requests
if TYPE_CHECKING:
pass
from tcga_downloader.logger import get_logger
logger = get_logger("interactive")
GDC_PROJECTS_URL = "https://api.gdc.cancer.gov/projects"
GDC_FILES_URL = "https://api.gdc.cancer.gov/files"
def fetch_projects(size: int = 1000) -> list[dict]:
"""
Fetch all available TCGA projects from GDC API.
Args:
size: Number of projects to return
Returns:
List of project dictionaries containing project_id, name, etc.
"""
logger.info("Fetching TCGA projects...")
params = {"size": size}
try:
resp = requests.get(GDC_PROJECTS_URL, params=params, timeout=30)
resp.raise_for_status()
data = resp.json()
projects = data.get("data", {}).get("hits", [])
logger.info("Fetched %d projects", len(projects))
return projects
except requests.exceptions.RequestException as e:
logger.error("Failed to fetch TCGA projects: %s", e)
raise
def fetch_data_types(project_id: str, size: int = 1000) -> list[str]:
"""
Fetch available data types for a specific project using GDC Files API facets.
Args:
project_id: TCGA project ID (e.g., TCGA-BRCA)
size: Maximum number of data types to return
Returns:
List of unique data type names
"""
logger.info("Fetching data types for project %s...", project_id)
payload = {
"filters": {
"op": "and",
"content": [
{
"op": "in",
"content": {"field": "cases.project.project_id", "value": [project_id]},
}
],
},
"size": 0,
"facets": "data_type",
"format": "JSON",
}
try:
resp = requests.post(GDC_FILES_URL, json=payload, timeout=30)
resp.raise_for_status()
data = resp.json()
aggregations = data.get("data", {}).get("aggregations", {})
data_types = aggregations.get("data_type", {}).get("buckets", [])
unique_types = [bucket.get("key") for bucket in data_types if bucket.get("key")]
logger.info("Found %d data types", len(unique_types))
return unique_types[:size]
except requests.exceptions.RequestException as e:
logger.error("Failed to fetch data types for project %s: %s", project_id, e)
raise
def select_project(projects: list[dict]) -> dict:
"""
Interactive selection of project from list.
Args:
projects: List of available projects
Returns:
Selected project dictionary
"""
choices = [f"{p['project_id']} - {p.get('name', 'Unknown')}" for p in projects]
answer = questionary.select(
message="Select a TCGA project:",
choices=choices,
).ask()
if answer is None:
logger.info("No project selected")
raise SystemExit(0)
selected = None
for p in projects:
if answer.startswith(p["project_id"]):
selected = p
break
if not selected:
logger.warning("Could not parse selection: %s", answer)
raise SystemExit(1)
return selected
def select_data_type(data_types: list[str]) -> str:
"""
Interactive selection of data type from list.
Args:
data_types: List of available data types
Returns:
Selected data type
"""
if not data_types:
logger.warning("No data types available")
raise SystemExit(1)
answer = questionary.select(
message="Select a data type:",
choices=data_types,
).ask()
if answer is None:
logger.info("No data type selected")
raise SystemExit(0)
return answer
def interactive_select() -> tuple[str, str]:
"""
Run interactive selection workflow: project -> data type.
Returns:
Tuple of (project_id, data_type)
"""
print("\n" + "=" * 60)
print("TCGA Downloader - Interactive Mode")
print("=" * 60 + "\n")
projects = fetch_projects()
selected_project = select_project(projects)
project_id = selected_project["project_id"]
data_types = fetch_data_types(project_id)
selected_data_type = select_data_type(data_types)
print(f"\nSelected project: {selected_project.get('name', project_id)}")
print(f"Selected data type: {selected_data_type}\n")
return project_id, selected_data_type