166 lines
4.4 KiB
Python
166 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
import questionary
|
|
import requests
|
|
|
|
if TYPE_CHECKING:
|
|
pass
|
|
|
|
from tcga_downloader.logger import get_logger
|
|
|
|
logger = get_logger("interactive")
|
|
|
|
GDC_PROJECTS_URL = "https://api.gdc.cancer.gov/projects"
|
|
GDC_FILES_URL = "https://api.gdc.cancer.gov/files"
|
|
|
|
|
|
def fetch_projects(size: int = 1000) -> list[dict]:
|
|
"""
|
|
Fetch all available TCGA projects from GDC API.
|
|
|
|
Args:
|
|
size: Number of projects to return
|
|
|
|
Returns:
|
|
List of project dictionaries containing project_id, name, etc.
|
|
"""
|
|
logger.info("Fetching TCGA projects...")
|
|
params = {"size": size}
|
|
try:
|
|
resp = requests.get(GDC_PROJECTS_URL, params=params, timeout=30)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
projects = data.get("data", {}).get("hits", [])
|
|
logger.info("Fetched %d projects", len(projects))
|
|
return projects
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error("Failed to fetch TCGA projects: %s", e)
|
|
raise
|
|
|
|
|
|
def fetch_data_types(project_id: str, size: int = 1000) -> list[str]:
|
|
"""
|
|
Fetch available data types for a specific project using GDC Files API facets.
|
|
|
|
Args:
|
|
project_id: TCGA project ID (e.g., TCGA-BRCA)
|
|
size: Maximum number of data types to return
|
|
|
|
Returns:
|
|
List of unique data type names
|
|
"""
|
|
logger.info("Fetching data types for project %s...", project_id)
|
|
payload = {
|
|
"filters": {
|
|
"op": "and",
|
|
"content": [
|
|
{
|
|
"op": "in",
|
|
"content": {"field": "cases.project.project_id", "value": [project_id]},
|
|
}
|
|
],
|
|
},
|
|
"size": 0,
|
|
"facets": "data_type",
|
|
"format": "JSON",
|
|
}
|
|
try:
|
|
resp = requests.post(GDC_FILES_URL, json=payload, timeout=30)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
aggregations = data.get("data", {}).get("aggregations", {})
|
|
data_types = aggregations.get("data_type", {}).get("buckets", [])
|
|
unique_types = [bucket.get("key") for bucket in data_types if bucket.get("key")]
|
|
logger.info("Found %d data types", len(unique_types))
|
|
return unique_types[:size]
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error("Failed to fetch data types for project %s: %s", project_id, e)
|
|
raise
|
|
|
|
|
|
def select_project(projects: list[dict]) -> dict:
|
|
"""
|
|
Interactive selection of project from list.
|
|
|
|
Args:
|
|
projects: List of available projects
|
|
|
|
Returns:
|
|
Selected project dictionary
|
|
"""
|
|
choices = [f"{p['project_id']} - {p.get('name', 'Unknown')}" for p in projects]
|
|
answer = questionary.select(
|
|
message="Select a TCGA project:",
|
|
choices=choices,
|
|
).ask()
|
|
|
|
if answer is None:
|
|
logger.info("No project selected")
|
|
raise SystemExit(0)
|
|
|
|
selected = None
|
|
for p in projects:
|
|
if answer.startswith(p["project_id"]):
|
|
selected = p
|
|
break
|
|
|
|
if not selected:
|
|
logger.warning("Could not parse selection: %s", answer)
|
|
raise SystemExit(1)
|
|
|
|
return selected
|
|
|
|
|
|
def select_data_type(data_types: list[str]) -> str:
|
|
"""
|
|
Interactive selection of data type from list.
|
|
|
|
Args:
|
|
data_types: List of available data types
|
|
|
|
Returns:
|
|
Selected data type
|
|
"""
|
|
if not data_types:
|
|
logger.warning("No data types available")
|
|
raise SystemExit(1)
|
|
|
|
answer = questionary.select(
|
|
message="Select a data type:",
|
|
choices=data_types,
|
|
).ask()
|
|
|
|
if answer is None:
|
|
logger.info("No data type selected")
|
|
raise SystemExit(0)
|
|
|
|
return answer
|
|
|
|
|
|
def interactive_select() -> tuple[str, str]:
|
|
"""
|
|
Run interactive selection workflow: project -> data type.
|
|
|
|
Returns:
|
|
Tuple of (project_id, data_type)
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("TCGA Downloader - Interactive Mode")
|
|
print("=" * 60 + "\n")
|
|
|
|
projects = fetch_projects()
|
|
selected_project = select_project(projects)
|
|
|
|
project_id = selected_project["project_id"]
|
|
data_types = fetch_data_types(project_id)
|
|
selected_data_type = select_data_type(data_types)
|
|
|
|
print(f"\nSelected project: {selected_project.get('name', project_id)}")
|
|
print(f"Selected data type: {selected_data_type}\n")
|
|
|
|
return project_id, selected_data_type
|