from __future__ import annotations import requests GDC_FILES_URL = "https://api.gdc.cancer.gov/files" def build_filters(project: str, data_type: str) -> dict: return { "op": "and", "content": [ { "op": "in", "content": { "field": "cases.project.project_id", "value": [project], }, }, { "op": "in", "content": { "field": "data_type", "value": [data_type], }, }, ], } def query_files(project: str, data_type: str, fields: list[str] | None = None, size: int = 1000) -> list[dict]: if fields is None: fields = ["file_id", "file_name", "data_type", "data_format", "file_size", "md5sum"] payload = { "filters": build_filters(project, data_type), "fields": ",".join(fields), "format": "JSON", "size": size, } resp = requests.post(GDC_FILES_URL, json=payload, timeout=30) resp.raise_for_status() data = resp.json() return data.get("data", {}).get("hits", [])