feat: add GDC query via REST

This commit is contained in:
yunpeng.zhang 2026-01-16 14:48:12 +08:00
parent e6b2a174c5
commit 7e8497decb
2 changed files with 49 additions and 0 deletions

42
tcga_downloader/query.py Normal file
View File

@ -0,0 +1,42 @@
from __future__ import annotations
import requests
GDC_FILES_URL = "https://api.gdc.cancer.gov/files"
def build_filters(project: str, data_type: str) -> dict:
return {
"op": "and",
"content": [
{
"op": "in",
"content": {
"field": "cases.project.project_id",
"value": [project],
},
},
{
"op": "in",
"content": {
"field": "data_type",
"value": [data_type],
},
},
],
}
def query_files(project: str, data_type: str, fields: list[str] | None = None, size: int = 1000) -> list[dict]:
if fields is None:
fields = ["file_id", "file_name", "data_type", "data_format", "file_size", "md5sum"]
payload = {
"filters": build_filters(project, data_type),
"fields": ",".join(fields),
"format": "JSON",
"size": size,
}
resp = requests.post(GDC_FILES_URL, json=payload, timeout=30)
resp.raise_for_status()
data = resp.json()
return data.get("data", {}).get("hits", [])

7
tests/test_query.py Normal file
View File

@ -0,0 +1,7 @@
from tcga_downloader.query import build_filters
def test_build_filters_project_and_type():
filters = build_filters(project="TCGA-BRCA", data_type="Gene Expression")
assert filters["op"] == "and"
assert filters["content"][0]["content"]["field"] == "cases.project.project_id"