diff --git a/tcga_downloader/query.py b/tcga_downloader/query.py new file mode 100644 index 0000000..1c83ead --- /dev/null +++ b/tcga_downloader/query.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import requests + +GDC_FILES_URL = "https://api.gdc.cancer.gov/files" + + +def build_filters(project: str, data_type: str) -> dict: + return { + "op": "and", + "content": [ + { + "op": "in", + "content": { + "field": "cases.project.project_id", + "value": [project], + }, + }, + { + "op": "in", + "content": { + "field": "data_type", + "value": [data_type], + }, + }, + ], + } + + +def query_files(project: str, data_type: str, fields: list[str] | None = None, size: int = 1000) -> list[dict]: + if fields is None: + fields = ["file_id", "file_name", "data_type", "data_format", "file_size", "md5sum"] + payload = { + "filters": build_filters(project, data_type), + "fields": ",".join(fields), + "format": "JSON", + "size": size, + } + resp = requests.post(GDC_FILES_URL, json=payload, timeout=30) + resp.raise_for_status() + data = resp.json() + return data.get("data", {}).get("hits", []) diff --git a/tests/test_query.py b/tests/test_query.py new file mode 100644 index 0000000..a5a1cdc --- /dev/null +++ b/tests/test_query.py @@ -0,0 +1,7 @@ +from tcga_downloader.query import build_filters + + +def test_build_filters_project_and_type(): + filters = build_filters(project="TCGA-BRCA", data_type="Gene Expression") + assert filters["op"] == "and" + assert filters["content"][0]["content"]["field"] == "cases.project.project_id"