tcga-downloader/tcga_downloader/manifest.py
yunpeng.zhang a01a59b371
Some checks failed
CI / Lint (push) Failing after 9m32s
CI / Test (3.11) (push) Successful in 6m41s
CI / Test (3.12) (push) Successful in 4m21s
feat: add interactive cli
2026-02-09 13:13:39 +08:00

193 lines
6.1 KiB
Python

from __future__ import annotations
import csv
import json
from collections.abc import Iterable
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
pass
REQUIRED_FIELDS = ["file_id", "file_name", "data_type", "data_format", "size", "md5"]
@dataclass(frozen=True)
class ManifestRecord:
file_id: str
file_name: str
data_type: str
data_format: str
size: int
md5: str
def _validate_record(rec: ManifestRecord) -> None:
if not rec.file_id or not rec.file_name:
raise ValueError("file_id and file_name are required")
if rec.size < 0:
raise ValueError("size must be non-negative")
def write_manifest(records: Iterable[ManifestRecord], path: Path, fmt: str = "tsv") -> None:
path = Path(path)
if fmt not in {"tsv", "json"}:
raise ValueError("fmt must be 'tsv' or 'json'")
records = list(records)
for rec in records:
_validate_record(rec)
if fmt == "json":
data = [rec.__dict__ for rec in records]
path.write_text(json.dumps(data, indent=2))
return
gdc_fields = ["id", "filename", "md5", "size", "state"]
with path.open("w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=gdc_fields, delimiter="\t")
writer.writeheader()
for rec in records:
row = {
"id": rec.file_id,
"filename": rec.file_name,
"md5": rec.md5,
"size": rec.size,
"state": "live", # Default state for gdc-client manifest
}
writer.writerow(row)
def load_manifest(path: Path) -> list[ManifestRecord]:
path = Path(path)
if path.suffix.lower() == ".json":
data = json.loads(path.read_text())
return [ManifestRecord(**row) for row in data]
with path.open("r", newline="") as f:
reader = csv.DictReader(f, delimiter="\t")
records: list[ManifestRecord] = []
for row in reader:
# Detect manifest format: gdc-client format uses 'id'/'filename', old format uses 'file_id'/'file_name'
if "id" in row and "filename" in row:
file_id = row["id"]
file_name = row["filename"]
elif "file_id" in row and "file_name" in row:
file_id = row["file_id"]
file_name = row["file_name"]
else:
raise ValueError("Invalid manifest: missing required columns")
records.append(
ManifestRecord(
file_id=file_id,
file_name=file_name,
data_type=row.get("data_type", "Unknown"),
data_format=row.get("data_format", "Unknown"),
size=int(row["size"]),
md5=row["md5"],
)
)
return records
def get_manifest_stats(
records: Iterable[ManifestRecord],
) -> dict[str, int | float | dict[str, int]]:
records = list(records)
total_size = sum(rec.size for rec in records)
data_types: dict[str, int] = {}
data_formats: dict[str, int] = {}
for rec in records:
data_types[rec.data_type] = data_types.get(rec.data_type, 0) + 1
data_formats[rec.data_format] = data_formats.get(rec.data_format, 0) + 1
return {
"file_count": len(records),
"total_size_bytes": total_size,
"total_size_mb": total_size / (1024 * 1024),
"total_size_gb": total_size / (1024 * 1024 * 1024),
"data_types": data_types,
"data_formats": data_formats,
}
def format_manifest_stats(stats: dict[str, int | float | dict[str, int]]) -> str:
lines = [
f"File count: {stats['file_count']}",
f"Total size: {stats['total_size_gb']:.2f} GB ({stats['total_size_mb']:.2f} MB)",
]
data_types = stats.get("data_types")
if data_types and isinstance(data_types, dict):
lines.append("\nData types:")
for data_type, count in sorted(data_types.items()):
lines.append(f" - {data_type}: {count}")
data_formats = stats.get("data_formats")
if data_formats and isinstance(data_formats, dict):
lines.append("\nData formats:")
for data_format, count in sorted(data_formats.items()):
lines.append(f" - {data_format}: {count}")
return "\n".join(lines)
def validate_manifest(records: list[ManifestRecord]) -> list[str]:
"""Validate manifest records and return list of error messages."""
errors = []
for i, rec in enumerate(records, 1):
if not rec.file_id:
errors.append(f"Record {i}: Missing file_id")
if not rec.file_name:
errors.append(f"Record {i}: Missing file_name")
if not rec.data_type:
errors.append(f"Record {i}: Missing data_type")
if not rec.data_format:
errors.append(f"Record {i}: Missing data_format")
if rec.size < 0:
errors.append(f"Record {i}: Invalid size (must be non-negative)")
if not rec.md5 or len(rec.md5) != 32:
errors.append(f"Record {i}: Invalid MD5 checksum (must be 32 characters)")
return errors
def validate_files_against_manifest(
records: list[ManifestRecord], data_dir: Path
) -> tuple[list[str], list[str]]:
"""Validate that downloaded files match manifest records.
Returns:
Tuple of (missing_files, checksum_errors)
"""
missing_files = []
checksum_errors = []
for rec in records:
file_path = data_dir / rec.file_name
if not file_path.exists():
missing_files.append(rec.file_name)
continue
if rec.md5:
actual_md5 = _calculate_md5(file_path)
if actual_md5 != rec.md5:
checksum_errors.append(f"{rec.file_name}: Expected {rec.md5}, got {actual_md5}")
return missing_files, checksum_errors
def _calculate_md5(file_path: Path) -> str:
import hashlib
hash_md5 = hashlib.md5()
with file_path.open("rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()