tcga-downloader/tests/test_manifest.py
yunpeng.zhang a01a59b371
Some checks failed
CI / Lint (push) Failing after 9m32s
CI / Test (3.11) (push) Successful in 6m41s
CI / Test (3.12) (push) Successful in 4m21s
feat: add interactive cli
2026-02-09 13:13:39 +08:00

240 lines
5.9 KiB
Python

from tcga_downloader.manifest import (
ManifestRecord,
format_manifest_stats,
get_manifest_stats,
load_manifest,
validate_files_against_manifest,
validate_manifest,
write_manifest,
)
def test_manifest_roundtrip_tsv(tmp_path):
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="abc",
)
]
path = tmp_path / "m.tsv"
write_manifest(records, path, fmt="tsv")
loaded = load_manifest(path)
expected = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Unknown",
data_format="Unknown",
size=123,
md5="abc",
)
]
assert loaded == expected
def test_manifest_roundtrip_json(tmp_path):
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="abc",
)
]
path = tmp_path / "m.json"
write_manifest(records, path, fmt="json")
loaded = load_manifest(path)
assert loaded == records
def test_get_manifest_stats():
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=1000,
md5="abc",
),
ManifestRecord(
file_id="f2",
file_name="b.tsv",
data_type="Gene Expression",
data_format="TSV",
size=2000,
md5="def",
),
ManifestRecord(
file_id="f3",
file_name="c.tsv",
data_type="Copy Number",
data_format="TSV",
size=3000,
md5="ghi",
),
]
stats = get_manifest_stats(records)
assert stats["file_count"] == 3
assert stats["total_size_bytes"] == 6000
assert stats["total_size_mb"] == 6000 / (1024 * 1024)
assert stats["total_size_gb"] == 6000 / (1024 * 1024 * 1024)
assert stats["data_types"] == {"Gene Expression": 2, "Copy Number": 1}
assert stats["data_formats"] == {"TSV": 3}
def test_format_manifest_stats():
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=1000,
md5="abc",
),
]
stats = get_manifest_stats(records)
formatted = format_manifest_stats(stats)
assert "File count: 1" in formatted
assert "Total size:" in formatted
assert "Data types:" in formatted
assert "Gene Expression: 1" in formatted
assert "Data formats:" in formatted
assert "TSV: 1" in formatted
def test_validate_manifest_valid():
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="a" * 32,
)
]
errors = validate_manifest(records)
assert len(errors) == 0
def test_validate_manifest_missing_fields():
records = [
ManifestRecord(
file_id="",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="a" * 32,
),
ManifestRecord(
file_id="f2",
file_name="",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="a" * 32,
),
ManifestRecord(
file_id="f3", file_name="c.tsv", data_type="", data_format="TSV", size=123, md5="a" * 32
),
ManifestRecord(
file_id="f4",
file_name="d.tsv",
data_type="Gene Expression",
data_format="",
size=123,
md5="a" * 32,
),
]
errors = validate_manifest(records)
assert len(errors) == 4
def test_validate_manifest_invalid_md5():
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="invalid",
)
]
errors = validate_manifest(records)
assert len(errors) == 1
assert "MD5" in errors[0]
def test_validate_manifest_invalid_size():
records = [
ManifestRecord(
file_id="f1",
file_name="a.tsv",
data_type="Gene Expression",
data_format="TSV",
size=-1,
md5="a" * 32,
)
]
errors = validate_manifest(records)
assert len(errors) == 1
assert "size" in errors[0].lower()
def test_validate_files_against_manifest(tmp_path):
records = [
ManifestRecord(
file_id="f1",
file_name="file1.txt",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="d41d8cd98f00b204e9800998ecf8427e",
),
]
data_dir = tmp_path / "data"
data_dir.mkdir()
file1 = data_dir / "file1.txt"
file1.write_text("Hello")
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
assert len(missing) == 0
assert len(checksum_errors) == 1
assert "file1.txt" in checksum_errors[0]
def test_validate_files_missing(tmp_path):
records = [
ManifestRecord(
file_id="f1",
file_name="file1.txt",
data_type="Gene Expression",
data_format="TSV",
size=123,
md5="a" * 32,
),
]
data_dir = tmp_path / "data"
data_dir.mkdir()
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
assert len(missing) == 1
assert "file1.txt" in missing[0]
assert len(checksum_errors) == 0