240 lines
5.9 KiB
Python
240 lines
5.9 KiB
Python
from tcga_downloader.manifest import (
|
|
ManifestRecord,
|
|
format_manifest_stats,
|
|
get_manifest_stats,
|
|
load_manifest,
|
|
validate_files_against_manifest,
|
|
validate_manifest,
|
|
write_manifest,
|
|
)
|
|
|
|
|
|
def test_manifest_roundtrip_tsv(tmp_path):
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="abc",
|
|
)
|
|
]
|
|
path = tmp_path / "m.tsv"
|
|
write_manifest(records, path, fmt="tsv")
|
|
loaded = load_manifest(path)
|
|
|
|
expected = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Unknown",
|
|
data_format="Unknown",
|
|
size=123,
|
|
md5="abc",
|
|
)
|
|
]
|
|
assert loaded == expected
|
|
|
|
|
|
def test_manifest_roundtrip_json(tmp_path):
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="abc",
|
|
)
|
|
]
|
|
path = tmp_path / "m.json"
|
|
write_manifest(records, path, fmt="json")
|
|
loaded = load_manifest(path)
|
|
assert loaded == records
|
|
|
|
|
|
def test_get_manifest_stats():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=1000,
|
|
md5="abc",
|
|
),
|
|
ManifestRecord(
|
|
file_id="f2",
|
|
file_name="b.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=2000,
|
|
md5="def",
|
|
),
|
|
ManifestRecord(
|
|
file_id="f3",
|
|
file_name="c.tsv",
|
|
data_type="Copy Number",
|
|
data_format="TSV",
|
|
size=3000,
|
|
md5="ghi",
|
|
),
|
|
]
|
|
|
|
stats = get_manifest_stats(records)
|
|
|
|
assert stats["file_count"] == 3
|
|
assert stats["total_size_bytes"] == 6000
|
|
assert stats["total_size_mb"] == 6000 / (1024 * 1024)
|
|
assert stats["total_size_gb"] == 6000 / (1024 * 1024 * 1024)
|
|
assert stats["data_types"] == {"Gene Expression": 2, "Copy Number": 1}
|
|
assert stats["data_formats"] == {"TSV": 3}
|
|
|
|
|
|
def test_format_manifest_stats():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=1000,
|
|
md5="abc",
|
|
),
|
|
]
|
|
|
|
stats = get_manifest_stats(records)
|
|
formatted = format_manifest_stats(stats)
|
|
|
|
assert "File count: 1" in formatted
|
|
assert "Total size:" in formatted
|
|
assert "Data types:" in formatted
|
|
assert "Gene Expression: 1" in formatted
|
|
assert "Data formats:" in formatted
|
|
assert "TSV: 1" in formatted
|
|
|
|
|
|
def test_validate_manifest_valid():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="a" * 32,
|
|
)
|
|
]
|
|
errors = validate_manifest(records)
|
|
assert len(errors) == 0
|
|
|
|
|
|
def test_validate_manifest_missing_fields():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="a" * 32,
|
|
),
|
|
ManifestRecord(
|
|
file_id="f2",
|
|
file_name="",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="a" * 32,
|
|
),
|
|
ManifestRecord(
|
|
file_id="f3", file_name="c.tsv", data_type="", data_format="TSV", size=123, md5="a" * 32
|
|
),
|
|
ManifestRecord(
|
|
file_id="f4",
|
|
file_name="d.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="",
|
|
size=123,
|
|
md5="a" * 32,
|
|
),
|
|
]
|
|
errors = validate_manifest(records)
|
|
assert len(errors) == 4
|
|
|
|
|
|
def test_validate_manifest_invalid_md5():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="invalid",
|
|
)
|
|
]
|
|
errors = validate_manifest(records)
|
|
assert len(errors) == 1
|
|
assert "MD5" in errors[0]
|
|
|
|
|
|
def test_validate_manifest_invalid_size():
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="a.tsv",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=-1,
|
|
md5="a" * 32,
|
|
)
|
|
]
|
|
errors = validate_manifest(records)
|
|
assert len(errors) == 1
|
|
assert "size" in errors[0].lower()
|
|
|
|
|
|
def test_validate_files_against_manifest(tmp_path):
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="file1.txt",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="d41d8cd98f00b204e9800998ecf8427e",
|
|
),
|
|
]
|
|
data_dir = tmp_path / "data"
|
|
data_dir.mkdir()
|
|
|
|
file1 = data_dir / "file1.txt"
|
|
file1.write_text("Hello")
|
|
|
|
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
|
|
assert len(missing) == 0
|
|
assert len(checksum_errors) == 1
|
|
assert "file1.txt" in checksum_errors[0]
|
|
|
|
|
|
def test_validate_files_missing(tmp_path):
|
|
records = [
|
|
ManifestRecord(
|
|
file_id="f1",
|
|
file_name="file1.txt",
|
|
data_type="Gene Expression",
|
|
data_format="TSV",
|
|
size=123,
|
|
md5="a" * 32,
|
|
),
|
|
]
|
|
data_dir = tmp_path / "data"
|
|
data_dir.mkdir()
|
|
|
|
missing, checksum_errors = validate_files_against_manifest(records, data_dir)
|
|
assert len(missing) == 1
|
|
assert "file1.txt" in missing[0]
|
|
assert len(checksum_errors) == 0
|