from tcga_downloader.manifest import ( ManifestRecord, format_manifest_stats, get_manifest_stats, load_manifest, validate_files_against_manifest, validate_manifest, write_manifest, ) def test_manifest_roundtrip_tsv(tmp_path): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=123, md5="abc", ) ] path = tmp_path / "m.tsv" write_manifest(records, path, fmt="tsv") loaded = load_manifest(path) expected = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Unknown", data_format="Unknown", size=123, md5="abc", ) ] assert loaded == expected def test_manifest_roundtrip_json(tmp_path): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=123, md5="abc", ) ] path = tmp_path / "m.json" write_manifest(records, path, fmt="json") loaded = load_manifest(path) assert loaded == records def test_get_manifest_stats(): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=1000, md5="abc", ), ManifestRecord( file_id="f2", file_name="b.tsv", data_type="Gene Expression", data_format="TSV", size=2000, md5="def", ), ManifestRecord( file_id="f3", file_name="c.tsv", data_type="Copy Number", data_format="TSV", size=3000, md5="ghi", ), ] stats = get_manifest_stats(records) assert stats["file_count"] == 3 assert stats["total_size_bytes"] == 6000 assert stats["total_size_mb"] == 6000 / (1024 * 1024) assert stats["total_size_gb"] == 6000 / (1024 * 1024 * 1024) assert stats["data_types"] == {"Gene Expression": 2, "Copy Number": 1} assert stats["data_formats"] == {"TSV": 3} def test_format_manifest_stats(): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=1000, md5="abc", ), ] stats = get_manifest_stats(records) formatted = format_manifest_stats(stats) assert "File count: 1" in formatted assert "Total size:" in formatted assert "Data types:" in formatted assert "Gene Expression: 1" in formatted assert "Data formats:" in formatted assert "TSV: 1" in formatted def test_validate_manifest_valid(): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=123, md5="a" * 32, ) ] errors = validate_manifest(records) assert len(errors) == 0 def test_validate_manifest_missing_fields(): records = [ ManifestRecord( file_id="", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=123, md5="a" * 32, ), ManifestRecord( file_id="f2", file_name="", data_type="Gene Expression", data_format="TSV", size=123, md5="a" * 32, ), ManifestRecord( file_id="f3", file_name="c.tsv", data_type="", data_format="TSV", size=123, md5="a" * 32 ), ManifestRecord( file_id="f4", file_name="d.tsv", data_type="Gene Expression", data_format="", size=123, md5="a" * 32, ), ] errors = validate_manifest(records) assert len(errors) == 4 def test_validate_manifest_invalid_md5(): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=123, md5="invalid", ) ] errors = validate_manifest(records) assert len(errors) == 1 assert "MD5" in errors[0] def test_validate_manifest_invalid_size(): records = [ ManifestRecord( file_id="f1", file_name="a.tsv", data_type="Gene Expression", data_format="TSV", size=-1, md5="a" * 32, ) ] errors = validate_manifest(records) assert len(errors) == 1 assert "size" in errors[0].lower() def test_validate_files_against_manifest(tmp_path): records = [ ManifestRecord( file_id="f1", file_name="file1.txt", data_type="Gene Expression", data_format="TSV", size=123, md5="d41d8cd98f00b204e9800998ecf8427e", ), ] data_dir = tmp_path / "data" data_dir.mkdir() file1 = data_dir / "file1.txt" file1.write_text("Hello") missing, checksum_errors = validate_files_against_manifest(records, data_dir) assert len(missing) == 0 assert len(checksum_errors) == 1 assert "file1.txt" in checksum_errors[0] def test_validate_files_missing(tmp_path): records = [ ManifestRecord( file_id="f1", file_name="file1.txt", data_type="Gene Expression", data_format="TSV", size=123, md5="a" * 32, ), ] data_dir = tmp_path / "data" data_dir.mkdir() missing, checksum_errors = validate_files_against_manifest(records, data_dir) assert len(missing) == 1 assert "file1.txt" in missing[0] assert len(checksum_errors) == 0