Skip to content

Commit afe8975

Browse files
authored
ENH: download clinical data files if clinical_index is requested (#132)
Also address #107 by installing both indices and clinical_data into user directory instead of the python package location.
1 parent a054695 commit afe8975

File tree

3 files changed

+70
-13
lines changed

3 files changed

+70
-13
lines changed

idc_index/index.py

+62-13
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88
import subprocess
99
import tempfile
1010
import time
11-
from importlib.metadata import distribution
11+
from importlib.metadata import distribution, version
1212
from pathlib import Path
1313

1414
import duckdb
1515
import idc_index_data
1616
import pandas as pd
17+
import platformdirs
1718
import psutil
1819
import requests
1920
from packaging.version import Version
@@ -86,33 +87,48 @@ def __init__(self):
8687
{"Modality": pd.Series.unique, "series_size_MB": "sum"}
8788
)
8889

89-
idc_version = f"v{Version(idc_index_data.__version__).major}"
90+
self.idc_version = f"v{Version(idc_index_data.__version__).major}"
91+
92+
# since indices can change between versions, we need to store them in a versioned directory
93+
self.indices_data_dir = platformdirs.user_data_dir(
94+
"idc_index_data", "IDC", version=version("idc-index-data")
95+
)
96+
# these are the items that are fetched from IDC release assets (e.g., clinical data files)
97+
self.idc_data_dir = platformdirs.user_data_dir(
98+
"IDC", "IDC", version=self.idc_version
99+
)
100+
self.clinical_data_dir = None
90101

91102
self.indices_overview = {
92103
"index": {
93104
"description": "Main index containing one row per DICOM series.",
94105
"installed": True,
95106
"url": None,
107+
"file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH,
96108
},
97109
"previous_versions_index": {
98110
"description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
99111
"installed": True,
100112
"url": None,
113+
"file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH,
101114
},
102115
"sm_index": {
103116
"description": "DICOM Slide Microscopy series-level index.",
104117
"installed": False,
105118
"url": f"{asset_endpoint_url}/sm_index.parquet",
119+
"file_path": None,
106120
},
107121
"sm_instance_index": {
108122
"description": "DICOM Slide Microscopy instance-level index.",
109123
"installed": False,
110124
"url": f"{asset_endpoint_url}/sm_instance_index.parquet",
125+
"file_path": None,
111126
},
112127
"clinical_index": {
113128
"description": "Index of clinical data accompanying the available images.",
114129
"installed": False,
115130
"url": f"{asset_endpoint_url}/clinical_index.parquet",
131+
"file_path": None,
116132
},
117133
}
118134

@@ -275,28 +291,32 @@ def _check_create_directory(download_dir):
275291

276292
return str(download_dir.resolve())
277293

278-
def fetch_index(self, index) -> None:
294+
def fetch_index(self, index_name) -> None:
279295
"""
280296
Downloads requested index and adds this index joined with the main index as respective class attribute.
281297
282298
Args:
283299
index (str): Name of the index to be downloaded.
284300
"""
285301

286-
if index not in self.indices_overview:
287-
logger.error(f"Index {index} is not available and can not be fetched.")
288-
elif self.indices_overview[index]["installed"]:
302+
if index_name not in self.indices_overview:
303+
logger.error(f"Index {index_name} is not available and can not be fetched.")
304+
elif self.indices_overview[index_name]["installed"]:
289305
logger.warning(
290-
f"Index {index} already installed and will not be fetched again."
306+
f"Index {index_name} already installed and will not be fetched again."
291307
)
292308
else:
293-
response = requests.get(self.indices_overview[index]["url"], timeout=30)
309+
logger.info("Fetching index %s", index_name)
310+
response = requests.get(
311+
self.indices_overview[index_name]["url"], timeout=30
312+
)
294313
if response.status_code == 200:
295314
filepath = os.path.join(
296-
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
297-
f"{index}.parquet",
315+
self.indices_data_dir,
316+
f"{index_name}.parquet",
298317
)
299318

319+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
300320
with open(filepath, mode="wb") as file:
301321
file.write(response.content)
302322

@@ -305,12 +325,41 @@ def fetch_index(self, index) -> None:
305325
# self.index[["series_aws_url", "SeriesInstanceUID"]],
306326
# on="SeriesInstanceUID", how="left"
307327
# )
308-
setattr(self.__class__, index, index_table)
309-
self.indices_overview[index]["installed"] = True
328+
setattr(self.__class__, index_name, index_table)
329+
self.indices_overview[index_name]["installed"] = True
330+
self.indices_overview[index_name]["file_path"] = filepath
310331

311332
else:
312333
logger.error(
313-
f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}"
334+
f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}"
335+
)
336+
# if clinical_index is requested, likely the user will need clinical data
337+
# download it here, given that the size is small (<2MB as of IDC v19)
338+
if index_name == "clinical_index":
339+
logger.info(
340+
"Since clinical_index was fetched, also installing corresponding tables."
341+
)
342+
# create clinical_data folder under self.idc_data_dir, if it does not exist
343+
self.clinical_data_dir = os.path.join(self.idc_data_dir, "clinical_data")
344+
idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{self.idc_version}_clinical/*"
345+
result = subprocess.run(
346+
[
347+
self.s5cmdPath,
348+
"--no-sign-request",
349+
"cp",
350+
idc_clinical_data_release_url,
351+
self.clinical_data_dir,
352+
],
353+
capture_output=True,
354+
text=True,
355+
check=True,
356+
)
357+
if result.stderr and result.stdout.startswith("ERROR"):
358+
logger.error("Failed to download IDC clinical data.")
359+
else:
360+
logger.info(
361+
"IDC clinical data downloaded successfully to %s",
362+
self.clinical_data_dir,
314363
)
315364

316365
def get_collections(self):

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ dependencies = [
3737
"idc-index-data==19.0.2",
3838
"packaging",
3939
"pandas<2.2",
40+
"platformdirs",
4041
"psutil",
4142
"pyarrow",
4243
"requests",

tests/idcindex.py

+7
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,13 @@ def test_indices_urls(self):
556556
if i.indices_overview[index]["url"] is not None:
557557
assert remote_file_exists(i.indices_overview[index]["url"])
558558

559+
def test_clinical_index_install(self):
560+
i = IDCClient()
561+
assert i.indices_overview["clinical_index"]["installed"] is False
562+
i.fetch_index("clinical_index")
563+
assert i.indices_overview["clinical_index"]["installed"] is True
564+
assert len(os.listdir(i.clinical_data_dir)) > 0
565+
559566

560567
if __name__ == "__main__":
561568
unittest.main()

0 commit comments

Comments
 (0)