Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: download clinical data files if clinical_index is requested #132

Merged
merged 1 commit into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 62 additions & 13 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import subprocess
import tempfile
import time
from importlib.metadata import distribution
from importlib.metadata import distribution, version
from pathlib import Path

import duckdb
import idc_index_data
import pandas as pd
import platformdirs
import psutil
import requests
from packaging.version import Version
Expand Down Expand Up @@ -86,33 +87,48 @@ def __init__(self):
{"Modality": pd.Series.unique, "series_size_MB": "sum"}
)

idc_version = f"v{Version(idc_index_data.__version__).major}"
self.idc_version = f"v{Version(idc_index_data.__version__).major}"

# since indices can change between versions, we need to store them in a versioned directory
self.indices_data_dir = platformdirs.user_data_dir(
"idc_index_data", "IDC", version=version("idc-index-data")
)
# these are the items that are fetched from IDC release assets (e.g., clinical data files)
self.idc_data_dir = platformdirs.user_data_dir(
"IDC", "IDC", version=self.idc_version
)
self.clinical_data_dir = None

self.indices_overview = {
"index": {
"description": "Main index containing one row per DICOM series.",
"installed": True,
"url": None,
"file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH,
},
"previous_versions_index": {
"description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
"installed": True,
"url": None,
"file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH,
},
"sm_index": {
"description": "DICOM Slide Microscopy series-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_index.parquet",
"file_path": None,
},
"sm_instance_index": {
"description": "DICOM Slide Microscopy instance-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_instance_index.parquet",
"file_path": None,
},
"clinical_index": {
"description": "Index of clinical data accompanying the available images.",
"installed": False,
"url": f"{asset_endpoint_url}/clinical_index.parquet",
"file_path": None,
},
}

Expand Down Expand Up @@ -275,28 +291,32 @@ def _check_create_directory(download_dir):

return str(download_dir.resolve())

def fetch_index(self, index) -> None:
def fetch_index(self, index_name) -> None:
"""
Downloads requested index and adds this index joined with the main index as respective class attribute.

Args:
index (str): Name of the index to be downloaded.
"""

if index not in self.indices_overview:
logger.error(f"Index {index} is not available and can not be fetched.")
elif self.indices_overview[index]["installed"]:
if index_name not in self.indices_overview:
logger.error(f"Index {index_name} is not available and can not be fetched.")
elif self.indices_overview[index_name]["installed"]:
logger.warning(
f"Index {index} already installed and will not be fetched again."
f"Index {index_name} already installed and will not be fetched again."
)
else:
response = requests.get(self.indices_overview[index]["url"], timeout=30)
logger.info("Fetching index %s", index_name)
response = requests.get(
self.indices_overview[index_name]["url"], timeout=30
)
if response.status_code == 200:
filepath = os.path.join(
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
f"{index}.parquet",
self.indices_data_dir,
f"{index_name}.parquet",
)

os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, mode="wb") as file:
file.write(response.content)

Expand All @@ -305,12 +325,41 @@ def fetch_index(self, index) -> None:
# self.index[["series_aws_url", "SeriesInstanceUID"]],
# on="SeriesInstanceUID", how="left"
# )
setattr(self.__class__, index, index_table)
self.indices_overview[index]["installed"] = True
setattr(self.__class__, index_name, index_table)
self.indices_overview[index_name]["installed"] = True
self.indices_overview[index_name]["file_path"] = filepath

else:
logger.error(
f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}"
f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}"
)
# if clinical_index is requested, likely the user will need clinical data
# download it here, given that the size is small (<2MB as of IDC v19)
if index_name == "clinical_index":
logger.info(
"Since clinical_index was fetched, also installing corresponding tables."
)
# create clinical_data folder under self.idc_data_dir, if it does not exist
self.clinical_data_dir = os.path.join(self.idc_data_dir, "clinical_data")
idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{self.idc_version}_clinical/*"
result = subprocess.run(
[
self.s5cmdPath,
"--no-sign-request",
"cp",
idc_clinical_data_release_url,
self.clinical_data_dir,
],
capture_output=True,
text=True,
check=True,
)
if result.stderr and result.stdout.startswith("ERROR"):
logger.error("Failed to download IDC clinical data.")
else:
logger.info(
"IDC clinical data downloaded successfully to %s",
self.clinical_data_dir,
)

def get_collections(self):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"idc-index-data==19.0.2",
"packaging",
"pandas<2.2",
"platformdirs",
"psutil",
"pyarrow",
"requests",
Expand Down
7 changes: 7 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,13 @@ def test_indices_urls(self):
if i.indices_overview[index]["url"] is not None:
assert remote_file_exists(i.indices_overview[index]["url"])

def test_clinical_index_install(self):
i = IDCClient()
assert i.indices_overview["clinical_index"]["installed"] is False
i.fetch_index("clinical_index")
assert i.indices_overview["clinical_index"]["installed"] is True
assert len(os.listdir(i.clinical_data_dir)) > 0


if __name__ == "__main__":
unittest.main()
Loading