8
8
import subprocess
9
9
import tempfile
10
10
import time
11
- from importlib .metadata import distribution
11
+ from importlib .metadata import distribution , version
12
12
from pathlib import Path
13
13
14
14
import duckdb
15
15
import idc_index_data
16
16
import pandas as pd
17
+ import platformdirs
17
18
import psutil
18
19
import requests
19
20
from packaging .version import Version
@@ -86,33 +87,48 @@ def __init__(self):
86
87
{"Modality" : pd .Series .unique , "series_size_MB" : "sum" }
87
88
)
88
89
89
- idc_version = f"v{ Version (idc_index_data .__version__ ).major } "
90
+ self .idc_version = f"v{ Version (idc_index_data .__version__ ).major } "
91
+
92
+ # since indices can change between versions, we need to store them in a versioned directory
93
+ self .indices_data_dir = platformdirs .user_data_dir (
94
+ "idc_index_data" , "IDC" , version = version ("idc-index-data" )
95
+ )
96
+ # these are the items that are fetched from IDC release assets (e.g., clinical data files)
97
+ self .idc_data_dir = platformdirs .user_data_dir (
98
+ "IDC" , "IDC" , version = self .idc_version
99
+ )
100
+ self .clinical_data_dir = None
90
101
91
102
self .indices_overview = {
92
103
"index" : {
93
104
"description" : "Main index containing one row per DICOM series." ,
94
105
"installed" : True ,
95
106
"url" : None ,
107
+ "file_path" : idc_index_data .IDC_INDEX_PARQUET_FILEPATH ,
96
108
},
97
109
"previous_versions_index" : {
98
110
"description" : "index containing one row per DICOM series from all previous IDC versions that are not in current version." ,
99
111
"installed" : True ,
100
112
"url" : None ,
113
+ "file_path" : idc_index_data .PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH ,
101
114
},
102
115
"sm_index" : {
103
116
"description" : "DICOM Slide Microscopy series-level index." ,
104
117
"installed" : False ,
105
118
"url" : f"{ asset_endpoint_url } /sm_index.parquet" ,
119
+ "file_path" : None ,
106
120
},
107
121
"sm_instance_index" : {
108
122
"description" : "DICOM Slide Microscopy instance-level index." ,
109
123
"installed" : False ,
110
124
"url" : f"{ asset_endpoint_url } /sm_instance_index.parquet" ,
125
+ "file_path" : None ,
111
126
},
112
127
"clinical_index" : {
113
128
"description" : "Index of clinical data accompanying the available images." ,
114
129
"installed" : False ,
115
130
"url" : f"{ asset_endpoint_url } /clinical_index.parquet" ,
131
+ "file_path" : None ,
116
132
},
117
133
}
118
134
@@ -275,28 +291,32 @@ def _check_create_directory(download_dir):
275
291
276
292
return str (download_dir .resolve ())
277
293
278
- def fetch_index (self , index ) -> None :
294
+ def fetch_index (self , index_name ) -> None :
279
295
"""
280
296
Downloads requested index and adds this index joined with the main index as respective class attribute.
281
297
282
298
Args:
283
299
index (str): Name of the index to be downloaded.
284
300
"""
285
301
286
- if index not in self .indices_overview :
287
- logger .error (f"Index { index } is not available and can not be fetched." )
288
- elif self .indices_overview [index ]["installed" ]:
302
+ if index_name not in self .indices_overview :
303
+ logger .error (f"Index { index_name } is not available and can not be fetched." )
304
+ elif self .indices_overview [index_name ]["installed" ]:
289
305
logger .warning (
290
- f"Index { index } already installed and will not be fetched again."
306
+ f"Index { index_name } already installed and will not be fetched again."
291
307
)
292
308
else :
293
- response = requests .get (self .indices_overview [index ]["url" ], timeout = 30 )
309
+ logger .info ("Fetching index %s" , index_name )
310
+ response = requests .get (
311
+ self .indices_overview [index_name ]["url" ], timeout = 30
312
+ )
294
313
if response .status_code == 200 :
295
314
filepath = os .path .join (
296
- idc_index_data . IDC_INDEX_PARQUET_FILEPATH . parents [ 0 ] ,
297
- f"{ index } .parquet" ,
315
+ self . indices_data_dir ,
316
+ f"{ index_name } .parquet" ,
298
317
)
299
318
319
+ os .makedirs (os .path .dirname (filepath ), exist_ok = True )
300
320
with open (filepath , mode = "wb" ) as file :
301
321
file .write (response .content )
302
322
@@ -305,12 +325,41 @@ def fetch_index(self, index) -> None:
305
325
# self.index[["series_aws_url", "SeriesInstanceUID"]],
306
326
# on="SeriesInstanceUID", how="left"
307
327
# )
308
- setattr (self .__class__ , index , index_table )
309
- self .indices_overview [index ]["installed" ] = True
328
+ setattr (self .__class__ , index_name , index_table )
329
+ self .indices_overview [index_name ]["installed" ] = True
330
+ self .indices_overview [index_name ]["file_path" ] = filepath
310
331
311
332
else :
312
333
logger .error (
313
- f"Failed to fetch index from URL { self .indices_overview [index ]['url' ]} : { response .status_code } "
334
+ f"Failed to fetch index from URL { self .indices_overview [index_name ]['url' ]} : { response .status_code } "
335
+ )
336
+ # if clinical_index is requested, likely the user will need clinical data
337
+ # download it here, given that the size is small (<2MB as of IDC v19)
338
+ if index_name == "clinical_index" :
339
+ logger .info (
340
+ "Since clinical_index was fetched, also installing corresponding tables."
341
+ )
342
+ # create clinical_data folder under self.idc_data_dir, if it does not exist
343
+ self .clinical_data_dir = os .path .join (self .idc_data_dir , "clinical_data" )
344
+ idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{ self .idc_version } _clinical/*"
345
+ result = subprocess .run (
346
+ [
347
+ self .s5cmdPath ,
348
+ "--no-sign-request" ,
349
+ "cp" ,
350
+ idc_clinical_data_release_url ,
351
+ self .clinical_data_dir ,
352
+ ],
353
+ capture_output = True ,
354
+ text = True ,
355
+ check = True ,
356
+ )
357
+ if result .stderr and result .stdout .startswith ("ERROR" ):
358
+ logger .error ("Failed to download IDC clinical data." )
359
+ else :
360
+ logger .info (
361
+ "IDC clinical data downloaded successfully to %s" ,
362
+ self .clinical_data_dir ,
314
363
)
315
364
316
365
def get_collections (self ):
0 commit comments