Skip to content

Commit 90ac74a

Browse files
authored
Merge pull request #72 from vkt1414/feat-download-cli
feat: provide a command line option to download from cli
2 parents 13a9bb7 + 60ed607 commit 90ac74a

File tree

4 files changed

+311
-5
lines changed

4 files changed

+311
-5
lines changed

idc_index/cli.py

+257
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
"""CLI module for the IDC client.
2+
3+
This module provides command-line interface (CLI) commands to interact with the Imaging Data Commons (IDC) data.
4+
"""
5+
from __future__ import annotations
6+
7+
import logging
8+
9+
import click
10+
11+
from . import index
12+
from .index import IDCClient
13+
14+
# Set up logging for the CLI module
15+
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
16+
logger_cli = logging.getLogger("cli")
17+
logger_cli.setLevel("WARNING")
18+
19+
20+
@click.group()
21+
def main():
22+
"""Idc is a command line client to help download data from Imaging Data Commons."""
23+
24+
25+
def set_log_level(log_level):
26+
"""Set the logging level for the CLI module.
27+
28+
Args:
29+
log_level (str): The logging level to set.
30+
"""
31+
log_levels = {
32+
"debug": logging.DEBUG,
33+
"info": logging.INFO,
34+
"warning": logging.WARNING,
35+
"error": logging.ERROR,
36+
"critical": logging.CRITICAL,
37+
}
38+
logging_level = log_levels.get(log_level.lower(), logging.WARNING)
39+
logger_cli.debug(f"Setting the log level of index.py to {logging_level}")
40+
index.logger.setLevel(logging_level)
41+
42+
43+
@main.command()
44+
@click.option(
45+
"--download-dir",
46+
required=True,
47+
type=click.Path(),
48+
help="Path to the directory to download the files to.",
49+
)
50+
@click.option(
51+
"--dry-run",
52+
type=bool,
53+
default=False,
54+
help="If set, calculates the size of the cohort but download does not start.",
55+
)
56+
@click.option(
57+
"--collection-id",
58+
type=str,
59+
multiple=True,
60+
default=None,
61+
help="Collection ID(s) to filter by.",
62+
)
63+
@click.option(
64+
"--patient-id",
65+
type=str,
66+
multiple=True,
67+
default=None,
68+
help="Patient ID(s) to filter by.",
69+
)
70+
@click.option(
71+
"--study-instance-uid",
72+
type=str,
73+
multiple=True,
74+
default=None,
75+
help="DICOM StudyInstanceUID(s) to filter by.",
76+
)
77+
@click.option(
78+
"--series-instance-uid",
79+
type=str,
80+
multiple=True,
81+
default=None,
82+
help="DICOM SeriesInstanceUID(s) to filter by.",
83+
)
84+
@click.option(
85+
"--quiet",
86+
type=bool,
87+
default=True,
88+
help="If set, suppresses the output of the subprocess.",
89+
)
90+
@click.option(
91+
"--show-progress-bar",
92+
type=bool,
93+
default=True,
94+
help="If set, tracks the progress of download.",
95+
)
96+
@click.option(
97+
"--use-s5cmd-sync",
98+
type=bool,
99+
default=False,
100+
help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.",
101+
)
102+
@click.option(
103+
"--log-level",
104+
type=click.Choice(
105+
["debug", "info", "warning", "error", "critical"], case_sensitive=False
106+
),
107+
default="info",
108+
help="Set the logging level for the CLI module.",
109+
)
110+
def download_from_selection(
111+
download_dir,
112+
dry_run,
113+
collection_id,
114+
patient_id,
115+
study_instance_uid,
116+
series_instance_uid,
117+
quiet,
118+
show_progress_bar,
119+
use_s5cmd_sync,
120+
log_level,
121+
):
122+
"""Download from a selection of collection(s), patient(s), study(studies) and series.
123+
124+
The filtering will be applied in sequence by first selecting the collection(s), followed by
125+
patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded.
126+
"""
127+
# Set the logging level for the CLI module
128+
set_log_level(log_level)
129+
# Create an instance of the IDCClient
130+
client = IDCClient()
131+
# Parse the input parameters and pass them to IDCClient's download_from_selection method
132+
collection_id = (
133+
[cid.strip() for cid in (",".join(collection_id)).split(",")]
134+
if collection_id
135+
else None
136+
)
137+
patient_id = (
138+
[pid.strip() for pid in (",".join(patient_id)).split(",")]
139+
if patient_id
140+
else None
141+
)
142+
study_instance_uid = (
143+
[uid.strip() for uid in (",".join(study_instance_uid)).split(",")]
144+
if study_instance_uid
145+
else None
146+
)
147+
series_instance_uid = (
148+
[uid.strip() for uid in (",".join(series_instance_uid)).split(",")]
149+
if series_instance_uid
150+
else None
151+
)
152+
logger_cli.debug("Inputs received from cli download:")
153+
logger_cli.debug(f"collection_id: {collection_id}")
154+
logger_cli.debug(f"patient_id: {patient_id}")
155+
logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
156+
logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
157+
logger_cli.debug(f"dry_run: {dry_run}")
158+
logger_cli.debug(f"quiet: {quiet}")
159+
logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
160+
logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}")
161+
162+
client.download_from_selection(
163+
download_dir,
164+
dry_run=dry_run,
165+
collection_id=collection_id,
166+
patientId=patient_id,
167+
studyInstanceUID=study_instance_uid,
168+
seriesInstanceUID=series_instance_uid,
169+
quiet=quiet,
170+
show_progress_bar=show_progress_bar,
171+
use_s5cmd_sync=use_s5cmd_sync,
172+
)
173+
174+
175+
@main.command()
176+
@click.option(
177+
"--manifest-file",
178+
required=True,
179+
type=click.Path(),
180+
help="The path to the manifest file.",
181+
)
182+
@click.option(
183+
"--download-dir",
184+
required=True,
185+
type=click.Path(),
186+
help="Path to the directory to download the files to.",
187+
)
188+
@click.option(
189+
"--quiet",
190+
type=bool,
191+
default=True,
192+
help="If set, suppresses the output of the subprocess.",
193+
)
194+
@click.option(
195+
"--validate-manifest",
196+
type=bool,
197+
default=True,
198+
help="If True, validates the manifest for any errors. Defaults to True.",
199+
)
200+
@click.option(
201+
"--show-progress-bar",
202+
type=bool,
203+
default=True,
204+
help="If set, tracks the progress of download.",
205+
)
206+
@click.option(
207+
"--use-s5cmd-sync",
208+
type=bool,
209+
default=False,
210+
help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.",
211+
)
212+
@click.option(
213+
"--log-level",
214+
type=click.Choice(
215+
["debug", "info", "warning", "error", "critical"], case_sensitive=False
216+
),
217+
default="info",
218+
help="Set the logging level for the CLI module.",
219+
)
220+
def download_from_manifest(
221+
manifest_file,
222+
download_dir,
223+
quiet,
224+
validate_manifest,
225+
show_progress_bar,
226+
use_s5cmd_sync,
227+
log_level,
228+
):
229+
"""Download the manifest file.
230+
231+
In a series of steps, the manifest file is first validated to ensure every line contains a valid URL.
232+
It then gets the total size to be downloaded and runs the download process on one
233+
process and download progress on another process.
234+
"""
235+
# Set the logging level for the CLI module
236+
set_log_level(log_level)
237+
# Create an instance of the IDCClient
238+
client = IDCClient()
239+
logger_cli.debug("Inputs received from cli manifest download:")
240+
logger_cli.debug(f"manifest_file_path: {manifest_file}")
241+
logger_cli.debug(f"download_dir: {download_dir}")
242+
logger_cli.debug(f"validate_manifest: {validate_manifest}")
243+
logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
244+
logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}")
245+
# Call IDCClient's download_from_manifest method with the provided parameters
246+
client.download_from_manifest(
247+
manifestFile=manifest_file,
248+
downloadDir=download_dir,
249+
quiet=quiet,
250+
validate_manifest=validate_manifest,
251+
show_progress_bar=show_progress_bar,
252+
use_s5cmd_sync=use_s5cmd_sync,
253+
)
254+
255+
256+
if __name__ == "__main__":
257+
main()

idc_index/index.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
from packaging.version import Version
2020
from tqdm import tqdm
2121

22-
logger = logging.getLogger(__name__)
23-
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
24-
2522
aws_endpoint_url = "https://s3.amazonaws.com"
2623
gcp_endpoint_url = "https://storage.googleapis.com"
2724

25+
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
26+
logger = logging.getLogger(__name__)
27+
2828

2929
class IDCClient:
3030
# Default download hierarchy template

pyproject.toml

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ classifiers = [
3232
]
3333
dynamic = ["version"]
3434
dependencies = [
35+
"click",
3536
'duckdb>=0.10.0',
3637
"idc-index-data==18.0.1",
3738
"packaging",
@@ -60,6 +61,9 @@ docs = [
6061
"furo>=2023.08.17",
6162
]
6263

64+
[project.scripts]
65+
idc = 'idc_index.cli:main'
66+
6367
[project.urls]
6468
Homepage = "https://github.com/ImagingDataCommons/idc-index"
6569
"Bug Tracker" = "https://github.com/ImagingDataCommons/idc-index/issues"

tests/idcindex.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88

99
import pandas as pd
1010
import pytest
11-
from idc_index import index
11+
from click.testing import CliRunner
12+
from idc_index import cli, index
1213

1314
# Run tests using the following command from the root of the repository:
1415
# python -m unittest -vv tests/idcindex.py
1516

16-
logging.basicConfig(level=logging.INFO)
17+
logging.basicConfig(level=logging.DEBUG)
1718

1819

1920
@pytest.fixture(autouse=True)
@@ -24,6 +25,8 @@ def _change_test_dir(request, monkeypatch):
2425
class TestIDCClient(unittest.TestCase):
2526
def setUp(self):
2627
self.client = index.IDCClient()
28+
self.download_from_manifest = cli.download_from_manifest
29+
self.download_from_selection = cli.download_from_selection
2730

2831
logger = logging.getLogger("idc_index")
2932
logger.setLevel(logging.DEBUG)
@@ -376,6 +379,48 @@ def test_citations(self):
376379
self.assertIsNotNone(citations)
377380
"""
378381

382+
def test_cli_download_from_selection(self):
383+
runner = CliRunner()
384+
with tempfile.TemporaryDirectory() as temp_dir:
385+
result = runner.invoke(
386+
self.download_from_selection,
387+
[
388+
"--download-dir",
389+
temp_dir,
390+
"--dry-run",
391+
False,
392+
"--quiet",
393+
True,
394+
"--show-progress-bar",
395+
True,
396+
"--use-s5cmd-sync",
397+
False,
398+
"--study-instance-uid",
399+
"1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462",
400+
],
401+
)
402+
assert len(os.listdir(temp_dir)) != 0
403+
404+
def test_cli_download_from_manifest(self):
405+
runner = CliRunner()
406+
with tempfile.TemporaryDirectory() as temp_dir:
407+
result = runner.invoke(
408+
self.download_from_manifest,
409+
[
410+
"--manifest-file",
411+
"./study_manifest_aws.s5cmd",
412+
"--download-dir",
413+
temp_dir,
414+
"--quiet",
415+
True,
416+
"--show-progress-bar",
417+
True,
418+
"--use-s5cmd-sync",
419+
False,
420+
],
421+
)
422+
assert len(os.listdir(temp_dir)) != 0
423+
379424

380425
if __name__ == "__main__":
381426
unittest.main()

0 commit comments

Comments
 (0)