Skip to content

Commit 19b42fe

Browse files
authored
BUG: catch another situation where invalid manifest lines were not handled (#140)
* BUG: fix errors handling invalid manifest file lines
1 parent 8b9d192 commit 19b42fe

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

idc_index/index.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,11 @@ def _validate_update_manifest_and_get_download_size(
779779
# Rename the column
780780
manifest_df.columns = ["manifest_cp_cmd"]
781781

782+
# remove all rows that do not contain an S3 URL
783+
manifest_df = manifest_df[
784+
manifest_df["manifest_cp_cmd"].str.contains(r"s3://", na=False)
785+
]
786+
782787
# create a copy of the index
783788
index_df_copy = self.index[
784789
[
@@ -916,7 +921,9 @@ def _validate_update_manifest_and_get_download_size(
916921
REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
917922
REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
918923
FROM
919-
manifest_df )
924+
manifest_df
925+
WHERE
926+
REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') IS NOT NULL)
920927
SELECT
921928
seriesInstanceuid,
922929
index_crdc_series_uuid,

tests/study_manifest_aws.s5cmd

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd),
22
# then run the following command:
33
# s5cmd --no-sign-request --endpoint-url https://s3.amazonaws.com run study_manifest_aws.s5cmd
4+
study_manifest_cp_command
45
cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* .
56
cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* .
67
cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* .

0 commit comments

Comments
 (0)