-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_attached_files.R
252 lines (182 loc) · 11.8 KB
/
process_attached_files.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
### Developed by Luis Castillo and Alicia Mastretta-Yanes
# Define a function to process files
process_attached_files <- function(file_path, kobo_output, delim, skip=0){
## What this function does
# If information of more than 25 populations was be used to collect data for Ne >500 indicator (Section 5 of the Kobo form)
# it is possible to use a template to upload data instead of using the kobo form.
# This functions processes the data stored in a one of those files, and formats it as the output of running the function get_indicator1_data.R
## Arguments:
# target_file = path to a file that needs to be processed
# Normaly the path to this file would be the "original_files_dir", the result of get_attached_files.R creating the directory and moving the files there
# The file MUST include the Xuuid in the file name (as resulting from running get_attached_files.R)
# kobo_output = a data frame result with the raw (hundreds of columns) Kobo output as downloaded from Kobo
# delim = delimiter of the file ("," ";", "\t", etc) to be passed to read_delim()
# skip = Number of lines to skip before reading data. To be pased to read_delim(). Default to 0.
## Needed libraries:
# library(dplyr)
# library(readr)
# library(stringr)
# library(tools)
### Read file
skip=skip
delim = delim
temp_df <- read_delim(file_path, delim = delim, col_names = TRUE, show_col_types = FALSE, skip=skip)
### Get Xuud from file name
# get the Xuud ie the characters after the first "__" and the second "__".
# [1,1] is used because we want the character result
Xuuid_filename<-str_match(file_path, "(?<=__).*?(?=__)")[1,1]
####### Separate data in kobo_output as in get_indicator1 function
# create a variable with the full taxon name if this variable doesn't exist already
# (raw kobo output doesn't include it, but it may exists in a "clean" version of the
# output if ran through the quality check pipeline)
if("taxon" %in% colnames(kobo_output)){
# the kobo_output data already contained a taxon column, that will be used instead of creating a new one
}else {
kobo_output<-kobo_output %>%
mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
# remove white space at the end of the name
mutate(taxon=str_trim(taxon, "right"))
}
## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
# object with duplicated taxa within a single country
# duplicated() is run twice, the second time with fromLast = TRUE so that
# the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
kobo_output_duplicates <- kobo_output[which(duplicated(kobo_output[c('taxon', 'country_assessment')]) | duplicated(kobo_output[c('taxon', 'country_assessment')], fromLast = TRUE)), ]
# if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
kobo_output <- kobo_output %>%
mutate(multiassessment= if_else(
X_uuid %in% kobo_output_duplicates$X_uuid, "multiassessment", "single_assessment"))
## Process data already including taxon column and multiassessment
kobo_output <- kobo_output %>%
# create variable with year in which assessment was done (based on date the form was completed)
mutate(year_assesment=substr(end,1,4)) %>%
# make sure some variables that seem numbers are actually character,
# because there may be character and integer values depending on how data was written)
# for example in IntroductionYear, NeYear and NcYear...
mutate(across(starts_with("IntroductionYear"), as.character)) %>%
mutate(across(starts_with("NeYear"), as.character)) %>%
mutate(across(starts_with("NcYear"), as.character)) %>%
mutate(across(starts_with("NcRangeDetails"), as.character))
###### Process attachment file
# columns with population data that should exist
required_pop_columns <- c("populationID", "PopulationName", "Origin", "IntroductionYear",
"Ne", "NeLower", "NeUpper", "NeYear", "GeneticMarkers", "GeneticMarkersOther",
"MethodNe", "SourceNe", "NcType", "NcYear", "NcMethod", "NcRange", "NcRangeDetails",
"NcPoint", "NcLower", "NcUpper", "SourceNc", "Comments")
### 1) Check if all required columns are present, if missing, create them
if(all(required_pop_columns %in% names(temp_df))) {
print("all requiered population columns are present in the file :)")
# create working df
df <- temp_df
} else {
## tell the user and create missing columns as empty
# create workig df
df <- temp_df
# check name of missing columns
missing_columns<-required_pop_columns[!(required_pop_columns %in% names(df))]
# tell the user which columns are missing
print(paste("the following column is missing in the file and it will be created as an empity variable:", missing_columns))
# create empty variable for each missing column
# Loop through the missing_variables vector and add each column
for (col_name in missing_columns) {
df <- df %>%
mutate(!!col_name := NA) # !! operator is used to "unquote" and interpret the col_name as a column name
}
}
### 2) Join population data and metadata (the metadata variables in the attachment file will be replaced from the metadata captured in kobo to assure it is correct)
# Keep only population data columns and add X_uuid columm
df <- df %>%
select(all_of(required_pop_columns)) %>%
mutate(X_uuid = Xuuid_filename)
# Find matches in "kobo_output" to then merge metadata matching rows with population data
matching_row <- kobo_output %>%
filter(X_uuid == Xuuid_filename) %>%
select(country_assessment, taxonomic_group, time_populations, taxon,
scientific_authority, name_assessor, email_assessor, kobo_tabular, genus, species, subspecies_variety,
X_validation_status, X_uuid, year_assesment, GBIF_taxonID, NCBI_taxonID, multiassessment,
national_taxonID, defined_populations)
# Join kobo_out metadata and file population data
df<-left_join(df, matching_row, by = "X_uuid")
## Further clean population data:
## Make sure numeric columns are numbers
for(x in c("Ne", "NeLower", "NeUpper", "NcPoint", "NcLower", "NcUpper")){
if(class(df[[x]])=="character"){
print(cat("varible", x, "is stored as character and should be numeric, so this function will: \n
1) check if there are (), for instance `86 (95% CI)`, and remove them keeping only the value outside ().\n
2) convert ',' to '' IF more than 3 digits followed the ',' (ie we assumed ',' is separating thousands), OR /n
convert ',' to '.' IF 2 digits followed the ',' (ie we assumed ',' is separating decimal points) ; and \n
3) use as.numeric(). \n
You should check the original data to make sure the transformation was correct"))
## 1) remove () if they are:
df[[x]]<-gsub(pattern="\\s*\\([^\\)]+\\)", replacement="", df[[x]])
## 2) numeric variables appear as character if "," was used in the original file
# Count characters after the first comma
characters_after_comma <- nchar(sub("^[^,]*,", "", df[[x]]))
# Change "," for "" if characters_after_comma >= 3, else replace with "."
df[[x]] <- ifelse(characters_after_comma >= 3, sub(",", "", df[[x]]), sub(",", ".", df[[x]]))
## transform to numeric
df[[x]]<-as.numeric(df[[x]])
} else {
df[[x]]<-as.numeric(df[[x]])}
}
### 3) NcRange and NcType should have only the values specified in the template. Any other value would be changed to NA.
## NcRange
excpected_categories<-c("less_5000", "less_5000_bymuch", "more_5000", "more_5000_bymuch", "range_includes_5000", NA)
condition<-df$NcRange %in% excpected_categories
# message and change data
if(any(!condition)){ # check if there is at least one FALSE
print("NcRange values could only be 'less_5000', 'less_5000_bymuch', 'more_5000', 'more_5000_bymuch', 'range_includes_5000'. Other values were found and were changed to NA")
}
df<- df %>%
mutate(NcRange = ifelse(condition, NcRange, NA))
## NcType
excpected_categories<-c("Nc_point", "Nc_range", NA)
condition<-df$NcType %in% excpected_categories
# message and change data
if(any(!condition)){ # check if there is at least one FALSE
print("NcType values could only be 'Nc_point', 'Nc_range'. Other values were found and were changed to NA")
}
df<- df %>%
mutate(NcType = ifelse(condition, NcType, NA))
### 4) Rename and fill columns that should not be empty, if needed
# Rename populationId and name column to match desired ind1_data names if they exits
if ("populationID" %in% colnames(df) && "PopulationName" %in% colnames(df)) {
df <- df %>%
rename(population = populationID, Name = PopulationName)
}
## Population ids should be pop1, pop2.... Since people could have written all sort of things, change them all to pop1, pop2, format
df <- df %>%
mutate(population = paste0("pop", row_number()))
## Fix problematic conditionals columns
# if NcPoint data was provided then NcType should exist
condition<-!is.na(df$NcPoint) & is.na(df$NcType)
ifelse(condition, print("NcPoint data was provided so NcType should exist but was not provided, setting NcType = `Nc_point` for relevant pops"), "")
df <- df %>%
mutate(NcType = ifelse(condition, "Nc_point", NcType))
# if NcRange data was provided then NcType should exist
condition<-!is.na(df$NcRange) & is.na(df$NcType)
ifelse(condition, print("NcRange data was provided so NcType should exist but was not provided, setting NcType = `Nc_range` for relevant pops"), "")
df <- df %>%
mutate(NcType = ifelse(condition, "Nc_range", NcType))
# If there is no Nc data, NcMethod and NcType should be NA
condition<-is.na(df$NcRange) & is.na(df$NcRangeDetails) & is.na(df$NcPoint)
ifelse(condition, print("If there is no Nc data, NcMethod and NcType should be NA, this was not the case so NA were introduced to replace the value. Check original data to make sure it is correct"), "")
df <- df %>%
mutate(NcMethod = ifelse(condition, NA , NcMethod),
NcType = ifelse(condition, NA , NcType))
## change all "" (empty) cells to NA
df <- df %>%
mutate_all(list(~na_if(.,"")))
### 5) Change columns to desired order
desired_order <- c(
"country_assessment", "taxonomic_group", "taxon", "scientific_authority",
"genus", "year_assesment", "name_assessor", "email_assessor", "kobo_tabular", "defined_populations",
"time_populations", "X_validation_status", "X_uuid", "multiassessment", "population",
"Name", "Origin", "IntroductionYear" , "Ne", "NeLower", "NeUpper",
"NeYear", "GeneticMarkers", "GeneticMarkersOther", "MethodNe", "SourceNe",
"NcType", "NcYear", "NcMethod", "NcRange", "NcRangeDetails", "NcPoint",
"NcLower", "NcUpper", "SourceNc", "Comments")
df <- df %>% select(desired_order)
### 6) Return data
df
}