rm(list=ls(all=t))
filename <- "SAP2016 Primaria RAW1 NOPII" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
"C_NOM",
"A_APEPAT",
"C_APEPAT",
"A_APEMAT",
"C_APEMAT",
"J_DNI",
"L_DNI",
"P36_B1_1",
"P36_B1_2",
"P36_B1_3",
"P36_B1_4",
"P36_B2_1",
"P36_B2_2",
"P36_B2_3",
"P36_B2_4",
"P36_B3_1",
"P36_B3_2",
"P36_B3_3",
"P36_B3_4")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!! Removed as it contains identifying information
dropvars <- c("DIGITA")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica 1 2 3 4 5 6 7
## 3457 102 93 2 73 125 171 1
## 8 9 10 11 12 13 14 15
## 1 166 160 95 3 218 287 52
## 17 18 20 21 22 23 24 25
## 137 2 57 1 163 1 143 76
## 27 28 30 31 33 36 38 123
## 1 1 1 2 1 1 1 1
## 141 143 160
## 1 1 1
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 3457 102 93 2 73 125 171 1 1 166 160 95 3 218 287 52 137
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## 2 57 1 163 1 143 76 1 1 1 2 1 1 1 1 1 1
## 35
## 1
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("CODLOC", "CODMOD")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## No Indica 140034 140048 291443 292452 292517 292579 324434
## 18 1 1 2 1 7 5 2
## 344125 346836 365231 365373 373373 373537 528426 687528
## 5 15 8 1 22 5 19 5
## <NA>
## 5481
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
## 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 <NA>
## 1 8 15 1 2 5 22 18 5 19 2 7 5 1 5 1 5481
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
## No Indica 203414 203745 207845 207894 207951 208371 208538
## 18 30 5 18 24 10 1 18
## 208553 215723 315275 317495 317511 318949 319269 322875
## 12 27 1 19 13 9 1 31
## 322958 323311 323451 328120 328401 328435 328443 328450
## 9 2 25 2 2 1 34 21
## 329045 334912 334987 335000 335042 335083 335091 335109
## 26 14 46 19 44 28 30 15
## 335141 338517 338566 338822 338970 339036 339192 339275
## 25 17 16 4 13 18 13 26
## 339432 339606 339804 398081 398123 398479 398578 398669
## 105 21 44 8 1 13 22 2
## 400036 433086 433227 433235 433276 433490 433516 433540
## 23 38 65 13 33 30 52 46
## 433623 433680 433805 433821 433862 433961 434019 434035
## 20 37 2 42 24 65 77 15
## 434076 434159 434191 434282 434399 434464 434480 434498
## 11 32 58 78 10 55 54 38
## 434506 434548 434597 434720 434829 436170 436196 436212
## 38 63 31 9 84 13 26 55
## 436287 436303 436311 436345 436360 436410 436428 436451
## 24 52 19 11 35 76 21 28
## 436493 436501 436543 436550 436576 436600 436634 436675
## 50 20 32 19 25 28 66 28
## 436709 436766 436790 436816 436824 466342 468488 478404
## 25 64 26 1 25 18 56 20
## 478420 482059 489070 493734 496133 497024 501957 510305
## 12 1 1 26 14 25 13 22
## 512020 513614 513713 516674 516872 518548 523464 523662
## 32 7 8 49 19 10 62 56
## 523761 523860 524264 525857 526400 527473 527572 528380
## 62 11 30 21 13 20 27 1
## 528794 541011 542597 543645 555599 556241 556290 556357
## 1 24 31 29 9 2 21 25
## 583922 584946 590133 596932 607432 607457 628404 628826
## 62 64 26 1 29 24 58 25
## 629295 632299 632323 632356 633321 647784 649913 659698
## 68 23 1 27 1 24 23 14
## 662726 662742 663682 663690 664508 665372 681825 689836
## 28 17 26 1 7 21 23 10
## 693465 694315 694455 697557 703736 732461 744540 744557
## 1 25 1 21 53 31 16 51
## 763151 764779 765164 765859 776229 777144 778076 781096
## 31 25 15 60 14 26 32 22
## 781351 781385 817916 820803 821058 821082 824003 826263
## 45 59 27 2 21 44 1 19
## 826479 834465 834853 834960 846048 855213 855270 869032
## 29 22 111 21 28 42 24 34
## 872127 873679 1009802 1033729 1053628 1062942 1070390 1072685
## 19 3 26 22 4 16 16 11
## 1088400 1098102 1154160 1195841 1196047 1240357 1242072 1248467
## 19 19 14 22 24 17 7 13
## 1258649 1261742 1268150 1279637 1309392 1350008 1365816 1474899
## 35 6 32 1 5 22 31 36
## 1475045 1475755
## 21 8
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 42 13 56 10 17 24 30 30 32 20 15 1 1 49 21 31 29 23 26 77 13
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
## 36 55 65 22 25 58 8 29 8 25 1 56 17 33 10 84 18 16 24 24 52
## 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
## 21 6 42 25 13 16 31 15 14 26 19 14 27 66 10 24 37 3 12 1 44
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
## 2 1 78 25 30 13 2 14 20 24 62 9 17 22 19 24 19 1 26 63 28
## 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
## 46 30 1 13 1 44 22 16 64 28 62 21 12 13 1 25 19 55 20 19 21
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
## 32 15 31 4 58 26 11 11 38 1 1 28 45 68 18 1 111 14 26 21 2
## 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 21 29 105 24 26 1 31 19 32 1 28 9 14 21 52 25 5 76 7 62 18
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
## 2 26 46 65 23 16 38 20 50 7 5 28 32 24 27 21 11 54 13 53 44
## 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
## 11 64 27 18 34 13 2 31 59 21 2 10 35 25 8 25 4 9 1 25 1
## 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
## 9 32 23 22 19 35 26 22 21 31 23 38 22 25 1 28 2 7 13 27 60
## 406 407 408 409 410 411 412 413
## 1 22 19 26 34 18 51 19
# !!! Removed as it contains identifying information
dropvars <- c("NOMESC")
mydata <- mydata[!names(mydata) %in% dropvars]
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
dropvars <- c("D_DD",
"D_MM",
"F_DD",
"F_MM")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("TURNO",
"GRADO",
"G_SEXO",
"I_SEXO",
"P28A",
"P30",
"P32_1",
"P32_2",
"P32_3",
"P32_4",
"P36_A1",
"P36_A2",
"P36_A3")
capture_tables (indirect_PII)
# Recode those with very specific values.
# !!! No very specific values
# !!!Insufficient demographic data
# !!! Identify open-end variables here:
open_ends <- c("P15B",
"P15D")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% "P15B"]
mydata <- mydata[!names(mydata) %in% "P15D"]
# !!! No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)