rm(list=ls(all=t))
filename <- "SAP2016 Primaria RAW2 NOPII" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
"C_NOM",
"A_APEPAT",
"C_APEPAT",
"A_APEMAT",
"C_APEMAT",
"J_DNI",
"L_DNI",
"P36_B1_1",
"P36_B1_2",
"P36_B1_3",
"P36_B1_4",
"P36_B2_1",
"P36_B2_2",
"P36_B2_3",
"P36_B2_4",
"P36_B3_1",
"P36_B3_2",
"P36_B3_3",
"P36_B3_4")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!! Removed as it contains identifying information
dropvars <- c("DIGITA")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica 1 2 4 6 7 8 9 10 11 12
## 2361 229 61 80 235 1 1 1 1 1 70
## 14 15 16 18 19 21 23 24 25 26 35
## 2 48 73 47 99 225 113 1 94 1 1
## 56 100 132 239
## 1 1 1 51
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 2361 229 61 80 235 1 1 1 1 1 70 2 48 73 47 99 225 113 1 94 1 1 1
## 24 25 26
## 1 1 51
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("CODLOC", "CODMOD")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## 144536 144602 144782 148129 148983 148997 292188 295097 295120 298835 305831 305911 305925 305973 313478 313727
## 4 6 1 1 2 4 1 5 1 7 1 1 3 1 2 12
## 320624 324486 325438 332165 340274 342843 343753 343767 343871 346638 365170 365311 365325 365759 366075 509239
## 4 2 6 17 9 13 1 7 5 4 6 3 1 7 11 2
## 593638 593657 594124 594435 <NA>
## 2 12 7 7 3621
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
## 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
## 11 2 1 1 3 7 5 6 7 6 6 2 1 4 9 7 2 1 1 13 1 12 2
## 483 484 485 486 487 488 489 490 491 492 493 494 495 <NA>
## 7 3 17 4 7 12 2 1 1 1 4 4 5 3621
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
## 202614 203505 207795 207852 207985 208058 208348 208389 208462 208546 208561 208579 208587 208652
## 15 11 19 17 37 38 18 19 7 51 32 34 29 25
## 215632 215848 215897 235010 317040 317131 317156 317214 317230 317263 317289 317313 317370 317438
## 64 23 30 2 1 57 1 25 18 17 60 26 25 29
## 317479 317529 317560 318352 319020 319061 319145 319160 319228 319285 322453 322461 322503 322685
## 71 17 36 4 53 53 38 27 36 21 19 17 17 81
## 322727 322768 322784 322891 322933 322974 323295 323378 323444 323865 327551 328146 328187 328252
## 5 19 1 25 14 31 26 53 30 29 22 2 30 27
## 328336 328344 328351 328369 328385 328393 328518 328526 328997 331702 332213 332239 334821 335166
## 18 21 18 13 1 7 58 85 18 12 13 1 23 27
## 335182 338525 339499 339697 398040 398446 398586 398859 405100 405167 405183 433078 434233 434258
## 18 7 2 1 16 15 13 8 19 1 20 32 1 3
## 434423 434761 436477 436774 482091 482109 489104 496521 496844 496877 497651 502534 510602 510701
## 1 1 1 1 48 29 31 23 61 35 38 24 2 1
## 510800 514224 526301 528281 531368 542720 556597 587279 587303 598482 607416 629329 639674 639732
## 37 22 36 29 36 14 42 31 1 14 1 20 1 1
## 647792 652081 656843 659664 659706 662734 664284 664722 665398 691808 694224 694422 694463 704445
## 30 19 14 23 60 1 35 2 25 51 10 11 10 16
## 705129 728642 759399 762500 762757 764076 772913 775312 780700 780825 835058 847087 870360 876219
## 12 23 46 39 9 56 20 24 34 26 31 6 16 1
## 885517 928820 1010107 1041474 1041623 1045111 1045277 1080068 1083187 1084987 1085919 1196526 1224104 1238542
## 1 20 36 1 1 49 33 31 1 19 13 47 1 16
## 1241082 1246792 1248350 1250695 1264639 1273150 1273234 1313444 1322973 1347798 1364967 1380690 1381342 1381581
## 7 1 40 1 1 23 22 33 1 19 2 31 14 33
## 1381862 1481662 1482546 1506724 1507276 1509496 1635689
## 27 1 1 1 28 7 1
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
## 18 7 53 1 17 11 16 33 39 17 1 34 31 2 19 36 51 22 51 60 1 17 23
## 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989
## 19 38 8 2 57 1 1 16 1 20 1 25 42 1 7 14 30 31 31 4 38 37 1
## 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
## 19 23 31 13 23 81 18 26 20 1 53 23 1 20 1 23 40 27 18 10 33 32 31
## 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
## 12 5 19 58 10 2 6 71 53 13 33 25 18 61 1 14 30 56 1 48 1 9 15
## 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
## 25 26 24 19 13 1 47 1 36 18 7 12 35 29 35 1 19 36 1 1 29 30 29
## 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
## 32 24 17 1 27 1 21 1 1 13 19 30 1 28 27 14 29 25 25 3 37 29 1
## 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
## 22 2 36 1 2 1 31 15 1 18 1 23 46 27 7 38 22 1 17 26 14 14 21
## 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
## 16 49 19 7 85 64 16 1 2 36 34 20 60 11
# !!! Removed as it contains identifying information
dropvars <- c("NOMESC")
mydata <- mydata[!names(mydata) %in% dropvars]
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
dropvars <- c("D_DD",
"D_MM",
"F_DD",
"F_MM")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("NIVESC",
"G_SEXO",
"I_SEXO",
"P25A",
"P30",
"P36_A1",
"P36_A2",
"P36_A3",
"P32_1",
"P32_2",
"P32_3",
"P32_4",
"P32_5")
capture_tables (indirect_PII)
# Recode those with very specific values.
# !!! No very specific values
# !!!Insufficient demographic data
# !!! Identify open-end variables here:
open_ends <- c("P15B",
"P15D")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% "P15B"]
mydata <- mydata[!names(mydata) %in% "P15D"]
# !!! No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)