rm(list=ls(all=t))

Setup filenames

filename <- "SAP2016 Primaria RAW1 NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
              "C_NOM",
              "A_APEPAT",
              "C_APEPAT",
              "A_APEMAT",
              "C_APEMAT",
              "J_DNI",
              "L_DNI",
              "P36_B1_1",
              "P36_B1_2",
              "P36_B1_3",
              "P36_B1_4",
              "P36_B2_1",
              "P36_B2_2",
              "P36_B2_3",
              "P36_B2_4",
              "P36_B3_1",
              "P36_B3_2",
              "P36_B3_3",
              "P36_B3_4") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!! Removed as it contains identifying information
dropvars <- c("DIGITA") 
mydata <- mydata[!names(mydata) %in% dropvars]

# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica         1         2         3         4         5         6         7 
##      3457       102        93         2        73       125       171         1 
##         8         9        10        11        12        13        14        15 
##         1       166       160        95         3       218       287        52 
##        17        18        20        21        22        23        24        25 
##       137         2        57         1       163         1       143        76 
##        27        28        30        31        33        36        38       123 
##         1         1         1         2         1         1         1         1 
##       141       143       160 
##         1         1         1 
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
## 3457  102   93    2   73  125  171    1    1  166  160   95    3  218  287   52  137 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33   34 
##    2   57    1  163    1  143   76    1    1    1    2    1    1    1    1    1    1 
##   35 
##    1

Small locations: Encode locations with pop <100,000 using random large numbers

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("CODLOC", "CODMOD") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## No Indica    140034    140048    291443    292452    292517    292579    324434 
##        18         1         1         2         1         7         5         2 
##    344125    346836    365231    365373    373373    373537    528426    687528 
##         5        15         8         1        22         5        19         5 
##      <NA> 
##      5481 
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
##  554  555  556  557  558  559  560  561  562  563  564  565  566  567  568  569 <NA> 
##    1    8   15    1    2    5   22   18    5   19    2    7    5    1    5    1 5481 
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
## No Indica    203414    203745    207845    207894    207951    208371    208538 
##        18        30         5        18        24        10         1        18 
##    208553    215723    315275    317495    317511    318949    319269    322875 
##        12        27         1        19        13         9         1        31 
##    322958    323311    323451    328120    328401    328435    328443    328450 
##         9         2        25         2         2         1        34        21 
##    329045    334912    334987    335000    335042    335083    335091    335109 
##        26        14        46        19        44        28        30        15 
##    335141    338517    338566    338822    338970    339036    339192    339275 
##        25        17        16         4        13        18        13        26 
##    339432    339606    339804    398081    398123    398479    398578    398669 
##       105        21        44         8         1        13        22         2 
##    400036    433086    433227    433235    433276    433490    433516    433540 
##        23        38        65        13        33        30        52        46 
##    433623    433680    433805    433821    433862    433961    434019    434035 
##        20        37         2        42        24        65        77        15 
##    434076    434159    434191    434282    434399    434464    434480    434498 
##        11        32        58        78        10        55        54        38 
##    434506    434548    434597    434720    434829    436170    436196    436212 
##        38        63        31         9        84        13        26        55 
##    436287    436303    436311    436345    436360    436410    436428    436451 
##        24        52        19        11        35        76        21        28 
##    436493    436501    436543    436550    436576    436600    436634    436675 
##        50        20        32        19        25        28        66        28 
##    436709    436766    436790    436816    436824    466342    468488    478404 
##        25        64        26         1        25        18        56        20 
##    478420    482059    489070    493734    496133    497024    501957    510305 
##        12         1         1        26        14        25        13        22 
##    512020    513614    513713    516674    516872    518548    523464    523662 
##        32         7         8        49        19        10        62        56 
##    523761    523860    524264    525857    526400    527473    527572    528380 
##        62        11        30        21        13        20        27         1 
##    528794    541011    542597    543645    555599    556241    556290    556357 
##         1        24        31        29         9         2        21        25 
##    583922    584946    590133    596932    607432    607457    628404    628826 
##        62        64        26         1        29        24        58        25 
##    629295    632299    632323    632356    633321    647784    649913    659698 
##        68        23         1        27         1        24        23        14 
##    662726    662742    663682    663690    664508    665372    681825    689836 
##        28        17        26         1         7        21        23        10 
##    693465    694315    694455    697557    703736    732461    744540    744557 
##         1        25         1        21        53        31        16        51 
##    763151    764779    765164    765859    776229    777144    778076    781096 
##        31        25        15        60        14        26        32        22 
##    781351    781385    817916    820803    821058    821082    824003    826263 
##        45        59        27         2        21        44         1        19 
##    826479    834465    834853    834960    846048    855213    855270    869032 
##        29        22       111        21        28        42        24        34 
##    872127    873679   1009802   1033729   1053628   1062942   1070390   1072685 
##        19         3        26        22         4        16        16        11 
##   1088400   1098102   1154160   1195841   1196047   1240357   1242072   1248467 
##        19        19        14        22        24        17         7        13 
##   1258649   1261742   1268150   1279637   1309392   1350008   1365816   1474899 
##        35         6        32         1         5        22        31        36 
##   1475045   1475755 
##        21         8 
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##  42  13  56  10  17  24  30  30  32  20  15   1   1  49  21  31  29  23  26  77  13 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 
##  36  55  65  22  25  58   8  29   8  25   1  56  17  33  10  84  18  16  24  24  52 
## 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 
##  21   6  42  25  13  16  31  15  14  26  19  14  27  66  10  24  37   3  12   1  44 
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 
##   2   1  78  25  30  13   2  14  20  24  62   9  17  22  19  24  19   1  26  63  28 
## 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 
##  46  30   1  13   1  44  22  16  64  28  62  21  12  13   1  25  19  55  20  19  21 
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 
##  32  15  31   4  58  26  11  11  38   1   1  28  45  68  18   1 111  14  26  21   2 
## 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##  21  29 105  24  26   1  31  19  32   1  28   9  14  21  52  25   5  76   7  62  18 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 
##   2  26  46  65  23  16  38  20  50   7   5  28  32  24  27  21  11  54  13  53  44 
## 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 
##  11  64  27  18  34  13   2  31  59  21   2  10  35  25   8  25   4   9   1  25   1 
## 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 
##   9  32  23  22  19  35  26  22  21  31  23  38  22  25   1  28   2   7  13  27  60 
## 406 407 408 409 410 411 412 413 
##   1  22  19  26  34  18  51  19
# !!! Removed as it contains identifying information

dropvars <- c("NOMESC") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

dropvars <- c("D_DD",
              "D_MM",
              "F_DD",
              "F_MM") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("TURNO",
                  "GRADO",
                  "G_SEXO",
                  "I_SEXO",
                  "P28A",
                  "P30",
                  "P32_1",
                  "P32_2",
                  "P32_3",
                  "P32_4",
                  "P36_A1",
                  "P36_A2",
                  "P36_A3")

capture_tables (indirect_PII)

# Recode those with very specific values. 
# !!! No very specific values

Matching and crosstabulations: Run automated PII check

# !!!Insufficient demographic data

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("P15B",
               "P15D")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.


mydata <- mydata[!names(mydata) %in% "P15B"]
mydata <- mydata[!names(mydata) %in% "P15D"]

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)