rm(list=ls(all=t))

Setup filenames

filename <- "SAP2016 Secundaria RAW1 NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
              "C_NOM",
              "A_APEPAT",
              "C_APEPAT",
              "A_APEMAT",
              "C_APEMAT",
              "J_DNI",
              "L_DNI") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!! Remove as it contains identifying information
dropvars <- c("DIGITA") 
mydata <- mydata[!names(mydata) %in% dropvars]

# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica         1         2         3         4         5         6         7 
##      1989       537       429        21       517       896       701         3 
##         9        10        11        12        13        14        15        16 
##      1086       947       439         2      1410      1175       373         6 
##        17        19        20        21        22        24        25        26 
##       872         1       267        10       628       978       341         1 
##        33       112       202       234       300       666 
##         1         1         1         1         1         1 
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
## 1989  537  429   21  517  896  701    3 1086  947  439    2 1410 1175  373    6  872 
##   18   19   20   21   22   23   24   25   26   27   28   29   30 
##    1  267   10  628  978  341    1    1    1    1    1    1    1

Small locations: Encode locations with pop <100,000 using random large numbers

# !!! Remove as it contains identifying information
dropvars <- c("NOMESC") 
mydata <- mydata[!names(mydata) %in% dropvars]

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("CODLOC", "CODMOD") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## No Indica    140048    142491    148313    291278    292490    292579    292678 
##        81        24         8         4         8        17         7        11 
##    298053    300739    301376    301668    301687    301692    301809    306675 
##         1         8         5        14         2         4         1        13 
##    307458    307477    311120    314519    315062    318348    319319    320426 
##         9         3         2         6         3         5         8        21 
##    320450    324429    324542    324561    324617    324655    324721    324759 
##         2         5         2        11         9         3         4         1 
##    324976    324981    325235    325264    325400    325508    332943    332962 
##         2         1         6         4         9        10         1         8 
##    333103    333377    333396    333400    333443    338681    338695    338817 
##         1        39        46        10         1         5        16         9 
##    338841    343814    343828    344111    346681    365231    687383    687887 
##         6         6        12         6        14         7         3         1 
##      <NA> 
##     13110 
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
##   467   468   469   470   471   472   473   474   475   476   477   478   479   480 
##    21     6     8     3    11     3     2    10     1     4    17     5    10     1 
##   481   482   483   484   485   486   487   488   489   490   491   492   493   494 
##     6    11     9    39     1     1     3     9     2     2     9     4     5     8 
##   495   496   497   498   499   500   501   502   503   504   505   506   507   508 
##    16     2    14     6     6    14     1     8     3    13    81     5     6     5 
##   509   510   511   512   513   514   515   516   517   518   519   520   521   522 
##     2     8     7     4     1    12    24     9     8     1     7     1     4    46 
##  <NA> 
## 13110 
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
## No Indica    209387    209510    209536    209908    209916    209965    209973 
##        81         5        40        75        45        18        80        22 
##    233130    236117    236224    236364    245688    302893    302968    318931 
##         1        54        12        12        91        14        37        15 
##    318949    325563    325589    325605    325647    325654    325670    325712 
##         3       123        40        27        87         1        77         4 
##    328047    328153    328872    329573    334680    334714    334722    334771 
##        21        11         8        73         7         5        26         7 
##    336560    336594    337436    337766    340281    340299    340323    340331 
##        58       106        89        21         6         4        11         8 
##    340356    340364    340380    340398    340414    340463    432773    434076 
##        12         6       123        10        58        10        12        10 
##    434175    434746    436709    437210    437228    437236    437277    437285 
##         1         2         8        32       137       140       108        37 
##    437335    437343    437368    437509    437707    437715    437723    449827 
##        26       126         2        10        33       111        42         3 
##    466342    466722    469700    481820    488619    488635    493544    495150 
##         4         6        70         1        59        38       118         2 
##    495812    496166    497081    499699    500124    500611    501676    501809 
##        14        58         2        93        14         7        81         4 
##    501908    502104    502484    502633    522318    523423    523621    525857 
##        47         3        92        62        14        47        37         1 
##    527572    536128    536326    546002    555862    556340    556472    556571 
##         6        27        40        56        45        82        28        76 
##    566141    566158    566430    566463    566471    567750    573352    578286 
##        87         3        64        57        73        42        37        77 
##    578294    578336    578351    578393    578435    578492    578542    579151 
##        74        64        55         3        47        75        83       101 
##    581710    581736    581876    581892    581900    582122    582148    582163 
##        11       111        88        65        53         5        45        68 
##    582189    582833    582866    582890    582981    583013    583203    583476 
##         1        79         4        57        49       134         3         7 
##    583567    590133    596007    598581    601492    605469    605501    607143 
##        46         3         1        57        30       145         5         1 
##    607432    607697    632356    639922    642892    643221    643783    643841 
##         3       100         6         5         4         2         4         3 
##    644880    647065    650036    659896    662940    662957    663005    663096 
##        51       123        32        85         5        12        15        88 
##    663120    663138    663559    664706    665265    682229    682260    690008 
##         5        35        92         2       107       115        30        73 
##    690024    691931    692434    692707    693499    694588    694604    703215 
##         1        94       114         4        94        59         8        90 
##    703223    703231    703249    703744    703751    705053    705772    720235 
##       106        87        99        61       109        77        10        25 
##    725770    725861    728337    728717    730515    732321    732347    743831 
##        46        22         4       166         6        16        85        66 
##    744573    751230    762773    762856    762864    763169    764928    764936 
##         2        19        77        69       107         1         1        45 
##    765297    765305    765370    773788    774455    776229    777680    777995 
##        52        17         2        68         7         4        80        46 
##    778027    778076    779041    781369    782045    782664    826479    832279 
##         1         3        81       134         1       101         6        21 
##    832311    869032    869198    869222    869230    872515    874206    884510 
##        11         3        42         2        57       122         8        11 
##    884528    884536    884544    884551    884585    900670    900761    900910 
##        48         5        30        91        90         1        65        98 
##    901033    901066    901124    915256    927814   1008440   1008960   1009802 
##        82        84         2        75       110        60        29         8 
##   1009844   1010149   1034016   1041631   1045434   1045632   1053628   1053693 
##        27        30         6       149        11         6        53       137 
##   1054196   1054352   1054394   1054436   1062942   1063106   1070036   1072727 
##       118        72        15       118        11        40        50        60 
##   1073212   1074301   1083674   1083716   1083815   1084508   1147933   1148014 
##        50        18        14        28        59        33         4        31 
##   1152941   1152982   1153022   1153261   1194380   1194810   1195189   1195577 
##       119        62         2         3        36         4        29         1 
##   1195874   1210137   1222595   1240357   1240720   1247832   1248509   1254192 
##        68        73         1         2        66         7        77         1 
##   1258334   1258649   1262211   1279124   1309392   1322593   1336072   1346576 
##         5         1        11        50       110         3         8        40 
##   1369503   1370378   1381987   1390137   1390798   1393453   1398148   1423615 
##        60         1         7         1        54       138        35        23 
##   1432772   1473644   1474964   1475011   1475201   1475284   1475755   1484443 
##         3        57       100        92        69        46         1        48 
##   1495365   1495407   1496355   1497056   1500354   1501451   1505494   1507532 
##        33        56         2        93         5        94       138        62 
##   1512789   1527316   1558725   1641521   1661271 
##        54         3       125        17        46 
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 
##  42  77  11  40  61  33   8  26  52   2  27   8  33  85  15  75   1  94  70  77  56 
## 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 
##  65 106 125   6   1  10  11   4  14  29  33  18   1   1  54   5   4 106  18  35 118 
## 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 
##   3  48 123   1  58   7  12   5  45  37  10 110 110  12  30 111 107   3  60 100   5 
## 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
## 138  91  77   1   2  87 145   7  92  10  21  14  11   8   8  72  62  60  90  73  73 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 
##  68   7   2 166   4 119  17   5   3  93 138   3  12   7  90   1  75  81   6   1  73 
## 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 
##  56  45  15   8  46   3  11   2   1 109   5  50  31  69  91   1   8  12  92  57  76 
## 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 
##  46  77  28  40   3  45  80   2   3  87   2  64  47  93  50   6   6  14  84   4 149 
## 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 
##  10   2   2  30 123  11  27  27  89   8  87 140   3  36   7  40  92  57   1 137   4 
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 
##   1 101  49   3  32   1  35  46   3  59  57 118  45   7   6   4  94  81 122   1   1 
## 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##  99  62  29  74  54  60  98   4  58  59  22  22  28 137  88  68  25  11  26   5   1 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 
##  77  82 114   2  58  66  57  80   2   1  12   1   3  14   3   3  23 111  10  59   5 
## 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 
##   6  51   1  37  15 134  38  46  69  53   1  83  54   3 107 126 118  55  64   1   7 
## 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 
##  62  30  66  14  21   4   2 100  53  47   4   4  21  79  82   6  88 115  40  46  40 
## 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 
## 134 101  11  17  94  19   6   4  37  85  42 123  11  37   3  30  16   4  48   2   2 
## 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 
##  42  68  57   6 108  50   5  65  81  75  32  47  73   5   5

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

dropvars <- c("D_DD",
              "D_MM",
              "F_DD",
              "F_MM") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("NIVESC",
                  "G_SEXO",
                  "I_SEXO",
                  "Q35A",
                  "Q40",
                  "Q42_1",
                  "Q42_2",
                  "Q42_3",
                  "Q42_4",
                  "Q42_5")

capture_tables (indirect_PII)

# Recode those with very specific values. 
# !!! No very specific values

Matching and crosstabulations: Run automated PII check

# !!!Insufficient demographic data

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("Q11_B",
               "Q12_B",
               "Q17_B",
               "Q18_B",
               "Q23B",
               "Q23D")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.


mydata <- mydata[!names(mydata) %in% "Q11_B"]
mydata <- mydata[!names(mydata) %in% "Q12_B"]
mydata <- mydata[!names(mydata) %in% "Q17_B"]
mydata <- mydata[!names(mydata) %in% "Q18_B"]
mydata <- mydata[!names(mydata) %in% "Q23B"]
mydata <- mydata[!names(mydata) %in% "Q23D"]

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)