rm(list=ls(all=t))

Setup filenames

filename <- "SAP2016 Primaria RAW2 NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
              "C_NOM",
              "A_APEPAT",
              "C_APEPAT",
              "A_APEMAT",
              "C_APEMAT",
              "J_DNI",
              "L_DNI",
              "P36_B1_1",
              "P36_B1_2",
              "P36_B1_3",
              "P36_B1_4",
              "P36_B2_1",
              "P36_B2_2",
              "P36_B2_3",
              "P36_B2_4",
              "P36_B3_1",
              "P36_B3_2",
              "P36_B3_3",
              "P36_B3_4") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!! Removed as it contains identifying information
dropvars <- c("DIGITA") 
mydata <- mydata[!names(mydata) %in% dropvars]

# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica         1         2         4         6         7         8         9        10        11        12 
##      2361       229        61        80       235         1         1         1         1         1        70 
##        14        15        16        18        19        21        23        24        25        26        35 
##         2        48        73        47        99       225       113         1        94         1         1 
##        56       100       132       239 
##         1         1         1        51 
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23 
## 2361  229   61   80  235    1    1    1    1    1   70    2   48   73   47   99  225  113    1   94    1    1    1 
##   24   25   26 
##    1    1   51

Small locations: Encode locations with pop <100,000 using random large numbers

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000
  
locvars <- c("CODLOC", "CODMOD") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## 144536 144602 144782 148129 148983 148997 292188 295097 295120 298835 305831 305911 305925 305973 313478 313727 
##      4      6      1      1      2      4      1      5      1      7      1      1      3      1      2     12 
## 320624 324486 325438 332165 340274 342843 343753 343767 343871 346638 365170 365311 365325 365759 366075 509239 
##      4      2      6     17      9     13      1      7      5      4      6      3      1      7     11      2 
## 593638 593657 594124 594435   <NA> 
##      2     12      7      7   3621 
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
##  460  461  462  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480  481  482 
##   11    2    1    1    3    7    5    6    7    6    6    2    1    4    9    7    2    1    1   13    1   12    2 
##  483  484  485  486  487  488  489  490  491  492  493  494  495 <NA> 
##    7    3   17    4    7   12    2    1    1    1    4    4    5 3621 
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
##  202614  203505  207795  207852  207985  208058  208348  208389  208462  208546  208561  208579  208587  208652 
##      15      11      19      17      37      38      18      19       7      51      32      34      29      25 
##  215632  215848  215897  235010  317040  317131  317156  317214  317230  317263  317289  317313  317370  317438 
##      64      23      30       2       1      57       1      25      18      17      60      26      25      29 
##  317479  317529  317560  318352  319020  319061  319145  319160  319228  319285  322453  322461  322503  322685 
##      71      17      36       4      53      53      38      27      36      21      19      17      17      81 
##  322727  322768  322784  322891  322933  322974  323295  323378  323444  323865  327551  328146  328187  328252 
##       5      19       1      25      14      31      26      53      30      29      22       2      30      27 
##  328336  328344  328351  328369  328385  328393  328518  328526  328997  331702  332213  332239  334821  335166 
##      18      21      18      13       1       7      58      85      18      12      13       1      23      27 
##  335182  338525  339499  339697  398040  398446  398586  398859  405100  405167  405183  433078  434233  434258 
##      18       7       2       1      16      15      13       8      19       1      20      32       1       3 
##  434423  434761  436477  436774  482091  482109  489104  496521  496844  496877  497651  502534  510602  510701 
##       1       1       1       1      48      29      31      23      61      35      38      24       2       1 
##  510800  514224  526301  528281  531368  542720  556597  587279  587303  598482  607416  629329  639674  639732 
##      37      22      36      29      36      14      42      31       1      14       1      20       1       1 
##  647792  652081  656843  659664  659706  662734  664284  664722  665398  691808  694224  694422  694463  704445 
##      30      19      14      23      60       1      35       2      25      51      10      11      10      16 
##  705129  728642  759399  762500  762757  764076  772913  775312  780700  780825  835058  847087  870360  876219 
##      12      23      46      39       9      56      20      24      34      26      31       6      16       1 
##  885517  928820 1010107 1041474 1041623 1045111 1045277 1080068 1083187 1084987 1085919 1196526 1224104 1238542 
##       1      20      36       1       1      49      33      31       1      19      13      47       1      16 
## 1241082 1246792 1248350 1250695 1264639 1273150 1273234 1313444 1322973 1347798 1364967 1380690 1381342 1381581 
##       7       1      40       1       1      23      22      33       1      19       2      31      14      33 
## 1381862 1481662 1482546 1506724 1507276 1509496 1635689 
##      27       1       1       1      28       7       1 
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
##  944  945  946  947  948  949  950  951  952  953  954  955  956  957  958  959  960  961  962  963  964  965  966 
##   18    7   53    1   17   11   16   33   39   17    1   34   31    2   19   36   51   22   51   60    1   17   23 
##  967  968  969  970  971  972  973  974  975  976  977  978  979  980  981  982  983  984  985  986  987  988  989 
##   19   38    8    2   57    1    1   16    1   20    1   25   42    1    7   14   30   31   31    4   38   37    1 
##  990  991  992  993  994  995  996  997  998  999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 
##   19   23   31   13   23   81   18   26   20    1   53   23    1   20    1   23   40   27   18   10   33   32   31 
## 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 
##   12    5   19   58   10    2    6   71   53   13   33   25   18   61    1   14   30   56    1   48    1    9   15 
## 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 
##   25   26   24   19   13    1   47    1   36   18    7   12   35   29   35    1   19   36    1    1   29   30   29 
## 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 
##   32   24   17    1   27    1   21    1    1   13   19   30    1   28   27   14   29   25   25    3   37   29    1 
## 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 
##   22    2   36    1    2    1   31   15    1   18    1   23   46   27    7   38   22    1   17   26   14   14   21 
## 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 
##   16   49   19    7   85   64   16    1    2   36   34   20   60   11
# !!! Removed as it contains identifying information

dropvars <- c("NOMESC") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

dropvars <- c("D_DD",
              "D_MM",
              "F_DD",
              "F_MM") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("NIVESC",
                  "G_SEXO",
                  "I_SEXO",
                  "P25A",
                  "P30",
                  "P36_A1",
                  "P36_A2",
                  "P36_A3",
                  "P32_1",
                  "P32_2",
                  "P32_3",
                  "P32_4",
                  "P32_5")

capture_tables (indirect_PII)

# Recode those with very specific values. 
# !!! No very specific values

Matching and crosstabulations: Run automated PII check

# !!!Insufficient demographic data

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("P15B",
               "P15D")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.


mydata <- mydata[!names(mydata) %in% "P15B"]
mydata <- mydata[!names(mydata) %in% "P15D"]

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)