rm(list=ls(all=t))

Setup filenames

filename <- "SAP2016 Secundaria RAW2 NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)
table(mydata$Seccion)
## Warning: Unknown or uninitialised column: `Seccion`.
## < table of extent 0 >

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
              "C_NOM",
              "A_APEPAT",
              "C_APEPAT",
              "A_APEMAT",
              "C_APEMAT",
              "J_DNI",
              "L_DNI") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!! Removed as it contains identifying information
dropvars <- c("DIGITA") 
mydata <- mydata[!names(mydata) %in% dropvars]

# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica         1         3         4         6         7        10        11 
##       165       527         1       602       790         3         1         1 
##        13        15        16        18        19        21        23        24 
##         1       501       543       499         1      1094       465         1 
##        25        26        41        42        65        66       101       158 
##       490         1         1         1         1         1         1         1 
##       196       201 
##         1         1 
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##  165  527    1  602  790    3    1    1    1  501  543  499    1 1094  465    1  490 
##   18   19   20   21   22   23   24   25   26 
##    1    1    1    1    1    1    1    1    1

Small locations: Encode locations with pop <100,000 using random large numbers

# !!! Removed as it contains identifying information
dropvars <- c("NOMESC") 
mydata <- mydata[!names(mydata) %in% dropvars]

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("CODLOC", "CODMOD") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## 144536 144579 144621 144683 144796 144824 288134 288214 292150 292254 305789 305925 
##     10     15      1     12      3      3      1     17      6      1      1      8 
## 308943 313478 313727 320619 320624 324014 324486 324491 324740 324896 325221 325424 
##     11     10      3     10      4      3      5      1      6      3      1      2 
## 325438 326838 329691 343866 346638 593638 594124 594435 594459   <NA> 
##      5      1     12      1      1      9      1      6     11   5510 
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
##  737  738  739  740  741  742  743  744  745  746  747  748  749  750  751  752  753 
##    1    1    3   12   10    1    6    5    9   10    6   15   11    2   11    1    3 
##  754  755  756  757  758  759  760  761  762  763  764  765  766  767  768  769 <NA> 
##    3   17    1    6    3    8   12    4   10    1    1    5    3    1    1    1 5510 
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
##  245662  325704  329326  336495  336511  336545  336628  337592  340315  340349 
##      99      57     115      68      47      11     100      10      62      68 
##  437244  437293  437319  437327  449868  493742  493841  495259  500348  501502 
##     118      91      67       8      95      26      63      26      18       3 
##  501601  535823  556449  565200  565234  565267  566414  578401  578468  578526 
##      57      50       5      22      82      15       4      51       3      85 
##  578534  582262  582304  582312  582932  583021  583104  583591  603878  642801 
##      95      81      47      12     125     125     106      57      53      89 
##  642926  643163  643262  643692  644690  647172  650002  659953  664292  664748 
##      74       1      69       7       6      68       9      95       1      91 
##  665281  665463  693622  694547  694562  694570  694596  743773  759613  762849 
##     127       2      95      58       4      97       9      50      63      80 
##  762880  762914  765313  765321  774026  774679  774703  775346  777656  778233 
##     138      61      11      51      58     101      88       1      61      47 
##  778738  781930  782102  832337  870931  870956  874214  884627  900647  900738 
##      53     165      82      49      11       3      61      10       1       4 
##  900944 1007160 1010180 1053669 1070077 1071919 1074509 1083633 1087295 1194265 
##       3      12       1      58     130      80      16       1       4      89 
## 1238948 1248392 1264670 1273275 1335637 1364975 1380740 1381219 1381334 1381896 
##       3     169       3     122     163       1     116     142      92      11 
## 1475250 1476258 1477264 
##      78      14       8 
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 
##  88  16   9   4 165  91   4  50  51   3   3  11   8  89  95 125  47  80   1   2  12 
## 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 
##  10   1  12  68  97 125  81 163  57  61   1  26  11  53  58  57 100  80  68  95  74 
## 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 
##   9   5 122  53  11  62  91  47  58 101  92  82 118  58 142  26 169 115  49   7   6 
## 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 
##  63  61   1  63 127  68   1  15  47 138  18  67  22  61   1  57 106   3  89  78   4 
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 
## 116   8  14  99   3  95 130  85  50  11   3   4  82  95  51  10   3   1  69

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

dropvars <- c("D_DD",
              "D_MM",
              "F_DD",
              "F_MM") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("NIVESC",
                  "G_SEXO",
                  "I_SEXO",
                  "Q42_1",
                  "Q42_2",
                  "Q42_3",
                  "Q42_4",
                  "Q43")

capture_tables (indirect_PII)

# Recode those with very specific values. 
# !!! No very specific values

Matching and crosstabulations: Run automated PII check

# !!!Insufficient demographic data

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("Q11_B",
               "Q12_B",
               "Q17_B",
               "Q18_B",
               "Q23B",
               "Q23D")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% "Q11_B"]
mydata <- mydata[!names(mydata) %in% "Q12_B"]
mydata <- mydata[!names(mydata) %in% "Q17_B"]
mydata <- mydata[!names(mydata) %in% "Q18_B"]
mydata <- mydata[!names(mydata) %in% "Q23B"]
mydata <- mydata[!names(mydata) %in% "Q23D"]

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)