rm(list=ls(all=t))

Setup filenames

filename <- "SAP2016 Secundaria RAW3 NOPII_relabelled" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("a_nom",
              "c_nom",
              "a_apepat",
              "c_apepat",
              "a_apemat",
              "c_apemat",
              "j_dni",
              "l_dni") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!!Replace vector in "variables" field below with relevant variable names

mydata <- encode_direct_PII_team (variables=c("dig", "encues"))
## [1] "Frequency table before encoding"
## dig. dig
##    1    3    4    5 
##    2 1668 1650    2 
## [1] "Frequency table after encoding"
## dig. dig
##    1    2    3    4 
##    2 1668 1650    2 
## [1] "Frequency table before encoding"
## encues. c<f3>digo del encuestador
## no indica         1         2         7        12        15        19        22 
##       970       612       621         1       464         1       650         1 
##        29 
##         2 
## [1] "Frequency table after encoding"
## encues. c<f3>digo del encuestador
##   1   2   3   4   5   6   7   8   9 
## 970 612 621   1 464   1 650   1   2

Small locations: Encode locations with pop <100,000 using random large numbers

# !!! Removed as it contains identifying information
dropvars <- c("nomesc") 
mydata <- mydata[!names(mydata) %in% dropvars]

# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("codloc", "codmod") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## codloc. c<f3>digo de local
##     -9 144560 144782 144800 146132 146187 146192 146253 146432 148129 148346 148964 
##      1     82     88     97      1     17      6    102     12    124     13      2 
## 148983 148997 149020 149044 149063 149077 149261 165170 288469 288520 288860 295064 
##    159    120    136      3     97      1    168      1      3      7    124     14 
## 295097 295115 295120 295158 298736 298779 298835 298939 299023 299024 299043 304860 
##     10     66     13      8     39     26     46      7      1     32      4      7 
## 304898 304916 304935 304940 304983 305020 305066 305082 305124 305142 308900 316641 
##     92     41     41      5     72      3      1    107     21      1     23    118 
## 316679 319239 320105 332165 341117 342843 361641 365066 365170 365311 365325 365330 
##     30     48    152     76      1    131      1     93     78     44     63      3 
## 365532 365759 365900 365995 366075 366706 366872 366886 367051 725870 
##      1     56    134      8     68      3      5     33     25    107 
## [1] "Frequency table after encoding"
## codloc. c<f3>digo de local
## 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 
##   8   1   1 159  33 107   2   3 131  13   1  14  48 107 118   7  44   1  39   8   4 
## 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 
##   3   5  97  26  41  12   6  66  17 124  63  88  78  93   1   1  76 136  68   1   7 
## 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 
##  46   1  13 124   1   3 152   7  25   3 134  82  21  41  30  56  97 102 168  72   5 
## 703 704 705 706 707 708 709 
##  23 120  92   3  32  10   1 
## [1] "Frequency table before encoding"
## codmod. c<f3>digo modular
##      -9  207449  233056  236109  236174  245696  302943  302950  304444  305656 
##      22      13      17       6     159      23       5      33     134      10 
##  324772  325449  325456  325464  325472  325506  325548  325662  330464  334672 
##      12      33      14     131     119       8      26       1      66      22 
##  334730  336586  337741  466730  504993  536714  578260  578278  591131  591164 
##       7     107       3     124      39      25       7       3       2      13 
##  591198  599365  616185  663971  663974  700446  704460  735035  785097  928200 
##     136     152     124      91       1       1      29       3     102       1 
##  933598 1008929 1056944 1063148 1063221 1063304 1085976 1147537 1147651 1153147 
##     120       5      46      41      41      72       4       8       3      79 
## 1227225 1260311 1262930 1263011 1324772 1381375 1381599 1386168 1386234 1390442 
##       1       1      68      63       1      88      96     168      97      44 
## 1411438 1467601 1470368 1497601 1511351 1527225 1563238 1640556 1697234 3181599 
##      76       1       3      47     107      72      56      82       7       1 
## [1] "Frequency table after encoding"
## codmod. c<f3>digo modular
## 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 
##   2 124   1  96 120  44  41  68   1  79  66 124  25  22 102 168   4  33  63 107   1 
## 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 
##  46  72  82   1   8   7  12   1  23 152   6  76  10 136   3   3   8  97  26  56   7 
## 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 
##   5 131   3   1   1  13 119   1  33   3   1  22 159  13   5  39  29  88   7  17 134 
## 624 625 626 627 628 629 630 
##  91   3  14  41  47 107  72

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

dropvars <- c("d_dd",
              "d_mm",
              "f_dd",
              "f_mm") 
mydata <- mydata[!names(mydata) %in% dropvars]

Indirect PII - Categrical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("nivesc",
                  "g_sexo",
                  "i_sexo",
                  "q42_1",
                  "q42_2",
                  "q42_3",
                  "q42_4",
                  "q43")

capture_tables (indirect_PII)

# Recode those with very specific values. 
# !!! No very specific values

Matching and crosstabulations: Run automated PII check

# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('grado', 'd_aa', 'g_sexo') ##!!! Replace with candidate categorical demo vars


# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3322 rows and 348 variables.
##   --> Categorical key variables: grado, d_aa, g_sexo
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size            Size of smallest (>0)
##         grado                    5  (5)   664.400  (664.400)                   511
##          d_aa                   14 (14)   237.286  (237.286)                     1
##        g_sexo                    3  (3)  1107.333 (1107.333)                    91
##       
##  (511)
##    (1)
##   (91)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 10 (0.301%)
##   - 3-anonymity: 20 (0.602%)
##   - 5-anonymity: 59 (1.776%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 10 x 3
##        grado             d_aa     g_sexo
##    <dbl+lbl>        <dbl+lbl>  <dbl+lbl>
##  1     2 [2] 1994             1 [hombre]
##  2     3 [3] 2003             1 [hombre]
##  3     3 [3] 1997             1 [hombre]
##  4     1 [1] 1998             1 [hombre]
##  5     5 [5]   -9 [no indica] 1 [hombre]
##  6     1 [1] 1999             2 [mujer] 
##  7     5 [5] 1995             1 [hombre]
##  8     4 [4] 1996             2 [mujer] 
##  9     2 [2] 2006             2 [mujer] 
## 10     3 [3] 1998             2 [mujer]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
##      grado d_aa g_sexo
## 268      2   NA      1
## 274      3   NA      1
## 837      3   NA      1
## 946      1   NA      1
## 2219     5   NA      1
## 2497     1   NA      2
## 2697     5   NA      1
## 2774     4   NA      2
## 2829     2   NA      2
## 2856     3   NA      2
mydata[notAnon,"g_sexo"]<- NA
mydata [notAnon,"grado"] <- NA
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3322 rows and 348 variables.
##   --> Categorical key variables: grado, d_aa, g_sexo
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size            Size of smallest (>0)
##         grado                    6  (6)   662.400  (662.400)                   508
##          d_aa                   14 (14)   237.286  (237.286)                     1
##        g_sexo                    4  (4)  1104.000 (1104.000)                    91
##       
##  (508)
##    (1)
##   (91)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 3 (0.090%)
##   - 3-anonymity: 9 (0.271%)
##   - 5-anonymity: 36 (1.084%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("q11_b",
               "q12_b",
               "q17_b",
               "q18_b",
               "q23b",
               "q23d")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% open_ends]

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)