rm(list=ls(all=t))
filename <- "SAP2016 Secundaria RAW3 NOPII_relabelled" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!Include any Direct PII variables
dropvars <- c("a_nom",
"c_nom",
"a_apepat",
"c_apepat",
"a_apemat",
"c_apemat",
"j_dni",
"l_dni")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("dig", "encues"))
## [1] "Frequency table before encoding"
## dig. dig
## 1 3 4 5
## 2 1668 1650 2
## [1] "Frequency table after encoding"
## dig. dig
## 1 2 3 4
## 2 1668 1650 2
## [1] "Frequency table before encoding"
## encues. c<f3>digo del encuestador
## no indica 1 2 7 12 15 19 22
## 970 612 621 1 464 1 650 1
## 29
## 2
## [1] "Frequency table after encoding"
## encues. c<f3>digo del encuestador
## 1 2 3 4 5 6 7 8 9
## 970 612 621 1 464 1 650 1 2
# !!! Removed as it contains identifying information
dropvars <- c("nomesc")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("codloc", "codmod")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## codloc. c<f3>digo de local
## -9 144560 144782 144800 146132 146187 146192 146253 146432 148129 148346 148964
## 1 82 88 97 1 17 6 102 12 124 13 2
## 148983 148997 149020 149044 149063 149077 149261 165170 288469 288520 288860 295064
## 159 120 136 3 97 1 168 1 3 7 124 14
## 295097 295115 295120 295158 298736 298779 298835 298939 299023 299024 299043 304860
## 10 66 13 8 39 26 46 7 1 32 4 7
## 304898 304916 304935 304940 304983 305020 305066 305082 305124 305142 308900 316641
## 92 41 41 5 72 3 1 107 21 1 23 118
## 316679 319239 320105 332165 341117 342843 361641 365066 365170 365311 365325 365330
## 30 48 152 76 1 131 1 93 78 44 63 3
## 365532 365759 365900 365995 366075 366706 366872 366886 367051 725870
## 1 56 134 8 68 3 5 33 25 107
## [1] "Frequency table after encoding"
## codloc. c<f3>digo de local
## 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
## 8 1 1 159 33 107 2 3 131 13 1 14 48 107 118 7 44 1 39 8 4
## 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
## 3 5 97 26 41 12 6 66 17 124 63 88 78 93 1 1 76 136 68 1 7
## 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
## 46 1 13 124 1 3 152 7 25 3 134 82 21 41 30 56 97 102 168 72 5
## 703 704 705 706 707 708 709
## 23 120 92 3 32 10 1
## [1] "Frequency table before encoding"
## codmod. c<f3>digo modular
## -9 207449 233056 236109 236174 245696 302943 302950 304444 305656
## 22 13 17 6 159 23 5 33 134 10
## 324772 325449 325456 325464 325472 325506 325548 325662 330464 334672
## 12 33 14 131 119 8 26 1 66 22
## 334730 336586 337741 466730 504993 536714 578260 578278 591131 591164
## 7 107 3 124 39 25 7 3 2 13
## 591198 599365 616185 663971 663974 700446 704460 735035 785097 928200
## 136 152 124 91 1 1 29 3 102 1
## 933598 1008929 1056944 1063148 1063221 1063304 1085976 1147537 1147651 1153147
## 120 5 46 41 41 72 4 8 3 79
## 1227225 1260311 1262930 1263011 1324772 1381375 1381599 1386168 1386234 1390442
## 1 1 68 63 1 88 96 168 97 44
## 1411438 1467601 1470368 1497601 1511351 1527225 1563238 1640556 1697234 3181599
## 76 1 3 47 107 72 56 82 7 1
## [1] "Frequency table after encoding"
## codmod. c<f3>digo modular
## 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
## 2 124 1 96 120 44 41 68 1 79 66 124 25 22 102 168 4 33 63 107 1
## 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
## 46 72 82 1 8 7 12 1 23 152 6 76 10 136 3 3 8 97 26 56 7
## 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623
## 5 131 3 1 1 13 119 1 33 3 1 22 159 13 5 39 29 88 7 17 134
## 624 625 626 627 628 629 630
## 91 3 14 41 47 107 72
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
dropvars <- c("d_dd",
"d_mm",
"f_dd",
"f_mm")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("nivesc",
"g_sexo",
"i_sexo",
"q42_1",
"q42_2",
"q42_3",
"q42_4",
"q43")
capture_tables (indirect_PII)
# Recode those with very specific values.
# !!! No very specific values
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('grado', 'd_aa', 'g_sexo') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3322 rows and 348 variables.
## --> Categorical key variables: grado, d_aa, g_sexo
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## grado 5 (5) 664.400 (664.400) 511
## d_aa 14 (14) 237.286 (237.286) 1
## g_sexo 3 (3) 1107.333 (1107.333) 91
##
## (511)
## (1)
## (91)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 10 (0.301%)
## - 3-anonymity: 20 (0.602%)
## - 5-anonymity: 59 (1.776%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 10 x 3
## grado d_aa g_sexo
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [2] 1994 1 [hombre]
## 2 3 [3] 2003 1 [hombre]
## 3 3 [3] 1997 1 [hombre]
## 4 1 [1] 1998 1 [hombre]
## 5 5 [5] -9 [no indica] 1 [hombre]
## 6 1 [1] 1999 2 [mujer]
## 7 5 [5] 1995 1 [hombre]
## 8 4 [4] 1996 2 [mujer]
## 9 2 [2] 2006 2 [mujer]
## 10 3 [3] 1998 2 [mujer]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## grado d_aa g_sexo
## 268 2 NA 1
## 274 3 NA 1
## 837 3 NA 1
## 946 1 NA 1
## 2219 5 NA 1
## 2497 1 NA 2
## 2697 5 NA 1
## 2774 4 NA 2
## 2829 2 NA 2
## 2856 3 NA 2
mydata[notAnon,"g_sexo"]<- NA
mydata [notAnon,"grado"] <- NA
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3322 rows and 348 variables.
## --> Categorical key variables: grado, d_aa, g_sexo
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## grado 6 (6) 662.400 (662.400) 508
## d_aa 14 (14) 237.286 (237.286) 1
## g_sexo 4 (4) 1104.000 (1104.000) 91
##
## (508)
## (1)
## (91)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 3 (0.090%)
## - 3-anonymity: 9 (0.271%)
## - 5-anonymity: 36 (1.084%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("q11_b",
"q12_b",
"q17_b",
"q18_b",
"q23b",
"q23d")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% open_ends]
# !!! No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)