rm(list=ls(all=t))
filename <- "InDepthParents2016_Rural_Raw_NOPII" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!Include any Direct PII variables
dropvars <- c("student_name",
"cto_padre_nom",
"name_pad",
"dia_nac",
"mes_nac",
"fecha_nac",
"telf_yesno",
"num_telf",
"future_parent",
"school_parent",
"education_parent",
"treated_2015",
"video_start",
"video_end",
"pic_home",
"audio_video",
"key",
"fecha_nac_fixed",
"p27a1",
"p27a2",
"p27a3",
"p27a4",
"p27a5",
"p27a6",
"p27a7",
"p27a8",
"p27a9",
"p27a10",
"p27d1",
"p27d2",
"p27d3")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Encode ID variables
mydata <- encode_direct_PII_team (variables=c("id_alumno"))
## [1] "Frequency table before encoding"
## id_alumno. Ingrese el código del estudiante cuyo papá/mamá/apoderado va a encuestar
## NONPII VERSION
## 1070
## [1] "Frequency table after encoding"
## id_alumno. Ingrese el código del estudiante cuyo papá/mamá/apoderado va a encuestar
## 1
## 1070
mydata <- encode_direct_PII_team (variables=c("id_alumno_preloaded"))
## [1] "Frequency table before encoding"
## id_alumno_preloaded. Selecciona al alumno que corresponde.
## NONPII VERSION
## 15 1055
## [1] "Frequency table after encoding"
## id_alumno_preloaded. Selecciona al alumno que corresponde.
## 1 2
## 15 1055
# Interviewer names, for example may be useful for analysis of interviewer effects
!!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("i5"))
## [1] "Frequency table before encoding"
## i5. Encuestador
## -99999
## 1070
## [1] "Frequency table after encoding"
## i5. Encuestador
## 1
## 1070
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("i8a",
"i7",
"geo_points1")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## i8a. Provincia
## AREQUIPA CAMANA CASTILLA CAYLLOMA CONDESUYOS LA UNION CUSCO ACOMAYO ANTA
## 6 1 55 16 17 70 24 22 125
## CALCA CANAS CHUMBIVILCAS ESPINAR PARURO PAUCARTAMBO QUISPICANCHI URUBAMBA
## 46 91 249 21 119 140 7 61
## [1] "Frequency table after encoding"
## i8a. Provincia
## 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
## 249 7 70 140 24 125 16 17 119 6 55 46 61 21 91 1 22
## [1] "Frequency table before encoding"
## i7. Distrito
## AREQUIPA ALTO SELVA ALEGRE CERRO COLORADO JACOBO HUNTER YURA JOSE MARIA QUIMPER
## 1 1 1 1 2 1
## ANDAGUA CHACHAS CHILCAYMARCA CHOCO ORCOPAMPA PAMPACOLCA
## 1 9 4 4 28 5
## TIPAN VIRACO CHIVAY CAYLLOMA SIBAYO TAPAY
## 2 2 2 12 1 1
## CHUQUIBAMBA CAYARANI IRAY SALAMANCA COTAHUASI ALCA
## 3 10 3 1 9 8
## HUAYNACOTAS PAMPAMARCA PUYCA TOMEPAMPA CUSCO CCORCA
## 13 11 24 5 10 5
## SAN JERONIMO SAN SEBASTIAN SANTIAGO ACOS RONDOCAN ANTA
## 6 1 2 2 20 43
## ANCAHUASI CHINCHAYPUJIO HUAROCONDO PUCYURA ZURITE LAMAY
## 40 20 13 1 8 10
## PISAC SAN SALVADOR YANAOCA CHECCA KUNTURKANKI LANGUI
## 19 17 3 48 31 9
## SANTO TOMAS CAPACMARCA CHAMACA COLQUEMARCA LIVITACA LLUSCO
## 32 13 34 35 43 31
## QUI<U+FFFD>OTA VELILLE ESPINAR COPORAQUE PARURO ACCHA
## 20 41 2 19 6 16
## CCAPI COLCHA HUANOQUITE OMACHA PACCARITAMBO YAURISQUE
## 7 12 17 34 13 14
## PAUCARTAMBO CAICAY CHALLABAMBA COLQUEPATA HUANCARANI ANDAHUAYLILLAS
## 6 16 27 48 43 1
## LUCRE URUBAMBA CHINCHERO HUAYLLABAMBA MARAS OLLANTAYTAMBO
## 6 14 20 1 11 15
## [1] "Frequency table after encoding"
## i7. Distrito
## 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
## 10 20 14 16 5 2 34 8 32 19 4 2 19 48 10 40 1 41 1 28 3 2 9
## 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
## 11 17 31 1 6 12 1 20 43 3 1 10 13 2 6 48 6 34 20 1 13 1 2
## 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
## 14 1 12 13 9 17 1 9 31 20 1 24 8 1 43 2 11 2 1 1 35 3 6
## 1070 1071 1072 1073 1074 1075 1076 1077 1078
## 5 4 13 43 5 16 7 15 27
## [1] "Frequency table before encoding"
## geo_points1. ¿Dónde se tomaron los puntos de georeferencia?
## En el hogar o frente al hogar En la escuela del niño
## 762 43
## En la chacra o en el centro de trabajo del papá Otro
## 150 89
## <NA>
## 26
## [1] "Frequency table after encoding"
## geo_points1. ¿Dónde se tomaron los puntos de georeferencia?
## 673 674 675 676 <NA>
## 89 762 43 150 26
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata$age1 <- as.numeric(mydata$age1)
break_age <- c(29,31,32,33,35,37,38,39,40,41,42,43,44,45,46,48,49,50)
labels_age <- c("30 or younger" =1,
"31"=2,
"32"=3,
"33"=4,
"35"=5,
"37"=6,
"38"=7,
"39"=8,
"40"=9,
"41"=10,
"42"=11,
"43"=12,
"44"=13,
"45"=14,
"46"=15,
"48"=16,
"49"=17,
"50 or older"=18,
"NA" = 19)
mydata <- ordinal_recode (variable="age1", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## age1.
## 29 30 31 32 33 35 37 38 39 40 41 42 43 44 45 46 48 49 50 51 52 53 54
## 3 2 1 2 1 1 5 1 3 3 4 4 2 5 4 3 2 3 2 4 2 3 2
## 55 56 57 58 75 <NA>
## 2 1 1 2 1 1001
## recoded
## [29,31) [31,32) [32,33) [33,35) [35,37) [37,38) [38,39) [39,40) [40,41) [41,42) [42,43) [43,44) [44,45) [45,46)
## 29 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 30 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 31 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 32 0 0 2 0 0 0 0 0 0 0 0 0 0 0
## 33 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 35 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 37 0 0 0 0 0 5 0 0 0 0 0 0 0 0
## 38 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 39 0 0 0 0 0 0 0 3 0 0 0 0 0 0
## 40 0 0 0 0 0 0 0 0 3 0 0 0 0 0
## 41 0 0 0 0 0 0 0 0 0 4 0 0 0 0
## 42 0 0 0 0 0 0 0 0 0 0 4 0 0 0
## 43 0 0 0 0 0 0 0 0 0 0 0 2 0 0
## 44 0 0 0 0 0 0 0 0 0 0 0 0 5 0
## 45 0 0 0 0 0 0 0 0 0 0 0 0 0 4
## 46 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 48 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 49 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 50 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 51 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 54 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 55 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 56 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 57 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 58 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 75 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## recoded
## [46,48) [48,49) [49,50) [50,1e+06)
## 29 0 0 0 0
## 30 0 0 0 0
## 31 0 0 0 0
## 32 0 0 0 0
## 33 0 0 0 0
## 35 0 0 0 0
## 37 0 0 0 0
## 38 0 0 0 0
## 39 0 0 0 0
## 40 0 0 0 0
## 41 0 0 0 0
## 42 0 0 0 0
## 43 0 0 0 0
## 44 0 0 0 0
## 45 0 0 0 0
## 46 3 0 0 0
## 48 0 2 0 0
## 49 0 0 3 0
## 50 0 0 0 2
## 51 0 0 0 4
## 52 0 0 0 2
## 53 0 0 0 3
## 54 0 0 0 2
## 55 0 0 0 2
## 56 0 0 0 1
## 57 0 0 0 1
## 58 0 0 0 2
## 75 0 0 0 1
## [1] "Frequency table after encoding"
## age1
## 30 or younger 31 32 33 35 37 38 39
## 5 1 2 1 1 5 1 3
## 40 41 42 43 44 45 46 48
## 3 4 4 2 5 4 3 2
## 49 50 or older <NA>
## 3 20 1001
## [1] "Inspect value labels and relabel as necessary"
## 30 or younger 31 32 33 35 37 38 39
## 1 2 3 4 5 6 7 8
## 40 41 42 43 44 45 46 48
## 9 10 11 12 13 14 15 16
## 49 50 or older NA
## 17 18 19
# Recode education attainment of adults to reduce risk of re-identification
break_edu <- c(-98,-1,0,1,2,3,8)
labels_edu <- c("No se"=1,
"Sin nivel"=2,
"Inicial"=3,
"Prim Comp"=4,
"Sec Comp"=5,
"Tec Incomp or more"=6)
mydata <- ordinal_recode (variable="p6_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## p6_1. Padre
## No sé Sin nivel Inicial
## 6 92 93
## Primaria completa Secundaria completa Superior técnica incompleta
## 561 138 3
## Superior técnica completa Superior universitaria completa <NA>
## 5 1 171
## recoded
## [-98,-1) [-1,0) [0,1) [1,2) [2,3) [3,8) [8,1e+06)
## -98 6 0 0 0 0 0 0
## -1 0 92 0 0 0 0 0
## 0 0 0 93 0 0 0 0
## 1 0 0 0 561 0 0 0
## 2 0 0 0 0 138 0 0
## 3 0 0 0 0 0 3 0
## 4 0 0 0 0 0 5 0
## 6 0 0 0 0 0 1 0
## [1] "Frequency table after encoding"
## p6_1. Padre
## No se Sin nivel Inicial Prim Comp Sec Comp Tec Incomp or more
## 6 92 93 561 138 9
## <NA>
## 171
## [1] "Inspect value labels and relabel as necessary"
## No se Sin nivel Inicial Prim Comp Sec Comp Tec Incomp or more
## 1 2 3 4 5 6
break_edu <- c(-98,-1,0,1,2,3,8)
labels_edu <- c("No se"=1,
"Sin nivel"=2,
"Inicial"=3,
"Prim Comp"=4,
"Sec Comp"=5,
"Tec Incomp or more"=6)
mydata <- ordinal_recode (variable="p6_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## p6_2. Madre
## No sé Sin nivel Inicial Primaria completa
## 1 290 135 493
## Secundaria completa Superior técnica incompleta Superior técnica completa <NA>
## 57 2 4 88
## recoded
## [-98,-1) [-1,0) [0,1) [1,2) [2,3) [3,8) [8,1e+06)
## -98 1 0 0 0 0 0 0
## -1 0 290 0 0 0 0 0
## 0 0 0 135 0 0 0 0
## 1 0 0 0 493 0 0 0
## 2 0 0 0 0 57 0 0
## 3 0 0 0 0 0 2 0
## 4 0 0 0 0 0 4 0
## [1] "Frequency table after encoding"
## p6_2. Madre
## No se Sin nivel Inicial Prim Comp Sec Comp Tec Incomp or more
## 1 290 135 493 57 6
## <NA>
## 88
## [1] "Inspect value labels and relabel as necessary"
## No se Sin nivel Inicial Prim Comp Sec Comp Tec Incomp or more
## 1 2 3 4 5 6
break_edu <- c(-98,-1,0,1)
labels_edu <- c("No se"=1,
"Sin nivel"=2,
"Inicial"=3,
"Prim Comp or more"=4)
mydata <- ordinal_recode (variable="p6b1", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## p6b1. Abuelo(a) ${p_a1}
## No sé Sin nivel Inicial Primaria completa Secundaria completa
## 4 56 4 39 2
## <NA>
## 965
## recoded
## [-98,-1) [-1,0) [0,1) [1,1e+06)
## -98 4 0 0 0
## -1 0 56 0 0
## 0 0 0 4 0
## 1 0 0 0 39
## 2 0 0 0 2
## [1] "Frequency table after encoding"
## p6b1. Abuelo(a) ${p_a1}
## No se Sin nivel Inicial Prim Comp or more <NA>
## 4 56 4 41 965
## [1] "Inspect value labels and relabel as necessary"
## No se Sin nivel Inicial Prim Comp or more
## 1 2 3 4
break_edu <- c(-1,0)
labels_edu <- c("Sin nivel"=1,
"Inicial or more"=2)
mydata <- ordinal_recode (variable="p6b2", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## p6b2. Abuelo(a) ${p_a2}
## Sin nivel Inicial Primaria completa <NA>
## 13 1 14 1042
## recoded
## [-1,0) [0,1e+06)
## -1 13 0
## 0 0 1
## 1 0 14
## [1] "Frequency table after encoding"
## p6b2. Abuelo(a) ${p_a2}
## Sin nivel Inicial or more <NA>
## 13 15 1042
## [1] "Inspect value labels and relabel as necessary"
## Sin nivel Inicial or more
## 1 2
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("p1", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more adult household members.
## [1] "Frequency table before encoding"
## p1. ¿Cuántas personas viven en total en el hogar?
## 1 2 3 4 5 6 7 8 9 10 12 16 <NA>
## 2 26 102 194 249 205 145 70 33 24 3 1 16
## [1] "Frequency table after encoding"
## p1. ¿Cuántas personas viven en total en el hogar?
## 1 2 3 4 5 6 7 8 9 10 or more
## 2 26 102 194 249 205 145 70 33 28
## <NA>
## 16
mydata <- top_recode ("p2c", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table before encoding"
## p2c. ¿Con cuántos hermanos o hermanas vive?
## 0 1 2 3 4 5 6 7 9 10 <NA>
## 128 218 256 212 127 68 28 14 2 1 16
## [1] "Frequency table after encoding"
## p2c. ¿Con cuántos hermanos o hermanas vive?
## 0 1 2 3 4 5 or more <NA>
## 128 218 256 212 127 113 16
mydata <- top_recode ("p2d", break_point=2, missing=c(888, 999999)) # Topcode cases with 2 or more adult household members.
## [1] "Frequency table before encoding"
## p2d. ¿Con cuántos abuelos o abuelas vive?
## 0 1 2 3 <NA>
## 945 81 27 1 16
## [1] "Frequency table after encoding"
## p2d. ¿Con cuántos abuelos o abuelas vive?
## 0 1 2 or more <NA>
## 945 81 28 16
mydata <- top_recode ("p2e", break_point=1, missing=c(888, 999999)) # Topcode cases with 1 or more adult household members.
## [1] "Frequency table before encoding"
## p2e. ¿Con cuántos tÃos o tÃas vive?
## 0 1 2 <NA>
## 1035 13 6 16
## [1] "Frequency table after encoding"
## p2e. ¿Con cuántos tÃos o tÃas vive?
## 0 1 or more <NA>
## 1035 19 16
mydata <- top_recode ("p2f", break_point=2, missing=c(888, 999999)) # Topcode cases with 2 or more adult household members.
## [1] "Frequency table before encoding"
## p2f. ¿Con cuántos sobrinos vive?
## 0 1 2 3 <NA>
## 1015 30 7 2 16
## [1] "Frequency table after encoding"
## p2f. ¿Con cuántos sobrinos vive?
## 0 1 2 or more <NA>
## 1015 30 9 16
mydata <- top_recode ("p2g", break_point=3, missing=c(888, 999999)) # Topcode cases with 3 or more adult household members.
## [1] "Frequency table before encoding"
## p2g. ¿Con cuántos otros miembros del hogar vive el/la niño/a?
## 0 1 2 3 5 <NA>
## 980 60 7 6 1 16
## [1] "Frequency table after encoding"
## p2g. ¿Con cuántos otros miembros del hogar vive el/la niño/a?
## 0 1 2 3 or more <NA>
## 980 60 7 7 16
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(na.exclude(mydata$p42a)[na.exclude(mydata$p42a)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p42a", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p42a. ¿Cuánto le da a la semana?
## 0.00999999977648258 0.5 1 1.20000004768372 1.5
## 1 1 39 1 5
## 1.60000002384186 2 2.5 3 3.5
## 1 52 52 69 2
## 4 4.5 5 6 7
## 25 3 312 22 15
## 7.5 8 10 11.5 12
## 10 8 155 1 5
## 12.5 13 15 16 20
## 4 1 49 1 29
## 21 25 27 30 35
## 2 13 1 5 1
## 48 50 120 <NA>
## 1 1 1 182
## [1] "Frequency table after encoding"
## p42a. ¿Cuánto le da a la semana?
## 0.00999999977648258 0.5 1 1.20000004768372 1.5
## 1 1 39 1 5
## 1.60000002384186 2 2.5 3 3.5
## 1 52 52 69 2
## 4 4.5 5 6 7
## 25 3 312 22 15
## 7.5 8 10 11.5 12
## 10 8 155 1 5
## 12.5 13 15 16 20
## 4 1 49 1 29
## 21 25 27 30 or more <NA>
## 2 13 1 9 182
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7_1)[na.exclude(mydata$p7_1)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7_1", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7_1. Padre
## -99 0 4 10 30 50 60 70 72 75 80 90 100 105 120 125 130 140 150 160 175 180 200
## 1 2 1 1 2 4 4 3 1 2 1 2 12 1 9 5 1 1 17 1 1 7 24
## 210 220 240 250 255 280 300 325 350 380 400 500 600 700 750 800 810 900 960 1000 1200 1300 1500
## 3 2 4 26 2 1 24 1 8 1 15 7 6 2 1 4 1 4 1 7 5 1 2
## 1600 1900 2000 3000 <NA>
## 1 1 1 1 835
## [1] "Frequency table after encoding"
## p7_1. Padre
## -99 0 4 10 30 50 60 70 72
## 1 2 1 1 2 4 4 3 1
## 75 80 90 100 105 120 125 130 140
## 2 1 2 12 1 9 5 1 1
## 150 160 175 180 200 210 220 240 250
## 17 1 1 7 24 3 2 4 26
## 255 280 300 325 350 380 400 500 600
## 2 1 24 1 8 1 15 7 6
## 700 750 800 810 900 960 1000 1200 1300
## 2 1 4 1 4 1 7 5 1
## 1500 1600 1900 1983 or more <NA>
## 2 1 1 2 835
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7_2)[na.exclude(mydata$p7_2)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7_2", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7_2. Madre
## 0 20 25 30 35 50 65 75 80 90 100 125 150 160 187 200 210 225 250 400 900 1000 <NA>
## 3 1 1 1 1 4 1 3 2 1 9 2 6 2 1 8 1 1 1 1 1 1 1018
## [1] "Frequency table after encoding"
## p7_2. Madre
## 0 20 25 30 35 50 65 75 80
## 3 1 1 1 1 4 1 3 2
## 90 100 125 150 160 187 200 210 225
## 1 9 2 6 2 1 8 1 1
## 250 400 900 974 or more <NA>
## 1 1 1 1 1018
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a1)[na.exclude(mydata$p7a1)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7a1", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7a1. Hermano(a) ${p_h1}
## -99 0 15 20 50 60 100 120 125 150 180 200 210 250 270 300 350 400 500 600 900 1000 1200
## 1 1 2 3 5 2 3 2 1 6 1 11 1 4 1 6 1 3 3 2 3 2 2
## 1350 1500 <NA>
## 1 1 1002
## [1] "Frequency table after encoding"
## p7a1. Hermano(a) ${p_h1}
## -99 0 15 20 50 60 100 120 125
## 1 1 2 3 5 2 3 2 1
## 150 180 200 210 250 270 300 350 400
## 6 1 11 1 4 1 6 1 3
## 500 600 900 1000 1200 1350 1449 or more <NA>
## 3 2 3 2 2 1 1 1002
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a2)[na.exclude(mydata$p7a2)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7a2", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7a2. Hermano(a) ${p_h2}
## -99 0 15 50 75 125 150 160 200 250 300 350 500 800 900 1000 <NA>
## 1 3 2 2 1 1 3 1 3 2 1 1 1 1 1 1 1045
## [1] "Frequency table after encoding"
## p7a2. Hermano(a) ${p_h2}
## -99 0 15 50 75 125 150 160 200
## 1 3 2 2 1 1 3 1 3
## 250 300 350 500 800 900 987 or more <NA>
## 2 1 1 1 1 1 1 1045
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a3)[na.exclude(mydata$p7a3)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7a3", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7a3. Hermano(a) ${p_h3}
## 0 75 90 150 200 500 800 1000 <NA>
## 1 1 1 1 1 1 1 1 1062
## [1] "Frequency table after encoding"
## p7a3. Hermano(a) ${p_h3}
## 0 75 90 150 200 500 800 993 or more <NA>
## 1 1 1 1 1 1 1 1 1062
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a4)[na.exclude(mydata$p7a4)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7a4", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7a4. Hermano(a) ${p_h4}
## 0 200 <NA>
## 2 1 1067
## [1] "Frequency table after encoding"
## p7a4. Hermano(a) ${p_h4}
## 0 198 or more <NA>
## 2 1 1067
percentile_99.5 <- floor(quantile(na.exclude(mydata$p7c1)[na.exclude(mydata$p7c1)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7c1", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## p7c1. TÃo(a) ${p_t1}
## 200 800 <NA>
## 1 1 1068
## [1] "Frequency table after encoding"
## p7c1. TÃo(a) ${p_t1}
## 200 797 or more <NA>
## 1 1 1068
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("i14",
"i15",
"dropout",
"p10_1",
"p42",
"p2a",
"p2b",
"p3a1",
"p3a2",
"p3a3",
"p3a4",
"p3a5",
"p3a6",
"p3a7",
"p3a8",
"p3a9",
"p3a10",
"p3b1",
"p3b2",
"p3b3",
"p3c1",
"p3c2",
"p3d1",
"p3d2",
"p3d3",
"p26a1",
"p26a2",
"p26a3",
"p26a4",
"p26a5",
"p26a6",
"p26a7",
"p26a8",
"p26a9",
"p26a10",
"p26c1",
"p26c2",
"p26d1",
"p26d2",
"p26d3",
"p4_1",
"p4_2",
"p4a1",
"p4a2",
"p4a3",
"p4a4",
"p4a5",
"p4a6",
"p4a7",
"p4a8",
"p4a9",
"p4a10",
"p4b1",
"p4b2",
"p4b3",
"p4c1",
"p4c2",
"p4d1",
"p4d2",
"p4d3",
"p5_a_1",
"p5_a_2",
"p5_aa1",
"p5_aa2",
"p5_aa3",
"p5_aa4",
"p5_aa5",
"p5_aa6",
"p5_aa7",
"p5_aa8",
"p5_aa9",
"p5_ab1",
"p5_ac1",
"p5_ac2",
"p5_ad1",
"p5_ad2",
"p23_1",
"p23_2",
"p23a1",
"p23a2",
"p23a3",
"p23a4",
"p23a5",
"p23a6",
"p23a7",
"p23a8",
"p23a9",
"p23b1",
"p23b2",
"p23b3",
"p23c1",
"p23c2",
"p23d1",
"p23d2",
"juntos_ben",
"juntos_year")
capture_tables (indirect_PII)
#Recode those with very specific values.
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
"Otros"=2,
"Trabajo remunerado"=3,
"Quehaceres del hogar o trabajo no remunerado"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4_1", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4_1. Padre
## Estudia y tiene un trabajo remunerado Trabajo remunerado
## 4 231
## Quehaceres del hogar o trabajo no remunerado Infante pre-escolar (menor a 2 años)
## 668 6
## <NA>
## 161
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 2 0 4 0 0 0
## 3 0 0 231 0 0
## 4 0 0 0 668 0
## 5 0 0 0 0 6
## [1] "Frequency table after encoding"
## p4_1. Padre
## Otros Trabajo remunerado
## 10 231
## Quehaceres del hogar o trabajo no remunerado <NA>
## 668 161
## [1] "Inspect value labels and relabel as necessary"
## Estudia Otros
## 1 2
## Trabajo remunerado Quehaceres del hogar o trabajo no remunerado
## 3 4
## Otros
## 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Trabajo remunerado"=3,
"Quehaceres del hogar o trabajo no remunerado"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4_2", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4_2. Madre
## Estudia Estudia y tiene un trabajo remunerado
## 2 1
## Trabajo remunerado Quehaceres del hogar o trabajo no remunerado
## 51 931
## Infante pre-escolar (menor a 2 años) <NA>
## 9 76
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 1 2 0 0 0 0
## 2 0 1 0 0 0
## 3 0 0 51 0 0
## 4 0 0 0 931 0
## 5 0 0 0 0 9
## [1] "Frequency table after encoding"
## p4_2. Madre
## Otros Trabajo remunerado
## 12 51
## Quehaceres del hogar o trabajo no remunerado <NA>
## 931 76
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros
## 1 2
## Trabajo remunerado Quehaceres del hogar o trabajo no remunerado
## 3 4
## Otros
## 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Otros"=3,
"Quehaceres del hogar o trabajo no remunerado"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4b1", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4b1. Abuelo(a) ${p_a1}
## Estudia Quehaceres del hogar o trabajo no remunerado
## 3 105
## Infante pre-escolar (menor a 2 años) <NA>
## 1 961
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 1 3 0 0 0 0
## 4 0 0 0 105 0
## 5 0 0 0 0 1
## [1] "Frequency table after encoding"
## p4b1. Abuelo(a) ${p_a1}
## Otros Quehaceres del hogar o trabajo no remunerado
## 4 105
## <NA>
## 961
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros
## 1 2
## Otros Quehaceres del hogar o trabajo no remunerado
## 3 4
## Otros
## 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Otros"=3,
"Otros"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4c1", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4c1. TÃo(a) ${p_t1}
## Estudia Estudia y tiene un trabajo remunerado
## 3 1
## Trabajo remunerado Quehaceres del hogar o trabajo no remunerado
## 1 14
## <NA>
## 1051
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 1 3 0 0 0 0
## 2 0 1 0 0 0
## 3 0 0 1 0 0
## 4 0 0 0 14 0
## [1] "Frequency table after encoding"
## p4c1. TÃo(a) ${p_t1}
## Otros <NA>
## 19 1051
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros Otros Otros Otros
## 1 2 3 4 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Otros"=3,
"Otros"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4b3", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4b3. Abuelo(a) ${p_a3}
## Quehaceres del hogar o trabajo no remunerado <NA>
## 1 1069
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 4 0 0 0 1 0
## [1] "Frequency table after encoding"
## p4b3. Abuelo(a) ${p_a3}
## Otros <NA>
## 1 1069
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros Otros Otros Otros
## 1 2 3 4 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Otros"=3,
"Otros"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4c2", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4c2. TÃo(a) ${p_t2}
## Estudia Quehaceres del hogar o trabajo no remunerado
## 2 4
## <NA>
## 1064
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 1 2 0 0 0 0
## 4 0 0 0 4 0
## [1] "Frequency table after encoding"
## p4c2. TÃo(a) ${p_t2}
## Otros <NA>
## 6 1064
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros Otros Otros Otros
## 1 2 3 4 5
break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
"Otros"=2,
"Otros"=3,
"Otros"=4,
"Otros"=5)
mydata <- ordinal_recode (variable="p4d1", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## p4d1. Sobrino(a) ${p_s1}
## Estudia Quehaceres del hogar o trabajo no remunerado
## 22 3
## Infante pre-escolar (menor a 2 años) <NA>
## 14 1031
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,1e+06)
## 1 22 0 0 0 0
## 4 0 0 0 3 0
## 5 0 0 0 0 14
## [1] "Frequency table after encoding"
## p4d1. Sobrino(a) ${p_s1}
## Otros <NA>
## 39 1031
## [1] "Inspect value labels and relabel as necessary"
## Otros Otros Otros Otros Otros
## 1 2 3 4 5
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('i14', 'age1') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 1070 rows and 739 variables.
## --> Categorical key variables: i14, age1
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## i14 3 (3) 527.000 (527.000) 381 (381)
## age1 19 (19) 3.833 (3.833) 1 (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 0 (0.000%)
## - 5-anonymity: 0 (0.000%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("dropout_reasons",
"dropout_reasons_otro",
"p15a",
"p29a",
"p44b",
"p44c",
"p51",
"p51a",
"p_h1",
"p_h2",
"p_h3",
"p_h4",
"p_h5",
"p_h6",
"p_h7",
"p_h8",
"p_h9",
"p_h10",
"p_h11",
"p_h12",
"p_a1",
"p_a2",
"p_a3",
"p_a4",
"p_t1",
"p_t2",
"p_t3",
"p_t4",
"p_t5",
"p_t6",
"p_t7",
"p_t8",
"p_t9",
"p_t10",
"p_t11",
"p_t12",
"p_s1",
"p_s2",
"p_s3",
"p_s4",
"p_s5",
"p_s6",
"p_s7",
"p_s8",
"p_s9",
"p_s10",
"p_s11",
"p_s12",
"text_geo")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% "dropout_reasons"]
mydata <- mydata[!names(mydata) %in% "dropout_reasons_otro"]
mydata <- mydata[!names(mydata) %in% "p15a"]
mydata <- mydata[!names(mydata) %in% "p29a"]
mydata <- mydata[!names(mydata) %in% "p44b"]
mydata <- mydata[!names(mydata) %in% "p44c"]
mydata <- mydata[!names(mydata) %in% "p51"]
mydata <- mydata[!names(mydata) %in% "p51a"]
mydata <- mydata[!names(mydata) %in% "p_h1"]
mydata <- mydata[!names(mydata) %in% "p_h2"]
mydata <- mydata[!names(mydata) %in% "p_h3"]
mydata <- mydata[!names(mydata) %in% "p_h4"]
mydata <- mydata[!names(mydata) %in% "p_h5"]
mydata <- mydata[!names(mydata) %in% "p_h6"]
mydata <- mydata[!names(mydata) %in% "p_h7"]
mydata <- mydata[!names(mydata) %in% "p_h8"]
mydata <- mydata[!names(mydata) %in% "p_h9"]
mydata <- mydata[!names(mydata) %in% "p_h10"]
mydata <- mydata[!names(mydata) %in% "p_h11"]
mydata <- mydata[!names(mydata) %in% "p_h12"]
mydata <- mydata[!names(mydata) %in% "p_a1"]
mydata <- mydata[!names(mydata) %in% "p_a2"]
mydata <- mydata[!names(mydata) %in% "p_a3"]
mydata <- mydata[!names(mydata) %in% "p_a4"]
mydata <- mydata[!names(mydata) %in% "p_t1"]
mydata <- mydata[!names(mydata) %in% "p_t2"]
mydata <- mydata[!names(mydata) %in% "p_t3"]
mydata <- mydata[!names(mydata) %in% "p_t4"]
mydata <- mydata[!names(mydata) %in% "p_t5"]
mydata <- mydata[!names(mydata) %in% "p_t6"]
mydata <- mydata[!names(mydata) %in% "p_t7"]
mydata <- mydata[!names(mydata) %in% "p_t8"]
mydata <- mydata[!names(mydata) %in% "p_t9"]
mydata <- mydata[!names(mydata) %in% "p_t10"]
mydata <- mydata[!names(mydata) %in% "p_t11"]
mydata <- mydata[!names(mydata) %in% "p_t12"]
mydata <- mydata[!names(mydata) %in% "p_s1"]
mydata <- mydata[!names(mydata) %in% "p_s2"]
mydata <- mydata[!names(mydata) %in% "p_s3"]
mydata <- mydata[!names(mydata) %in% "p_s4"]
mydata <- mydata[!names(mydata) %in% "p_s5"]
mydata <- mydata[!names(mydata) %in% "p_s6"]
mydata <- mydata[!names(mydata) %in% "p_s7"]
mydata <- mydata[!names(mydata) %in% "p_s8"]
mydata <- mydata[!names(mydata) %in% "p_s9"]
mydata <- mydata[!names(mydata) %in% "p_s10"]
mydata <- mydata[!names(mydata) %in% "p_s11"]
mydata <- mydata[!names(mydata) %in% "p_s12"]
mydata <- mydata[!names(mydata) %in% "text_geo"]
# Setup map
countrymap <- map_data("world") %>% filter(region=="Peru") #!!! Select correct country
admin <- raster::getData("GADM", country="PE", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement.
gps.vars <- c("geo_pointslongitude", "geo_pointslatitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 43 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## geo_pointslongitude geo_pointslatitude
## Min. :-73.00 Min. :-16.59
## 1st Qu.:-72.17 1st Qu.:-14.51
## Median :-71.91 Median :-14.19
## Mean :-71.96 Mean :-14.12
## 3rd Qu.:-71.71 3rd Qu.:-13.49
## Max. :-71.23 Max. :-13.14
## NA's :43 NA's :43
## Warning: Removed 43 rows containing missing values (geom_point).
## Warning: Removed 43 rows containing missing values (geom_point).
## Warning: Removed 43 rows containing missing values (geom_point).
## Warning: Removed 43 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## geo_pointslongitude geo_pointslatitude
## Min. :-73.01 Min. :-16.58
## 1st Qu.:-72.17 1st Qu.:-14.52
## Median :-71.91 Median :-14.19
## Mean :-71.96 Mean :-14.12
## 3rd Qu.:-71.71 3rd Qu.:-13.49
## Max. :-71.21 Max. :-13.15
## NA's :43 NA's :43
## [1] "Processing time = 41.7003061532974"
# !!! Remove altitude data
mydata <- mydata[!names(mydata) %in% "geo_pointsaltitude"]
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)