rm(list=ls(all=t))
filename <- "SAP2016 Secundaria RAW2 NOPII" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
table(mydata$Seccion)
## Warning: Unknown or uninitialised column: `Seccion`.
## < table of extent 0 >
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!Include any Direct PII variables
dropvars <- c("A_NOM",
"C_NOM",
"A_APEPAT",
"C_APEPAT",
"A_APEMAT",
"C_APEMAT",
"J_DNI",
"L_DNI")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!! Removed as it contains identifying information
dropvars <- c("DIGITA")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("ENCUES"))
## [1] "Frequency table before encoding"
## ENCUES. Codigo del Encuestador
## No indica 1 3 4 6 7 10 11
## 165 527 1 602 790 3 1 1
## 13 15 16 18 19 21 23 24
## 1 501 543 499 1 1094 465 1
## 25 26 41 42 65 66 101 158
## 490 1 1 1 1 1 1 1
## 196 201
## 1 1
## [1] "Frequency table after encoding"
## ENCUES. Codigo del Encuestador
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 165 527 1 602 790 3 1 1 1 501 543 499 1 1094 465 1 490
## 18 19 20 21 22 23 24 25 26
## 1 1 1 1 1 1 1 1 1
# !!! Removed as it contains identifying information
dropvars <- c("NOMESC")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("CODLOC", "CODMOD")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## CODLOC. Codigo de Local
## 144536 144579 144621 144683 144796 144824 288134 288214 292150 292254 305789 305925
## 10 15 1 12 3 3 1 17 6 1 1 8
## 308943 313478 313727 320619 320624 324014 324486 324491 324740 324896 325221 325424
## 11 10 3 10 4 3 5 1 6 3 1 2
## 325438 326838 329691 343866 346638 593638 594124 594435 594459 <NA>
## 5 1 12 1 1 9 1 6 11 5510
## [1] "Frequency table after encoding"
## CODLOC. Codigo de Local
## 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
## 1 1 3 12 10 1 6 5 9 10 6 15 11 2 11 1 3
## 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 <NA>
## 3 17 1 6 3 8 12 4 10 1 1 5 3 1 1 1 5510
## [1] "Frequency table before encoding"
## CODMOD. C<f3>digo Modular
## 245662 325704 329326 336495 336511 336545 336628 337592 340315 340349
## 99 57 115 68 47 11 100 10 62 68
## 437244 437293 437319 437327 449868 493742 493841 495259 500348 501502
## 118 91 67 8 95 26 63 26 18 3
## 501601 535823 556449 565200 565234 565267 566414 578401 578468 578526
## 57 50 5 22 82 15 4 51 3 85
## 578534 582262 582304 582312 582932 583021 583104 583591 603878 642801
## 95 81 47 12 125 125 106 57 53 89
## 642926 643163 643262 643692 644690 647172 650002 659953 664292 664748
## 74 1 69 7 6 68 9 95 1 91
## 665281 665463 693622 694547 694562 694570 694596 743773 759613 762849
## 127 2 95 58 4 97 9 50 63 80
## 762880 762914 765313 765321 774026 774679 774703 775346 777656 778233
## 138 61 11 51 58 101 88 1 61 47
## 778738 781930 782102 832337 870931 870956 874214 884627 900647 900738
## 53 165 82 49 11 3 61 10 1 4
## 900944 1007160 1010180 1053669 1070077 1071919 1074509 1083633 1087295 1194265
## 3 12 1 58 130 80 16 1 4 89
## 1238948 1248392 1264670 1273275 1335637 1364975 1380740 1381219 1381334 1381896
## 3 169 3 122 163 1 116 142 92 11
## 1475250 1476258 1477264
## 78 14 8
## [1] "Frequency table after encoding"
## CODMOD. C<f3>digo Modular
## 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
## 88 16 9 4 165 91 4 50 51 3 3 11 8 89 95 125 47 80 1 2 12
## 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
## 10 1 12 68 97 125 81 163 57 61 1 26 11 53 58 57 100 80 68 95 74
## 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
## 9 5 122 53 11 62 91 47 58 101 92 82 118 58 142 26 169 115 49 7 6
## 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
## 63 61 1 63 127 68 1 15 47 138 18 67 22 61 1 57 106 3 89 78 4
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
## 116 8 14 99 3 95 130 85 50 11 3 4 82 95 51 10 3 1 69
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
dropvars <- c("D_DD",
"D_MM",
"F_DD",
"F_MM")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("NIVESC",
"G_SEXO",
"I_SEXO",
"Q42_1",
"Q42_2",
"Q42_3",
"Q42_4",
"Q43")
capture_tables (indirect_PII)
# Recode those with very specific values.
# !!! No very specific values
# !!!Insufficient demographic data
# !!! Identify open-end variables here:
open_ends <- c("Q11_B",
"Q12_B",
"Q17_B",
"Q18_B",
"Q23B",
"Q23D")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.
mydata <- mydata[!names(mydata) %in% "Q11_B"]
mydata <- mydata[!names(mydata) %in% "Q12_B"]
mydata <- mydata[!names(mydata) %in% "Q17_B"]
mydata <- mydata[!names(mydata) %in% "Q18_B"]
mydata <- mydata[!names(mydata) %in% "Q23B"]
mydata <- mydata[!names(mydata) %in% "Q23D"]
# !!! No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)