Introduction
In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc
dataset of the laeken
package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021)
<arXiv:2105.08379>.
Contingency table
We will estimate the contingency table when the factor variable which
represents the economic status pl030
is crossed with a
discretized version of the equivalized household income
eqIncome
. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix
data("eusilc")
<- na.omit(eusilc)
eusilc <- nrow(eusilc)
N
# Xm are the matching variables and id are identity of the units
<- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xm <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xmcat <- cbind(Xmcat,Xm[,-c(2,4,5)])
Xm <- eusilc$rb030
id
# categorial income splitted by the percentile
<- eusilc$eqIncome
c_income <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
q which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
c_income[
# variable of interests
<- data.frame(ecostat = eusilc$pl030)
Y <- data.frame(c_income = c_income)
Z
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
<- table(cbind(Y,Z))
YZ addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Sampling schemes
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
<- 1000
n1 <- 500
n2
# samples
<- srswor(n1,N)
s1 <- srswor(n2,N)
s2
# extract matching units
<- Xm[s1 == 1,]
X1 <- Xm[s2 == 1,]
X2
# extract variable of interest
<- data.frame(Y[s1 == 1,])
Y1 colnames(Y1) <- colnames(Y)
<- as.data.frame(Z[s2 == 1,])
Z2 colnames(Z2) <- colnames(Z)
# extract correct identities
<- id[s1 == 1]
id1 <- id[s2 == 1]
id2
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
<- rep(N/n1,n1)
d1 <- rep(N/n2,n2)
d2
# disjunctive form
<- sampling::disjunctive(as.matrix(Y))
Y_dis <- sampling::disjunctive(as.matrix(Z))
Z_dis
<- Y_dis[s1 ==1,]
Y1_dis <- Z_dis[s2 ==1,] Z2_dis
Harmonization
Then the harmonization step must be performed. The
harmonize
function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
<- harmonize(X1,d1,id1,X2,d2,id2)
re
# if we want to use the population totals to harmonize we can use
<- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
re
<- re$w1
w1 <- re$w2
w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
Optimal transport matching
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct
weights.
# Optimal transport matching
<- otmatch(X1,id1,X2,id2,w1,w2)
object head(object[,1:3])
#> id1 id2 weight
#> 1101 1101 10101 13.105554
#> 2201 2201 92002 13.228948
#> 2506 2506 284804 11.137493
#> 3401 3401 75003 15.535169
#> 3502 3502 573801 12.459401
#> 5601 5601 94301 2.045844
<- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Y1_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
Z2_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
YZ_ot
# transform NA into 0
is.na(YZ_ot)] <- 0
YZ_ot[
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 695.307 631.512 821.460 801.524 710.964 919.513 705.936 5286.216
#> 2 168.740 248.835 219.567 168.099 207.663 192.223 114.867 1319.994
#> 3 93.764 100.265 35.517 41.781 129.307 119.098 91.409 611.140
#> 4 125.379 77.033 79.099 66.112 49.029 48.917 75.149 520.718
#> 5 499.483 478.886 416.906 660.976 381.548 382.080 333.852 3153.731
#> 6 0.000 19.531 12.861 30.103 21.410 49.620 0.000 133.525
#> 7 213.890 143.905 181.229 113.050 174.820 166.721 88.061 1081.676
#> Sum 1796.562 1699.967 1766.639 1881.645 1674.741 1878.172 1409.273 12107.000
Balanced sampling
As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch
function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
<- bsmatch(object)
BS head(BS$object[,1:3])
#> id1 id2 weight
#> 1101 1101 10101 13.105554
#> 2201 2201 92002 13.228948
#> 2506 2506 284804 11.137493
#> 3401 3401 75003 15.535169
#> 3502 3502 573801 12.459401
#> 5601.1 5601 504101 9.651876
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 687.894 680.868 825.285 803.873 711.913 876.420 699.962 5286.216
#> 2 147.184 245.493 214.871 169.883 209.014 224.932 108.616 1319.994
#> 3 108.774 96.795 40.816 30.807 90.390 111.556 132.003 611.140
#> 4 134.655 80.550 68.865 75.608 54.709 37.404 68.927 520.718
#> 5 493.231 473.863 380.201 668.284 380.821 366.602 390.728 3153.731
#> 6 0.000 19.531 12.861 30.103 21.410 49.620 0.000 133.525
#> 7 186.107 154.341 160.995 143.815 164.853 171.316 100.248 1081.676
#> Sum 1757.845 1751.442 1703.895 1922.373 1633.111 1837.850 1500.484 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
<- bsmatch(object,Z2)
BS
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 657.957 650.504 763.609 787.167 747.249 932.068 747.661 5286.216
#> 2 157.964 234.473 238.515 194.087 209.863 179.352 105.741 1319.994
#> 3 96.399 121.868 28.117 42.742 113.689 123.004 85.322 611.140
#> 4 146.005 67.274 80.790 64.629 32.928 37.404 91.688 520.718
#> 5 529.678 474.721 411.914 679.078 358.354 375.266 324.720 3153.731
#> 6 0.000 19.531 12.861 30.103 21.410 49.620 0.000 133.525
#> 7 211.686 123.079 230.840 82.714 191.263 182.047 60.046 1081.676
#> Sum 1799.689 1691.451 1766.647 1880.521 1674.756 1878.760 1415.177 12107.000
Prediction
# split the weight by id1
<- split(object$weight,f = object$id1)
q_l # normalize in each id1
<- lapply(q_l, function(x){x/sum(x)})
q_l <- as.numeric(do.call(c,q_l))
q
<- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
Z_pred colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 1.0000000 0 0 0 0 0.0000000 0
#> [2,] 0.0000000 0 0 0 0 0.0000000 1
#> [3,] 0.0000000 0 1 0 0 0.0000000 0
#> [4,] 0.0000000 0 1 0 0 0.0000000 0
#> [5,] 0.0000000 0 0 0 0 0.0000000 1
#> [6,] 0.8251074 0 0 0 0 0.1748926 0