clear set more off cd "/Users/vojtabartos/Documents/4 Teaching/2018 Development Economics LMU/Tutorials/Exercise 4/PSDP_public_data_2014-11/" ************************************************** ************************************************** * Replicating Table 1: randomization balance check use "namelist", clear * restrict to the pre-treatment visit (we're interested in baseline characteristics) keep if visit==981 /* there are some duplicates, we need to drop them (this comes from the intermediate dataset that was actually used in the original study; thus these are typos) */ drop if dupid==2 * Merge with the pupils questionnaire dataset merge 1:1 pupid using pupq * Create the following variables for Panel B * Create measure of pre-program school attendance based on # days absent in previous four weeks gen preatt_98 = (20-absdays_98_6)/20 * Create indicator for "Child sick often" gen soften_98 = (fallsick_98_37==3) replace soften_98 = . if fallsick_98_37==. * Create indicator for "Child clean" gen clean_98 = (clean_98_15==1) replace clean_98 = . if clean_98_15==. /* Now we are interested in replicating table 1; the authors examine school averages weighted by pupil population */ *** Panel A (full sample across all grades) * Summary stats and "t-tests" preserve collapse sex elg98 stdgap yrbirth preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 wgrp* (count) npup = pupid, by(schid) bysort wgrp: sum sex elg98 stdgap yrbirth [aw=npup] foreach var in sex elg98 stdgap yrbirth { regress `var' wgrp1 wgrp2 [aw=npup] } restore *** Panel B preserve * Only keep pupils with 1998 data drop if pupdate_98_1=="" & schid_98_2==. collapse (mean) sex elg98 stdgap yrbirth preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 wgrp* (count) npup = pupid, by(schid) * Summary stats and "t-tests" bys wgrp: sum preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 [aw=npup] foreach var in preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 { regress `var' wgrp1 wgrp2 [aw=npup] } restore * Now use school level data use "schoolvar", clear *** Panel C * NOTE: There are some differences in pop1_3km and pop1_36_k; this was another coding mistake in the original paper; these results are correct bysort wgrp: sum distlake pup_pop latr_pup z_inf98 pop1_3km_updated pop1_36k_updated popT_3km_updated popT_36k_updated * Need to generate treatment dummies gen wgrp1 = (wgrp==1) gen wgrp2 = (wgrp==2) gen wgrp3 = (wgrp==3) foreach var in distlake pup_pop latr_pup z_inf98 pop1_3km_updated pop1_36k_updated popT_3km_updated popT_36k_updated { regress `var' wgrp1 wgrp2 // alternatively: xi: regress `var' i.wgrp (but this gives you wgrp2 and wgrp3, so different baseline) } ************************************************** ************************************************** * Replicating Table 3: deworming treatment compliance * Merge Compliance data with Namelist data use namelist, clear *reshape wide date schid std obs prs Tmonths Isem*, i(pupid dupid) j(visit) drop if dupid==2 drop if visit!=981 preserve use comply, clear duplicates tag pupid, gen(dup) drop if dup!=0 saveold comply2, replace restore merge 1:1 pupid using "comply2" * Top panel * Girls < 13 & boys * Any medical treatment in 1998 bys wgrp: tab any98 if elg98 == 1 * Girls >= 13 * Any medical treatment in 1998 bys wgrp: tab any98 if elg98==0 * Middle panel * Drop pupils in standard 8 in 1998 drop if std981==8 * Girls < 13 & boys * Any medical treatment in 1998 bys wgrp: tab any99 if elg99 == 1 * Girls >= 13 * Any medical treatment in 1998 bys wgrp: tab any99 if elg99==0 * Drop pupils not present during 1999 school visits drop if (totprs99==0 | totprs99==.) * Girls < 13 & boys * Any medical treatment in 1999 bys wgrp: tab any99 if elg99 == 1 * Any medical treatment in 1999 bys wgrp: tab any99 if elg99 == 0 ************************************************** ************************************************** * Replicating Table 5: health and health behavior by treatment (1999) * Incorporate data on eligibility and parasitological exams use "namelist", clear /* We only use the baseline data for namelist, we only use them because this dataset contains the treatment variable (the wormed data come after the intervention) */ keep if visit==981 drop if dupid==2 // we need to drop duplicate IDs (see above) merge 1:1 pupid using "wormed" /* Let's just keep the data that were merged succesfully (we only know group assignment for these pupils) */ keep if _merge==3 * Generate low HB indicator (anemia indicator) /* Rather strict to use 100: "Diagnosis in men is based on a hemoglobin of less than 130 to 140 g/L, while in women, it must be less than 120 to 130 g/L." */ gen hb100 = (hb<100) replace hb100=. if hb==. * We're interested in 1999 infection data, restrict sample appropriately keep if any_ics99!=. *** Panel A * only restrict the sample to group 1 and group 2 (no data for group 3 kids anyways) bysort wgrp: sum any_ics98 any_ics99 regress any_ics99 wgrp1 if (wgrp==1 | wgrp==2), robust cluster(schid) // alternative to the school averages used in table 1 *** Panel B (only look at anemia) * Using new data no longer significant differences on anemia bysort wgrp: sum hb10 regress hb100 wgrp1, robust cluster(schid) *** Panel C (plus "sick often" data) * You can just replicate the code on loading data from Table 1 use "namelist", clear /* We only use the baseline data for namelist, we only use them because this dataset contains the treatment variable (the wormed data come after the intervention) */ keep if visit==981 /* there are some duplicates, we need to drop them (this comes from the intermediate dataset that was actually used in the original study; thus these are typos) */ drop if dupid==2 * Merge with the pupils questionnaire dataset merge 1:1 pupid using pupq * Create indicator for "Child clean" gen clean_99 = (clean_99_13==1) replace clean_99 = . if clean_99_13==. * Create indicator for "Child wears shoes" gen shoes_99 = (shoes_99_10==1 | shoes_99_10==2) replace shoes_99 = . if shoes_99_10==. /* For some reason, the table in the paper pools groups 2 and 3 in this case, we just do it as we are used to. This should have been stressed out in the paper more clearly */ bysort wgrp: sum clean_99 shoes_99 dayswat_99_36 foreach var in clean_99 shoes_99 dayswat_99_36 { regress `var' wgrp1, robust cluster(schid) } ************************************************** ************************************************** * Replicating Table 6: health externalities within school (1999) /* NOTE: Here, use the same dataset as in Panel A and B of Table 5 (just copy the code). Further, merge also the compliance dataset, comply2.dta. */ * Incorporate data on eligibility and parasitological exams use "namelist", clear /* We only use the baseline data for namelist, we only use them because this dataset contains the treatment variable (the wormed data come after the intervention) */ keep if visit==981 drop if dupid==2 // we need to drop duplicate IDs (see above) merge 1:1 pupid using "wormed" /* Let's just keep the data that were merged succesfully (we only know group assignment for these pupils) */ keep if _merge!=1 rename _merge _merge_wormed /* Merge with the compliance dataset. We need to use the one that drops the duplicate IDs so that we can merge succesfully. We created that when replicating Table 3. */ merge 1:1 pupid using "comply2" keep if _merge!=1 rename _merge _merge_comply * Restrict to those with non-missing eligibility data keep if elg98!=. * We're interested in 1999 infection data, restrict sample appropriately keep if any_ics99!=. * Now we split the sample to eligible and non-eligible (in Table 5 we pooled them) * We only want to replicate the moderate-heavy infection rates for those eligible and those not eligible *Girls <13 years, and all boys sum any_ics99 if (wgrp==1 & any98==1) & elg98==1 sum any_ics99 if (wgrp==1 & any98==0) & elg98==1 sum any_ics99 if (wgrp==2 & any99==1) & elg98==1 sum any_ics99 if (wgrp==2 & any99==0) & elg98==1 regress any_ics99 wgrp1 if ((wgrp==1 & any98==1) | (wgrp==2 & any99==1)) & elg98==1, robust cluster(schid) regress any_ics99 wgrp1 if ((wgrp==1 & any98==0) | (wgrp==2 & any99==0)) & elg98==1, robust cluster(schid) *Girls >=13 years sum any_ics99 if (wgrp==1 & any98==1) & elg98==0 sum any_ics99 if (wgrp==1 & any98==0) & elg98==0 sum any_ics99 if (wgrp==2 & any99==1) & elg98==0 sum any_ics99 if (wgrp==2 & any99==0) & elg98==0 regress any_ics99 wgrp1 if ((wgrp==1 & any98==1) | (wgrp==2 & any99==1)) & elg98==0, robust cluster(schid) regress any_ics99 wgrp1 if ((wgrp==1 & any98==0) | (wgrp==2 & any99==0)) & elg98==0, robust cluster(schid) ************************************************** ************************************************** * Replicating Table VII * Incorporate data on eligibility and parasitological exams use "namelist", clear /* We only use the baseline data for namelist, we only use them because this dataset contains the treatment variable (the wormed data come after the intervention) */ keep if visit==981 drop if dupid==2 // we need to drop duplicate IDs (see above) merge 1:1 pupid using "wormed" /* Let's just keep the data that were merged succesfully (we only know group assignment for these pupils) */ keep if _merge!=1 rename _merge _merge_wormed /* Merge with the compliance dataset. We need to use the one that drops the duplicate IDs so that we can merge succesfully. We created that when replicating Table 3. */ merge 1:1 pupid using "comply2" keep if _merge!=1 rename _merge _merge_comply merge m:1 schid using "schoolvar" rename _merge _merge_schoolvar * Generate selection into treatment indicator variable gen select=0 replace select=1 if (wgrp==1 & any98==1) | (wgrp==2 & any99==1) replace select=. if (any98==. & any99==.) gen wgrp1_select = wgrp1*select * Generate eligibility interaction *gen Iwgrp1_elg = wgrp1*elg98 * Replace population variables so that they are in per 1000 pupils terms foreach var in pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original { replace `var'=`var'/1000 } dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original if (wgrp==1 | wgrp==2), robust cluster(schid) outreg2 using "Tables/table_mk2004.xls", symbol(***, **, *) bdec(2) sdec(2) se replace excel dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select if (wgrp==1 | wgrp==2), robust cluster(schid) outreg2 /* You can see that we only have about 2326 observations (two missing due to duplicates in the original data that we dropped). The reason is that the parasitological survey was conducted on a smaller sample. Hence, we should better try to weight the samples by the original school population. Let's do it. */ * Weight each school by its total initial namelist population; saveold "comprehensive", replace use "namelist", clear keep if visit==981 drop if dupid==2 // we need to drop duplicate IDs (see above) collapse (count) nsch=pupid, by(schid) merge 1:m schid using "comprehensive" keep if _merge==3 egen ndata=count(pupid), by(schid) replace nsch=nsch/ndata dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 * Now you can see that there MK also use some control variables in their regressions, let's add them global obs sap1 sap2 sap3 sap4 i.std mk96_s xi: dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 xi: dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 /* Now we can examine the different direct and externality treatment effects on schistosomiasis and geohelminth worms. We use the same specification as above */ xi: dprobit sm99_who wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 xi: dprobit sm99_who wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 xi: dprobit any_geo99_original wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 xi: dprobit any_geo99_original wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid) outreg2 /* For the rest, just try to follow the tables in the text. We won't have enough time in the seminar to make it this far anyways */