clear
set more off
cd "/Users/vojtabartos/Documents/4 Teaching/2018 Development Economics LMU/Tutorials/Exercise 4/PSDP_public_data_2014-11/"


**************************************************
**************************************************
* Replicating Table 1: randomization balance check
use "namelist", clear

* restrict to the pre-treatment visit (we're interested in baseline characteristics)
keep if visit==981 
/* there are some duplicates, we need to drop them (this comes from the intermediate 
dataset that was actually used in the original study; thus these are typos) */
drop if dupid==2

* Merge with the pupils questionnaire dataset		
merge 1:1 pupid using pupq

* Create the following variables for Panel B
* Create measure of pre-program school attendance based on # days absent in previous four weeks
gen preatt_98 = (20-absdays_98_6)/20 

* Create indicator for "Child sick often"
gen soften_98 = (fallsick_98_37==3) 
replace soften_98 = . if fallsick_98_37==. 

* Create indicator for "Child clean"
gen clean_98 = (clean_98_15==1) 
replace clean_98 = . if clean_98_15==. 

/* Now we are interested in replicating table 1; the authors examine school averages 
weighted by pupil population */

*** Panel A (full sample across all grades)
* Summary stats and "t-tests"
preserve
collapse sex elg98 stdgap yrbirth preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 wgrp* (count) npup = pupid, by(schid)
bysort wgrp: sum sex elg98 stdgap yrbirth [aw=npup]
foreach var in sex elg98 stdgap yrbirth { 
	regress `var' wgrp1 wgrp2 [aw=npup]
	}
restore

*** Panel B
preserve
* Only keep pupils with 1998 data
drop if pupdate_98_1=="" &  schid_98_2==. 

collapse (mean) sex elg98 stdgap yrbirth preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 wgrp* (count) npup = pupid, by(schid)
* Summary stats and "t-tests"
bys wgrp: sum preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 [aw=npup]
foreach var in preatt_98 bloodst_98_58 soften_98 malaria_98_48 clean_98 { 
	regress `var' wgrp1 wgrp2 [aw=npup]
	} 
restore

* Now use school level data
use "schoolvar", clear

*** Panel C
* NOTE: There are some differences in pop1_3km and pop1_36_k; this was another coding mistake in the original paper; these results are correct
bysort wgrp: sum distlake pup_pop latr_pup z_inf98 pop1_3km_updated pop1_36k_updated  popT_3km_updated popT_36k_updated

* Need to generate treatment dummies
gen wgrp1 = (wgrp==1)
gen wgrp2 = (wgrp==2)
gen wgrp3 = (wgrp==3)
foreach var in distlake pup_pop latr_pup z_inf98 pop1_3km_updated pop1_36k_updated  popT_3km_updated popT_36k_updated { 
	regress `var' wgrp1 wgrp2 // alternatively: xi: regress `var' i.wgrp (but this gives you wgrp2 and wgrp3, so different baseline)
	} 

**************************************************
**************************************************
* Replicating Table 3: deworming treatment compliance
* Merge Compliance data with Namelist data
use namelist, clear
*reshape wide date schid std obs prs Tmonths Isem*, i(pupid dupid) j(visit) 
drop if dupid==2
drop if visit!=981
preserve
use comply, clear
duplicates tag pupid, gen(dup)
drop if dup!=0
saveold comply2, replace
restore
merge 1:1 pupid using "comply2"

* Top panel
* Girls < 13 & boys
* Any medical treatment in 1998
bys wgrp: tab any98 if elg98 == 1

* Girls >= 13
* Any medical treatment in 1998
bys wgrp: tab any98 if elg98==0 

* Middle panel
* Drop pupils in standard 8 in 1998
drop if std981==8 
* Girls < 13 & boys
* Any medical treatment in 1998
bys wgrp: tab any99 if elg99 == 1

* Girls >= 13
* Any medical treatment in 1998
bys wgrp: tab any99 if elg99==0

* Drop pupils not present during 1999 school visits
drop if (totprs99==0 | totprs99==.) 
* Girls < 13 & boys
* Any medical treatment in 1999
bys wgrp: tab any99 if elg99 == 1
* Any medical treatment in 1999
bys wgrp: tab any99 if elg99 == 0


**************************************************
**************************************************
* Replicating Table 5: health and health behavior by treatment (1999)

* Incorporate data on eligibility and parasitological exams
use "namelist", clear
/* We only use the baseline data for namelist, we only use them because this dataset 
contains the treatment variable (the wormed data come after the intervention) */
keep if visit==981
drop if dupid==2 // we need to drop duplicate IDs (see above)
merge 1:1 pupid using "wormed"
/* Let's just keep the data that were merged succesfully (we only know group assignment 
for these pupils) */
keep if _merge==3 

* Generate low HB indicator (anemia indicator)
/* Rather strict to use 100: "Diagnosis in men is based on a hemoglobin of less than 130 to 140 g/L,
while in women, it must be less than 120 to 130 g/L." */
gen hb100 = (hb<100) 
replace hb100=. if hb==. 

* We're interested in 1999 infection data, restrict sample appropriately
keep if any_ics99!=. 

*** Panel A
* only restrict the sample to group 1 and group 2 (no data for group 3 kids anyways)
bysort wgrp: sum any_ics98 any_ics99
regress any_ics99 wgrp1 if (wgrp==1 | wgrp==2), robust cluster(schid) // alternative to the school averages used in table 1

*** Panel B (only look at anemia)
* Using new data no longer significant differences on anemia
bysort wgrp: sum hb10
regress hb100 wgrp1, robust cluster(schid)

*** Panel C (plus "sick often" data)
* You can just replicate the code on loading data from Table 1
use "namelist", clear

/* We only use the baseline data for namelist, we only use them because this dataset 
contains the treatment variable (the wormed data come after the intervention) */
keep if visit==981 
/* there are some duplicates, we need to drop them (this comes from the intermediate 
dataset that was actually used in the original study; thus these are typos) */
drop if dupid==2

* Merge with the pupils questionnaire dataset		
merge 1:1 pupid using pupq

* Create indicator for "Child clean"
gen clean_99 = (clean_99_13==1) 
replace clean_99 = . if clean_99_13==. 

* Create indicator for "Child wears shoes"
gen shoes_99 = (shoes_99_10==1 | shoes_99_10==2) 
replace shoes_99 = . if shoes_99_10==. 

/* For some reason, the table in the paper pools groups 2 and 3 in this case, we just do it as we are used to.
This should have been stressed out in the paper more clearly */
bysort wgrp: sum clean_99 shoes_99 dayswat_99_36
foreach var in clean_99 shoes_99 dayswat_99_36 { 
	regress `var' wgrp1, robust cluster(schid) 
	} 

**************************************************
**************************************************
* Replicating Table 6: health externalities within school (1999)

/* NOTE: Here, use the same dataset as in Panel A and B of Table 5 (just copy the code).
Further, merge also the compliance dataset, comply2.dta. */

* Incorporate data on eligibility and parasitological exams
use "namelist", clear
/* We only use the baseline data for namelist, we only use them because this dataset 
contains the treatment variable (the wormed data come after the intervention) */
keep if visit==981
drop if dupid==2 // we need to drop duplicate IDs (see above)
merge 1:1 pupid using "wormed"
/* Let's just keep the data that were merged succesfully (we only know group assignment 
for these pupils) */
keep if _merge!=1 
rename _merge _merge_wormed
/* Merge with the compliance dataset. We need to use the one that drops the duplicate IDs 
so that we can merge succesfully. We created that when replicating Table 3. */
merge 1:1 pupid using "comply2"
keep if _merge!=1 
rename _merge _merge_comply

* Restrict to those with non-missing eligibility data
keep if elg98!=. 

* We're interested in 1999 infection data, restrict sample appropriately
keep if any_ics99!=. 

* Now we split the sample to eligible and non-eligible (in Table 5 we pooled them)
* We only want to replicate the moderate-heavy infection rates for those eligible and those not eligible
*Girls <13 years, and all boys
sum any_ics99 if (wgrp==1 & any98==1) & elg98==1
sum any_ics99 if (wgrp==1 & any98==0) & elg98==1 
sum any_ics99 if (wgrp==2 & any99==1) & elg98==1 
sum any_ics99 if (wgrp==2 & any99==0) & elg98==1 
regress any_ics99 wgrp1 if ((wgrp==1 & any98==1) | (wgrp==2 & any99==1)) & elg98==1, robust cluster(schid) 
regress any_ics99 wgrp1 if ((wgrp==1 & any98==0) | (wgrp==2 & any99==0)) & elg98==1, robust cluster(schid)

*Girls >=13 years
sum any_ics99 if (wgrp==1 & any98==1) & elg98==0
sum any_ics99 if (wgrp==1 & any98==0) & elg98==0
sum any_ics99 if (wgrp==2 & any99==1) & elg98==0
sum any_ics99 if (wgrp==2 & any99==0) & elg98==0
regress any_ics99 wgrp1 if ((wgrp==1 & any98==1) | (wgrp==2 & any99==1)) & elg98==0, robust cluster(schid) 
regress any_ics99 wgrp1 if ((wgrp==1 & any98==0) | (wgrp==2 & any99==0)) & elg98==0, robust cluster(schid)

**************************************************
**************************************************
* Replicating Table VII

* Incorporate data on eligibility and parasitological exams
use "namelist", clear
/* We only use the baseline data for namelist, we only use them because this dataset 
contains the treatment variable (the wormed data come after the intervention) */
keep if visit==981
drop if dupid==2 // we need to drop duplicate IDs (see above)
merge 1:1 pupid using "wormed"
/* Let's just keep the data that were merged succesfully (we only know group assignment 
for these pupils) */
keep if _merge!=1 
rename _merge _merge_wormed
/* Merge with the compliance dataset. We need to use the one that drops the duplicate IDs 
so that we can merge succesfully. We created that when replicating Table 3. */
merge 1:1 pupid using "comply2"
keep if _merge!=1 
rename _merge _merge_comply

merge m:1 schid using "schoolvar"
rename _merge _merge_schoolvar

* Generate selection into treatment indicator variable
gen select=0
replace select=1 if (wgrp==1 & any98==1) | (wgrp==2 & any99==1)
replace select=. if (any98==. & any99==.)
gen wgrp1_select = wgrp1*select

* Generate eligibility interaction
*gen Iwgrp1_elg = wgrp1*elg98

* Replace population variables so that they are in per 1000 pupils terms
foreach var in pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original {
	replace `var'=`var'/1000
	}
	
dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original if (wgrp==1 | wgrp==2), robust cluster(schid)
outreg2 using "Tables/table_mk2004.xls", symbol(***, **, *) bdec(2) sdec(2) se replace excel
dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select if (wgrp==1 | wgrp==2), robust cluster(schid)
outreg2
/* You can see that we only have about 2326 observations (two missing due to duplicates 
in the original data that we dropped). The reason is that the parasitological survey
was conducted on a smaller sample. Hence, we should better try to weight the samples by 
the original school population. Let's do it. */
* Weight each school by its total initial namelist population;
saveold "comprehensive", replace

use "namelist", clear
keep if visit==981
drop if dupid==2 // we need to drop duplicate IDs (see above)
collapse (count) nsch=pupid, by(schid)
merge 1:m schid using "comprehensive"
keep if _merge==3
egen ndata=count(pupid), by(schid)
replace nsch=nsch/ndata

dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2
dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2

* Now you can see that there MK also use some control variables in their regressions, let's add them

global obs sap1 sap2 sap3 sap4 i.std mk96_s
xi: dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2
xi: dprobit any_ics99 wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2

/* Now we can examine the different direct and externality treatment effects on schistosomiasis
and geohelminth worms. We use the same specification as above  */
xi: dprobit sm99_who wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2
xi: dprobit sm99_who wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2
xi: dprobit any_geo99_original wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2
xi: dprobit any_geo99_original wgrp1 pop1_3km_original pop1_36k_original popT_3km_original popT_36k_original select wgrp1_select $obs if (wgrp==1 | wgrp==2) [pw=nsch], robust cluster(schid)
outreg2

/* For the rest, just try to follow the tables in the text. We won't have enough time 
in the seminar to make it this far anyways */