* This do-file creates the graphs used in Lecture 6 of Principles of Empirical Analysis 
* Matti Sarvimäki, January 2021

clear all
set matsize 11000
set seed 12345

global home "/Users/sarvimm1/Dropbox (Aalto)/teaching/Principles_of_Empirical_Analysis/matti"
global data "$home/rct_papers/ChattopadhyayDuflo_data/"
global output "/Users/sarvimm1/Dropbox (Aalto)/Principles_of_Empirical_Analysis/matti/graphs"

* --- Setting up the data ---

* Let's take a look at the full content of the data downloaded from
* https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/USBFNOMLAT
desc using "$data/womenpolicymakers_parta.dta"
desc using "$data/womenpolicymakers_partb.dta"
desc using "$data/womenpolicymakers_partc.dta"
desc using "$data/womenpolicymakers_partd.dta"

* Load only the variables we need for our analysis
use  gpnum prvill twprrt twprbt twpubt twpurt twgprt twgpbt twgprt twgpbt wwpubt wwpurt wwprbt wwprrt  using "$data/womenpolicymakers_partc.dta", clear
* merge with treatment status 
merge m:1 gpnum using "$data/womenpolicymakers_parta.dta", nogen keepusing(womres)

* Use the same sample criterion as in the paper
drop if prvil=="YES"

* Outcome variable
egen y=rsum(twprrt twprbt twpubt twpurt twgprt twgpbt twgprt twgpbt wwpubt wwpurt wwprbt wwprrt )

* Treatment variable
gen D=1 if womres==1
replace D=0 if womres==2
* (coding for womres: 1= no, 2 = yes)

* Aggregatin the data into GP level 
collapse (mean) y D, by(gpnum)


* --- Compare to Chattopadhyay and Duflo, Table V  ---

* Approach 1: Use Stata's summarize command and collect the averages into a matrix
sum y if D==0
local yC=r(mean)
sum y if D==1
local yT=r(mean)
local dif=`yT'-`yC'
di "Difference between treated and control: " `dif'
matrix main_result=`yT',`yC', `dif'
matrix list main_result

* Approach 2: Use Stata's ttest command
ttest y, by(D)

* Approach 3: Use regression
regress y D

* Note that all approaches give us the same results, but they are not exactly
* the same as in Chattopadhyay and Duflo, Table V. This is likely due to
* me constructing the outcome variable slightly differently than they do.
* In any case, for the purpose of this lecture, we get close enough.


* --- Simulate the randomization inference test distribution ---	

matrix rand_inf=J(1,3,.)			/* setting up a matrix used in the program below */

* Let's create a little program that allocates 54 randomly chosen GPs into treatment group
capture program drop randinf
program def randinf
	* Here is one way to do the randomization: 
	gen random_number=uniform()		/* generate a random number */
	egen rank=rank(random_number)	/* use the random number to put the observatins into a random order */
	gen D_ri=(rank<=54)				/* allocate the 54 first observations into the "treatment" group */
	* Tkae the averages
	sum y if D_ri==0
	local yC=r(mean)
	sum y if D_ri==1
	local yT=r(mean)
	* Add a new row to the results matrix with the averages and their difference
	matrix rand_inf= rand_inf\(`yT',`yC', (`yT'-`yC'))
	drop random_number rank D_ri
end

* Let's now create a loop that runs the program many times
forvalues round=1(1)10000{
	quietly: randinf
	di `round'
}

* --- Draw figures of the simulation results ---	
* turning the results matrix into data
svmat rand_inf
* drawing histrograms
local dif="9.5577189"
local opt "bin(100) xtitle("") ytitle("") xsize(18) ysize(20) xlab(-15(5)15)"
hist rand_inf3, xline(`dif', lc(gs10)) `opt' text(.08 10.5 "point estimate", orient(rvertical))
graph export "$output/CD_ri_fig1.pdf", replace
hist rand_inf3, xline(-`dif' `dif', lc(gs10)) `opt' text(.08 10.5 "point estimate", orient(rvertical)) text(.085 -10.3 "-point estimate", orient(rvertical))
graph export "$output/CD_ri_fig2.pdf", replace
hist rand_inf3,  `opt'  
graph export "$output/CD_ri_fig3.pdf", replace
hist rand_inf3,  `opt'  norm 
graph export "$output/CD_ri_fig4.pdf", replace

* calculating the share of simulation round giving larger differneces than the one we actually observe
gen ri_pvalue_1s=(rand_inf3>`dif')
replace ri_pvalue_1s=. if rand_inf3==.
gen ri_pvalue_2s=(rand_inf3>`dif' |  rand_inf3<-`dif')
replace ri_pvalue_2s=. if rand_inf3==.
sum ri_pvalue*
* standard error and range of the simulation distribution
sum rand_inf3, d

* save the data in case we'd like to look at again 
* (without running all the simulations again)
save "$data/randomization_inference", replace