clear all
set matsize 5000

* --- Setting things up ---

* defining folder stucture
global home "/Users/sarvimm1/Dropbox (Aalto)/Principles_of_Empirical_Analysis/"
global data "$home/otto/fleed/Fleed_tyossakaynti/"
global output "$home/matti/graphs"

set seed 31

* Let's use the teaching FLEED data
import excel "$data\fleed_puf_julk.xlsx", firstrow
* Note that I defined the folder "data" above using Stata's "global" macros
* Once we have defined them, we can refer to them using the $ symbol

* Giving variables more informative names
rename vuosi year
rename shtun id
gen income	= svatva
replace income=0 if income==.
gen lnincome = ln(income)
gen gender	= sukup - 1
rename syntyv year_of_birth
rename kieli language
* binary variables for having children under 7 years old
gen children_u7=(a7lkm>0 & a7lkm<.)

* Let's focus on only one year
keep if year==10
* and the variables we use in the analysis 
* (I'm not actually using all of these, but they may be helpful for you in the exercises)
keep id year income lnincome gender year_of_birth language children*
order id year income lnincome gender year_of_birth language children*
sum
capture mkdir "$home/matti/temp/"
save "$home/matti/temp/lect6_temp.dta", replace
* Note that here I saved a temporary file with just the variables we need
* for the actual analysis. This is typically a good practice, particularly
* when working with large dataset. 


* --- Simulations ---

* ------------------------------------------------------------------------------------------------------
* ------------------------------------------------------------------------------------------------------
capture program drop SIMULATION
program SIMULATION

	* I start by creating an empty matrix that will later be 
	* expanded to store the results
	matrix results=J(1,7,.) 

	* I now start running a loop, where the sample size (n)
	* increases in steps of 10 from 50 to 2500
	forvalues n=50(10)2501{
		* For each sample size, I then run 20 regressions
		forvalues i=1(1)20{
			use "$home/matti/temp/lect6_temp.dta", clear

				* taking a random sample of n observations
				sample `n', count	
				* Note that the n here comes from the "forvalues" command above. 
				* Also note the `' notation, which tells Stata that you are calling 
				* a "local" macro (this is just how Stata loops work)
				
				* splitting the sample into equally sized "treatment" and "control" groups
				gen random_number=uniform()
				sort random_number
				gen rank=_n
				gen D=(rank<=`n'/2)
				
				* Adding the "effect"
				replace income=income + `1' if D==1

				* testing for the difference between the two groups
				ttest income, by(D)

				* storing the results
				matrix results=results\[r(mu_1),r(mu_2),r(se),r(t),r(p),r(N_1),r(N_2)]
		}
	}

	* Now, let's look at the results
	matrix list results
	* ... and turn them into data
	svmat results
	* and rename variables to something meaningful
	rename results1 mean0
	rename results2 mean1
	rename results3 se
	rename results4 t
	rename results5 p
	rename results6 N1
	rename results7 N0
	gen diff=mean1-mean0
	gen N=N0+N1

	* I also keep a temporary file of the results in case I want to go back
	* the results without running all thes simulation again.
	keep mean1-N
	save "$home/matti/temp/placebo_results_`1'.dta", replace

	* --- Analyzing the results ---
	* use "$home/matti/temp/placebo_results.dta", replace
	* First, let's check the share of results where p-value < .05
	gen significant=(p<.05)
	tab significant
	bysort N: egen nr_significant=total(significant)
	tab nr_sig

	* 95% CI figure
	gen lb=diff-1.96*se
	gen ub=diff+1.96*se
	format diff %9.0f

	* Run a loop where n takes values 50, 500 and 2500
	foreach n in 50 500 2500{
	preserve
		* keep only the results with n observations
		keep if N==`n'
		
		* order the results in ascending order
		sort diff
		
		* auxiliary variable for the y co-ordinate in the graphs
		gen order=_n
		
		* draw several graph on top of each other using the "twoway" command
		twoway	(scatter order diff if p>=.05, ms(0)) ///
				(scatter order diff if p<.05, ms(X)) ///
				(rspike lb ub order if p>=.05, lcol(navy) horizontal) ///
				(rspike lb ub order if p<.05, lcol(maroon) horizontal) ///
				(scatter order diff if p<.05, mlabel(diff) mlabpos(12) msymbol(none) mlabcol(maroon) mlabsize(medium)) ///
				(scatter order diff if p>=.05, mlabel(diff) mlabpos(12) msymbol(none) mlabcol(navy) mlabsize(medium)) ///
				, note("n = `n'" " ") name(g`n', replace) ylab("") yscale(lstyle(none)) ///
				legend(lab(2 "p < .05") lab(1 "p > .05") order(2 1) ring(0) pos(5) col(1) size(large)) ///
				ytitle("") xtitle("") xline(0, lw(thin) lc(black)) xline(`1', lw(thin) lc(maroon)) ysize(20) xsize(11.5) xlab(-20000(10000)20000, labsize(large))
		
		* save the resulting graph in pdf format
		graph export "$output/simulation_n`n'_effect_`1'.pdf", replace
	restore
	}


	* Another figure: statistically significant and insignificant estimates by sample size
	twoway 	(scatter diff N if p>=.05, mfc(navy%25) mlc(navy%0) msize(vsmall)) ///
			(scatter diff N if p<.05, ms(X))  ///
			, legend( lab(2 "Statistically significant (p < .05)") lab(1 "p > .05") order(2 1) ring(0) pos(5) col(1)) ///
			yline(`1', lw(thin) lc(maroon))  ytitle("") xtitle(Sample size) yline(0, lw(thin) lc(black)) name(scatter, replace) xsize(20) ysize(10)
	graph export "$output/simulation_all_effect_`1'.png", replace
	* using .png here format here, because scatter plots can lead to large pdf-files

	* width of the confidence interval (for one bullet point of in the lecture slides)
	gen CI_width=ub-lb
	tab N if N==50 | N==500 | N==2500, s(CI_widt)
end
* ------------------------------------------------------------------------------------------------------
* ------------------------------------------------------------------------------------------------------

SIMULATION 0
SIMULATION 1500
SIMULATION 2000

clear
* When working with shared resources (e.g. Statistics Finland remote servers),
* is is good practice is to end every code with "clear" or "exit" in order to 
* release the resources for other users