clear all set matsize 5000 * --- Setting things up --- * defining folder stucture global home "/Users/sarvimm1/Dropbox (Aalto)/Principles_of_Empirical_Analysis/" global data "$home/otto/fleed/Fleed_tyossakaynti/" global output "$home/matti/graphs" set seed 31 * Let's use the teaching FLEED data import excel "$data\fleed_puf_julk.xlsx", firstrow * Note that I defined the folder "data" above using Stata's "global" macros * Once we have defined them, we can refer to them using the $ symbol * Giving variables more informative names rename vuosi year rename shtun id gen income = svatva replace income=0 if income==. gen lnincome = ln(income) gen gender = sukup - 1 rename syntyv year_of_birth rename kieli language * binary variables for having children under 7 years old gen children_u7=(a7lkm>0 & a7lkm<.) * Let's focus on only one year keep if year==10 * and the variables we use in the analysis * (I'm not actually using all of these, but they may be helpful for you in the exercises) keep id year income lnincome gender year_of_birth language children* order id year income lnincome gender year_of_birth language children* sum capture mkdir "$home/matti/temp/" save "$home/matti/temp/lect6_temp.dta", replace * Note that here I saved a temporary file with just the variables we need * for the actual analysis. This is typically a good practice, particularly * when working with large dataset. * --- Simulations --- * ------------------------------------------------------------------------------------------------------ * ------------------------------------------------------------------------------------------------------ capture program drop SIMULATION program SIMULATION * I start by creating an empty matrix that will later be * expanded to store the results matrix results=J(1,7,.) * I now start running a loop, where the sample size (n) * increases in steps of 10 from 50 to 2500 forvalues n=50(10)2501{ * For each sample size, I then run 20 regressions forvalues i=1(1)20{ use "$home/matti/temp/lect6_temp.dta", clear * taking a random sample of n observations sample `n', count * Note that the n here comes from the "forvalues" command above. * Also note the `' notation, which tells Stata that you are calling * a "local" macro (this is just how Stata loops work) * splitting the sample into equally sized "treatment" and "control" groups gen random_number=uniform() sort random_number gen rank=_n gen D=(rank<=`n'/2) * Adding the "effect" replace income=income + `1' if D==1 * testing for the difference between the two groups ttest income, by(D) * storing the results matrix results=results\[r(mu_1),r(mu_2),r(se),r(t),r(p),r(N_1),r(N_2)] } } * Now, let's look at the results matrix list results * ... and turn them into data svmat results * and rename variables to something meaningful rename results1 mean0 rename results2 mean1 rename results3 se rename results4 t rename results5 p rename results6 N1 rename results7 N0 gen diff=mean1-mean0 gen N=N0+N1 * I also keep a temporary file of the results in case I want to go back * the results without running all thes simulation again. keep mean1-N save "$home/matti/temp/placebo_results_`1'.dta", replace * --- Analyzing the results --- * use "$home/matti/temp/placebo_results.dta", replace * First, let's check the share of results where p-value < .05 gen significant=(p<.05) tab significant bysort N: egen nr_significant=total(significant) tab nr_sig * 95% CI figure gen lb=diff-1.96*se gen ub=diff+1.96*se format diff %9.0f * Run a loop where n takes values 50, 500 and 2500 foreach n in 50 500 2500{ preserve * keep only the results with n observations keep if N==`n' * order the results in ascending order sort diff * auxiliary variable for the y co-ordinate in the graphs gen order=_n * draw several graph on top of each other using the "twoway" command twoway (scatter order diff if p>=.05, ms(0)) /// (scatter order diff if p<.05, ms(X)) /// (rspike lb ub order if p>=.05, lcol(navy) horizontal) /// (rspike lb ub order if p<.05, lcol(maroon) horizontal) /// (scatter order diff if p<.05, mlabel(diff) mlabpos(12) msymbol(none) mlabcol(maroon) mlabsize(medium)) /// (scatter order diff if p>=.05, mlabel(diff) mlabpos(12) msymbol(none) mlabcol(navy) mlabsize(medium)) /// , note("n = `n'" " ") name(g`n', replace) ylab("") yscale(lstyle(none)) /// legend(lab(2 "p < .05") lab(1 "p > .05") order(2 1) ring(0) pos(5) col(1) size(large)) /// ytitle("") xtitle("") xline(0, lw(thin) lc(black)) xline(`1', lw(thin) lc(maroon)) ysize(20) xsize(11.5) xlab(-20000(10000)20000, labsize(large)) * save the resulting graph in pdf format graph export "$output/simulation_n`n'_effect_`1'.pdf", replace restore } * Another figure: statistically significant and insignificant estimates by sample size twoway (scatter diff N if p>=.05, mfc(navy%25) mlc(navy%0) msize(vsmall)) /// (scatter diff N if p<.05, ms(X)) /// , legend( lab(2 "Statistically significant (p < .05)") lab(1 "p > .05") order(2 1) ring(0) pos(5) col(1)) /// yline(`1', lw(thin) lc(maroon)) ytitle("") xtitle(Sample size) yline(0, lw(thin) lc(black)) name(scatter, replace) xsize(20) ysize(10) graph export "$output/simulation_all_effect_`1'.png", replace * using .png here format here, because scatter plots can lead to large pdf-files * width of the confidence interval (for one bullet point of in the lecture slides) gen CI_width=ub-lb tab N if N==50 | N==500 | N==2500, s(CI_widt) end * ------------------------------------------------------------------------------------------------------ * ------------------------------------------------------------------------------------------------------ SIMULATION 0 SIMULATION 1500 SIMULATION 2000 clear * When working with shared resources (e.g. Statistics Finland remote servers), * is is good practice is to end every code with "clear" or "exit" in order to * release the resources for other users