# ECON-L1300
# March 2021 / iivo.vehvilainen@aalto.fi
#
# Mostly copied from Conlon & Gortmaker
# see the excellent website: https://pyblp.readthedocs.io

# preliminaries ----

# basic R
library(data.table)
library(ggplot2)
library(reshape)
library(RColorBrewer)

# for iv and logit regression
library(AER)
library(mlogit)

# excel input
library(openxlsx)

# output
library(stargazer)

# Python
library(reticulate)

# for display in class
options(digits=4)

# helper function to convert R integers to Python integers
fi = function(i) return(as.integer(i))

# this needs to point to a valid installation of Python
# this is for a HomeBrew install on MacOS
use_python("/usr/local/bin/python3")

# setup Python libraries (these need to be installed for the Python above)
pyblp = import("pyblp")
np = import("numpy")
pd = import("pandas")

# set pyblp options
pyblp$options$digits = fi(2)
pyblp$options$verbose = TRUE
print(pyblp$'__version__')

# set working directory
setwd("/Users/iivo/Dropbox (Aalto)/Teaching/misc/IO exercises/cereal case")

# Lecture 9 Mergers -----------------------------------------------------------------

# Setting Up and Solving the Problem Without Demographics ----

# load data
product_data = as.data.table(pd$read_csv(pyblp$data$NEVO_PRODUCTS_LOCATION))
head(product_data[, 1:11])

# the following data is used in demand estimation
#   market_ids
#   product_ids
#   shares
#   prices
#   characteristics (sugar, mushy)
#   demand_instruments0..19
#
# for costs and markups, also
#   firm_ids

#stargazer(product_data[1:20,1:10], summary=F, rownames = F, font.size = "tiny")

# X1: matrix for the linear demand model (theta1)
#   - here product_ids are used as fixed effects
X1_formulation = pyblp$Formulation('0 + prices', absorb='C(product_ids)')

# X2: matrix for the nonlinear demand model (theta2)
X2_formulation = pyblp$Formulation('1 + prices + sugar + mushy')
product_formulations = c(X1_formulation, X2_formulation)
product_formulations

# method to integrate over heterogeneous consumer choices
mc_integration = pyblp$Integration('monte_carlo', size=fi(50), specification_options=py_dict('seed', fi(0)))
mc_integration

# compare with Gaussian-Hermite product rule
pr_integration = pyblp$Integration('product', size=fi(5))
pr_integration

# formulate problem
#   T   number of markets
#   N   number of products across all markets
#   F   number of firms
#   I   number of agents
#   K1  number of linear demand characteristics
#   K2  number of nonlinear demand characteristics
#   D   number of demographic variables
#   MD  demand instruments
#   ED  demand side fixed effects

mc_problem = pyblp$Problem(product_formulations, product_data, integration=mc_integration)
mc_problem

pr_problem = pyblp$Problem(product_formulations, product_data, integration=pr_integration)
pr_problem

# select optimization method
#   bfgs without bounds used to match nevo
bfgs = pyblp$Optimization('bfgs', py_dict('gtol', 1e-4))
bfgs

# solve using mc and with unrestricted covariance matrix for random tastes
results1 = mc_problem$solve(sigma=np$ones(fi(c(4, 4))), optimization=bfgs)
results1

# solve using gaussian product rule and with unrestricted covariance matrix for random tastes
results2 = pr_problem$solve(sigma=np$ones(fi(c(4, 4))), optimization=bfgs)
results2

# solve using mc and with diagonal covariance matrix for random tastes
results3 = mc_problem$solve(sigma=np$eye(fi(4)), optimization=bfgs)
results3

# Adding Demographics to the Problem ----

# the following data is used in demand estimation
#   weights give the weight given to each observation in the integration
#   nodes are the points in which the unobservables are evaluated
#   other columns give the observables

# load data
agent_data = as.data.table(pd$read_csv(pyblp$data$NEVO_AGENTS_LOCATION))
head(agent_data)

#stargazer(agent_data[1:20,], summary=F, rownames = F, font.size = "tiny", digits=2)

# add observable demographics
agent_formulation = pyblp$Formulation('0 + income + income_squared + age + child')
agent_formulation

# reformulate problem
nevo_problem = pyblp$Problem(
  product_formulations,
  product_data,
  agent_formulation,
  agent_data
)
nevo_problem

# initialize covariance matrix for random tastes
initial_sigma = np$diag(c(0.3302, 2.4526, 0.0163, 0.2441))

# initialize covariance matrix for demographics
initial_pi = np$array(matrix(c(
  c( 5.4819,  0.0,      0.2037,  0.0     ),
  c(15.8935, -1.2000, 0.0,       2.6342  ),
  c(-0.2506,  0.0,      0.0511,  0.0     ),
  c( 1.2650,  0.0,     -0.8091,  0.0     )
), ncol=4, byrow=T))

# bfgs without bounds used to match nevo
tighter_bfgs = pyblp$Optimization('bfgs', py_dict('gtol', 1e-5))

# reformulate problem
nevo_results = nevo_problem$solve(
  initial_sigma,
  initial_pi,
  optimization=tighter_bfgs,
  method='1s'
)

# see the results
nevo_results

# Mergers ----

# Marginal Costs and Markups ----

# calculate (constant) marginal costs by using firm_ids
costs = nevo_results$compute_costs()
p.mc = ggplot(data.table(costs), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("Marginal Costs")
p.mc

# pdf(file="p_nevo2001_mc.pdf", width=3.94, height=2.62, pointsize=9)
# print(p.mc+theme_bw(base_family="serif", base_size = 8))
# dev.off()

# calculate markups
markups = nevo_results$compute_markups()
p.marks = ggplot(data.table(markups), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("Markups")
p.marks

# pdf(file="p_nevo2001_marks.pdf", width=3.94, height=2.62, pointsize=9)
# print(p.marks+theme_bw(base_family="serif", base_size = 8))
# dev.off()

# calculate hhi
hhi = nevo_results$compute_hhi()

# calculate profits 
profits = nevo_results$compute_profits(costs=costs)

# calculate surpluses 
cs = nevo_results$compute_consumer_surpluses()

# merger simulation
product_data[, merger_ids:=firm_ids]
product_data[merger_ids==2, merger_ids:=1]

# calculate new prices (assumes costs don't change)
changed_prices = nevo_results$compute_prices(
  firm_ids=product_data[, merger_ids],
  costs=costs
)

# calculate new market shares (assumes costs don't change)
changed_shares = nevo_results$compute_shares(changed_prices)

# calculate new hhi, markups, profits and cs with changed prices
changed_hhi = nevo_results$compute_hhi(firm_ids=product_data[, merger_ids], shares=changed_shares)
changed_markups = nevo_results$compute_markups(changed_prices, costs)
changed_profits = nevo_results$compute_profits(changed_prices, changed_shares, costs)
changed_cs = nevo_results$compute_consumer_surpluses(changed_prices)

dt.pic = as.data.table(melt(cbind(changed_hhi,hhi)))
dt.pic$name = c('New HHI', 'Old HHI')[dt.pic$X2]
ggplot(dt.pic, aes(x=value, color=name))+geom_density(fill="white", alpha=0.5)+ggtitle("HHI Changes")
#ggplot(data.table(changed_hhi-hhi), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("HHI Changes")

dt.pic = as.data.table(melt(cbind(changed_markups,markups)))
dt.pic$name = c('New markups', 'Old markups')[dt.pic$X2]
ggplot(dt.pic, aes(x=value, color=name))+geom_density(fill="white", alpha=0.5)+ggtitle("Markup Changes")
#ggplot(data.table(changed_markups-markups), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("Markup Changes")

dt.pic = as.data.table(melt(cbind(changed_profits,profits)))
dt.pic$name = c('New profits', 'Old profits')[dt.pic$X2]
ggplot(dt.pic, aes(x=value, color=name))+geom_density(fill="white", alpha=0.5)+ggtitle("Profit Changes") + xlim(c(0,0.01))
#ggplot(data.table(changed_profits-profits), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("Profit Changes")

dt.pic = as.data.table(melt(cbind(changed_cs,cs)))
dt.pic$name = c('New CS', 'Old CS')[dt.pic$X2]
ggplot(dt.pic, aes(x=value, color=name))+geom_density(fill="white", alpha=0.5)+ggtitle("Consumer Surplus Changes")# + xlim(c(0,0.01))
#ggplot(data.table(changed_cs-cs), aes(x=V1))+geom_histogram(bins=50, fill="white", color="black", alpha=0.5)+ggtitle("Consumer Surplus Changes")


# Lecture 10 Market power -----------------------------------------------------------------

# Logit demand (manually) ----
c.charnames = names(agent_data)[5:12]

# combine mead
dt.logit = product_data[agent_data[, lapply(.SD, median), .SDcols=c.charnames, by=market_ids], on=c("market_ids")]
dt.logit = dt.logit[dt.logit[, .(outside.shares=1-sum(shares)), by=market_ids], on="market_ids"]
dt.logit[, y:=log(shares)-log(outside.shares)]
dt.logit[, city_ids := as.factor(city_ids)]

c.instruments = names(dt.logit)[grep("demand_", names(dt.logit))]

l.logitfm = list()
l.logitfm[[1]] = as.formula(paste("y ~ 0 + prices + sugar + mushy | sugar + mushy + factor(brand_ids)"))
l.logitfm[[2]] = as.formula(paste("y ~ 0 + prices + product_ids | product_ids + ", paste(c.instruments, collapse="+"), sep=""))
l.logitfm[[3]] = as.formula(paste("y ~ 0 + prices + income + age + child + product_ids | income + age + child + product_ids +", paste(c.instruments, collapse="+"), sep=""))
l.logitfm[[4]] = as.formula(paste("y ~ 0 + prices + product_ids + city_ids | product_ids + city_ids +", paste(c.instruments, collapse="+"), sep=""))

l.logit = list()
l.logit[[1]] = lm(y ~ 0 + prices + sugar + mushy, data = dt.logit)
l.logit[[2]] = lm(y ~ 0 + prices + product_ids, data = dt.logit)
l.logit[[3]] = lm(y ~ 0 + prices + income + age + child + product_ids, data = dt.logit)
l.logit[[4]] = ivreg(l.logitfm[[1]], data = dt.logit)
l.logit[[5]] = ivreg(l.logitfm[[2]], data = dt.logit)
l.logit[[6]] = ivreg(l.logitfm[[3]], data = dt.logit)
l.logit[[7]] = ivreg(l.logitfm[[4]], data = dt.logit)

c.omitfe = c(paste("product_ids", unique(dt.logit$product_ids), sep=""), paste("city_ids", unique(dt.logit$city_ids), sep=""))
stargazer(l.logit, title = "Nevo 2001: Table V", omit=c.omitfe, omit.stat = c("F", "ser"), report=c("vsc"), type="text", single.row = T)
#stargazer(l.logit, title = "Nevo 2001: Table V", omit=c.omitfe, omit.stat = c("F", "ser"), report=c("vcs"), type="latex", single.row = F, no.space = T)


# Elasticities and Diversion Ratios ----

elasticities = nevo_results$compute_elasticities()
diversions = nevo_results$compute_diversion_ratios()

# median elasticities
dt.ela = data.table(product_data[, .(market_ids, product_ids)], elasticities)[, lapply(.SD, function(x) sprintf("%.3f", median(x),3)), .SDcols=3:26, by=product_ids]
setnames(dt.ela, c("product_ids", unique(dt.ela$product_ids)))
stargazer(dt.ela, summary=F, rownames = F, type="text")
#stargazer(dt.ela, summary=F, rownames = F, type="latex", font.size="tiny", column.sep.width = "-5pt")

# median diversions, diversion to outside good on the diagonal
dt.dive = data.table(product_data[, .(market_ids, product_ids)], diversions)[, lapply(.SD, function(x) sprintf("%.3f", median(x),3)), .SDcols=3:26, by=product_ids]
setnames(dt.dive, c("product_ids", unique(dt.dive$product_ids)))
stargazer(dt.dive, summary=F, rownames = F, type="text")
#stargazer(dt.dive, summary=F, rownames = F, type="latex", font.size="tiny", column.sep.width = "-5pt")

# single market example
single_market = product_data[market_ids== 'C01Q1', .I]

dt.pic = melt(elasticities[single_market, single_market])
dt.pic$market1 = as.factor((dt.pic$X2-1))
dt.pic$market2 = as.factor((dt.pic$X1-1))
p.ela = ggplot(dt.pic, aes(market1,market2,fill=value))+geom_tile()+scale_y_discrete(limits=rev)+scale_fill_distiller(palette = "YlGnBu")
p.ela

# pdf(file="p_nevo2001_ela.pdf", width=3.94, height=2.62, pointsize=9)
# print(p.ela+theme_bw(base_family="serif", base_size = 8))
# dev.off()

# single market example
dt.pic = melt(diversions[single_market, single_market])
dt.pic$market1 = as.factor((dt.pic$X2-1))
dt.pic$market2 = as.factor((dt.pic$X1-1))
p.dive = ggplot(dt.pic, aes(market1,market2,fill=value))+geom_tile()+scale_y_discrete(limits=rev)+scale_fill_distiller(palette = "YlGnBu")
p.dive

# pdf(file="p_nevo2001_dive.pdf", width=3.94, height=2.62, pointsize=9)
# print(p.dive+theme_bw(base_family="serif", base_size = 8))
# dev.off()

# histogram of elasticities
means = nevo_results$extract_diagonal_means(elasticities)
aggregates = nevo_results$compute_aggregate_elasticities(factor=0.1)

dt.pic = as.data.table(melt(cbind(means,aggregates)))
dt.pic$name = c('Mean Own Elasticities', 'Aggregate Elasticities')[dt.pic$X2]
p.hista = ggplot(dt.pic, aes(x=value, color=name))+geom_histogram(bins=50, fill="white", alpha=0.5)
p.hista

# pdf(file="p_nevo2001_hista.pdf", width=3.94, height=2.62, pointsize=9)
# print(p.hista+theme_bw(base_family="serif", base_size = 8))
# dev.off()

# Full model ----

nevo_results


# set up different industry structures

product_data[, single_product_ids:=brand_ids]
product_data[, joint_ownership_ids:=1]

# calculate new prices (assumes costs don't change)
single_product = nevo_results$compute_prices(firm_ids=product_data[, single_product_ids], costs=costs)
joint_ownership = nevo_results$compute_prices(firm_ids=product_data[, joint_ownership_ids], costs=costs)

# calculate new market shares (assumes costs don't change)
single_product_markups = nevo_results$compute_markups(single_product, costs)
joint_ownership_markups = nevo_results$compute_markups(joint_ownership, costs)

# collect and plot
dt.data = data.table(t=1:length(markups), cbind(single_product_markups, joint_ownership_markups, markups))
setnames(dt.data, c('t', 'Single', 'Joint', 'Current'))
ggplot(melt(dt.data, id="t"), aes(x=value, color=variable))+geom_density(fill="white", alpha=0.5)+ggtitle("Markup Changes")

dt.data[, lapply(.SD, mean), .SDcols=c('Single', 'Current', 'Joint')]


# Bootstrapping Results ----

# parametric bootstrapping
bootstrapped_results = nevo_results$bootstrap(draws=fi(100), seed=fi(0)) # this slow
bootstrapped_results

bounds = np$percentile(
  bootstrapped_results$extract_diagonal_means(
    bootstrapped_results$compute_elasticities()
  ),
  q=c(fi(10), fi(90)),
  axis=fi(0)
)

dt.table = data.table(index=as.character(mc_problem$unique_market_ids), "Lower bound"=bounds[1,,1], "Mean Own Elasticity"=as.numeric(means), 'Upper Bound'=bounds[2,,1])
head(dt.table)

# Optimal Instruments ----

instrument_results = nevo_results$compute_optimal_instruments(method='approximate')
instrument_results

updated_problem = instrument_results$to_problem()
updated_problem

updated_results = updated_problem$solve(
  nevo_results$sigma,
  nevo_results$pi,
  optimization=pyblp$Optimization('bfgs', py_dict('gtol', 1e-5)),
  method='1s'
)
updated_results