Article Summary: These are the technical notes and code samples that accompany our main p-value replicability article.

Quick Guide


CTR simulation

Output .csv files can be found here. These files include:

  • Summarized results of 100,000 trials for each difference in CTR lift from 0% to 20%

  • Summarized results for all significant trials for each difference in CTR lift from 0% to 20%

  • Summarized results from 10 trials for each difference in CTR lift from 0% to 20%

  • Summarized results for the 2.5th and 97.5th percentile of estimated CTR lift from 0% to 20%

  • Summarized results for all significant trials for the 2.5th and 97.5th percentile of estimated CTR lift from 0% to 20%

  • Summarized results of the maximum and minimum estimated CTR lift from 0% to 20%

  • Summarized results for all significant trials of the maximum and minimum estimated CTR lift from 0% to 20%

  • Raw output of all 2.1 million simulations

  • Lift misestimation diagrams in JPEG format

  • The code below in a .R file

######################################################################################
# CTR success metric simulation
######################################################################################

# This simulation uses the rbinom feature to randomly select either a 1 or 0
# with the specified probability, simulating either a click or non-click. This results
# in a test of proportions, the same test used to determine the outcome of A/B CTR tests. 

# Load libraries
library(tidyverse)

# An simple, example simulation is shown below.
ctr1 = .3
ctr2 = .33
n = 10000
button_1_ctr = rbinom(n, 1, ctr1)
button_2_ctr = rbinom(n, 1, ctr2)
results = c(sum(button_1_ctr)/n, sum(button_2_ctr )/n)
barplot(results, names.arg=c("Button 1 CTR", "Button 2 CTR"))


###
# Define simulation functions
###
ctr_sim = function(n, base_ctr, ctr2) {
  x = rbinom(n, 1, ctr2)
  y = rbinom(n, 1, base_ctr)
  prop.test(x=c(sum(x), sum(y)), n=c(n, n), correct=FALSE)
}

simulate = function(n_sims, n, base_ctr, ctr2) {
  result = tibble()
  
  for(i in 1:n_sims) {
    sim = ctr_sim(n, base_ctr, ctr2)
    p_value = sim[["p.value"]]
    CI_lower = sim[["conf.int"]][1]
    CI_upper = sim[["conf.int"]][2]
    ctr2_estimate = sim[["estimate"]][[1]]
    base_ctr_estimate = sim[["estimate"]][[2]]
    result = bind_rows(result, tibble(base_ctr, ctr2, p_value, CI_lower, CI_upper, base_ctr_estimate, ctr2_estimate))
  }
  
  result %>%
    mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>%
    mutate(point_estimate = ctr2_estimate - base_ctr_estimate) %>%
    mutate(point_estimate_in_lift = point_estimate / base_ctr_estimate * 100) %>%
    mutate(estimate_sign_error = point_estimate < 0) %>%
    mutate(actual_diff_in_prop = ctr2 - base_ctr) %>%
    mutate(point_estimate_diff = point_estimate - actual_diff_in_prop) %>%
    mutate(CI_width = abs(CI_upper - CI_lower)) %>%
    mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_prop & actual_diff_in_prop <= CI_upper, TRUE, FALSE))
}

print_sim_stats = function(sim) {
  print("------------------ General Summary ------------------ ")
  print(paste("Largest p-value from simulation:", max(sim[["p_value"]])), sep=" ")
  print(paste("Smallest p-value from simulation:", min(sim[["p_value"]])), sep=" ")
  print(paste("Average point estimate:", sim %>% summarize(mean(point_estimate)), sep=" "))
  print(paste("Percent total sign errors:", (sum(sim[["estimate_sign_error"]]) / nrow(sim)) * 100, sep=" "))
  print("")
  
  print("------------------ CI Summary ------------------ ")
  print(paste("Percent of simulation CIs that cover true difference in proportion:", sum(sim[["CI_coverage"]]) / nrow(sim), sep=" "))
  print(paste("Average CI width:", sim %>% summarize(mean(CI_width))))
  print(paste("Max CI width:", max(sim[["CI_width"]])))
  print(paste("Min CI width::", min(sim[["CI_width"]])))
  print(paste("Average CI lower bound:", sim %>% summarize(mean(CI_lower))))
  print(paste("Average CI upper bound:", sim %>% summarize(mean(CI_upper))))
  print("")
  
  print("------------------ Stat Sig Summary ------------------ ")
  print(paste("Percent of simulation trials that are significant:", sum(sim[["significant"]]) / nrow(sim), sep=" "))
  sig_trials = sim %>% filter(significant==1)
  print(paste("Average point estimate of statistically significant trials:", sig_trials %>% summarize(mean(point_estimate)), sep=" "))
  print(paste("Percent sign errors for statistically significant trials:", (sum(sig_trials[["estimate_sign_error"]]) / nrow(sig_trials)) * 100, sep=" "))
  print(paste("Percent of simulation CIs that cover true difference in proportion for statistically signficant trials:", sum(sig_trials[["CI_coverage"]]) / nrow(sig_trials), sep=" "))
  print("")
  
  print("------------------ Lift Summary ------------------ ")
  print(paste("Lift estimate:", ((sim %>% summarize(mean(point_estimate))) / ctr1) * 100, sep=" "))
  print(paste("Lift estimate of sig trials:", ((sig_trials %>% summarize(mean(point_estimate))) / ctr1) * 100, sep=" "))
}

###
# Assume base scenario where lift is expected to be 10% (from 3% CTR to 3.3% CTR).
# Then examine the impact if the actual population lift is different.
# Sample size is determined by: power.prop.test( p1=0.03, p2=0.033, sig.level=0.05, power=0.8 )
# This leads to a sample size of 53,210 customers for both the control and treatment (106,420 total).
# First examine individual cases and use the print_sims_stats function to print the results.
# Then systematically examine all population lifts from 0% to 20%.
###

# Sample size calculation
# Converstion rate: 3
# 95% CI
# 80% power
# 10% lift
n = 53210
n_sims = 1000

# Actual lift
# 10%
ctr1 = .03
ctr2 = .03 * 1.10
sim = simulate(n_sims, n, ctr1, ctr2)
print("Sample size based on 10% lift at 80% power and actual population lift is 10%.")
print_sim_stats(sim)

# Actual lift
# 6.5%
ctr1 = .03
ctr2 = .03 * 1.065
sim = simulate(n_sims, n, ctr1, ctr2)
print("Sample size based on 10% lift at 80% power and actual population lift is 6.5%.")
print_sim_stats(sim)

# Actual lift
# 5%
ctr1 = .03
ctr2 = .03 * 1.05
sim = simulate(n_sims, n, ctr1, ctr2)
print("Sample size based on 10% lift at 80% power and actual population lift is 5%.")
print_sim_stats(sim)

# Actual lift
# 20%
ctr1 = .03
ctr2 = .03 * 1.2
sim = simulate(n_sims, n, ctr1, ctr2)
print("Sample size based on 10% lift at 80% power and actual population lift is 20%.")
print_sim_stats(sim)


###
# Simulate all lifts from 0% to 20% with 100,000 trials each
###
n = 53210
n_sims = 10
base_ctr = .03
lifts = seq(1, 1.2, by=.01)
sims = tibble()
for(lift in lifts) {
  ctr2 = base_ctr * lift
  sim = simulate(n_sims, n, base_ctr, ctr2) %>%
    mutate(population_lift = (lift - 1) * 100)
  sims = bind_rows(sims, sim)
}

cooked_sims = sims %>%
  mutate(estimated_lift = point_estimate / base_ctr_estimate * 100) %>%
  mutate(negative_estimate = point_estimate < 0) %>%
  mutate(CI_width_in_lift_percent = CI_width /  base_ctr_estimate * 100) %>%
  mutate(CI_lower_bound_in_lift_percent = CI_lower / base_ctr_estimate * 100) %>%
  mutate(CI_upper_bound_in_lift_percent = CI_upper / base_ctr_estimate * 100)

summary_results = cooked_sims %>%
  group_by(population_lift) %>%
  summarise(percent_sig = sum(significant) / n() * 100,
          mean_estimated_lift = mean(estimated_lift),
          percent_sign_error = sum(estimate_sign_error) / n() * 100,
          percent_CI_coverage = sum(CI_coverage) / n() * 100,
          average_CI_width_in_lift_percent = mean(CI_width_in_lift_percent),
          average_CI_lower_bound_in_lift_percent = mean(CI_lower_bound_in_lift_percent),
          average_CI_upper_bound_in_lift_percent = mean(CI_upper_bound_in_lift_percent),
          average_pval = mean(p_value),
          max_pval = max(p_value),
          min_pval = min(p_value))

sig_results = cooked_sims %>%
  filter(significant == TRUE) %>%
  group_by(population_lift) %>%
  summarise(trial_count = n(),
            mean_estimated_lift = mean(estimated_lift),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            average_CI_width_in_lift_percent = mean(CI_width_in_lift_percent),
            average_CI_lower_bound_in_lift_percent = mean(CI_lower_bound_in_lift_percent),
            average_CI_upper_bound_in_lift_percent = mean(CI_upper_bound_in_lift_percent))

quantile_summary_all = cooked_sims %>%
  group_by(population_lift) %>%
  summarize(lower_q = quantile(estimated_lift, 0.025),
            upper_q = quantile(estimated_lift, 0.975),
            mean_pe = mean(estimated_lift),
            median_pe = median(estimated_lift),
            lq_mean_diff = abs(lower_q - mean_pe),
            uq_mean_diff = abs(upper_q - mean_pe),
            lq_median_diff = abs(lower_q - median_pe),
            uq_median_diff = abs(upper_q - median_pe))
  
quantile_summary_sig = cooked_sims %>%
  filter(significant == TRUE) %>%
  group_by(population_lift) %>%
  summarize(lower_q = quantile(estimated_lift, 0.025),
            upper_q = quantile(estimated_lift, 0.975),
            mean_pe = mean(estimated_lift),
            median_pe = median(estimated_lift),
            lq_mean_diff = abs(lower_q - mean_pe),
            uq_mean_diff = abs(upper_q - mean_pe),
            lq_median_diff = abs(lower_q - median_pe),
            uq_median_diff = abs(upper_q - median_pe))

estimate_range_all = cooked_sims %>%
  group_by(population_lift) %>%
  summarize(min_estimate = min(estimated_lift),
            max_estimate = max(estimated_lift),
            mean_pe = mean(estimated_lift),
            median_pe = median(estimated_lift))
  
estimate_range_sig = cooked_sims %>%
  filter(significant == TRUE) %>%
  group_by(population_lift) %>%
  summarize(min_estimate = min(estimated_lift),
            max_estimate = max(estimated_lift),
            mean_pe = mean(estimated_lift),
            median_pe = median(estimated_lift))


###
# P-value analysis if the CI lower bound were the actual population lift
###
# Analyze results of p-values of 0.01
# Count is 13,171
# Average point estimate is  9.12 
# Average CI lower bound is 2.14
cooked_sims %>%
  filter(p_value <= 0.01) %>%
  filter(p_value >= 0.009) %>%
  summarise(count = n(),
            average_point_estimate = mean(point_estimate_in_lift),
            average_CI_lower_bound = mean(CI_lower_bound_in_lift_percent),
            average_CI_upper_bound = mean(CI_upper_bound_in_lift_percent))

# What are the results if the CTR lift is 2.1%
n = 53210
n_sims = 100000
base_ctr = .03
ctr2 = base_ctr + (base_ctr * 0.021)
sim = simulate(n_sims, n, base_ctr, ctr2)

sim %>%
  summarise(n_sig = sum(significant) / n() * 100,
            avg_pval = mean(p_value))


###
# Example of a p-value histogram
###
breaks = c(0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00)

lift3_hist = sims %>%
  filter(ctr2 == 0.0315) %>%
  select("p_value") %>%
  pull %>%
  hist(breaks)
lift3_hist$counts / 100000 * 100 # scale to get percentage


Cookie Cats analysis

Output .csv files can be found here. These files include:

  • Cookie cats raw data

  • Effect on 1-day retention of removing a random X% of users (where X is set to 50%, 60%, 70%, 80%, 90%, 99%, 10 trials each)

  • Effect on 1-day retention of removing the top X% (where X varies from 0% to 20% by 0.1%)

  • Effect on 1-day retention of removing the bottom X% (where X varies from 0% to 20% by 0.1%)

  • Effect on 7-day retention of removing a random X% of users (where X is set to 50%, 60%, 70%, 80%, 90%, 99%, 10 trials each)

  • Effect on 7-day retention of removing the top X% (where X varies from 0% to 20% by 0.1%)

  • Effect on 7-day retention of removing the bottom X% (where X varies from 0% to 20% by 0.1%)

  • Effect on 7-day retention of X% missing data, if that data is added back into to the full dataset (where X varies from 5% to 20% by 5%)

  • The code below in a .R file

######################################################################################
# Simulation using real data (cookie cats)
######################################################################################

# This dataset represents the results of a real A/B test from the mobile game
# Cookie Cats. The dataset can be accessed at the URL below.
# https://www.kaggle.com/yufengsui/mobile-games-ab-testing#cookie_cats.csv

# Dictionary
# cc = cooke cats (the mobile game the A/B test was implemented on)
# rr = retention rate (the key outcome measure of the A/B test)

# Load libraries
library(tidyverse)

# Load data
cc_data = read_csv("cookie_cats.csv")

# Ensure every row is a user
length(unique(cc_data[["userid"]])) == nrow(cc_data)


###
# Define simulation functions
###
# Helper function to output data in convenient table form
get_cc_table = function(cc_data, retention_period) {
  if(retention_period != 1 & retention_period != 7) {
    stop("Retention period must be 1 or 7.")
  }
  
  retention = sym(paste0("retention_", retention_period))
  
  cc_data %>%
    select(version, !!retention) %>%
    group_by(version) %>%
    summarise(successes = sum(!!retention),
              totals = n(),
              rr = successes / totals)
}

# Helper function to get rrs
cc_get_rrs = function(cc_data, retention_period) {
  t = get_cc_table(cc_data, retention_period)
  list(
    "gate_30_rr" = filter(t, version=="gate_30")[["rr"]],
    "gate_40_rr" = filter(t, version=="gate_40")[["rr"]]
  )
}

# Calculate prop.test
cc_rr_test = function(cc_data, retention_period) {
  t = get_cc_table(cc_data, retention_period)
  prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)
}

# Simulate trials
simulate = function(cc_data, n_sims, sample_size, rentention_period) {
  result = tibble()
  
  for(i in 1:n_sims) {
    reduced_data = cc_data %>%
      sample_n(sample_size)
    
    sim = cc_rr_test(reduced_data, retention_period)
    p_value = sim[["p.value"]]
    CI_lower = sim[["conf.int"]][1]
    CI_upper = sim[["conf.int"]][2]
    estimate1 = sim[["estimate"]][[1]]
    estimate2 = sim[["estimate"]][[2]]
    cc_rrs = cc_get_rrs(reduced_data, retention_period)
      gate_30_rr = cc_rrs[["gate_30_rr"]]
      gate_40_rr = cc_rrs[["gate_40_rr"]]
    result = bind_rows(result, tibble(sample_size, gate_30_rr, gate_40_rr, p_value, CI_lower, CI_upper, estimate1, estimate2))
  }
  
  result %>%
    mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>%
    mutate(point_estimate = estimate1 - estimate2) %>%
    mutate(estimate_sign_error = point_estimate < 0) %>%
    mutate(point_estimate_diff = actual_diff_in_prop - point_estimate) %>%
    mutate(CI_width = abs(CI_upper - CI_lower)) %>%
    mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_prop & actual_diff_in_prop <= CI_upper, TRUE, FALSE))
}


######################################################################################
# Analyze results of one-day retention
######################################################################################

# p-value of all rows: 0.07441
t = get_cc_table(cc_data, 1)
prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)


###
# p-value of random subsets of data from 50% to 99%
###

# Set global retention variables
cc_n = nrow(cc_data)
retention_period = 1
rr_for_test_population = cc_get_rrs(cc_data, retention_period)
actual_diff_in_prop = rr_for_test_population[["gate_30_rr"]] - rr_for_test_population[["gate_40_rr"]]

# Set sim parameters
n_sims = 10
sample_percentages = seq(from=.5, to=.9, by=.1)
sample_percentages = c(sample_percentages, .99)
sample_sizes = cc_n * sample_percentages
sample_size_sims = tibble()

for(sample_size in sample_sizes) {
  sim = simulate(cc_data, n_sims, sample_size, rentention_period)
  sample_size_sims = bind_rows(sample_size_sims, sim)
}

# Summarize sim results
summary_results = sample_size_sims %>%
  group_by(sample_size) %>%
  summarise(sample_percentages = mean(sample_size) / cc_n,
            percent_sig = sum(significant) / n() * 100,
            mean_estimate = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width_in_lift_percent = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value))

View(summary_results)

# Examine the 99% case in more detail
n_sims = 1000000
sample_percentage = .99
sample_size = cc_n * sample_percentage

sim = simulate(cc_data, n_sims, sample_size, rentention_period)

# Summarize sim results
summary_results = sim %>%
  summarise(sample_percentages = mean(sample_size) / cc_n,
            percent_sig = sum(significant) / n() * 100,
            mean_estimate = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width_in_lift_percent = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value))

View(summary_results)


###
# p-value of specified subsets of data
###

# Remove X% of users from top of list
pct_removed = seq(from=0, to=.20, by=0.001)
p_vals = numeric()

# Remove from top of list
for(pct in pct_removed) {
  cc_data_subset = tail(cc_data, cc_n * (1 - pct))
  t = get_cc_table(cc_data_subset, retention_period)
  p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]]
  p_vals = c(p_vals, p_val)
}
plot(x=pct_removed, y=p_vals, type='l')
write_csv(data.frame(pct_removed, p_vals), "1-day_retention_top_removed_users_p-vals.csv")

# Remove X% of users from bottom of list
pct_removed = seq(from=0, to=.20, by=0.001)
p_vals = numeric()
for(pct in pct_removed) {
  cc_data_subset = head(cc_data, cc_n * (1 - pct))
  t = get_cc_table(cc_data_subset, retention_period)
  p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]]
  p_vals = c(p_vals, p_val)
}
plot(x=pct_removed, y=p_vals, type='l')
write_csv(data.frame(pct_removed, p_vals), "1-day_retention_bottom_removed_users_p-vals.csv")


######################################################################################
# Analyze results of seven-day retention
######################################################################################

# p-value of all rows: 0.001554
t = get_cc_table(cc_data, 7)
prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)


###
# p-value of random subsets of data from 50% to 99%
###

# Set global retention variables
cc_n = nrow(cc_data)
retention_period = 7
rr_for_test_population = cc_get_rrs(cc_data, retention_period)
actual_diff_in_prop = rr_for_test_population[["gate_30_rr"]] - rr_for_test_population[["gate_40_rr"]]

# Set sim parameters
n_sims = 10
sample_percentages = seq(from=.5, to=.9, by=.1)
sample_percentages = c(sample_percentages, .99)
sample_sizes = cc_n * sample_percentages
sample_size_sims = tibble()

for(sample_size in sample_sizes) {
  sim = simulate(cc_data, n_sims, sample_size, rentention_period)
  sample_size_sims = bind_rows(sample_size_sims, sim)
}

# Summarize sim results
summary_results = sample_size_sims %>%
  group_by(sample_size) %>%
  summarise(sample_percentages = mean(sample_size) / cc_n,
            percent_sig = sum(significant) / n() * 100,
            mean_estimate = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width_in_lift_percent = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value),
            vibration_ratio = max_pval / min_pval)

View(summary_results)
write_csv(summary_results, "7-day_retention_random_pct_removed_p-vals.csv")


###
# p-value of specified subsets of data
###

# Remove X% of users from top of list
pct_removed = seq(from=0, to=.20, by=0.001)
p_vals = numeric()

for(pct in pct_removed) {
  cc_data_subset = tail(cc_data, cc_n * (1 - pct))
  t = get_cc_table(cc_data_subset, retention_period)
  p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]]
  p_vals = c(p_vals, p_val)
}

plot(x=pct_removed, y=p_vals, type='l')
write_csv(data.frame(pct_removed, p_vals), "7-day_retention_top_removed_users_p-vals.csv")

# Remove X% of users from bottom of list
pct_removed = seq(from=0, to=.20, by=0.001)
p_vals = numeric()

for(pct in pct_removed) {
  cc_data_subset = head(cc_data, cc_n * (1 - pct))
  t = get_cc_table(cc_data_subset, retention_period)
  p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]]
  p_vals = c(p_vals, p_val)
}

plot(x=pct_removed, y=p_vals, type='l')
write_csv(data.frame(pct_removed, p_vals), "7-day_retention_bottom_removed_users_p-vals.csv")


###
# Effect of missing data
###

# Original p_value for 7-day retention
t = get_cc_table(cc_data, 7)
t
pt = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)
pt[["estimate"]][["prop 1"]] # gate_30 retention rate is 0.1902013
pt[["estimate"]][["prop 2"]] # gate_40 retention rate is 0.1820000 
pt[["estimate"]][["prop 1"]] - pt[["estimate"]][["prop 2"]] # 0.008201298
pt[["p.value"]] # 0.00155425

# Test different retention rates for Gate 30 and Gate 40
n = nrow(cc_data)
percentages_missing = c(.05, .1, .15, .2)
gate_rrs = list(c(0.19, 0.182), c(0.182, 0.19), c(0.19, 0.19), c(0.19, 0.2), c(0.19, 0.21), c(0.19, 0.22), c(0.19, 0.23),
                c(0.19, 0.24), c(0.19, 0.25), c(0.19, 0.26), c(0.19, 0.27), c(0.19, 0.28), c(0.19, 0.29), c(0.19, 0.30),
                c(0.19, 0.31), c(0.19, 0.32), c(0.19, 0.33), c(0.19, 0.34), c(0.19, 0.35), c(0.19, 0.36), c(0.19, 0.37),
                c(0.19, 0.38), c(0.19, 0.39), c(0.19, 0.40), c(.01, .01), c(.01, .02), c(.01, .03), c(0.5, 0.51))
missing_data_result = tibble()
n_trials = 10

# Iterate through different retention rates for different amounts of missing data and repeat 10 times for each combination
for(rrs in gate_rrs) {
  gate_30_new_rr = rrs[1]
  gate_40_new_rr = rrs[2]

  for(percent_missing in percentages_missing) {
    
    for(trial in 1:n_trials) {
      n_extra_users = round(n * (1/(1-percent_missing) - 1))
      assignment = sample(c("gate_30", "gate_40"), n_extra_users, replace=TRUE) # assign missing users to one of the treatments at random
      gate_30_new_tot = sum(assignment == "gate_30")
      gate_40_new_tot = sum(assignment == "gate_40")
      gate_30_new_suc = sum(rep(TRUE, gate_30_new_tot * gate_30_new_rr))
      gate_40_new_suc = sum(rep(TRUE, gate_40_new_tot * gate_40_new_rr))
      
      # Add in the missing users to the original data
      t = get_cc_table(cc_data, 7)
      t[1, "successes"] = t[1, "successes"] + gate_30_new_suc 
      t[2, "successes"] = t[2, "successes"] + gate_40_new_suc
      t[1, "totals"] = t[1, "totals"] + gate_30_new_tot 
      t[2, "totals"] = t[2, "totals"] + gate_40_new_tot
      
      new_g30_rr = t[[1, "successes"]] / t[[1, "totals"]]
      new_g40_rr = t[[2, "successes"]] / t[[2, "totals"]]
      t = add_column(t, new_rr = c(new_g30_rr, new_g40_rr))
      
      # Calculae new p-value and estimates
      pt = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)
      p_value = pt[["p.value"]]
      prop1 = pt[["estimate"]][["prop 1"]]
      prop2 = pt[["estimate"]][["prop 2"]]
      prop_diff = pt[["estimate"]][["prop 1"]] - pt[["estimate"]][["prop 2"]]
      
      missing_data_result = bind_rows(missing_data_result,
                                      tibble(
                                        gate_30_new_rr,
                                        gate_40_new_rr,
                                        percent_missing,
                                        p_value,
                                        prop1,
                                        prop2,
                                        prop_diff))
    }
  }
}

# Analyze results
missing_data_summary = missing_data_result %>%
  group_by(percent_missing, gate_40_new_rr, gate_30_new_rr) %>%
  summarise(avg_pval = mean(p_value),
            avg_diff = mean(prop_diff)) %>%
  mutate(significant = if_else(avg_pval <= 0.05, TRUE, FALSE, missing = NULL))
View(missing_data_summary)


Average order value (AOV) simulation

Output .csv files can be found here. These files include:

  • Summarized results of the 100,000 trials for each difference in AOV of $0 to $20

  • Summarized results for all significant trials for each difference in AOV of $0 to $20

  • The code below in a .R file

######################################################################################
# Average order value (AOV) success metric simulation
######################################################################################

# This simulation uses a Gamma distribution to simulate differences in average order
# value (AOV) due to an A/B test. The Gamma distribution gives a good approximation of
# what customer orders might look like if they were plotted by the size of the order
# in dollars.

# Load libraries
library(tidyverse)

# Example of Gamma distribution
# This distribution is equivalent to the average order being $200, a realistic estimate
# for some electronics retailers for example.
x=seq(from=0, to=1000, by=.25)
y=dgamma(x, shape=2.1, rate=.01)
plot(x, y, type='l')
write_csv(data.frame(x,y), "example_gamma_dist.csv")


###
# Proof of normality
###

# Since we're simulating an A/B test we need two such distributions, simulating the
# effect of a test having an impact on the average order size. By the central limit
# theorem the sampling distribution of the difference of sample means should be
# approximately normal for large n, even if the original underlying distribution
# is discrete. This gives us the ability to use a standard t-test to evaluate the
# success of the test.

diff_in_means = numeric()
for(i in 1:10000) {
  n = 1000
  order_amounts_1 = mean(rgamma(n, shape=2, rate=.01))
  order_amounts_2 = mean(rgamma(n, shape=2.5, rate=.01))
  diff_in_means = c(diff_in_means, order_amounts_2 - order_amounts_1)
}

hist(diff_in_means, breaks=100)
qqnorm(diff_in_means)

# Max vector size for Shapiro-Wilk’s test is 5,000
diff_in_means = numeric()
for(i in 1:5000) {
  n = 1000
  order_amounts_1 = mean(rgamma(n, shape=2, rate=.01))
  order_amounts_2 = mean(rgamma(n, shape=2.5, rate=.01))
  diff_in_means = c(diff_in_means, order_amounts_2 - order_amounts_1)
}
shapiro.test(diff_in_means) # Should be insignificant


###
# Determination of shape and rate parameters that give 80% power
###

# Here the expected value and variance can be determined exactly
# using the specified shape and rate parameters.
# Wikipedia, among other sources, gives the closed form calculations:
# https://en.wikipedia.org/wiki/Gamma_distribution
# You could also use this online calculator which allows you to
# change the shape and rate parameters and observe the shape
# of the resulting distribution along with its statistics:
# https://homepage.divms.uiowa.edu/~mbognar/applets/gamma.html

shape1 = 2
shape2 = 2.1
rate = .01

population_diff_in_means = shape2/rate - shape1/rate
sd = sqrt((shape2/rate^2 + shape1/rate^2)/2) # Pooled variance
delta = population_diff_in_means / sd
power.t.test(delta=delta, power=.8, type = "two.sample", strict=T)
# n = 3,219

# You can also determine the sample size empirically and get a similar result.
# If you set 'n' and 'estimates' to be very large (for example the 
# values below) you can approximate the 3,219 sample size above
# very closely. Our simulation gave 3,220. However, it takes
# about 30 minutes to run. Smaller values give less accurate
# estimates.

n = 1000000
sample_size_estimates = numeric()
estimates = 10000

for(estimate in 1:estimates) {
  aov1 = rgamma(n, shape=2, rate=.01)
  aov2 = rgamma(n, shape=2.1, rate=.01)
  
  population_diff_in_means = mean(aov2) - mean(aov1)
  sd = sqrt((var(aov1) + var(aov2))/2)
  delta = population_diff_in_means / sd
  
  sample_size_estimate = power.t.test(delta=delta, power=.8, type = "two.sample", strict=T)[["n"]]
  sample_size_estimates = c(sample_size_estimates, sample_size_estimate)
}
mean(sample_size_estimates) # n = 3,220 in my trial

# A further check on power is to run repeated simulations with this sample size
# and the shape and rate parameters specified above for two different Gamma
# distributions and note that a t-test leads to a statistically significant
# difference 80% of the time. This simulation is done below.


###
# Define simulation functions
###

aov_sim = function(n, shape1, rate1, shape2, rate2) {
  aov1 = rgamma(n, shape=shape1, rate=rate1)
  aov2 = rgamma(n, shape=shape2, rate=rate2)
  t.test(aov2, aov1)
}

simulate_orders = function(n_sims, n, shape1, rate1, shape2, rate2) {
  result = tibble()
  
  population_aov1 = shape1 / rate1 # exected value
  population_aov2 = shape2 / rate2 # exected value
  
  for(i in 1:n_sims) {
    sim = aov_sim(n, shape1, rate1, shape2, rate2)
    p_value = sim[["p.value"]]
    CI_lower = sim[["conf.int"]][1]
    CI_upper = sim[["conf.int"]][2]
    estimate1 = sim[["estimate"]][[1]]
    estimate2 = sim[["estimate"]][[2]]
    result = bind_rows(result, tibble(population_aov1, population_aov2, p_value, CI_lower, CI_upper, estimate1, estimate2))
  }
  
  result %>%
    mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>%
    mutate(point_estimate = estimate1 - estimate2) %>%
    mutate(estimate_sign_error = point_estimate < 0) %>%
    mutate(actual_diff_in_means = population_aov2 - population_aov1) %>%
    mutate(point_estimate_diff = actual_diff_in_means - point_estimate) %>%
    mutate(CI_width = abs(CI_upper - CI_lower)) %>%
    mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_means & actual_diff_in_means <= CI_upper, TRUE, FALSE))
}


###
# Simulate differences in AOV from $0 to $20
###
n_sims = 100000
n = 3219
shape1 = 2
rate1 = .01
shapes2 = seq(from=2, to=2.2, by=.01)
rate2 = .01

sims = tibble()
for(shape2 in shapes2) {
  sim = simulate_orders(n_sims, n, shape1, rate1, shape2, rate2)
  sims = bind_rows(sims, sim)
}

summary_results = sims %>%
  group_by(actual_diff_in_means) %>%
  summarise(percent_sig = sum(significant) / n() * 100,
            estimated_lift = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            vibration_ratio = max(point_estimate) - min(point_estimate),
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value))

sig_results = sims %>%
  filter(significant == TRUE) %>%
  group_by(actual_diff_in_means) %>%
  summarise(trial_count = n(),
            estimated_lift = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper))


Average order size (AOS) simulation

Output .csv files can be found here. These files include:

  • Summarized results of the 100,000 trials for differences in AOS that create powers of 10% to 100% by 10%

  • Summarized results for all significant trials for each difference in AOS that create powers of 10% to 100% by 10%

  • Summarized results of the 100,000 trials for differences in AOS of 0.1, 0.25, 0.5, 0.75, and 1.0 products per order

  • Summarized results for all significant trials for differences in AOS of 0.1, 0.25, 0.5, 0.75, and 1.0 products per order

  • The code below in a .R file

######################################################################################
# Average order size (AOS) success metric simulation
######################################################################################
# Load libraries
library(tidyverse)

# No pre-existing discrete distribution gives a good basis for a products per order
# metric. However, using the 'sample' function we can define this manually by specifying 
# an order size and its probability. An example distribution is shown below. What this
# means in practice is that 30% of customers buy just a single item, 20% buy two items,
# 10% buy three items and so on. Just 0.01% buy 40 items.

n = 1000000
max_products_per_order = 40
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .0185, .01, seq(.01, .0001, length.out=30))

avg_order_size = sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE)
h = hist(avg_order_size, breaks=40)


###
# Proof of normality
###

# Since we're simulating an A/B test we need two such distributions, simulating the
# effect of a test having an impact on the average order size. By the central limit
# theorem the sampling distribution of the difference of sample means should be
# approximately normal for large n, even if the original underlying distribution
# is discrete. This gives us the ability to use a standard t-test to evaluate the
# success of the test.

max_products_per_order = 40
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.33, .17, .15, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# Takes 30 seconds to run
diff_in_means = numeric()
for(i in 1:100000) {
  n = 3000
  o1=sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE)
  o2=sample(1:max_products_per_order, size=n, prob=probs2, replace=TRUE)
  diff_in_means = c(diff_in_means, mean(o2) - mean(o1))
}

hist(diff_in_means, breaks=100)
qqnorm(diff_in_means)

# Max vector size for Shapiro-Wilk’s test is 5,000
diff_in_means = numeric()
for(i in 1:5000) {
  n = 3000
  o1 = sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE)
  o2 = sample(1:max_products_per_order, size=n, prob=probs2, replace=TRUE)
  diff_in_means = c(diff_in_means, mean(o2) - mean(o1))
}
shapiro.test(diff_in_means) # Should be insignificant


###
# Determination of correct probability mass functions that gives power of
# 0.1 to 1 in 0.1 increments.
###

# It is possible to simply play around with two distributions until the
# power happens to be at the specified level. For example, the below
# distributions were created via trial and error and give powers very close
# to those they are attempting to estimate. Note that the solutions are
# not unique. The distributions were designed to be somewhat realistic
# of actual distributions one might see in A/B test results.

# Estimate of 10% power. Actual power = 0.1002428
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .1115, .0985, .09, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 20% power. Actual power = 0.2000198
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .1525, .1, .05, .0475, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 30% power. Actual power = 0.2999245
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .1771, .0729, .06, .05, .03, .02, .01, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 40% power. Actual power = 0.3999956
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.368, .232, .1, .04, .02, .02, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 50% power. Actual power = 0.4999833
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.38, .25, .05515, .05, .03, .02485, .02, .01, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 60% power. Actual power = 0.5999171
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .2, .0826, .0774, .03, .01, .01, .01, .01, .0185, seq(.01, .0001, length.out=30))

# Estimate of 70% power. Actual power = 0.6999192
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .2, .1, .076, .014, .01, .01, .0185, .01, .01, seq(.01, .0001, length.out=30))

# Estimate of 80% power. Actual power = 0.800426
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .2, .13, .05, .0185, .01, .01, .01, .01, .01, seq(.01, .0001, length.out=30))

# Estimate of 90% power. Actual power = 0.9000084
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.431735, .218265, .1, .02, .02, .0185, .01, .01, .01, .01, seq(.01, .0001, length.out=30))

# Estimate of 100% power. Actual power = 1
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .35, .155, .0185, .01, .01, .01, .01, .01, .01, seq(.001, .0001, length.out=30))

# It is also possible to simply play around with two distributions
# until the difference in means happens to be at the specified level.
# The below distributions were created via trial and error and give
# exact differences in means. Note that the solutions are not unique.
# The distributions were designed to be somewhat realistic of actual
# distributions one might see in A/B test results.

# AOS difference of 0.1 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .1, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# AOS difference of 0.25 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .2, .05, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# AOS difference of 0.5 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .165, .09, .069, .04, .031, .03, .0185, .01, .01, seq(.009, .0001, length.out=30))

# AOS difference of 0.75 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .17, .093, .067, .05, .03, .03, .0185, .01, .01, seq(.008, .0001, length.out=30))

# AOS difference of 1 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.45, .125, .121, .074, .06, .0185, .01, .01, .005, .005, seq(.008, .0001, length.out=30))

# AOS difference of 0.5 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.5, .14, .06, .04, .03, .01, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

# AOS difference of 2 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.3, .18, .14, .14, .065, .05, .04, .03, .02, .0185, seq(.001, .0001, length.out=30))

# AOS difference of 3 products per order
probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))
probs2 = c(.4, .35, .088, .0635, .0285, .02, .0135, .01, .005, .005, seq(.001, .0001, length.out=30))

# Plug in probs below to see power
probs1 =
probs2 =

counts = 1:40
population_diff_in_means = sum(counts * probs1) - sum(counts * probs2)
var1 = sum((counts)^2 * probs1) - sum((counts) * probs1)^2
var2 = sum((counts)^2 * probs2) - sum((counts) * probs2)^2
sd = sqrt((var1 + var2)/2)
delta = population_diff_in_means / sd

power.t.test(n=3000, delta=delta, type = "two.sample", strict=T)

# A different method to find distributions would be to use constrained optimization:
# For a given sample size, the power is defined by the effect size (difference
# in means). It is possible, therefore, to set a sample size and power, and
# then return what the difference in means would be to generate that power
# using the 'power.t.test' function (the function always returns the missing
# parameter between 'n', 'power', and 'delta' and requires the other two to be
# specified. It is then possible to use constrained optimization solution methods
# to set the probabilities of the various order sizes that produce the specified
# difference in means. The first is step get a list of desired differences in means
# (called 'delta' by the t-test function). The code below can do this.

powers = seq(from=.1, to=1, by=.1)
deltas = sapply(powers, FUN=function(power) {
  power.t.test(n=3000, power=power, type = "two.sample", strict=T)[["delta"]]
})

# Take deltas to Excel and use solver constrained optimization add-in to produce
# two probability mass functions that give the specified delta. A sample Excel
# workbook is provided in the resources section of the article accompanying this code.


###
# Define simulation functions
###
products_per_order_sim = function(n, pmf1, pmf2) {
  products_per_order1 = sample(pmf1$counts, size=n, prob=pmf1$probs, replace=TRUE)
  products_per_order2 = sample(pmf2$counts, size=n, prob=pmf2$probs, replace=TRUE)
  t.test(products_per_order1, products_per_order2)
}

simulate_orders = function(n_sims, n, order_pmf1, order_pmf2) {
  result = tibble()
  
  population_orders_per_product1 = sum(order_pmf1$counts * order_pmf1$probs)
  population_orders_per_product2 = sum(order_pmf2$counts * order_pmf2$probs)
  
  for(i in 1:n_sims) {
    sim = products_per_order_sim(n, order_pmf1, order_pmf2)
    p_value = sim[["p.value"]]
    CI_lower = sim[["conf.int"]][1]
    CI_upper = sim[["conf.int"]][2]
    estimate1 = sim[["estimate"]][[1]]
    estimate2 = sim[["estimate"]][[2]]
    result = bind_rows(result, tibble(population_orders_per_product1, population_orders_per_product2, p_value, CI_lower, CI_upper, estimate1, estimate2))
  }
  
  result %>%
    mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>%
    mutate(point_estimate = estimate1 - estimate2) %>%
    mutate(estimate_sign_error = point_estimate < 0) %>%
    mutate(actual_diff_in_means = population_orders_per_product1 - population_orders_per_product2) %>%
    mutate(point_estimate_diff = actual_diff_in_means - point_estimate) %>%
    mutate(CI_width = abs(CI_upper - CI_lower)) %>%
    mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_means & actual_diff_in_means <= CI_upper, TRUE, FALSE))
}

###
# Simulations based on power
###
base_order_size_probs = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

treatment_order_size_probs = list(
  p10 = c(.4, .1115, .0985, .09, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)),
  p20 = c(.4, .1525, .1, .05, .0475, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)),
  p30 = c(.4, .1771, .0729, .06, .05, .03, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)),
  p40 = c(.368, .232, .1, .04, .02, .02, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)),
  p50 = c(.38, .25, .05515, .05, .03, .02485, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)),
  p60 = c(.4, .2, .0826, .0774, .03, .01, .01, .01, .01, .0185, seq(.01, .0001, length.out=30)),
  p70 = c(.4, .2, .1, .076, .014, .01, .01, .0185, .01, .01, seq(.01, .0001, length.out=30)),
  p80 = c(.4, .2, .13, .05, .0185, .01, .01, .01, .01, .01, seq(.01, .0001, length.out=30)),
  p90 = c(.431735, .218265, .1, .02, .02, .0185, .01, .01, .01, .01, seq(.01, .0001, length.out=30)),
  p100 = c(.4, .35, .155, .0185, .01, .01, .01, .01, .01, .01, seq(.001, .0001, length.out=30))
)

n = 3000 # n that was assumed in power calculations above
n_sims = 100
max_products_per_order = 40
# All power calculations are in relation to the base probabilities
base_order_size_distribution = list(counts=1:max_products_per_order, probs=base_order_size_probs)
sims = tibble()
power = 0.1

for(prob in treatment_order_size_probs) {
  treatment_order_size_distribution = list(counts=1:max_products_per_order, probs=prob)

  sim = simulate_orders(n_sims, n, base_order_size_distribution, treatment_order_size_distribution)
  sims = bind_rows(sims, cbind.data.frame(power, sim))
  power = power + 0.1
}

summary_results = sims %>%
  group_by(power) %>%
  summarise(percent_sig = sum(significant) / n() * 100,
            actual_diff_in_means = mean(actual_diff_in_means),
            average_estimated_lift = mean(point_estimate),
            average_point_estimate_diff = mean(point_estimate_diff),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value),
            vibration_ratio = max_pval / min_pval)

sig_results = sims %>%
  filter(significant == TRUE) %>%
  group_by(power) %>%
  summarise(trial_count = n(),
            actual_diff_in_means = mean(actual_diff_in_means),
            estimated_lift = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper))

###
# Simulations based on differences in average AOS
###
base_order_size_probs = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30))

treatment_order_size_probs = list(
  d.1 = c(.4, .1, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)),
  d.25 = c(.4, .2, .05, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)),
  d.50 = c(.4, .165, .09, .069, .04, .031, .03, .0185, .01, .01, seq(.009, .0001, length.out=30)),
  d.75 = c(.4, .17, .093, .067, .05, .03, .03, .0185, .01, .01, seq(.008, .0001, length.out=30)),
  d.1 = c(.45, .125, .121, .074, .06, .0185, .01, .01, .005, .005, seq(.008, .0001, length.out=30))
)

n = 3000 # n that was assumed in power calculations above
n_sims = 100000
max_products_per_order = 40
# All power calculations are in relation to the base probabilities
base_order_size_distribution = list(counts=1:max_products_per_order, probs=base_order_size_probs)
sims = tibble()

for(prob in treatment_order_size_probs) {
  treatment_order_size_distribution = list(counts=1:max_products_per_order, probs=prob)
  
  sim = simulate_orders(n_sims, n, base_order_size_distribution, treatment_order_size_distribution)
  sims = bind_rows(sims, sim)
}

summary_results = sims %>%
  group_by(actual_diff_in_means) %>%
  summarise(percent_sig = sum(significant) / n() * 100,
            average_estimated_lift = mean(point_estimate),
            average_point_estimate_diff = mean(point_estimate_diff),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            percent_CI_coverage = sum(CI_coverage) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper),
            average_pval = mean(p_value),
            max_pval = max(p_value),
            min_pval = min(p_value),
            vibration_ratio = max_pval / min_pval)

sig_results = sims %>%
  filter(significant == TRUE) %>%
  group_by(actual_diff_in_means) %>%
  summarise(trial_count = n(),
            estimated_lift = mean(point_estimate),
            percent_sign_error = sum(estimate_sign_error) / n() * 100,
            average_CI_width = mean(CI_width),
            average_CI_lower_bound = mean(CI_lower),
            average_CI_upper_bound = mean(CI_upper))