Article Summary: These are the technical notes and code samples that accompany our main p-value replicability article.
Quick Guide
CTR simulation
Output .csv files can be found here. These files include:
Summarized results of 100,000 trials for each difference in CTR lift from 0% to 20%
Summarized results for all significant trials for each difference in CTR lift from 0% to 20%
Summarized results from 10 trials for each difference in CTR lift from 0% to 20%
Summarized results for the 2.5th and 97.5th percentile of estimated CTR lift from 0% to 20%
Summarized results for all significant trials for the 2.5th and 97.5th percentile of estimated CTR lift from 0% to 20%
Summarized results of the maximum and minimum estimated CTR lift from 0% to 20%
Summarized results for all significant trials of the maximum and minimum estimated CTR lift from 0% to 20%
Raw output of all 2.1 million simulations
Lift misestimation diagrams in JPEG format
The code below in a .R file
###################################################################################### # CTR success metric simulation ###################################################################################### # This simulation uses the rbinom feature to randomly select either a 1 or 0 # with the specified probability, simulating either a click or non-click. This results # in a test of proportions, the same test used to determine the outcome of A/B CTR tests. # Load libraries library(tidyverse) # An simple, example simulation is shown below. ctr1 = .3 ctr2 = .33 n = 10000 button_1_ctr = rbinom(n, 1, ctr1) button_2_ctr = rbinom(n, 1, ctr2) results = c(sum(button_1_ctr)/n, sum(button_2_ctr )/n) barplot(results, names.arg=c("Button 1 CTR", "Button 2 CTR")) ### # Define simulation functions ### ctr_sim = function(n, base_ctr, ctr2) { x = rbinom(n, 1, ctr2) y = rbinom(n, 1, base_ctr) prop.test(x=c(sum(x), sum(y)), n=c(n, n), correct=FALSE) } simulate = function(n_sims, n, base_ctr, ctr2) { result = tibble() for(i in 1:n_sims) { sim = ctr_sim(n, base_ctr, ctr2) p_value = sim[["p.value"]] CI_lower = sim[["conf.int"]][1] CI_upper = sim[["conf.int"]][2] ctr2_estimate = sim[["estimate"]][[1]] base_ctr_estimate = sim[["estimate"]][[2]] result = bind_rows(result, tibble(base_ctr, ctr2, p_value, CI_lower, CI_upper, base_ctr_estimate, ctr2_estimate)) } result %>% mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>% mutate(point_estimate = ctr2_estimate - base_ctr_estimate) %>% mutate(point_estimate_in_lift = point_estimate / base_ctr_estimate * 100) %>% mutate(estimate_sign_error = point_estimate < 0) %>% mutate(actual_diff_in_prop = ctr2 - base_ctr) %>% mutate(point_estimate_diff = point_estimate - actual_diff_in_prop) %>% mutate(CI_width = abs(CI_upper - CI_lower)) %>% mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_prop & actual_diff_in_prop <= CI_upper, TRUE, FALSE)) } print_sim_stats = function(sim) { print("------------------ General Summary ------------------ ") print(paste("Largest p-value from simulation:", max(sim[["p_value"]])), sep=" ") print(paste("Smallest p-value from simulation:", min(sim[["p_value"]])), sep=" ") print(paste("Average point estimate:", sim %>% summarize(mean(point_estimate)), sep=" ")) print(paste("Percent total sign errors:", (sum(sim[["estimate_sign_error"]]) / nrow(sim)) * 100, sep=" ")) print("") print("------------------ CI Summary ------------------ ") print(paste("Percent of simulation CIs that cover true difference in proportion:", sum(sim[["CI_coverage"]]) / nrow(sim), sep=" ")) print(paste("Average CI width:", sim %>% summarize(mean(CI_width)))) print(paste("Max CI width:", max(sim[["CI_width"]]))) print(paste("Min CI width::", min(sim[["CI_width"]]))) print(paste("Average CI lower bound:", sim %>% summarize(mean(CI_lower)))) print(paste("Average CI upper bound:", sim %>% summarize(mean(CI_upper)))) print("") print("------------------ Stat Sig Summary ------------------ ") print(paste("Percent of simulation trials that are significant:", sum(sim[["significant"]]) / nrow(sim), sep=" ")) sig_trials = sim %>% filter(significant==1) print(paste("Average point estimate of statistically significant trials:", sig_trials %>% summarize(mean(point_estimate)), sep=" ")) print(paste("Percent sign errors for statistically significant trials:", (sum(sig_trials[["estimate_sign_error"]]) / nrow(sig_trials)) * 100, sep=" ")) print(paste("Percent of simulation CIs that cover true difference in proportion for statistically signficant trials:", sum(sig_trials[["CI_coverage"]]) / nrow(sig_trials), sep=" ")) print("") print("------------------ Lift Summary ------------------ ") print(paste("Lift estimate:", ((sim %>% summarize(mean(point_estimate))) / ctr1) * 100, sep=" ")) print(paste("Lift estimate of sig trials:", ((sig_trials %>% summarize(mean(point_estimate))) / ctr1) * 100, sep=" ")) } ### # Assume base scenario where lift is expected to be 10% (from 3% CTR to 3.3% CTR). # Then examine the impact if the actual population lift is different. # Sample size is determined by: power.prop.test( p1=0.03, p2=0.033, sig.level=0.05, power=0.8 ) # This leads to a sample size of 53,210 customers for both the control and treatment (106,420 total). # First examine individual cases and use the print_sims_stats function to print the results. # Then systematically examine all population lifts from 0% to 20%. ### # Sample size calculation # Converstion rate: 3 # 95% CI # 80% power # 10% lift n = 53210 n_sims = 1000 # Actual lift # 10% ctr1 = .03 ctr2 = .03 * 1.10 sim = simulate(n_sims, n, ctr1, ctr2) print("Sample size based on 10% lift at 80% power and actual population lift is 10%.") print_sim_stats(sim) # Actual lift # 6.5% ctr1 = .03 ctr2 = .03 * 1.065 sim = simulate(n_sims, n, ctr1, ctr2) print("Sample size based on 10% lift at 80% power and actual population lift is 6.5%.") print_sim_stats(sim) # Actual lift # 5% ctr1 = .03 ctr2 = .03 * 1.05 sim = simulate(n_sims, n, ctr1, ctr2) print("Sample size based on 10% lift at 80% power and actual population lift is 5%.") print_sim_stats(sim) # Actual lift # 20% ctr1 = .03 ctr2 = .03 * 1.2 sim = simulate(n_sims, n, ctr1, ctr2) print("Sample size based on 10% lift at 80% power and actual population lift is 20%.") print_sim_stats(sim) ### # Simulate all lifts from 0% to 20% with 100,000 trials each ### n = 53210 n_sims = 10 base_ctr = .03 lifts = seq(1, 1.2, by=.01) sims = tibble() for(lift in lifts) { ctr2 = base_ctr * lift sim = simulate(n_sims, n, base_ctr, ctr2) %>% mutate(population_lift = (lift - 1) * 100) sims = bind_rows(sims, sim) } cooked_sims = sims %>% mutate(estimated_lift = point_estimate / base_ctr_estimate * 100) %>% mutate(negative_estimate = point_estimate < 0) %>% mutate(CI_width_in_lift_percent = CI_width / base_ctr_estimate * 100) %>% mutate(CI_lower_bound_in_lift_percent = CI_lower / base_ctr_estimate * 100) %>% mutate(CI_upper_bound_in_lift_percent = CI_upper / base_ctr_estimate * 100) summary_results = cooked_sims %>% group_by(population_lift) %>% summarise(percent_sig = sum(significant) / n() * 100, mean_estimated_lift = mean(estimated_lift), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width_in_lift_percent = mean(CI_width_in_lift_percent), average_CI_lower_bound_in_lift_percent = mean(CI_lower_bound_in_lift_percent), average_CI_upper_bound_in_lift_percent = mean(CI_upper_bound_in_lift_percent), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value)) sig_results = cooked_sims %>% filter(significant == TRUE) %>% group_by(population_lift) %>% summarise(trial_count = n(), mean_estimated_lift = mean(estimated_lift), percent_sign_error = sum(estimate_sign_error) / n() * 100, average_CI_width_in_lift_percent = mean(CI_width_in_lift_percent), average_CI_lower_bound_in_lift_percent = mean(CI_lower_bound_in_lift_percent), average_CI_upper_bound_in_lift_percent = mean(CI_upper_bound_in_lift_percent)) quantile_summary_all = cooked_sims %>% group_by(population_lift) %>% summarize(lower_q = quantile(estimated_lift, 0.025), upper_q = quantile(estimated_lift, 0.975), mean_pe = mean(estimated_lift), median_pe = median(estimated_lift), lq_mean_diff = abs(lower_q - mean_pe), uq_mean_diff = abs(upper_q - mean_pe), lq_median_diff = abs(lower_q - median_pe), uq_median_diff = abs(upper_q - median_pe)) quantile_summary_sig = cooked_sims %>% filter(significant == TRUE) %>% group_by(population_lift) %>% summarize(lower_q = quantile(estimated_lift, 0.025), upper_q = quantile(estimated_lift, 0.975), mean_pe = mean(estimated_lift), median_pe = median(estimated_lift), lq_mean_diff = abs(lower_q - mean_pe), uq_mean_diff = abs(upper_q - mean_pe), lq_median_diff = abs(lower_q - median_pe), uq_median_diff = abs(upper_q - median_pe)) estimate_range_all = cooked_sims %>% group_by(population_lift) %>% summarize(min_estimate = min(estimated_lift), max_estimate = max(estimated_lift), mean_pe = mean(estimated_lift), median_pe = median(estimated_lift)) estimate_range_sig = cooked_sims %>% filter(significant == TRUE) %>% group_by(population_lift) %>% summarize(min_estimate = min(estimated_lift), max_estimate = max(estimated_lift), mean_pe = mean(estimated_lift), median_pe = median(estimated_lift)) ### # P-value analysis if the CI lower bound were the actual population lift ### # Analyze results of p-values of 0.01 # Count is 13,171 # Average point estimate is 9.12 # Average CI lower bound is 2.14 cooked_sims %>% filter(p_value <= 0.01) %>% filter(p_value >= 0.009) %>% summarise(count = n(), average_point_estimate = mean(point_estimate_in_lift), average_CI_lower_bound = mean(CI_lower_bound_in_lift_percent), average_CI_upper_bound = mean(CI_upper_bound_in_lift_percent)) # What are the results if the CTR lift is 2.1% n = 53210 n_sims = 100000 base_ctr = .03 ctr2 = base_ctr + (base_ctr * 0.021) sim = simulate(n_sims, n, base_ctr, ctr2) sim %>% summarise(n_sig = sum(significant) / n() * 100, avg_pval = mean(p_value)) ### # Example of a p-value histogram ### breaks = c(0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00) lift3_hist = sims %>% filter(ctr2 == 0.0315) %>% select("p_value") %>% pull %>% hist(breaks) lift3_hist$counts / 100000 * 100 # scale to get percentage
Cookie Cats analysis
Output .csv files can be found here. These files include:
Cookie cats raw data
Effect on 1-day retention of removing a random X% of users (where X is set to 50%, 60%, 70%, 80%, 90%, 99%, 10 trials each)
Effect on 1-day retention of removing the top X% (where X varies from 0% to 20% by 0.1%)
Effect on 1-day retention of removing the bottom X% (where X varies from 0% to 20% by 0.1%)
Effect on 7-day retention of removing a random X% of users (where X is set to 50%, 60%, 70%, 80%, 90%, 99%, 10 trials each)
Effect on 7-day retention of removing the top X% (where X varies from 0% to 20% by 0.1%)
Effect on 7-day retention of removing the bottom X% (where X varies from 0% to 20% by 0.1%)
Effect on 7-day retention of X% missing data, if that data is added back into to the full dataset (where X varies from 5% to 20% by 5%)
The code below in a .R file
###################################################################################### # Simulation using real data (cookie cats) ###################################################################################### # This dataset represents the results of a real A/B test from the mobile game # Cookie Cats. The dataset can be accessed at the URL below. # https://www.kaggle.com/yufengsui/mobile-games-ab-testing#cookie_cats.csv # Dictionary # cc = cooke cats (the mobile game the A/B test was implemented on) # rr = retention rate (the key outcome measure of the A/B test) # Load libraries library(tidyverse) # Load data cc_data = read_csv("cookie_cats.csv") # Ensure every row is a user length(unique(cc_data[["userid"]])) == nrow(cc_data) ### # Define simulation functions ### # Helper function to output data in convenient table form get_cc_table = function(cc_data, retention_period) { if(retention_period != 1 & retention_period != 7) { stop("Retention period must be 1 or 7.") } retention = sym(paste0("retention_", retention_period)) cc_data %>% select(version, !!retention) %>% group_by(version) %>% summarise(successes = sum(!!retention), totals = n(), rr = successes / totals) } # Helper function to get rrs cc_get_rrs = function(cc_data, retention_period) { t = get_cc_table(cc_data, retention_period) list( "gate_30_rr" = filter(t, version=="gate_30")[["rr"]], "gate_40_rr" = filter(t, version=="gate_40")[["rr"]] ) } # Calculate prop.test cc_rr_test = function(cc_data, retention_period) { t = get_cc_table(cc_data, retention_period) prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE) } # Simulate trials simulate = function(cc_data, n_sims, sample_size, rentention_period) { result = tibble() for(i in 1:n_sims) { reduced_data = cc_data %>% sample_n(sample_size) sim = cc_rr_test(reduced_data, retention_period) p_value = sim[["p.value"]] CI_lower = sim[["conf.int"]][1] CI_upper = sim[["conf.int"]][2] estimate1 = sim[["estimate"]][[1]] estimate2 = sim[["estimate"]][[2]] cc_rrs = cc_get_rrs(reduced_data, retention_period) gate_30_rr = cc_rrs[["gate_30_rr"]] gate_40_rr = cc_rrs[["gate_40_rr"]] result = bind_rows(result, tibble(sample_size, gate_30_rr, gate_40_rr, p_value, CI_lower, CI_upper, estimate1, estimate2)) } result %>% mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>% mutate(point_estimate = estimate1 - estimate2) %>% mutate(estimate_sign_error = point_estimate < 0) %>% mutate(point_estimate_diff = actual_diff_in_prop - point_estimate) %>% mutate(CI_width = abs(CI_upper - CI_lower)) %>% mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_prop & actual_diff_in_prop <= CI_upper, TRUE, FALSE)) } ###################################################################################### # Analyze results of one-day retention ###################################################################################### # p-value of all rows: 0.07441 t = get_cc_table(cc_data, 1) prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE) ### # p-value of random subsets of data from 50% to 99% ### # Set global retention variables cc_n = nrow(cc_data) retention_period = 1 rr_for_test_population = cc_get_rrs(cc_data, retention_period) actual_diff_in_prop = rr_for_test_population[["gate_30_rr"]] - rr_for_test_population[["gate_40_rr"]] # Set sim parameters n_sims = 10 sample_percentages = seq(from=.5, to=.9, by=.1) sample_percentages = c(sample_percentages, .99) sample_sizes = cc_n * sample_percentages sample_size_sims = tibble() for(sample_size in sample_sizes) { sim = simulate(cc_data, n_sims, sample_size, rentention_period) sample_size_sims = bind_rows(sample_size_sims, sim) } # Summarize sim results summary_results = sample_size_sims %>% group_by(sample_size) %>% summarise(sample_percentages = mean(sample_size) / cc_n, percent_sig = sum(significant) / n() * 100, mean_estimate = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width_in_lift_percent = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value)) View(summary_results) # Examine the 99% case in more detail n_sims = 1000000 sample_percentage = .99 sample_size = cc_n * sample_percentage sim = simulate(cc_data, n_sims, sample_size, rentention_period) # Summarize sim results summary_results = sim %>% summarise(sample_percentages = mean(sample_size) / cc_n, percent_sig = sum(significant) / n() * 100, mean_estimate = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width_in_lift_percent = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value)) View(summary_results) ### # p-value of specified subsets of data ### # Remove X% of users from top of list pct_removed = seq(from=0, to=.20, by=0.001) p_vals = numeric() # Remove from top of list for(pct in pct_removed) { cc_data_subset = tail(cc_data, cc_n * (1 - pct)) t = get_cc_table(cc_data_subset, retention_period) p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]] p_vals = c(p_vals, p_val) } plot(x=pct_removed, y=p_vals, type='l') write_csv(data.frame(pct_removed, p_vals), "1-day_retention_top_removed_users_p-vals.csv") # Remove X% of users from bottom of list pct_removed = seq(from=0, to=.20, by=0.001) p_vals = numeric() for(pct in pct_removed) { cc_data_subset = head(cc_data, cc_n * (1 - pct)) t = get_cc_table(cc_data_subset, retention_period) p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]] p_vals = c(p_vals, p_val) } plot(x=pct_removed, y=p_vals, type='l') write_csv(data.frame(pct_removed, p_vals), "1-day_retention_bottom_removed_users_p-vals.csv") ###################################################################################### # Analyze results of seven-day retention ###################################################################################### # p-value of all rows: 0.001554 t = get_cc_table(cc_data, 7) prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE) ### # p-value of random subsets of data from 50% to 99% ### # Set global retention variables cc_n = nrow(cc_data) retention_period = 7 rr_for_test_population = cc_get_rrs(cc_data, retention_period) actual_diff_in_prop = rr_for_test_population[["gate_30_rr"]] - rr_for_test_population[["gate_40_rr"]] # Set sim parameters n_sims = 10 sample_percentages = seq(from=.5, to=.9, by=.1) sample_percentages = c(sample_percentages, .99) sample_sizes = cc_n * sample_percentages sample_size_sims = tibble() for(sample_size in sample_sizes) { sim = simulate(cc_data, n_sims, sample_size, rentention_period) sample_size_sims = bind_rows(sample_size_sims, sim) } # Summarize sim results summary_results = sample_size_sims %>% group_by(sample_size) %>% summarise(sample_percentages = mean(sample_size) / cc_n, percent_sig = sum(significant) / n() * 100, mean_estimate = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width_in_lift_percent = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value), vibration_ratio = max_pval / min_pval) View(summary_results) write_csv(summary_results, "7-day_retention_random_pct_removed_p-vals.csv") ### # p-value of specified subsets of data ### # Remove X% of users from top of list pct_removed = seq(from=0, to=.20, by=0.001) p_vals = numeric() for(pct in pct_removed) { cc_data_subset = tail(cc_data, cc_n * (1 - pct)) t = get_cc_table(cc_data_subset, retention_period) p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]] p_vals = c(p_vals, p_val) } plot(x=pct_removed, y=p_vals, type='l') write_csv(data.frame(pct_removed, p_vals), "7-day_retention_top_removed_users_p-vals.csv") # Remove X% of users from bottom of list pct_removed = seq(from=0, to=.20, by=0.001) p_vals = numeric() for(pct in pct_removed) { cc_data_subset = head(cc_data, cc_n * (1 - pct)) t = get_cc_table(cc_data_subset, retention_period) p_val = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE)[["p.value"]] p_vals = c(p_vals, p_val) } plot(x=pct_removed, y=p_vals, type='l') write_csv(data.frame(pct_removed, p_vals), "7-day_retention_bottom_removed_users_p-vals.csv") ### # Effect of missing data ### # Original p_value for 7-day retention t = get_cc_table(cc_data, 7) t pt = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE) pt[["estimate"]][["prop 1"]] # gate_30 retention rate is 0.1902013 pt[["estimate"]][["prop 2"]] # gate_40 retention rate is 0.1820000 pt[["estimate"]][["prop 1"]] - pt[["estimate"]][["prop 2"]] # 0.008201298 pt[["p.value"]] # 0.00155425 # Test different retention rates for Gate 30 and Gate 40 n = nrow(cc_data) percentages_missing = c(.05, .1, .15, .2) gate_rrs = list(c(0.19, 0.182), c(0.182, 0.19), c(0.19, 0.19), c(0.19, 0.2), c(0.19, 0.21), c(0.19, 0.22), c(0.19, 0.23), c(0.19, 0.24), c(0.19, 0.25), c(0.19, 0.26), c(0.19, 0.27), c(0.19, 0.28), c(0.19, 0.29), c(0.19, 0.30), c(0.19, 0.31), c(0.19, 0.32), c(0.19, 0.33), c(0.19, 0.34), c(0.19, 0.35), c(0.19, 0.36), c(0.19, 0.37), c(0.19, 0.38), c(0.19, 0.39), c(0.19, 0.40), c(.01, .01), c(.01, .02), c(.01, .03), c(0.5, 0.51)) missing_data_result = tibble() n_trials = 10 # Iterate through different retention rates for different amounts of missing data and repeat 10 times for each combination for(rrs in gate_rrs) { gate_30_new_rr = rrs[1] gate_40_new_rr = rrs[2] for(percent_missing in percentages_missing) { for(trial in 1:n_trials) { n_extra_users = round(n * (1/(1-percent_missing) - 1)) assignment = sample(c("gate_30", "gate_40"), n_extra_users, replace=TRUE) # assign missing users to one of the treatments at random gate_30_new_tot = sum(assignment == "gate_30") gate_40_new_tot = sum(assignment == "gate_40") gate_30_new_suc = sum(rep(TRUE, gate_30_new_tot * gate_30_new_rr)) gate_40_new_suc = sum(rep(TRUE, gate_40_new_tot * gate_40_new_rr)) # Add in the missing users to the original data t = get_cc_table(cc_data, 7) t[1, "successes"] = t[1, "successes"] + gate_30_new_suc t[2, "successes"] = t[2, "successes"] + gate_40_new_suc t[1, "totals"] = t[1, "totals"] + gate_30_new_tot t[2, "totals"] = t[2, "totals"] + gate_40_new_tot new_g30_rr = t[[1, "successes"]] / t[[1, "totals"]] new_g40_rr = t[[2, "successes"]] / t[[2, "totals"]] t = add_column(t, new_rr = c(new_g30_rr, new_g40_rr)) # Calculae new p-value and estimates pt = prop.test(x=t[["successes"]], n=t[["totals"]], correct=FALSE) p_value = pt[["p.value"]] prop1 = pt[["estimate"]][["prop 1"]] prop2 = pt[["estimate"]][["prop 2"]] prop_diff = pt[["estimate"]][["prop 1"]] - pt[["estimate"]][["prop 2"]] missing_data_result = bind_rows(missing_data_result, tibble( gate_30_new_rr, gate_40_new_rr, percent_missing, p_value, prop1, prop2, prop_diff)) } } } # Analyze results missing_data_summary = missing_data_result %>% group_by(percent_missing, gate_40_new_rr, gate_30_new_rr) %>% summarise(avg_pval = mean(p_value), avg_diff = mean(prop_diff)) %>% mutate(significant = if_else(avg_pval <= 0.05, TRUE, FALSE, missing = NULL)) View(missing_data_summary)
Average order value (AOV) simulation
Output .csv files can be found here. These files include:
Summarized results of the 100,000 trials for each difference in AOV of $0 to $20
Summarized results for all significant trials for each difference in AOV of $0 to $20
The code below in a .R file
###################################################################################### # Average order value (AOV) success metric simulation ###################################################################################### # This simulation uses a Gamma distribution to simulate differences in average order # value (AOV) due to an A/B test. The Gamma distribution gives a good approximation of # what customer orders might look like if they were plotted by the size of the order # in dollars. # Load libraries library(tidyverse) # Example of Gamma distribution # This distribution is equivalent to the average order being $200, a realistic estimate # for some electronics retailers for example. x=seq(from=0, to=1000, by=.25) y=dgamma(x, shape=2.1, rate=.01) plot(x, y, type='l') write_csv(data.frame(x,y), "example_gamma_dist.csv") ### # Proof of normality ### # Since we're simulating an A/B test we need two such distributions, simulating the # effect of a test having an impact on the average order size. By the central limit # theorem the sampling distribution of the difference of sample means should be # approximately normal for large n, even if the original underlying distribution # is discrete. This gives us the ability to use a standard t-test to evaluate the # success of the test. diff_in_means = numeric() for(i in 1:10000) { n = 1000 order_amounts_1 = mean(rgamma(n, shape=2, rate=.01)) order_amounts_2 = mean(rgamma(n, shape=2.5, rate=.01)) diff_in_means = c(diff_in_means, order_amounts_2 - order_amounts_1) } hist(diff_in_means, breaks=100) qqnorm(diff_in_means) # Max vector size for Shapiro-Wilk’s test is 5,000 diff_in_means = numeric() for(i in 1:5000) { n = 1000 order_amounts_1 = mean(rgamma(n, shape=2, rate=.01)) order_amounts_2 = mean(rgamma(n, shape=2.5, rate=.01)) diff_in_means = c(diff_in_means, order_amounts_2 - order_amounts_1) } shapiro.test(diff_in_means) # Should be insignificant ### # Determination of shape and rate parameters that give 80% power ### # Here the expected value and variance can be determined exactly # using the specified shape and rate parameters. # Wikipedia, among other sources, gives the closed form calculations: # https://en.wikipedia.org/wiki/Gamma_distribution # You could also use this online calculator which allows you to # change the shape and rate parameters and observe the shape # of the resulting distribution along with its statistics: # https://homepage.divms.uiowa.edu/~mbognar/applets/gamma.html shape1 = 2 shape2 = 2.1 rate = .01 population_diff_in_means = shape2/rate - shape1/rate sd = sqrt((shape2/rate^2 + shape1/rate^2)/2) # Pooled variance delta = population_diff_in_means / sd power.t.test(delta=delta, power=.8, type = "two.sample", strict=T) # n = 3,219 # You can also determine the sample size empirically and get a similar result. # If you set 'n' and 'estimates' to be very large (for example the # values below) you can approximate the 3,219 sample size above # very closely. Our simulation gave 3,220. However, it takes # about 30 minutes to run. Smaller values give less accurate # estimates. n = 1000000 sample_size_estimates = numeric() estimates = 10000 for(estimate in 1:estimates) { aov1 = rgamma(n, shape=2, rate=.01) aov2 = rgamma(n, shape=2.1, rate=.01) population_diff_in_means = mean(aov2) - mean(aov1) sd = sqrt((var(aov1) + var(aov2))/2) delta = population_diff_in_means / sd sample_size_estimate = power.t.test(delta=delta, power=.8, type = "two.sample", strict=T)[["n"]] sample_size_estimates = c(sample_size_estimates, sample_size_estimate) } mean(sample_size_estimates) # n = 3,220 in my trial # A further check on power is to run repeated simulations with this sample size # and the shape and rate parameters specified above for two different Gamma # distributions and note that a t-test leads to a statistically significant # difference 80% of the time. This simulation is done below. ### # Define simulation functions ### aov_sim = function(n, shape1, rate1, shape2, rate2) { aov1 = rgamma(n, shape=shape1, rate=rate1) aov2 = rgamma(n, shape=shape2, rate=rate2) t.test(aov2, aov1) } simulate_orders = function(n_sims, n, shape1, rate1, shape2, rate2) { result = tibble() population_aov1 = shape1 / rate1 # exected value population_aov2 = shape2 / rate2 # exected value for(i in 1:n_sims) { sim = aov_sim(n, shape1, rate1, shape2, rate2) p_value = sim[["p.value"]] CI_lower = sim[["conf.int"]][1] CI_upper = sim[["conf.int"]][2] estimate1 = sim[["estimate"]][[1]] estimate2 = sim[["estimate"]][[2]] result = bind_rows(result, tibble(population_aov1, population_aov2, p_value, CI_lower, CI_upper, estimate1, estimate2)) } result %>% mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>% mutate(point_estimate = estimate1 - estimate2) %>% mutate(estimate_sign_error = point_estimate < 0) %>% mutate(actual_diff_in_means = population_aov2 - population_aov1) %>% mutate(point_estimate_diff = actual_diff_in_means - point_estimate) %>% mutate(CI_width = abs(CI_upper - CI_lower)) %>% mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_means & actual_diff_in_means <= CI_upper, TRUE, FALSE)) } ### # Simulate differences in AOV from $0 to $20 ### n_sims = 100000 n = 3219 shape1 = 2 rate1 = .01 shapes2 = seq(from=2, to=2.2, by=.01) rate2 = .01 sims = tibble() for(shape2 in shapes2) { sim = simulate_orders(n_sims, n, shape1, rate1, shape2, rate2) sims = bind_rows(sims, sim) } summary_results = sims %>% group_by(actual_diff_in_means) %>% summarise(percent_sig = sum(significant) / n() * 100, estimated_lift = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, vibration_ratio = max(point_estimate) - min(point_estimate), percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value)) sig_results = sims %>% filter(significant == TRUE) %>% group_by(actual_diff_in_means) %>% summarise(trial_count = n(), estimated_lift = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper))
Average order size (AOS) simulation
Output .csv files can be found here. These files include:
Summarized results of the 100,000 trials for differences in AOS that create powers of 10% to 100% by 10%
Summarized results for all significant trials for each difference in AOS that create powers of 10% to 100% by 10%
Summarized results of the 100,000 trials for differences in AOS of 0.1, 0.25, 0.5, 0.75, and 1.0 products per order
Summarized results for all significant trials for differences in AOS of 0.1, 0.25, 0.5, 0.75, and 1.0 products per order
The code below in a .R file
###################################################################################### # Average order size (AOS) success metric simulation ###################################################################################### # Load libraries library(tidyverse) # No pre-existing discrete distribution gives a good basis for a products per order # metric. However, using the 'sample' function we can define this manually by specifying # an order size and its probability. An example distribution is shown below. What this # means in practice is that 30% of customers buy just a single item, 20% buy two items, # 10% buy three items and so on. Just 0.01% buy 40 items. n = 1000000 max_products_per_order = 40 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .0185, .01, seq(.01, .0001, length.out=30)) avg_order_size = sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE) h = hist(avg_order_size, breaks=40) ### # Proof of normality ### # Since we're simulating an A/B test we need two such distributions, simulating the # effect of a test having an impact on the average order size. By the central limit # theorem the sampling distribution of the difference of sample means should be # approximately normal for large n, even if the original underlying distribution # is discrete. This gives us the ability to use a standard t-test to evaluate the # success of the test. max_products_per_order = 40 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.33, .17, .15, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # Takes 30 seconds to run diff_in_means = numeric() for(i in 1:100000) { n = 3000 o1=sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE) o2=sample(1:max_products_per_order, size=n, prob=probs2, replace=TRUE) diff_in_means = c(diff_in_means, mean(o2) - mean(o1)) } hist(diff_in_means, breaks=100) qqnorm(diff_in_means) # Max vector size for Shapiro-Wilk’s test is 5,000 diff_in_means = numeric() for(i in 1:5000) { n = 3000 o1 = sample(1:max_products_per_order, size=n, prob=probs1, replace=TRUE) o2 = sample(1:max_products_per_order, size=n, prob=probs2, replace=TRUE) diff_in_means = c(diff_in_means, mean(o2) - mean(o1)) } shapiro.test(diff_in_means) # Should be insignificant ### # Determination of correct probability mass functions that gives power of # 0.1 to 1 in 0.1 increments. ### # It is possible to simply play around with two distributions until the # power happens to be at the specified level. For example, the below # distributions were created via trial and error and give powers very close # to those they are attempting to estimate. Note that the solutions are # not unique. The distributions were designed to be somewhat realistic # of actual distributions one might see in A/B test results. # Estimate of 10% power. Actual power = 0.1002428 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .1115, .0985, .09, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 20% power. Actual power = 0.2000198 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .1525, .1, .05, .0475, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 30% power. Actual power = 0.2999245 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .1771, .0729, .06, .05, .03, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 40% power. Actual power = 0.3999956 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.368, .232, .1, .04, .02, .02, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 50% power. Actual power = 0.4999833 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.38, .25, .05515, .05, .03, .02485, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 60% power. Actual power = 0.5999171 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .2, .0826, .0774, .03, .01, .01, .01, .01, .0185, seq(.01, .0001, length.out=30)) # Estimate of 70% power. Actual power = 0.6999192 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .2, .1, .076, .014, .01, .01, .0185, .01, .01, seq(.01, .0001, length.out=30)) # Estimate of 80% power. Actual power = 0.800426 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .2, .13, .05, .0185, .01, .01, .01, .01, .01, seq(.01, .0001, length.out=30)) # Estimate of 90% power. Actual power = 0.9000084 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.431735, .218265, .1, .02, .02, .0185, .01, .01, .01, .01, seq(.01, .0001, length.out=30)) # Estimate of 100% power. Actual power = 1 probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .35, .155, .0185, .01, .01, .01, .01, .01, .01, seq(.001, .0001, length.out=30)) # It is also possible to simply play around with two distributions # until the difference in means happens to be at the specified level. # The below distributions were created via trial and error and give # exact differences in means. Note that the solutions are not unique. # The distributions were designed to be somewhat realistic of actual # distributions one might see in A/B test results. # AOS difference of 0.1 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .1, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # AOS difference of 0.25 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .2, .05, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # AOS difference of 0.5 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .165, .09, .069, .04, .031, .03, .0185, .01, .01, seq(.009, .0001, length.out=30)) # AOS difference of 0.75 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .17, .093, .067, .05, .03, .03, .0185, .01, .01, seq(.008, .0001, length.out=30)) # AOS difference of 1 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.45, .125, .121, .074, .06, .0185, .01, .01, .005, .005, seq(.008, .0001, length.out=30)) # AOS difference of 0.5 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.5, .14, .06, .04, .03, .01, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) # AOS difference of 2 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.3, .18, .14, .14, .065, .05, .04, .03, .02, .0185, seq(.001, .0001, length.out=30)) # AOS difference of 3 products per order probs1 = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) probs2 = c(.4, .35, .088, .0635, .0285, .02, .0135, .01, .005, .005, seq(.001, .0001, length.out=30)) # Plug in probs below to see power probs1 = probs2 = counts = 1:40 population_diff_in_means = sum(counts * probs1) - sum(counts * probs2) var1 = sum((counts)^2 * probs1) - sum((counts) * probs1)^2 var2 = sum((counts)^2 * probs2) - sum((counts) * probs2)^2 sd = sqrt((var1 + var2)/2) delta = population_diff_in_means / sd power.t.test(n=3000, delta=delta, type = "two.sample", strict=T) # A different method to find distributions would be to use constrained optimization: # For a given sample size, the power is defined by the effect size (difference # in means). It is possible, therefore, to set a sample size and power, and # then return what the difference in means would be to generate that power # using the 'power.t.test' function (the function always returns the missing # parameter between 'n', 'power', and 'delta' and requires the other two to be # specified. It is then possible to use constrained optimization solution methods # to set the probabilities of the various order sizes that produce the specified # difference in means. The first is step get a list of desired differences in means # (called 'delta' by the t-test function). The code below can do this. powers = seq(from=.1, to=1, by=.1) deltas = sapply(powers, FUN=function(power) { power.t.test(n=3000, power=power, type = "two.sample", strict=T)[["delta"]] }) # Take deltas to Excel and use solver constrained optimization add-in to produce # two probability mass functions that give the specified delta. A sample Excel # workbook is provided in the resources section of the article accompanying this code. ### # Define simulation functions ### products_per_order_sim = function(n, pmf1, pmf2) { products_per_order1 = sample(pmf1$counts, size=n, prob=pmf1$probs, replace=TRUE) products_per_order2 = sample(pmf2$counts, size=n, prob=pmf2$probs, replace=TRUE) t.test(products_per_order1, products_per_order2) } simulate_orders = function(n_sims, n, order_pmf1, order_pmf2) { result = tibble() population_orders_per_product1 = sum(order_pmf1$counts * order_pmf1$probs) population_orders_per_product2 = sum(order_pmf2$counts * order_pmf2$probs) for(i in 1:n_sims) { sim = products_per_order_sim(n, order_pmf1, order_pmf2) p_value = sim[["p.value"]] CI_lower = sim[["conf.int"]][1] CI_upper = sim[["conf.int"]][2] estimate1 = sim[["estimate"]][[1]] estimate2 = sim[["estimate"]][[2]] result = bind_rows(result, tibble(population_orders_per_product1, population_orders_per_product2, p_value, CI_lower, CI_upper, estimate1, estimate2)) } result %>% mutate(significant = if_else(p_value <= 0.05, TRUE, FALSE, missing = NULL)) %>% mutate(point_estimate = estimate1 - estimate2) %>% mutate(estimate_sign_error = point_estimate < 0) %>% mutate(actual_diff_in_means = population_orders_per_product1 - population_orders_per_product2) %>% mutate(point_estimate_diff = actual_diff_in_means - point_estimate) %>% mutate(CI_width = abs(CI_upper - CI_lower)) %>% mutate(CI_coverage = if_else(CI_lower <= actual_diff_in_means & actual_diff_in_means <= CI_upper, TRUE, FALSE)) } ### # Simulations based on power ### base_order_size_probs = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) treatment_order_size_probs = list( p10 = c(.4, .1115, .0985, .09, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)), p20 = c(.4, .1525, .1, .05, .0475, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)), p30 = c(.4, .1771, .0729, .06, .05, .03, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)), p40 = c(.368, .232, .1, .04, .02, .02, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)), p50 = c(.38, .25, .05515, .05, .03, .02485, .02, .01, .01, .0185, seq(.01, .0001, length.out=30)), p60 = c(.4, .2, .0826, .0774, .03, .01, .01, .01, .01, .0185, seq(.01, .0001, length.out=30)), p70 = c(.4, .2, .1, .076, .014, .01, .01, .0185, .01, .01, seq(.01, .0001, length.out=30)), p80 = c(.4, .2, .13, .05, .0185, .01, .01, .01, .01, .01, seq(.01, .0001, length.out=30)), p90 = c(.431735, .218265, .1, .02, .02, .0185, .01, .01, .01, .01, seq(.01, .0001, length.out=30)), p100 = c(.4, .35, .155, .0185, .01, .01, .01, .01, .01, .01, seq(.001, .0001, length.out=30)) ) n = 3000 # n that was assumed in power calculations above n_sims = 100 max_products_per_order = 40 # All power calculations are in relation to the base probabilities base_order_size_distribution = list(counts=1:max_products_per_order, probs=base_order_size_probs) sims = tibble() power = 0.1 for(prob in treatment_order_size_probs) { treatment_order_size_distribution = list(counts=1:max_products_per_order, probs=prob) sim = simulate_orders(n_sims, n, base_order_size_distribution, treatment_order_size_distribution) sims = bind_rows(sims, cbind.data.frame(power, sim)) power = power + 0.1 } summary_results = sims %>% group_by(power) %>% summarise(percent_sig = sum(significant) / n() * 100, actual_diff_in_means = mean(actual_diff_in_means), average_estimated_lift = mean(point_estimate), average_point_estimate_diff = mean(point_estimate_diff), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value), vibration_ratio = max_pval / min_pval) sig_results = sims %>% filter(significant == TRUE) %>% group_by(power) %>% summarise(trial_count = n(), actual_diff_in_means = mean(actual_diff_in_means), estimated_lift = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper)) ### # Simulations based on differences in average AOS ### base_order_size_probs = c(.3, .2, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)) treatment_order_size_probs = list( d.1 = c(.4, .1, .1, .1, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)), d.25 = c(.4, .2, .05, .05, .05, .03, .02, .02, .01, .0185, seq(.01, .0001, length.out=30)), d.50 = c(.4, .165, .09, .069, .04, .031, .03, .0185, .01, .01, seq(.009, .0001, length.out=30)), d.75 = c(.4, .17, .093, .067, .05, .03, .03, .0185, .01, .01, seq(.008, .0001, length.out=30)), d.1 = c(.45, .125, .121, .074, .06, .0185, .01, .01, .005, .005, seq(.008, .0001, length.out=30)) ) n = 3000 # n that was assumed in power calculations above n_sims = 100000 max_products_per_order = 40 # All power calculations are in relation to the base probabilities base_order_size_distribution = list(counts=1:max_products_per_order, probs=base_order_size_probs) sims = tibble() for(prob in treatment_order_size_probs) { treatment_order_size_distribution = list(counts=1:max_products_per_order, probs=prob) sim = simulate_orders(n_sims, n, base_order_size_distribution, treatment_order_size_distribution) sims = bind_rows(sims, sim) } summary_results = sims %>% group_by(actual_diff_in_means) %>% summarise(percent_sig = sum(significant) / n() * 100, average_estimated_lift = mean(point_estimate), average_point_estimate_diff = mean(point_estimate_diff), percent_sign_error = sum(estimate_sign_error) / n() * 100, percent_CI_coverage = sum(CI_coverage) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper), average_pval = mean(p_value), max_pval = max(p_value), min_pval = min(p_value), vibration_ratio = max_pval / min_pval) sig_results = sims %>% filter(significant == TRUE) %>% group_by(actual_diff_in_means) %>% summarise(trial_count = n(), estimated_lift = mean(point_estimate), percent_sign_error = sum(estimate_sign_error) / n() * 100, average_CI_width = mean(CI_width), average_CI_lower_bound = mean(CI_lower), average_CI_upper_bound = mean(CI_upper))