P-value function code sample
Article Summary: These are the technical notes and code samples that accompany our main p-value function article. P-value functions expand on standard null hypothesis significance testing by allowing the null hypothesis to range between all reasonable values instead of fixing it at the standard value of 0 (an assumption of no difference between group means).
P-value function code sample
###################################################################################### # P-value Function Using IBM Watson Insurance Data ###################################################################################### # This file creates a p-value function based on the difference in customer lifetime # value of two different insurance policies. # The dataset can be found here: # https://www.kaggle.com/pankajjsh06/ibm-watson-marketing-customer-value-data/version/1 # Load libraries library(tidyverse) # Load data data = read_csv("WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv") ### # Examine p-value function for Policy Type ### x = data %>% filter(`Policy Type` == "Personal Auto") %>% select(`Customer Lifetime Value`) %>% pull y = data %>% filter(`Policy Type` == "Corporate Auto") %>% select(`Customer Lifetime Value`) %>% pull t = t.test(x=x, y=y) # What is the original p-value? t[["p.value"]] # 0.02 # What is the effect size/ t[["estimate"]][1] - t[["estimate"]][2] # $212.95 # What is the total sample size? length(x) + length(y) # 8,756 # Original effect size is $212, set effect size tests so that the entire p-value # function is seen in the plot window. Plot all p-values for these nulls. nulls = seq(from=-500, to=1000, by=1) results = tibble() for(null in nulls) { p_value = t.test(x=x, y=y, mu=null)[["p.value"]] results = bind_rows(results, tibble(null, p_value)) } plot(x=results[["null"]], y=results[["p_value"]], type='l') abline(h = 0.05) # What is the p-value of 0 difference? t.test(x=x, y=y)[["p.value"]] # 0.2 # What is the "mirror"sister" null that also produces this effect size? results %>% filter(between(p_value, 0.2, 0.201)) # $426 ### # P-value functions for vehicle size ### x = data %>% filter(`Vehicle Size` == "Small") %>% select(`Customer Lifetime Value`) %>% pull y = data %>% filter(`Vehicle Size` == "Large") %>% select(`Customer Lifetime Value`) %>% pull t = t.test(x=x, y=y) # What is the original p-value? t[["p.value"]] # What is effect size? t[["estimate"]][1] - t[["estimate"]][2] # 540.1 # What is the total sample size? length(x) + length(y) # 2,710 # Original effect size is $540, set effect size tests so that the entire p-value # function is seen in the plot window. Plot all p-values for these nulls. nulls = seq(from=-1000, to=2000, by=1) results = tibble() for(null in nulls) { p_value = t.test(x=x, y=y, mu=null)[["p.value"]] results = bind_rows(results, tibble(null, p_value)) } plot(x=results[["null"]], y=results[["p_value"]], type='l') abline(h = 0.05) # Examine results right around where the p-value crosses above 0.05 threshold results %>% filter(between(p_value, 0.049, 0.051)) # $2 to $3 ### # Examine p-value function for Policy Type using different sample sizes ### # Set mean and sd mean_small_vech = mean(x) mean_large_vech = mean(y) sd_small_vech = sd(x) sd_large_vech = sd(y) # Plot small sample size sample_size = 1000 x_s = rnorm(sample_size, mean=mean_small_vech, sd=sd_small_vech) y_s = rnorm(sample_size, mean=mean_large_vech, sd=sd_large_vech) nulls = seq(from=-800, to=2000, by=1) results_s = tibble() for(null in nulls) { p_value = t.test(x=x_s, y=y_s, mu=null)[["p.value"]] results_s = bind_rows(results_s, tibble(null, p_value)) } plot(x=results_s[["null"]], y=results_s[["p_value"]], type='l') # Plot medium sample size sample_size = 10000 x_m = rnorm(sample_size, mean=mean_small_vech, sd=sd_small_vech) y_m = rnorm(sample_size, mean=mean_large_vech, sd=sd_large_vech) nulls = seq(from=-800, to=2000, by=1) results_m = tibble() for(null in nulls) { p_value = t.test(x=x_m, y=y_m, mu=null)[["p.value"]] results_m = bind_rows(results_m, tibble(null, p_value)) } lines(x=results_m[["null"]], y=results_m[["p_value"]], type='l') # Plot large sample size sample_size = 1000000 x_l = rnorm(sample_size, mean=mean_small_vech, sd=sd_small_vech) y_l = rnorm(sample_size, mean=mean_large_vech, sd=sd_large_vech) nulls = seq(from=-800, to=2000, by=1) results_l = tibble() for(null in nulls) { p_value = t.test(x=x_l, y=y_l, mu=null)[["p.value"]] results_l = bind_rows(results_l, tibble(null, p_value)) } lines(x=results_l[["null"]], y=results_l[["p_value"]], type='l') # Center results for more useful visual max_s = which(results_s[["p_value"]] == max(results_s[["p_value"]])) max_m = which(results_m[["p_value"]] == max(results_m[["p_value"]])) max_l = which(results_l[["p_value"]] == max(results_l[["p_value"]])) results_m[["null"]] = results_m[["null"]] + (max_s - max_m) results_l[["null"]] = results_l[["null"]] + (max_s - max_l) plot(x=results_s[["null"]], y=results_s[["p_value"]], type='l') lines(x=results_m[["null"]], y=results_m[["p_value"]], type='l') lines(x=results_l[["null"]], y=results_l[["p_value"]], type='l')