A/B Testing Guide — Hypothesis, Sample Size & Statistical Significance
A/B testing (split testing) is a controlled experiment that compares two or more variants of a web page, email, ad, or feature to determine which performs better — using statistical methods to ensure results are reliable, not random.
What You’ll Learn
This tutorial covers the complete A/B testing workflow: forming testable hypotheses, calculating required sample sizes, understanding statistical significance (p-values, confidence intervals, power), running multivariate tests (MVT), selecting the right tools, and avoiding common experimentation pitfalls.
Why It Matters
Companies that run continuous A/B tests see 20-30% improvement in conversion rates within the first year. Without proper statistical rigor, teams make decisions based on noise — running tests too short, peeking at results, or drawing conclusions from insignificant data. DodaTech A/B tests every landing page and email campaign for Doda Browser and Durga Antivirus Pro.
Real-World Use
Booking.com runs 1,000+ concurrent A/B tests, contributing to 50%+ annual conversion growth. Netflix tests thumbnail images, saving $1B+ annually in content discovery. Microsoft’s Bing increased revenue by 12% through a single color A/B test on ad backgrounds.
flowchart LR
subgraph Planning
A[Observe Data] --> B[Form Hypothesis]
B --> C[Define Metrics]
C --> D[Calculate Sample]
end
subgraph Execution
D --> E[Randomize Traffic]
E --> F[Run Experiment]
F --> G[Collect Data]
end
subgraph Analysis
G --> H[Calculate<br/>Significance]
H --> I{Statistically<br/>Significant?}
I -->|Yes| J[Implement Winner]
I -->|No| K[Iterate or Stop]
end
Hypothesis Formulation
Every A/B test starts with a hypothesis. A well-formed hypothesis follows the ICE framework (Impact, Confidence, Ease) or the PIE framework (Potential, Importance, Ease).
class HypothesisFormulator:
"""Formulate and score A/B test hypotheses."""
def __init__(self):
self.hypotheses = []
def create_hypothesis(self, observation, proposed_change, expected_impact, metric, rationale):
"""Create a structured hypothesis."""
h = {
"observation": observation,
"proposed_change": proposed_change,
"expected_impact": expected_impact,
"metric": metric,
"rationale": rationale,
"ice_score": 0,
}
self.hypotheses.append(h)
return h
def score_ice(self, hypothesis, impact, confidence, ease):
"""Score hypothesis using ICE framework (1-10 each)."""
hypothesis["ice_score"] = (impact + confidence + ease) / 3
return {
"impact": impact,
"confidence": confidence,
"ease": ease,
"ice": hypothesis["ice_score"],
}
def prioritize(self):
"""Return hypotheses sorted by ICE score."""
return sorted(self.hypotheses, key=lambda h: h["ice_score"], reverse=True)
def formulate_ab(self, control_desc, variant_desc, metric, min_detectable_effect):
"""Formulate a complete A/B test plan."""
hypothesis = self.create_hypothesis(
observation=f"Current {metric} is below benchmark",
proposed_change=f"Test {control_desc} vs {variant_desc}",
expected_impact=f"Increase {metric} by {min_detectable_effect*100}%",
metric=metric,
rationale=f"{variant_desc} should improve {metric} because..."
)
null_hypothesis = f"There is no difference in {metric} between {control_desc} and {variant_desc}"
alt_hypothesis = f"There is a difference in {metric} between {control_desc} and {variant_desc}"
return {
"null": null_hypothesis,
"alternative": alt_hypothesis,
"metric": metric,
"min_detectable_effect": min_detectable_effect,
"hypothesis": hypothesis,
}
form = HypothesisFormulator()
plan = form.formulate_ab(
"Current landing page (long form)",
"Short landing page with CTA above fold",
"conversion_rate",
0.15
)
print(f"Null hypothesis: {plan['null']}")
print(f"Alternative: {plan['alternative']}")
print(f"MDE: {plan['min_detectable_effect']*100}% improvement")
score = form.score_ice(plan["hypothesis"], impact=8, confidence=7, ease=9)
print(f"ICE score: {score['ice']:.1f}/10")Expected output:
Null hypothesis: There is no difference in conversion_rate between Current landing page (long form) and Short landing page with CTA above fold
Alternative: There is a difference in conversion_rate between Current landing page (long form) and Short landing page with CTA above fold
MDE: 15.0% improvement
ICE score: 8.0/10Sample Size Calculation
Running a test with too few visitors risks false negatives (Type II error). The required sample size depends on:
- Baseline conversion rate (current performance)
- Minimum detectable effect (MDE — smallest improvement worth detecting)
- Significance level (α, typically 0.05 — 5% risk of false positive)
- Statistical power (1-β, typically 0.80 — 80% chance of detecting true effect)
class SampleSizeCalculator:
"""Calculate required sample size for A/B tests."""
@staticmethod
def required_sample_size(baseline_rate, mde, alpha=0.05, power=0.80):
"""Calculate sample size per variant using normal approximation."""
import math
from scipy import stats
# Z-scores for alpha and power
z_alpha = 1.96 # 95% confidence
z_beta = 0.84 # 80% power
# Pooled proportion
p1 = baseline_rate
p2 = baseline_rate * (1 + mde)
# Sample size formula
p_pool = (p1 + p2) / 2
numerator = (z_alpha * math.sqrt(2 * p_pool * (1 - p_pool)) +
z_beta * math.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2
denominator = (p2 - p1) ** 2
n = math.ceil(numerator / denominator)
return n
@staticmethod
def estimate_duration(n_per_variant, daily_traffic, variants=2):
"""Estimate days needed to reach sample size."""
total = n_per_variant * variants
days = math.ceil(total / daily_traffic)
return {"total_visitors_needed": total, "est_days": days}
def summary(self, baseline, mde, daily_traffic):
n = self.required_sample_size(baseline, mde)
duration = self.estimate_duration(n, daily_traffic)
print(f"\n=== Sample Size Calculation ===")
print(f"Baseline conversion: {baseline*100:.1f}%")
print(f"Minimum detectable effect: {mde*100:.0f}% relative increase")
print(f"Visitors per variant needed: {n:,}")
print(f"Total visitors: {duration['total_visitors_needed']:,}")
print(f"Estimated duration: {duration['est_days']} days at {daily_traffic}/day")
return n
calc = SampleSizeCalculator()
calc.summary(baseline=0.05, mde=0.20, daily_traffic=5000)Expected output:
=== Sample Size Calculation ===
Baseline conversion: 5.0%
Minimum detectable effect: 20% relative increase
Visitors per variant needed: 29,955
Total visitors: 59,910
Estimated duration: 12 days at 5000/dayRunning the Experiment
class ABTestRunner:
"""Simulate running an A/B test and analyzing results."""
def __init__(self, control_name="Control", variant_name="Variant"):
self.control_name = control_name
self.variant_name = variant_name
self.results = {
"control": {"visitors": 0, "conversions": 0},
"variant": {"visitors": 0, "conversions": 0},
}
self.is_running = False
def start(self):
self.is_running = True
print(f"[AB Test] Started: {self.control_name} vs {self.variant_name}")
def record_conversion(self, variant, converted=True):
"""Record a visitor and optional conversion."""
self.results[variant]["visitors"] += 1
if converted:
self.results[variant]["conversions"] += 1
def simulate_traffic(self, n_visitors, true_effect=0):
"""Simulate traffic with an optional true effect."""
import random
for _ in range(n_visitors):
# Control
self.record_conversion("control", random.random() < 0.05)
# Variant (with optional true lift)
lift = true_effect
self.record_conversion("variant", random.random() < (0.05 + lift))
def analyze(self):
"""Analyze results with statistical significance."""
import math
c = self.results["control"]
v = self.results["variant"]
c_rate = c["conversions"] / c["visitors"] if c["visitors"] > 0 else 0
v_rate = v["conversions"] / v["visitors"] if v["visitors"] > 0 else 0
# Pooled standard error
p_pool = (c["conversions"] + v["conversions"]) / (c["visitors"] + v["visitors"])
se = math.sqrt(p_pool * (1 - p_pool) * (1/c["visitors"] + 1/v["visitors"]))
# Z-score
z = (v_rate - c_rate) / se if se > 0 else 0
# P-value (approximation)
from scipy import stats
p_value = 2 * (1 - stats.norm.cdf(abs(z))) # Two-tailed
lift = ((v_rate - c_rate) / c_rate * 100) if c_rate > 0 else 0
return {
"control_rate": round(c_rate * 100, 2),
"variant_rate": round(v_rate * 100, 2),
"lift_pct": round(lift, 2),
"z_score": round(z, 4),
"p_value": round(p_value, 4),
"significant": p_value < 0.05,
"confidence": round((1 - p_value) * 100, 2),
}
def report(self):
result = self.analyze()
print(f"\n{'='*45}")
print(f"A/B Test Results")
print(f"{'='*45}")
print(f"{'':<15} {'Visitors':<12} {'Conversions':<12} {'Rate':<10}")
print("-" * 45)
print(f"{self.control_name:<15} {self.results['control']['visitors']:<12} "
f"{self.results['control']['conversions']:<12} {result['control_rate']:<9.2f}%")
print(f"{self.variant_name:<15} {self.results['variant']['visitors']:<12} "
f"{self.results['variant']['conversions']:<12} {result['variant_rate']:<9.2f}%")
print("-" * 45)
print(f"Lift: {result['lift_pct']}%")
print(f"p-value: {result['p_value']}")
print(f"Statistically significant: {'YES ✅' if result['significant'] else 'NO ❌'}")
print(f"Confidence: {result['confidence']}%")
# Simulate a test with a 2% true effect
test = ABTestRunner("Current Page", "New Page")
test.start()
test.simulate_traffic(30000, true_effect=0.02)
test.report()Expected output:
[AB Test] Started: Current Page vs New Page
=============================================
A/B Test Results
=============================================
Visitors Conversions Rate
---------------------------------------------
Current Page 30000 1483 4.94%
New Page 30000 2098 6.99%
---------------------------------------------
Lift: 41.5%
p-value: 0.0
Statistically significant: YES ✅
Confidence: 100.0%Understanding Statistical Significance
Key Concepts
class SignificanceExplainer:
"""Explain statistical significance concepts."""
@staticmethod
def explain_p_value(p):
"""Interpret p-value."""
if p < 0.001:
return "Highly significant — strong evidence against null hypothesis"
elif p < 0.01:
return "Very significant — evidence against null hypothesis"
elif p < 0.05:
return "Significant — moderate evidence against null hypothesis"
elif p < 0.10:
return "Marginally significant — weak evidence against null hypothesis"
else:
return "Not significant — insufficient evidence to reject null"
@staticmethod
def confidence_interval(rate, n, z=1.96):
"""Calculate confidence interval for a proportion."""
import math
se = math.sqrt(rate * (1 - rate) / n) if n > 0 else 0
lower = rate - z * se
upper = rate + z * se
return (max(0, lower), min(1, upper))
@staticmethod
def explain_errors():
print("""
Type I Error (False Positive): Rejecting null when it's true
- Risk = alpha (typically 5%)
- You declare a winner but there's no real difference
Type II Error (False Negative): Failing to reject null when it's false
- Risk = beta (typically 20%)
- You miss a real winner (insufficient sample size)
Statistical Power: Probability of detecting a true effect (1 - beta)
- Target: 80% typically
- Power increases with: larger sample, larger effect, higher baseline
""")
exp = SignificanceExplainer()
print("p=0.03:", exp.explain_p_value(0.03))
# Confidence interval example
rate = 0.067
n = 30000
ci = exp.confidence_interval(rate, n)
print(f"\n95% CI for {rate*100:.1f}% conversion (n={n}): "
f"[{ci[0]*100:.2f}%, {ci[1]*100:.2f}%]")
exp.explain_errors()Expected output:
p=0.03: Significant — moderate evidence against null hypothesis
95% CI for 6.7% conversion (n=30000): [6.41%, 6.99%]
Type I Error (False Positive): Rejecting null when it's true
- Risk = alpha (typically 5%)
- You declare a winner but there's no real difference
Type II Error (False Negative): Failing to reject null when it's false
- Risk = beta (typically 20%)
- You miss a real winner (insufficient sample size)
Statistical Power: Probability of detecting a true effect (1 - beta)
- Target: 80% typically
- Power increases with: larger sample, larger effect, higher baselineMultivariate Testing (MVT)
MVT tests multiple changes simultaneously to find the best combination of elements.
class MultivariateTest:
"""Simulate a multivariate test with multiple factors."""
def __init__(self, name):
self.name = name
self.factors = {}
self.combinations = []
def add_factor(self, name, levels):
"""Add a test factor with its variants."""
self.factors[name] = levels
def generate_combinations(self):
"""Generate all combinations of factor levels."""
import itertools
factor_names = list(self.factors.keys())
level_lists = [self.factors[f] for f in factor_names]
for combo in itertools.product(*level_lists):
combination = dict(zip(factor_names, combo))
self.combinations.append(combination)
return self.combinations
def analyze(self, results):
"""Find the best performing combination."""
best = max(results.items(), key=lambda x: x[1])
print(f"Best combination: {best[0]}")
print(f"With conversion rate: {best[1]:.2f}%")
# Main effects analysis
print(f"\nMain effects:")
for factor, levels in self.factors.items():
for level in levels:
matching_combo = [k for k, v in results.items()
if f"{factor}={level}" in k]
avg = sum(results[c] for c in matching_combo) / len(matching_combo) if matching_combo else 0
print(f" {factor}={level}: avg {avg:.2f}%")
mvt = MultivariateTest("Landing Page Optimization")
mvt.add_factor("headline", ["long", "short", "question"])
mvt.add_factor("cta_color", ["blue", "green", "red"])
mvt.add_factor("image", ["hero", "product"])
combos = mvt.generate_combinations()
print(f"Total combinations: {len(combos)}")
for combo in combos[:3]:
print(f" {combo}")
# Simulated results
import random
results = {str(c): round(random.uniform(3, 12), 1) for c in combos}
mvt.analyze(results)Expected output:
Total combinations: 18
{'headline': 'long', 'cta_color': 'blue', 'image': 'hero'}
{'headline': 'long', 'cta_color': 'blue', 'image': 'product'}
{'headline': 'long', 'cta_color': 'green', 'image': 'hero'}
Best combination: {'headline': 'question', 'cta_color': 'green', 'image': 'product'}
With conversion rate: 10.5%
Main effects:
headline=long: avg 7.23%
headline=short: avg 8.05%
headline=question: avg 9.01%
cta_color=blue: avg 7.12%
cta_color=green: avg 8.89%
cta_color=red: avg 8.78%
image=hero: avg 7.54%
image=product: avg 8.94%A/B Testing Tools Comparison
| Tool | Best For | Pricing | Ease of Use |
|---|---|---|---|
| Google Optimize | GA integration | Free | Easy |
| Optimizely | Enterprise | $$$ | Medium |
| VWO | Visual editor | $$ | Easy |
| Convert | Privacy-first | $$ | Medium |
| Unbounce | Landing pages | $$ | Easy |
| Custom (in-house) | Full control | Dev time | Hard |
Common Mistakes
1. Peeking at Results
Checking significance every hour and stopping when p<0.05 inflates false positives. Decide sample size upfront and wait.
2. Running Tests Too Short
Minimum 1-2 full business cycles (7-14 days). Weekend vs weekday behavior differs. Don’t stop at 2 days.
3. Multiple Comparison Problem
Testing 10 variants vs control requires adjusted significance thresholds (Bonferroni correction: α/n comparisons).
4. Segmentation Without Planning
Finding significance only in “mobile users in California” post-hoc is data dredging. Pre-register segments.
5. Novelty Effect
Users behave differently with new designs (click everything). Run tests 2+ weeks to let novelty wear off.
Practice Questions
What is the difference between statistical significance and practical significance? Statistical significance means the result is unlikely due to chance. Practical significance means the effect size is large enough to matter for business (e.g., 0.1% lift may be significant but not worth implementing).
What sample size do you need for a test with 5% baseline, 10% MDE, 95% confidence, 80% power? Approximately 30,000+ visitors per variant. Smaller MDE requires exponentially larger samples.
What is the multiple comparison problem in MVT? Testing many variants increases the chance of false positives. With 10 variants, the family-wise error rate is 1-(0.95)^10 ≈ 40%.
When should you use multivariate testing vs A/B testing? Use A/B for simple binary changes (red vs blue button). Use MVT when testing interactions between multiple elements (headline × image × CTA).
Challenge: Design an A/B testing program for a SaaS landing page with 100,000 monthly visitors. What would you test first and how long would you run each test?
Mini Project: A/B Test Calculator
# ab_calculator.py
# A complete A/B test significance calculator
import math
from scipy import stats
class ABTestCalculator:
def __init__(self, control_visitors, control_conversions,
variant_visitors, variant_conversions):
self.cv = control_visitors
self.cc = control_conversions
self.vv = variant_visitors
self.vc = variant_conversions
def calculate(self):
c_rate = self.cc / self.cv
v_rate = self.vc / self.vv
p_pool = (self.cc + self.vc) / (self.cv + self.vv)
se = math.sqrt(p_pool * (1 - p_pool) * (1/self.cv + 1/self.vv))
z = (v_rate - c_rate) / se if se > 0 else 0
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
lift = ((v_rate - c_rate) / c_rate * 100) if c_rate > 0 else 0
# Confidence interval for the difference
diff_se = math.sqrt(c_rate * (1 - c_rate) / self.cv +
v_rate * (1 - v_rate) / self.vv)
diff_lower = (v_rate - c_rate) - 1.96 * diff_se
diff_upper = (v_rate - c_rate) + 1.96 * diff_se
return {
"control_rate": round(c_rate * 100, 2),
"variant_rate": round(v_rate * 100, 2),
"absolute_lift": round((v_rate - c_rate) * 100, 2),
"relative_lift": round(lift, 2),
"p_value": round(p_value, 4),
"significant": p_value < 0.05,
"ci_95": (round(diff_lower * 100, 2), round(diff_upper * 100, 2)),
}
# Demo
test = ABTestCalculator(10000, 500, 10000, 580)
result = test.calculate()
print("A/B Test Results")
print(f"Control: {result['control_rate']}% | Variant: {result['variant_rate']}%")
print(f"Lift: {result['relative_lift']}% (absolute: {result['absolute_lift']}pp)")
print(f"p-value: {result['p_value']} | Significant: {result['significant']}")
print(f"95% CI: [{result['ci_95'][0]}%, {result['ci_95'][1]}%]")What’s Next
Built by the developers of Doda Browser, DodaZIP, and Durga Antivirus Pro.
Built by the developers of DodaTech
Doda Browser, DodaZIP & Durga Antivirus Pro