awk — Probability & Combinatorics

Factorial
echo "10" | awk '{
    f=1; for(i=2;i<=$1;i++) f*=i
    printf "%d! = %.0f\n", $1, f
}'
# 10! = 3628800
Permutations P(n,r) — ordered selections
# P(10,3) = 10!/(10-3)! = 720
awk 'BEGIN{
    n=10; r=3
    p=1; for(i=n;i>n-r;i--) p*=i
    printf "P(%d,%d) = %.0f\n", n, r, p
}'
Combinations C(n,r) — unordered selections
# C(10,3) = 10!/(3!×7!) = 120
awk 'BEGIN{
    n=10; r=3
    num=1; den=1
    for(i=1;i<=r;i++){num*=(n-i+1); den*=i}
    printf "C(%d,%d) = %.0f\n", n, r, num/den
}'
Password entropy — bits of entropy for a given character set and length
# How strong is a 16-char password from 95 printable ASCII characters?
awk 'BEGIN{
    charset=95; length=16
    entropy=length*log(charset)/log(2)
    printf "%d chars from %d symbols = %.1f bits of entropy\n", length, charset, entropy
    printf "keyspace: 2^%.0f ≈ 10^%.0f\n", entropy, entropy*log(2)/log(10)
}'
Binomial probability — P(X=k) = C(n,k) * p^k * (1-p)^(n-k)
# 10 servers, each has 1% chance of failure. P(exactly 2 fail)?
awk 'BEGIN{
    n=10; k=2; p=0.01
    # C(n,k)
    c=1; for(i=1;i<=k;i++) c*=(n-i+1)/i
    prob=c * p^k * (1-p)^(n-k)
    printf "P(X=%d) = %.6f (%.4f%%)\n", k, prob, prob*100
}'
Binomial distribution table — all outcomes
# 5 deployments, 90% success rate each
awk 'BEGIN{
    n=5; p=0.9
    printf "%3s %10s %10s\n", "k", "P(X=k)", "P(X<=k)"
    cumul=0
    for(k=0;k<=n;k++){
        c=1; for(i=1;i<=k;i++) c*=(n-i+1)/i
        prob=c*p^k*(1-p)^(n-k)
        cumul+=prob
        printf "%3d %10.6f %10.6f\n", k, prob, cumul
    }
}'
Expected value and variance
# Dice roll simulation — expected value from observed data
cat <<'EOF' | awk '{sum+=$1; sumsq+=$1^2; n++} END{
    ev=sum/n
    var=sumsq/n-ev^2
    printf "E[X]=%.4f Var(X)=%.4f σ=%.4f\n", ev, var, sqrt(var)
    printf "(fair die: E=3.5 Var=2.917)\n"
}'
3
5
2
6
1
4
3
6
2
5
4
1
EOF
Poisson probability — P(X=k) = (lambda^k * e^-lambda) / k!
# Average 3 alerts per hour. P(exactly 5 in an hour)?
awk 'BEGIN{
    lambda=3; k=5
    # k!
    f=1; for(i=2;i<=k;i++) f*=i
    prob=(lambda^k * exp(-lambda))/f
    printf "P(X=%d | λ=%d) = %.6f (%.4f%%)\n", k, lambda, prob, prob*100
}'
SLA uptime probability — compound availability
# 3 services in series, each 99.9% uptime
awk 'BEGIN{
    n=3; p=0.999
    combined=p^n
    printf "%d services @ %.3f%% each\n", n, p*100
    printf "combined availability: %.6f%%\n", combined*100
    printf "expected downtime: %.2f minutes/year\n", (1-combined)*525960
}'
Monte Carlo estimation of pi
awk 'BEGIN{
    srand()
    n=100000; inside=0
    for(i=1;i<=n;i++){
        x=rand(); y=rand()
        if(x^2+y^2<=1) inside++
    }
    pi_est=4.0*inside/n
    printf "π ≈ %.6f (n=%d, error=%.6f)\n", pi_est, n, pi_est-atan2(0,-1)
}'
Bayes' theorem — test accuracy
# Disease prevalence: 1%. Test sensitivity: 95%. Specificity: 90%.
# P(disease | positive test)?
awk 'BEGIN{
    prev=0.01; sens=0.95; spec=0.90
    p_pos=sens*prev + (1-spec)*(1-prev)
    p_disease_given_pos=(sens*prev)/p_pos
    printf "P(disease|positive) = %.4f (%.2f%%)\n", p_disease_given_pos, p_disease_given_pos*100
    printf "Most positives are false positives when prevalence is low\n"
}'
Birthday problem — P(collision) for n items in m slots
# How many random session IDs before 50% collision chance? (m = 2^32)
awk 'BEGIN{
    m=2^32
    p=1.0
    for(n=1;n<=100000;n++){
        p*=(m-n+1)/m
        if(1-p>=0.5){printf "50%% collision at n=%d (pool=%d)\n", n, m; break}
    }
    printf "approximation: sqrt(2*m*ln(2)) = %.0f\n", sqrt(2*m*log(2))
}'