# Statistische Inferenz: Bootstratp Konfidenzintervalle



In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/Library_Usage.csv", na_values="None")
df = df[(
    (df['Year Patron Registered'] == 2010) & 
    (df['Circulation Active Year'] == 2016)
)]
df.head()

Unnamed: 0,Patron Type Code,Patron Type Definition,Total Checkouts,Total Renewals,Age Range,Home Library Code,Home Library Definition,Circulation Active Month,Circulation Active Year,Notice Preference Code,Notice Preference Definition,Provided Email Address,Year Patron Registered,Outside of County,Supervisor District
2,0,ADULT,31,22,25 to 34 years,S7,Sunset,April,2016.0,z,email,True,2010,False,4.0
15,0,ADULT,22,6,45 to 54 years,M2,Marina,May,2016.0,z,email,True,2010,False,2.0
46,0,ADULT,47,16,25 to 34 years,M4,Merced,June,2016.0,p,phone,False,2010,False,11.0
48,0,ADULT,87,38,25 to 34 years,W2,West Portal,February,2016.0,z,email,True,2010,False,7.0
155,0,ADULT,28,15,45 to 54 years,S7,Sunset,June,2016.0,z,email,True,2010,False,4.0


## CI for the mean of `Total Renewals`

In [4]:
df['Total Renewals'].mean()

91.6004630193737

In [5]:
bt_means = []
alpha = 0.10
S= 10000
for i in range(S):
    stat = df['Total Renewals'].sample(len(df), replace=True).mean()
    bt_means.append(stat)

In [6]:
pd.Series(bt_means).between(89, 92).mean()

0.4437

In [7]:
pd.Series(bt_means).quantile((alpha/2, 1-alpha/2))

0.05    87.807469
0.95    95.467436
dtype: float64

## Case-Study: CI for young and senior library users

In [8]:
young = df['Total Checkouts'][df['Patron Type Definition'] == 'YOUNG ADULT']
senior = df['Total Checkouts'][df['Patron Type Definition'] == 'SENIOR']
print((len(young), len(senior)))
print(young.mean(), senior.mean())
print(young.median(), senior.median())
print(young.var(), senior.var())

(507, 698)
324.55424063116374 327.4928366762178
140.0 116.5
295188.887862416 385846.37369323324


### Bootstrap CI for the median difference

In [9]:
bt_diffs = []
alpha = 0.1
repl = 10000
for i in range(repl):
    x = young.sample(len(young), replace=True).median()
    y = senior.sample(len(senior), replace=True).median()
    bt_diffs.append(x - y)
pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))

0.05    -1.5
0.95    49.5
dtype: float64

### Bootstrap CI for the mean difference

In [10]:
bt_diffs = []
alpha = 0.1
repl = 10000
for i in range(repl):
    x = young.sample(len(young), replace=True).mean()
    y = senior.sample(len(senior), replace=True).mean()
    bt_diffs.append(x- y)
pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))

0.05   -59.918641
0.95    52.128678
dtype: float64

## Theoretically derived confidence intervals for the difference in means

In [11]:
import statsmodels.stats.api as sms

cm = sms.CompareMeans(sms.DescrStatsW(young), 
                      sms.DescrStatsW(senior))
cm.tconfint_diff(usevar='unequal', alpha=0.10)

(-58.397892481879445, 52.520700391771335)