{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Statistische Inferenz: Bootstratp Konfidenzintervalle\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Patron Type CodePatron Type DefinitionTotal CheckoutsTotal RenewalsAge RangeHome Library CodeHome Library DefinitionCirculation Active MonthCirculation Active YearNotice Preference CodeNotice Preference DefinitionProvided Email AddressYear Patron RegisteredOutside of CountySupervisor District
20ADULT312225 to 34 yearsS7SunsetApril2016.0zemailTrue2010False4.0
150ADULT22645 to 54 yearsM2MarinaMay2016.0zemailTrue2010False2.0
460ADULT471625 to 34 yearsM4MercedJune2016.0pphoneFalse2010False11.0
480ADULT873825 to 34 yearsW2West PortalFebruary2016.0zemailTrue2010False7.0
1550ADULT281545 to 54 yearsS7SunsetJune2016.0zemailTrue2010False4.0
\n", "
" ], "text/plain": [ " Patron Type Code Patron Type Definition Total Checkouts Total Renewals \\\n", "2 0 ADULT 31 22 \n", "15 0 ADULT 22 6 \n", "46 0 ADULT 47 16 \n", "48 0 ADULT 87 38 \n", "155 0 ADULT 28 15 \n", "\n", " Age Range Home Library Code Home Library Definition \\\n", "2 25 to 34 years S7 Sunset \n", "15 45 to 54 years M2 Marina \n", "46 25 to 34 years M4 Merced \n", "48 25 to 34 years W2 West Portal \n", "155 45 to 54 years S7 Sunset \n", "\n", " Circulation Active Month Circulation Active Year Notice Preference Code \\\n", "2 April 2016.0 z \n", "15 May 2016.0 z \n", "46 June 2016.0 p \n", "48 February 2016.0 z \n", "155 June 2016.0 z \n", "\n", " Notice Preference Definition Provided Email Address \\\n", "2 email True \n", "15 email True \n", "46 phone False \n", "48 email True \n", "155 email True \n", "\n", " Year Patron Registered Outside of County Supervisor District \n", "2 2010 False 4.0 \n", "15 2010 False 2.0 \n", "46 2010 False 11.0 \n", "48 2010 False 7.0 \n", "155 2010 False 4.0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df = pd.read_csv(\"../data/Library_Usage.csv\", na_values=\"None\")\n", "df = df[(\n", " (df['Year Patron Registered'] == 2010) & \n", " (df['Circulation Active Year'] == 2016)\n", ")]\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CI for the mean of `Total Renewals`" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "91.6004630193737" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Total Renewals'].mean()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "bt_means = []\n", "alpha = 0.10\n", "S= 10000\n", "for i in range(S):\n", " stat = df['Total Renewals'].sample(len(df), replace=True).mean()\n", " bt_means.append(stat)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4437" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(bt_means).between(89, 92).mean()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.05 87.807469\n", "0.95 95.467436\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(bt_means).quantile((alpha/2, 1-alpha/2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Case-Study: CI for young and senior library users" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(507, 698)\n", "324.55424063116374 327.4928366762178\n", "140.0 116.5\n", "295188.887862416 385846.37369323324\n" ] } ], "source": [ "young = df['Total Checkouts'][df['Patron Type Definition'] == 'YOUNG ADULT']\n", "senior = df['Total Checkouts'][df['Patron Type Definition'] == 'SENIOR']\n", "print((len(young), len(senior)))\n", "print(young.mean(), senior.mean())\n", "print(young.median(), senior.median())\n", "print(young.var(), senior.var())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bootstrap CI for the median difference" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.05 -1.5\n", "0.95 49.5\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bt_diffs = []\n", "alpha = 0.1\n", "repl = 10000\n", "for i in range(repl):\n", " x = young.sample(len(young), replace=True).median()\n", " y = senior.sample(len(senior), replace=True).median()\n", " bt_diffs.append(x - y)\n", "pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bootstrap CI for the mean difference" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.05 -59.918641\n", "0.95 52.128678\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bt_diffs = []\n", "alpha = 0.1\n", "repl = 10000\n", "for i in range(repl):\n", " x = young.sample(len(young), replace=True).mean()\n", " y = senior.sample(len(senior), replace=True).mean()\n", " bt_diffs.append(x- y)\n", "pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Theoretically derived confidence intervals for the difference in means" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(-58.397892481879445, 52.520700391771335)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import statsmodels.stats.api as sms\n", "\n", "cm = sms.CompareMeans(sms.DescrStatsW(young), \n", " sms.DescrStatsW(senior))\n", "cm.tconfint_diff(usevar='unequal', alpha=0.10)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }