Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions obbba_district_impacts/Congressional-Hackathon-2025
Submodule Congressional-Hackathon-2025 added at 3f6d05
397 changes: 397 additions & 0 deletions us/states/pa/data_exploration.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,397 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PA Dataset Exploration\n",
"\n",
"This notebook explores the Pennsylvania (PA) dataset to understand household counts, income distribution, and demographic characteristics."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from policyengine_us import Microsimulation\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"PA_DATASET = \"hf://policyengine/policyengine-us-data/states/PA.h5\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5801ad26ee654449ab3be3dc62d09e8b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"PA.h5: 0%| | 0.00/149M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Load PA dataset\n",
"sim = Microsimulation(dataset=PA_DATASET)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of households in dataset: 68,351\n",
"Household count (weighted): 4,662,650\n",
"Person count (weighted): 13,217,679\n"
]
}
],
"source": [
"# Check dataset size\n",
"household_weight = sim.calculate(\"household_weight\", period=2025)\n",
"household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n",
"person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n",
"\n",
"print(f\"Number of households in dataset: {len(household_weight):,}\")\n",
"print(f\"Household count (weighted): {household_count.sum():,.0f}\")\n",
"print(f\"Person count (weighted): {person_count.sum():,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Income distribution:\n",
" Median AGI: $73,962\n",
" 75th percentile: $169,351\n",
" 90th percentile: $404,412\n",
" 95th percentile: $511,573\n",
" Max AGI: $3,229,514\n"
]
}
],
"source": [
"# Check household income distribution\n",
"agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n",
"print(f\"Income distribution:\")\n",
"print(f\" Median AGI: ${agi.median():,.0f}\")\n",
"print(f\" 75th percentile: ${agi.quantile(0.75):,.0f}\")\n",
"print(f\" 90th percentile: ${agi.quantile(0.90):,.0f}\")\n",
"print(f\" 95th percentile: ${agi.quantile(0.95):,.0f}\")\n",
"print(f\" Max AGI: ${agi.max():,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Households with children (weighted):\n",
" Total households with children: 1,489,087\n",
" Households with 1 child: 720,370\n",
" Households with 2 children: 504,238\n",
" Households with 3+ children: 264,479\n"
]
}
],
"source": [
"# Check households with children\n",
"is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n",
"household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n",
"household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n",
"\n",
"# Create DataFrame\n",
"df_households = pd.DataFrame({\n",
" 'household_id': household_id,\n",
" 'is_child': is_child,\n",
" 'household_weight': household_weight\n",
"})\n",
"\n",
"# Count children per household\n",
"children_per_household = df_households.groupby('household_id').agg({\n",
" 'is_child': 'sum',\n",
" 'household_weight': 'first'\n",
"}).reset_index()\n",
"\n",
"# Calculate weighted household counts\n",
"total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n",
"households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n",
"households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n",
"households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n",
"\n",
"print(f\"\\nHouseholds with children (weighted):\")\n",
"print(f\" Total households with children: {total_households_with_children:,.0f}\")\n",
"print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n",
"print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n",
"print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Children by age:\n",
" Total children under 18: 2,597,022\n",
" Children under 6: 799,168\n"
]
}
],
"source": [
"# Check children by age groups\n",
"df = pd.DataFrame({\n",
" \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n",
" \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n",
" \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n",
" \"age\": sim.calculate(\"age\", map_to=\"person\"),\n",
" \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n",
"})\n",
"\n",
"# Filter for children and apply weights\n",
"children_under_18_df = df[df['age'] < 18]\n",
"children_under_6_df = df[df['age'] < 6]\n",
"\n",
"# Calculate weighted totals\n",
"total_children = children_under_18_df['person_weight'].sum()\n",
"children_under_6 = children_under_6_df['person_weight'].sum()\n",
"\n",
"print(f\"\\nChildren by age:\")\n",
"print(f\" Total children under 18: {total_children:,.0f}\")\n",
"print(f\" Children under 6: {children_under_6:,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"============================================================\n",
"PA DATASET SUMMARY - WEIGHTED (Population Estimates)\n",
"============================================================\n",
" Metric Value\n",
" Household count (weighted) 4,662,650\n",
" Person count (weighted) 13,217,679\n",
" Median AGI $73,962\n",
" 75th percentile AGI $169,351\n",
" 90th percentile AGI $404,412\n",
" 95th percentile AGI $511,573\n",
" Max AGI $3,229,514\n",
"Total households with children 1,489,087\n",
" Households with 1 child 720,370\n",
" Households with 2 children 504,238\n",
" Households with 3+ children 264,479\n",
" Total children under 18 2,597,022\n",
" Children under 6 799,168\n",
"============================================================\n",
"\n",
"Summary saved to: pa_dataset_summary_weighted.csv\n"
]
}
],
"source": [
"# Create weighted summary table\n",
"weighted_summary_data = {\n",
" 'Metric': [\n",
" 'Household count (weighted)',\n",
" 'Person count (weighted)',\n",
" 'Median AGI',\n",
" '75th percentile AGI',\n",
" '90th percentile AGI',\n",
" '95th percentile AGI',\n",
" 'Max AGI',\n",
" 'Total households with children',\n",
" 'Households with 1 child',\n",
" 'Households with 2 children',\n",
" 'Households with 3+ children',\n",
" 'Total children under 18',\n",
" 'Children under 6'\n",
" ],\n",
" 'Value': [\n",
" f\"{household_count.sum():,.0f}\",\n",
" f\"{person_count.sum():,.0f}\",\n",
" f\"${agi.median():,.0f}\",\n",
" f\"${agi.quantile(0.75):,.0f}\",\n",
" f\"${agi.quantile(0.90):,.0f}\",\n",
" f\"${agi.quantile(0.95):,.0f}\",\n",
" f\"${agi.max():,.0f}\",\n",
" f\"{total_households_with_children:,.0f}\",\n",
" f\"{households_with_1_child:,.0f}\",\n",
" f\"{households_with_2_children:,.0f}\",\n",
" f\"{households_with_3plus_children:,.0f}\",\n",
" f\"{total_children:,.0f}\",\n",
" f\"{children_under_6:,.0f}\"\n",
" ]\n",
"}\n",
"\n",
"weighted_df = pd.DataFrame(weighted_summary_data)\n",
"\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"PA DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n",
"print(\"=\"*60)\n",
"print(weighted_df.to_string(index=False))\n",
"print(\"=\"*60)\n",
"\n",
"# Save table\n",
"weighted_df.to_csv('pa_dataset_summary_weighted.csv', index=False)\n",
"print(\"\\nSummary saved to: pa_dataset_summary_weighted.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"HOUSEHOLDS WITH $0 INCOME\n",
"======================================================================\n",
"Household count: 368,283\n",
"Percentage of all households: 7.90%\n",
"======================================================================\n"
]
}
],
"source": [
"# Households with $0 income\n",
"agi_hh = np.array(sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\"))\n",
"weights = np.array(sim.calculate(\"household_weight\", period=2025))\n",
"\n",
"zero_income_mask = agi_hh == 0\n",
"zero_income_count = weights[zero_income_mask].sum()\n",
"total_households = weights.sum()\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
"print(\"=\"*70)\n",
"print(f\"Household count: {zero_income_count:,.0f}\")\n",
"print(f\"Percentage of all households: {zero_income_count / total_households * 100:.2f}%\")\n",
"print(\"=\"*70)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"HOUSEHOLD COUNTS BY INCOME BRACKET\n",
"======================================================================\n",
"Income Bracket Households % of All Households\n",
" $0-$10k 786,029 16.86%\n",
" $10k-$20k 177,932 3.82%\n",
" $20k-$30k 151,871 3.26%\n",
" $30k-$40k 394,030 8.45%\n",
" $40k-$50k 240,967 5.17%\n",
" $50k-$60k 200,283 4.30%\n",
"======================================================================\n",
"\n",
"Total households in $0-$60k range: 1,951,112\n",
"Percentage of all households in $0-$60k range: 41.85%\n"
]
}
],
"source": [
"# Household counts by income brackets\n",
"income_brackets = [\n",
" (0, 10000, \"$0-$10k\"),\n",
" (10000, 20000, \"$10k-$20k\"),\n",
" (20000, 30000, \"$20k-$30k\"),\n",
" (30000, 40000, \"$30k-$40k\"),\n",
" (40000, 50000, \"$40k-$50k\"),\n",
" (50000, 60000, \"$50k-$60k\")\n",
"]\n",
"\n",
"bracket_data = []\n",
"for lower, upper, label in income_brackets:\n",
" mask = (agi_hh >= lower) & (agi_hh < upper)\n",
" count = weights[mask].sum()\n",
" pct_of_total = (count / total_households) * 100\n",
" \n",
" bracket_data.append({\n",
" \"Income Bracket\": label,\n",
" \"Households\": f\"{count:,.0f}\",\n",
" \"% of All Households\": f\"{pct_of_total:.2f}%\"\n",
" })\n",
"\n",
"income_df = pd.DataFrame(bracket_data)\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
"print(\"=\"*70)\n",
"print(income_df.to_string(index=False))\n",
"print(\"=\"*70)\n",
"\n",
"# Total in $0-$60k range\n",
"total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
"print(f\"\\nTotal households in $0-$60k range: {total_in_range:,.0f}\")\n",
"print(f\"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading