PolicyEngine · DTrim99 · Nov 19, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/obbba_district_impacts/Congressional-Hackathon-2025 b/obbba_district_impacts/Congressional-Hackathon-2025
diff --git a/us/states/pa/data_exploration.ipynb b/us/states/pa/data_exploration.ipynb
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PA Dataset Exploration\n",
+    "\n",
+    "This notebook explores the Pennsylvania (PA) dataset to understand household counts, income distribution, and demographic characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from policyengine_us import Microsimulation\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "PA_DATASET = \"hf://policyengine/policyengine-us-data/states/PA.h5\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5801ad26ee654449ab3be3dc62d09e8b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "PA.h5:   0%|          | 0.00/149M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Load PA dataset\n",
+    "sim = Microsimulation(dataset=PA_DATASET)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of households in dataset: 68,351\n",
+      "Household count (weighted): 4,662,650\n",
+      "Person count (weighted): 13,217,679\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check dataset size\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025)\n",
+    "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n",
+    "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n",
+    "\n",
+    "print(f\"Number of households in dataset: {len(household_weight):,}\")\n",
+    "print(f\"Household count (weighted): {household_count.sum():,.0f}\")\n",
+    "print(f\"Person count (weighted): {person_count.sum():,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Income distribution:\n",
+      "  Median AGI: $73,962\n",
+      "  75th percentile: $169,351\n",
+      "  90th percentile: $404,412\n",
+      "  95th percentile: $511,573\n",
+      "  Max AGI: $3,229,514\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check household income distribution\n",
+    "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n",
+    "print(f\"Income distribution:\")\n",
+    "print(f\"  Median AGI: ${agi.median():,.0f}\")\n",
+    "print(f\"  75th percentile: ${agi.quantile(0.75):,.0f}\")\n",
+    "print(f\"  90th percentile: ${agi.quantile(0.90):,.0f}\")\n",
+    "print(f\"  95th percentile: ${agi.quantile(0.95):,.0f}\")\n",
+    "print(f\"  Max AGI: ${agi.max():,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Households with children (weighted):\n",
+      "  Total households with children: 1,489,087\n",
+      "  Households with 1 child: 720,370\n",
+      "  Households with 2 children: 504,238\n",
+      "  Households with 3+ children: 264,479\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check households with children\n",
+    "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n",
+    "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n",
+    "\n",
+    "# Create DataFrame\n",
+    "df_households = pd.DataFrame({\n",
+    "    'household_id': household_id,\n",
+    "    'is_child': is_child,\n",
+    "    'household_weight': household_weight\n",
+    "})\n",
+    "\n",
+    "# Count children per household\n",
+    "children_per_household = df_households.groupby('household_id').agg({\n",
+    "    'is_child': 'sum',\n",
+    "    'household_weight': 'first'\n",
+    "}).reset_index()\n",
+    "\n",
+    "# Calculate weighted household counts\n",
+    "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n",
+    "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n",
+    "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n",
+    "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nHouseholds with children (weighted):\")\n",
+    "print(f\"  Total households with children: {total_households_with_children:,.0f}\")\n",
+    "print(f\"  Households with 1 child: {households_with_1_child:,.0f}\")\n",
+    "print(f\"  Households with 2 children: {households_with_2_children:,.0f}\")\n",
+    "print(f\"  Households with 3+ children: {households_with_3plus_children:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Children by age:\n",
+      "  Total children under 18: 2,597,022\n",
+      "  Children under 6: 799,168\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check children by age groups\n",
+    "df = pd.DataFrame({\n",
+    "    \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n",
+    "    \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n",
+    "    \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n",
+    "    \"age\": sim.calculate(\"age\", map_to=\"person\"),\n",
+    "    \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n",
+    "})\n",
+    "\n",
+    "# Filter for children and apply weights\n",
+    "children_under_18_df = df[df['age'] < 18]\n",
+    "children_under_6_df = df[df['age'] < 6]\n",
+    "\n",
+    "# Calculate weighted totals\n",
+    "total_children = children_under_18_df['person_weight'].sum()\n",
+    "children_under_6 = children_under_6_df['person_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nChildren by age:\")\n",
+    "print(f\"  Total children under 18: {total_children:,.0f}\")\n",
+    "print(f\"  Children under 6: {children_under_6:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "============================================================\n",
+      "PA DATASET SUMMARY - WEIGHTED (Population Estimates)\n",
+      "============================================================\n",
+      "                        Metric      Value\n",
+      "    Household count (weighted)  4,662,650\n",
+      "       Person count (weighted) 13,217,679\n",
+      "                    Median AGI    $73,962\n",
+      "           75th percentile AGI   $169,351\n",
+      "           90th percentile AGI   $404,412\n",
+      "           95th percentile AGI   $511,573\n",
+      "                       Max AGI $3,229,514\n",
+      "Total households with children  1,489,087\n",
+      "       Households with 1 child    720,370\n",
+      "    Households with 2 children    504,238\n",
+      "   Households with 3+ children    264,479\n",
+      "       Total children under 18  2,597,022\n",
+      "              Children under 6    799,168\n",
+      "============================================================\n",
+      "\n",
+      "Summary saved to: pa_dataset_summary_weighted.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create weighted summary table\n",
+    "weighted_summary_data = {\n",
+    "    'Metric': [\n",
+    "        'Household count (weighted)',\n",
+    "        'Person count (weighted)',\n",
+    "        'Median AGI',\n",
+    "        '75th percentile AGI',\n",
+    "        '90th percentile AGI',\n",
+    "        '95th percentile AGI',\n",
+    "        'Max AGI',\n",
+    "        'Total households with children',\n",
+    "        'Households with 1 child',\n",
+    "        'Households with 2 children',\n",
+    "        'Households with 3+ children',\n",
+    "        'Total children under 18',\n",
+    "        'Children under 6'\n",
+    "    ],\n",
+    "    'Value': [\n",
+    "        f\"{household_count.sum():,.0f}\",\n",
+    "        f\"{person_count.sum():,.0f}\",\n",
+    "        f\"${agi.median():,.0f}\",\n",
+    "        f\"${agi.quantile(0.75):,.0f}\",\n",
+    "        f\"${agi.quantile(0.90):,.0f}\",\n",
+    "        f\"${agi.quantile(0.95):,.0f}\",\n",
+    "        f\"${agi.max():,.0f}\",\n",
+    "        f\"{total_households_with_children:,.0f}\",\n",
+    "        f\"{households_with_1_child:,.0f}\",\n",
+    "        f\"{households_with_2_children:,.0f}\",\n",
+    "        f\"{households_with_3plus_children:,.0f}\",\n",
+    "        f\"{total_children:,.0f}\",\n",
+    "        f\"{children_under_6:,.0f}\"\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "weighted_df = pd.DataFrame(weighted_summary_data)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"PA DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n",
+    "print(\"=\"*60)\n",
+    "print(weighted_df.to_string(index=False))\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "# Save table\n",
+    "weighted_df.to_csv('pa_dataset_summary_weighted.csv', index=False)\n",
+    "print(\"\\nSummary saved to: pa_dataset_summary_weighted.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "HOUSEHOLDS WITH $0 INCOME\n",
+      "======================================================================\n",
+      "Household count: 368,283\n",
+      "Percentage of all households: 7.90%\n",
+      "======================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Households with $0 income\n",
+    "agi_hh = np.array(sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\"))\n",
+    "weights = np.array(sim.calculate(\"household_weight\", period=2025))\n",
+    "\n",
+    "zero_income_mask = agi_hh == 0\n",
+    "zero_income_count = weights[zero_income_mask].sum()\n",
+    "total_households = weights.sum()\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"Household count: {zero_income_count:,.0f}\")\n",
+    "print(f\"Percentage of all households: {zero_income_count / total_households * 100:.2f}%\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "HOUSEHOLD COUNTS BY INCOME BRACKET\n",
+      "======================================================================\n",
+      "Income Bracket Households % of All Households\n",
+      "       $0-$10k    786,029              16.86%\n",
+      "     $10k-$20k    177,932               3.82%\n",
+      "     $20k-$30k    151,871               3.26%\n",
+      "     $30k-$40k    394,030               8.45%\n",
+      "     $40k-$50k    240,967               5.17%\n",
+      "     $50k-$60k    200,283               4.30%\n",
+      "======================================================================\n",
+      "\n",
+      "Total households in $0-$60k range: 1,951,112\n",
+      "Percentage of all households in $0-$60k range: 41.85%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Household counts by income brackets\n",
+    "income_brackets = [\n",
+    "    (0, 10000, \"$0-$10k\"),\n",
+    "    (10000, 20000, \"$10k-$20k\"),\n",
+    "    (20000, 30000, \"$20k-$30k\"),\n",
+    "    (30000, 40000, \"$30k-$40k\"),\n",
+    "    (40000, 50000, \"$40k-$50k\"),\n",
+    "    (50000, 60000, \"$50k-$60k\")\n",
+    "]\n",
+    "\n",
+    "bracket_data = []\n",
+    "for lower, upper, label in income_brackets:\n",
+    "    mask = (agi_hh >= lower) & (agi_hh < upper)\n",
+    "    count = weights[mask].sum()\n",
+    "    pct_of_total = (count / total_households) * 100\n",
+    "    \n",
+    "    bracket_data.append({\n",
+    "        \"Income Bracket\": label,\n",
+    "        \"Households\": f\"{count:,.0f}\",\n",
+    "        \"% of All Households\": f\"{pct_of_total:.2f}%\"\n",
+    "    })\n",
+    "\n",
+    "income_df = pd.DataFrame(bracket_data)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
+    "print(\"=\"*70)\n",
+    "print(income_df.to_string(index=False))\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Total in $0-$60k range\n",
+    "total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
+    "print(f\"\\nTotal households in $0-$60k range: {total_in_range:,.0f}\")\n",
+    "print(f\"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}