From e6a0ac506376158e359b6060e218b88a8efce9cc Mon Sep 17 00:00:00 2001
From: David Trimmer <david@policyengine.org>
Date: Tue, 18 Nov 2025 19:09:45 -0500
Subject: [PATCH 1/5] PA EITC Fixes #95

---
 .../Congressional-Hackathon-2025              |   1 +
 us/states/pa/data_exploration.ipynb           | 397 ++++++++++++++++
 us/states/pa/pa_dataset_summary_weighted.csv  |  14 +
 us/states/pa/pa_eitc_reform_analysis.ipynb    | 445 ++++++++++++++++++
 us/states/pa/pa_eitc_reform_results.csv       |   2 +
 5 files changed, 859 insertions(+)
 create mode 160000 obbba_district_impacts/Congressional-Hackathon-2025
 create mode 100644 us/states/pa/data_exploration.ipynb
 create mode 100644 us/states/pa/pa_dataset_summary_weighted.csv
 create mode 100644 us/states/pa/pa_eitc_reform_analysis.ipynb
 create mode 100644 us/states/pa/pa_eitc_reform_results.csv

diff --git a/obbba_district_impacts/Congressional-Hackathon-2025 b/obbba_district_impacts/Congressional-Hackathon-2025
new file mode 160000
index 0000000..3f6d05e
--- /dev/null
+++ b/obbba_district_impacts/Congressional-Hackathon-2025
@@ -0,0 +1 @@
+Subproject commit 3f6d05e76400c6e396a3a4eddd34a7b3f6919fc3
diff --git a/us/states/pa/data_exploration.ipynb b/us/states/pa/data_exploration.ipynb
new file mode 100644
index 0000000..843c424
--- /dev/null
+++ b/us/states/pa/data_exploration.ipynb
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PA Dataset Exploration\n",
+    "\n",
+    "This notebook explores the Pennsylvania (PA) dataset to understand household counts, income distribution, and demographic characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\dtsax\\envs\\pe\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from policyengine_us import Microsimulation\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load PA dataset\n",
+    "sim = Microsimulation(dataset='hf://policyengine/test/PA.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of households in dataset: 20,180\n",
+      "Household count (weighted): 4,435,467\n",
+      "Person count (weighted): 12,863,313\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check dataset size\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025)\n",
+    "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n",
+    "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n",
+    "\n",
+    "print(f\"Number of households in dataset: {len(household_weight):,}\")\n",
+    "print(f\"Household count (weighted): {household_count.sum():,.0f}\")\n",
+    "print(f\"Person count (weighted): {person_count.sum():,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Income distribution:\n",
+      "  Median AGI: $71,734\n",
+      "  75th percentile: $149,456\n",
+      "  90th percentile: $268,015\n",
+      "  95th percentile: $379,910\n",
+      "  Max AGI: $1,838,621\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check household income distribution\n",
+    "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n",
+    "print(f\"Income distribution:\")\n",
+    "print(f\"  Median AGI: ${agi.median():,.0f}\")\n",
+    "print(f\"  75th percentile: ${agi.quantile(0.75):,.0f}\")\n",
+    "print(f\"  90th percentile: ${agi.quantile(0.90):,.0f}\")\n",
+    "print(f\"  95th percentile: ${agi.quantile(0.95):,.0f}\")\n",
+    "print(f\"  Max AGI: ${agi.max():,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Households with children (weighted):\n",
+      "  Total households with children: 1,457,610\n",
+      "  Households with 1 child: 734,446\n",
+      "  Households with 2 children: 481,892\n",
+      "  Households with 3+ children: 241,273\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check households with children\n",
+    "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n",
+    "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n",
+    "\n",
+    "# Create DataFrame\n",
+    "df_households = pd.DataFrame({\n",
+    "    'household_id': household_id,\n",
+    "    'is_child': is_child,\n",
+    "    'household_weight': household_weight\n",
+    "})\n",
+    "\n",
+    "# Count children per household\n",
+    "children_per_household = df_households.groupby('household_id').agg({\n",
+    "    'is_child': 'sum',\n",
+    "    'household_weight': 'first'\n",
+    "}).reset_index()\n",
+    "\n",
+    "# Calculate weighted household counts\n",
+    "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n",
+    "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n",
+    "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n",
+    "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nHouseholds with children (weighted):\")\n",
+    "print(f\"  Total households with children: {total_households_with_children:,.0f}\")\n",
+    "print(f\"  Households with 1 child: {households_with_1_child:,.0f}\")\n",
+    "print(f\"  Households with 2 children: {households_with_2_children:,.0f}\")\n",
+    "print(f\"  Households with 3+ children: {households_with_3plus_children:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Children by age:\n",
+      "  Total children under 18: 2,494,202\n",
+      "  Children under 6: 780,623\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check children by age groups\n",
+    "df = pd.DataFrame({\n",
+    "    \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n",
+    "    \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n",
+    "    \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n",
+    "    \"age\": sim.calculate(\"age\", map_to=\"person\"),\n",
+    "    \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n",
+    "})\n",
+    "\n",
+    "# Filter for children and apply weights\n",
+    "children_under_18_df = df[df['age'] < 18]\n",
+    "children_under_6_df = df[df['age'] < 6]\n",
+    "\n",
+    "# Calculate weighted totals\n",
+    "total_children = children_under_18_df['person_weight'].sum()\n",
+    "children_under_6 = children_under_6_df['person_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nChildren by age:\")\n",
+    "print(f\"  Total children under 18: {total_children:,.0f}\")\n",
+    "print(f\"  Children under 6: {children_under_6:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "============================================================\n",
+      "PA DATASET SUMMARY - WEIGHTED (Population Estimates)\n",
+      "============================================================\n",
+      "                        Metric      Value\n",
+      "    Household count (weighted)  4,435,467\n",
+      "       Person count (weighted) 12,863,313\n",
+      "                    Median AGI    $71,734\n",
+      "           75th percentile AGI   $149,456\n",
+      "           90th percentile AGI   $268,015\n",
+      "           95th percentile AGI   $379,910\n",
+      "                       Max AGI $1,838,621\n",
+      "Total households with children  1,457,610\n",
+      "       Households with 1 child    734,446\n",
+      "    Households with 2 children    481,892\n",
+      "   Households with 3+ children    241,273\n",
+      "       Total children under 18  2,494,202\n",
+      "              Children under 6    780,623\n",
+      "============================================================\n",
+      "\n",
+      "Summary saved to: pa_dataset_summary_weighted.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create weighted summary table\n",
+    "weighted_summary_data = {\n",
+    "    'Metric': [\n",
+    "        'Household count (weighted)',\n",
+    "        'Person count (weighted)',\n",
+    "        'Median AGI',\n",
+    "        '75th percentile AGI',\n",
+    "        '90th percentile AGI',\n",
+    "        '95th percentile AGI',\n",
+    "        'Max AGI',\n",
+    "        'Total households with children',\n",
+    "        'Households with 1 child',\n",
+    "        'Households with 2 children',\n",
+    "        'Households with 3+ children',\n",
+    "        'Total children under 18',\n",
+    "        'Children under 6'\n",
+    "    ],\n",
+    "    'Value': [\n",
+    "        f\"{household_count.sum():,.0f}\",\n",
+    "        f\"{person_count.sum():,.0f}\",\n",
+    "        f\"${agi.median():,.0f}\",\n",
+    "        f\"${agi.quantile(0.75):,.0f}\",\n",
+    "        f\"${agi.quantile(0.90):,.0f}\",\n",
+    "        f\"${agi.quantile(0.95):,.0f}\",\n",
+    "        f\"${agi.max():,.0f}\",\n",
+    "        f\"{total_households_with_children:,.0f}\",\n",
+    "        f\"{households_with_1_child:,.0f}\",\n",
+    "        f\"{households_with_2_children:,.0f}\",\n",
+    "        f\"{households_with_3plus_children:,.0f}\",\n",
+    "        f\"{total_children:,.0f}\",\n",
+    "        f\"{children_under_6:,.0f}\"\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "weighted_df = pd.DataFrame(weighted_summary_data)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"PA DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n",
+    "print(\"=\"*60)\n",
+    "print(weighted_df.to_string(index=False))\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "# Save table\n",
+    "weighted_df.to_csv('pa_dataset_summary_weighted.csv', index=False)\n",
+    "print(\"\\nSummary saved to: pa_dataset_summary_weighted.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Households with $0 income (uses agi and household_weight from earlier cells)\n",
+    "zero_income_mask = agi == 0\n",
+    "zero_income_weighted = household_weight[zero_income_mask].sum()\n",
+    "zero_income_unweighted = zero_income_mask.sum()\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"Weighted count:   {zero_income_weighted:,.0f}\")\n",
+    "print(f\"Unweighted count: {zero_income_unweighted:,}\")\n",
+    "print(f\"\\nPercentage of all households with $0 income:\")\n",
+    "print(f\"  {zero_income_weighted / household_weight.sum() * 100:.2f}%\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "==========================================================================================\n",
+      "HOUSEHOLD COUNTS BY INCOME BRACKET\n",
+      "==========================================================================================\n",
+      "Income Bracket Households (Weighted) % of All Households Households (Unweighted)\n",
+      "       $0-$10k           239,725,630              15.27%                   4,540\n",
+      "     $10k-$20k            37,046,016               2.36%                     765\n",
+      "     $20k-$30k            44,020,114               2.80%                     723\n",
+      "     $30k-$40k           108,601,465               6.92%                   1,264\n",
+      "     $40k-$50k            77,534,722               4.94%                   1,034\n",
+      "     $50k-$60k            66,831,837               4.26%                     937\n",
+      "==========================================================================================\n",
+      "\n",
+      "Total households in $0-$60k range:\n",
+      "  Weighted: 573,759,784\n",
+      "  Unweighted: 9,263\n",
+      "\n",
+      "Percentage of all households in $0-$60k range:\n",
+      "  36.56%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Household counts by income brackets (uses agi and household_weight from earlier cells)\n",
+    "# Define income brackets from $0-$10k up to $50k-$60k\n",
+    "income_brackets = [\n",
+    "    (0, 10000, \"$0-$10k\"),\n",
+    "    (10000, 20000, \"$10k-$20k\"),\n",
+    "    (20000, 30000, \"$20k-$30k\"),\n",
+    "    (30000, 40000, \"$30k-$40k\"),\n",
+    "    (40000, 50000, \"$40k-$50k\"),\n",
+    "    (50000, 60000, \"$50k-$60k\")\n",
+    "]\n",
+    "\n",
+    "# Get total households for percentage calculation\n",
+    "total_households_weighted = household_weight.sum()\n",
+    "\n",
+    "# Calculate weighted household counts for each bracket\n",
+    "bracket_data = []\n",
+    "for lower, upper, label in income_brackets:\n",
+    "    mask = (agi >= lower) & (agi < upper)\n",
+    "    weighted_count = household_weight[mask].sum()\n",
+    "    unweighted_count = mask.sum()\n",
+    "    pct_of_total = (weighted_count / total_households_weighted) * 100\n",
+    "    \n",
+    "    bracket_data.append({\n",
+    "        \"Income Bracket\": label,\n",
+    "        \"Households (Weighted)\": f\"{weighted_count:,.0f}\",\n",
+    "        \"% of All Households\": f\"{pct_of_total:.2f}%\",\n",
+    "        \"Households (Unweighted)\": f\"{unweighted_count:,}\"\n",
+    "    })\n",
+    "\n",
+    "income_df = pd.DataFrame(bracket_data)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*90)\n",
+    "print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
+    "print(\"=\"*90)\n",
+    "print(income_df.to_string(index=False))\n",
+    "print(\"=\"*90)\n",
+    "\n",
+    "# Also calculate total across all brackets\n",
+    "total_weighted = sum([household_weight[(agi >= lower) & (agi < upper)].sum() for lower, upper, _ in income_brackets])\n",
+    "total_unweighted = sum([((agi >= lower) & (agi < upper)).sum() for lower, upper, _ in income_brackets])\n",
+    "print(f\"\\nTotal households in $0-$60k range:\")\n",
+    "print(f\"  Weighted: {total_weighted:,.0f}\")\n",
+    "print(f\"  Unweighted: {total_unweighted:,}\")\n",
+    "print(f\"\\nPercentage of all households in $0-$60k range:\")\n",
+    "print(f\"  {total_weighted / total_households_weighted * 100:.2f}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pe",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/us/states/pa/pa_dataset_summary_weighted.csv b/us/states/pa/pa_dataset_summary_weighted.csv
new file mode 100644
index 0000000..3d520be
--- /dev/null
+++ b/us/states/pa/pa_dataset_summary_weighted.csv
@@ -0,0 +1,14 @@
+Metric,Value
+Household count (weighted),"4,435,467"
+Person count (weighted),"12,863,313"
+Median AGI,"$71,734"
+75th percentile AGI,"$149,456"
+90th percentile AGI,"$268,015"
+95th percentile AGI,"$379,910"
+Max AGI,"$1,838,621"
+Total households with children,"1,457,610"
+Households with 1 child,"734,446"
+Households with 2 children,"481,892"
+Households with 3+ children,"241,273"
+Total children under 18,"2,494,202"
+Children under 6,"780,623"
diff --git a/us/states/pa/pa_eitc_reform_analysis.ipynb b/us/states/pa/pa_eitc_reform_analysis.ipynb
new file mode 100644
index 0000000..525ea0b
--- /dev/null
+++ b/us/states/pa/pa_eitc_reform_analysis.ipynb
@@ -0,0 +1,445 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pennsylvania EITC Reform Analysis (2025)\n",
+    "\n",
+    "This notebook analyzes the impact of Pennsylvania's Working Pennsylvanians Tax Credit (EITC).\n",
+    "\n",
+    "## Baseline\n",
+    "- PA EITC is set to 0% (no state EITC)\n",
+    "\n",
+    "## Reform (Current Law)\n",
+    "- PA EITC matches 10% of the federal EITC\n",
+    "\n",
+    "## Metrics\n",
+    "We calculate:\n",
+    "- Budgetary impact (net cost)\n",
+    "- Winners (percentage of population affected)\n",
+    "- Overall poverty impact\n",
+    "- Child poverty impact"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from policyengine_us import Microsimulation\n",
+    "from policyengine_core.reforms import Reform\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helper Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_poverty(sim, period=2025, child_only=False):\n",
+    "    \"\"\"\n",
+    "    Calculate poverty rate and count.\n",
+    "    \n",
+    "    Args:\n",
+    "        sim: Microsimulation object\n",
+    "        period: Year to analyze\n",
+    "        child_only: If True, only count children under 18\n",
+    "    \n",
+    "    Returns:\n",
+    "        poverty_rate: Weighted poverty rate\n",
+    "        people_in_poverty: Unweighted count\n",
+    "    \"\"\"\n",
+    "    age = sim.calculate(\"age\", period=period)\n",
+    "    is_in_poverty = sim.calculate(\"person_in_poverty\", period=period)\n",
+    "    person_weight = sim.calculate(\"person_weight\", period=period)\n",
+    "    \n",
+    "    if child_only:\n",
+    "        mask = age < 18\n",
+    "    else:\n",
+    "        mask = np.ones_like(age, dtype=bool)\n",
+    "    \n",
+    "    # Weighted poverty rate\n",
+    "    weighted_in_poverty = (is_in_poverty[mask] * person_weight[mask]).sum()\n",
+    "    weighted_total = person_weight[mask].sum()\n",
+    "    poverty_rate = weighted_in_poverty / weighted_total if weighted_total > 0 else 0\n",
+    "    \n",
+    "    # Unweighted count\n",
+    "    unweighted_in_poverty = is_in_poverty[mask].sum()\n",
+    "    unweighted_total = mask.sum()\n",
+    "    \n",
+    "    return {\n",
+    "        \"poverty_rate\": poverty_rate,\n",
+    "        \"people_in_poverty\": unweighted_in_poverty,\n",
+    "        \"total_people\": unweighted_total\n",
+    "    }\n",
+    "\n",
+    "def calculate_budgetary_impact(baseline_sim, reform_sim, variable, period=2025):\n",
+    "    \"\"\"\n",
+    "    Calculate the budgetary impact (net cost) of a reform.\n",
+    "    \"\"\"\n",
+    "    baseline_value = baseline_sim.calculate(variable, period=period, map_to=\"household\").sum()\n",
+    "    reform_value = reform_sim.calculate(variable, period=period, map_to=\"household\").sum()\n",
+    "    \n",
+    "    return reform_value - baseline_value\n",
+    "\n",
+    "def calculate_winners(baseline_sim, reform_sim, period=2025):\n",
+    "    \"\"\"\n",
+    "    Calculate winners from a reform at the person level.\n",
+    "    Winners: People in households with higher net income under reform\n",
+    "    Returns percentage of total population.\n",
+    "    \"\"\"\n",
+    "    # Get household-level income change\n",
+    "    baseline_income = baseline_sim.calculate(\"household_net_income\", period=period, map_to=\"household\")\n",
+    "    reform_income = reform_sim.calculate(\"household_net_income\", period=period, map_to=\"household\")\n",
+    "    income_change = reform_income - baseline_income\n",
+    "    \n",
+    "    # Map to person level\n",
+    "    household_id_person = baseline_sim.calculate(\"household_id\", period=period, map_to=\"person\")\n",
+    "    household_id_household = baseline_sim.calculate(\"household_id\", period=period, map_to=\"household\")\n",
+    "    \n",
+    "    # Create mapping of household_id to income_change\n",
+    "    income_change_dict = dict(zip(household_id_household, income_change))\n",
+    "    \n",
+    "    # Map income change to each person\n",
+    "    person_income_change = np.array([income_change_dict.get(hh_id, 0) for hh_id in household_id_person])\n",
+    "    \n",
+    "    # Count people who are winners\n",
+    "    people_winning = (person_income_change > 1).sum()  # Gained more than $1\n",
+    "    total_people = len(person_income_change)\n",
+    "    \n",
+    "    # Calculate percentage\n",
+    "    pct_winners = (people_winning / total_people * 100) if total_people > 0 else 0\n",
+    "    \n",
+    "    # Average gain for winning households\n",
+    "    avg_gain = income_change[income_change > 1].mean() if (income_change > 1).sum() > 0 else 0\n",
+    "    \n",
+    "    return {\n",
+    "        \"people_winning\": people_winning,\n",
+    "        \"total_people\": total_people,\n",
+    "        \"pct_winners\": pct_winners,\n",
+    "        \"avg_gain\": avg_gain\n",
+    "    }\n",
+    "\n",
+    "def format_currency(value):\n",
+    "    \"\"\"Format value as currency in millions.\"\"\"\n",
+    "    return f\"${value/1e6:.2f}M\"\n",
+    "\n",
+    "def format_percent(value):\n",
+    "    \"\"\"Format value as percentage.\"\"\"\n",
+    "    return f\"{value*100:.2f}%\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Baseline and Reform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reform functions defined!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def create_baseline():\n",
+    "    \"\"\"Baseline: PA EITC at 0%\"\"\"\n",
+    "    return Reform.from_dict(\n",
+    "        {\n",
+    "            \"gov.states.pa.tax.income.credits.eitc.match\": {\n",
+    "                \"2025-01-01.2100-12-31\": 0.0\n",
+    "            }\n",
+    "        },\n",
+    "        country_id=\"us\",\n",
+    "    )\n",
+    "\n",
+    "def create_reform():\n",
+    "    \"\"\"Reform: PA EITC at 10% (current law)\"\"\"\n",
+    "    return Reform.from_dict(\n",
+    "        {\n",
+    "            \"gov.states.pa.tax.income.credits.eitc.match\": {\n",
+    "                \"2025-01-01.2100-12-31\": 0.1\n",
+    "            }\n",
+    "        },\n",
+    "        country_id=\"us\",\n",
+    "    )\n",
+    "\n",
+    "print(\"Reform functions defined!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Simulations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading baseline (PA EITC at 0%)...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✓ Baseline loaded\n",
+      "\n",
+      "Loading reform (PA EITC at 10%)...\n",
+      "✓ Reform loaded\n",
+      "\n",
+      "============================================================\n",
+      "All simulations ready!\n",
+      "============================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Loading baseline (PA EITC at 0%)...\")\n",
+    "baseline_reform = create_baseline()\n",
+    "baseline = Microsimulation(dataset='hf://policyengine/test/PA.h5', reform=baseline_reform)\n",
+    "print(\"✓ Baseline loaded\")\n",
+    "\n",
+    "print(\"\\nLoading reform (PA EITC at 10%)...\")\n",
+    "reform = create_reform()\n",
+    "reform_sim = Microsimulation(dataset='hf://policyengine/test/PA.h5', reform=reform)\n",
+    "print(\"✓ Reform loaded\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"All simulations ready!\")\n",
+    "print(\"=\"*60)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Calculate Impacts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✓ All impacts calculated\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Baseline metrics\n",
+    "baseline_overall_pov = calculate_poverty(baseline, child_only=False)\n",
+    "baseline_child_pov = calculate_poverty(baseline, child_only=True)\n",
+    "\n",
+    "# Reform metrics\n",
+    "reform_overall_pov = calculate_poverty(reform_sim, child_only=False)\n",
+    "reform_child_pov = calculate_poverty(reform_sim, child_only=True)\n",
+    "\n",
+    "# Budgetary impact\n",
+    "eitc_cost = calculate_budgetary_impact(baseline, reform_sim, \"pa_eitc\")\n",
+    "\n",
+    "# Winners (at person level)\n",
+    "winners = calculate_winners(baseline, reform_sim)\n",
+    "\n",
+    "print(\"✓ All impacts calculated\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Results Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "================================================================================\n",
+      "PA EITC REFORM IMPACTS (2025)\n",
+      "Baseline: PA EITC at 0% | Reform: PA EITC at 10% federal match\n",
+      "================================================================================\n",
+      "\n",
+      "================================BUDGETARY IMPACT================================\n",
+      "PA EITC net cost:              $88.92M\n",
+      "\n",
+      "==============================WINNERS (POPULATION)==============================\n",
+      "People gaining income:         7,408 (14.42% of population)\n",
+      "  Average household gain:      $170.28\n",
+      "\n",
+      "============================POVERTY IMPACT - OVERALL============================\n",
+      "Baseline poverty rate:         16.11%\n",
+      "Reform poverty rate:           15.93%\n",
+      "Absolute reduction:            0.18%\n",
+      "Relative reduction:            1.10%\n",
+      "People lifted from poverty:    19,239\n",
+      "\n",
+      "===========================POVERTY IMPACT - CHILDREN============================\n",
+      "Baseline child poverty rate:   20.61%\n",
+      "Reform child poverty rate:     20.24%\n",
+      "Absolute reduction:            0.38%\n",
+      "Relative reduction:            1.83%\n",
+      "Children lifted from poverty:  6,983\n",
+      "================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"PA EITC REFORM IMPACTS (2025)\")\n",
+    "print(\"Baseline: PA EITC at 0% | Reform: PA EITC at 10% federal match\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "print(f\"\\n{'BUDGETARY IMPACT':=^80}\")\n",
+    "print(f\"PA EITC net cost:              {format_currency(eitc_cost)}\")\n",
+    "\n",
+    "print(f\"\\n{'WINNERS (POPULATION)':=^80}\")\n",
+    "print(f\"People gaining income:         {winners['people_winning']:,} ({winners['pct_winners']:.2f}% of population)\")\n",
+    "print(f\"  Average household gain:      ${winners['avg_gain']:,.2f}\")\n",
+    "\n",
+    "print(f\"\\n{'POVERTY IMPACT - OVERALL':=^80}\")\n",
+    "print(f\"Baseline poverty rate:         {format_percent(baseline_overall_pov['poverty_rate'])}\")\n",
+    "print(f\"Reform poverty rate:           {format_percent(reform_overall_pov['poverty_rate'])}\")\n",
+    "overall_pov_reduction = baseline_overall_pov['poverty_rate'] - reform_overall_pov['poverty_rate']\n",
+    "overall_pov_pct_reduction = (overall_pov_reduction / baseline_overall_pov['poverty_rate'] * 100) if baseline_overall_pov['poverty_rate'] > 0 else 0\n",
+    "print(f\"Absolute reduction:            {format_percent(overall_pov_reduction)}\")\n",
+    "print(f\"Relative reduction:            {overall_pov_pct_reduction:.2f}%\")\n",
+    "print(f\"People lifted from poverty:    {int(baseline_overall_pov['people_in_poverty'] - reform_overall_pov['people_in_poverty']):,}\")\n",
+    "\n",
+    "print(f\"\\n{'POVERTY IMPACT - CHILDREN':=^80}\")\n",
+    "print(f\"Baseline child poverty rate:   {format_percent(baseline_child_pov['poverty_rate'])}\")\n",
+    "print(f\"Reform child poverty rate:     {format_percent(reform_child_pov['poverty_rate'])}\")\n",
+    "child_pov_reduction = baseline_child_pov['poverty_rate'] - reform_child_pov['poverty_rate']\n",
+    "child_pov_pct_reduction = (child_pov_reduction / baseline_child_pov['poverty_rate'] * 100) if baseline_child_pov['poverty_rate'] > 0 else 0\n",
+    "print(f\"Absolute reduction:            {format_percent(child_pov_reduction)}\")\n",
+    "print(f\"Relative reduction:            {child_pov_pct_reduction:.2f}%\")\n",
+    "print(f\"Children lifted from poverty:  {int(baseline_child_pov['people_in_poverty'] - reform_child_pov['people_in_poverty']):,}\")\n",
+    "print(\"=\"*80)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Export Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "==============================================================================================================\n",
+      "PA EITC REFORM SUMMARY\n",
+      "==============================================================================================================\n",
+      "Scenario PA EITC Match Net Cost Overall Poverty Change (%) Child Poverty Change (%) % Population Winning\n",
+      "  Reform           10%  $88.92M                      1.10%                    1.83%               14.42%\n",
+      "==============================================================================================================\n",
+      "\n",
+      "✓ Exported to: pa_eitc_reform_results.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate poverty changes\n",
+    "overall_pov_reduction = baseline_overall_pov['poverty_rate'] - reform_overall_pov['poverty_rate']\n",
+    "overall_pov_pct_reduction = (overall_pov_reduction / baseline_overall_pov['poverty_rate'] * 100) if baseline_overall_pov['poverty_rate'] > 0 else 0\n",
+    "child_pov_reduction = baseline_child_pov['poverty_rate'] - reform_child_pov['poverty_rate']\n",
+    "child_pov_pct_reduction = (child_pov_reduction / baseline_child_pov['poverty_rate'] * 100) if baseline_child_pov['poverty_rate'] > 0 else 0\n",
+    "\n",
+    "# Create results DataFrame (reform only)\n",
+    "results = [\n",
+    "    {\n",
+    "        \"Scenario\": \"Reform\",\n",
+    "        \"PA EITC Match\": \"10%\",\n",
+    "        \"Net Cost\": format_currency(eitc_cost),\n",
+    "        \"Overall Poverty Change (%)\": f\"{overall_pov_pct_reduction:.2f}%\",\n",
+    "        \"Child Poverty Change (%)\": f\"{child_pov_pct_reduction:.2f}%\",\n",
+    "        \"% Population Winning\": f\"{winners['pct_winners']:.2f}%\"\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "df_results = pd.DataFrame(results)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*110)\n",
+    "print(\"PA EITC REFORM SUMMARY\")\n",
+    "print(\"=\"*110)\n",
+    "print(df_results.to_string(index=False))\n",
+    "print(\"=\"*110)\n",
+    "\n",
+    "# Export to CSV\n",
+    "df_results.to_csv(\"pa_eitc_reform_results.csv\", index=False)\n",
+    "print(\"\\n✓ Exported to: pa_eitc_reform_results.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pe",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/us/states/pa/pa_eitc_reform_results.csv b/us/states/pa/pa_eitc_reform_results.csv
new file mode 100644
index 0000000..826ebaf
--- /dev/null
+++ b/us/states/pa/pa_eitc_reform_results.csv
@@ -0,0 +1,2 @@
+Scenario,PA EITC Match,Net Cost,Overall Poverty Change (%),Child Poverty Change (%),% Population Winning
+Reform,10%,$88.92M,1.10%,1.83%,14.42%

From 7c7479901d346b04f1075f83000434fd5bb14321 Mon Sep 17 00:00:00 2001
From: David Trimmer <david@policyengine.org>
Date: Mon, 1 Dec 2025 14:23:29 -0500
Subject: [PATCH 2/5] PA EITC Fixes #95

---
 us/states/pa/data_exploration.ipynb        | 48 ++++++++++++++-----
 us/states/pa/pa_eitc_reform_analysis.ipynb | 55 +++++++++-------------
 us/states/pa/pa_eitc_reform_results.csv    |  2 +-
 3 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/us/states/pa/data_exploration.ipynb b/us/states/pa/data_exploration.ipynb
index 843c424..7338362 100644
--- a/us/states/pa/data_exploration.ipynb
+++ b/us/states/pa/data_exploration.ipynb
@@ -274,13 +274,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "HOUSEHOLDS WITH $0 INCOME\n",
+      "======================================================================\n",
+      "Weighted count:   106,517,064\n",
+      "Unweighted count: 445,821.65727878007\n",
+      "\n",
+      "Percentage of all households with $0 income:\n",
+      "  6.79%\n",
+      "======================================================================\n"
+     ]
+    }
+   ],
    "source": [
-    "# Households with $0 income (uses agi and household_weight from earlier cells)\n",
-    "zero_income_mask = agi == 0\n",
-    "zero_income_weighted = household_weight[zero_income_mask].sum()\n",
+    "# Households with $0 income\n",
+    "# Re-fetch household_weight at household level (was overwritten in cell 5 at person level)\n",
+    "household_weight_hh = sim.calculate(\"household_weight\", period=2025)\n",
+    "agi_hh = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n",
+    "\n",
+    "zero_income_mask = agi_hh == 0\n",
+    "zero_income_weighted = household_weight_hh[zero_income_mask].sum()\n",
     "zero_income_unweighted = zero_income_mask.sum()\n",
     "\n",
     "print(\"\\n\" + \"=\"*70)\n",
@@ -289,13 +310,13 @@
     "print(f\"Weighted count:   {zero_income_weighted:,.0f}\")\n",
     "print(f\"Unweighted count: {zero_income_unweighted:,}\")\n",
     "print(f\"\\nPercentage of all households with $0 income:\")\n",
-    "print(f\"  {zero_income_weighted / household_weight.sum() * 100:.2f}%\")\n",
+    "print(f\"  {zero_income_weighted / household_weight_hh.sum() * 100:.2f}%\")\n",
     "print(\"=\"*70)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -325,7 +346,8 @@
     }
    ],
    "source": [
-    "# Household counts by income brackets (uses agi and household_weight from earlier cells)\n",
+    "# Household counts by income brackets\n",
+    "# Use household-level variables from previous cell\n",
     "# Define income brackets from $0-$10k up to $50k-$60k\n",
     "income_brackets = [\n",
     "    (0, 10000, \"$0-$10k\"),\n",
@@ -337,13 +359,13 @@
     "]\n",
     "\n",
     "# Get total households for percentage calculation\n",
-    "total_households_weighted = household_weight.sum()\n",
+    "total_households_weighted = household_weight_hh.sum()\n",
     "\n",
     "# Calculate weighted household counts for each bracket\n",
     "bracket_data = []\n",
     "for lower, upper, label in income_brackets:\n",
-    "    mask = (agi >= lower) & (agi < upper)\n",
-    "    weighted_count = household_weight[mask].sum()\n",
+    "    mask = (agi_hh >= lower) & (agi_hh < upper)\n",
+    "    weighted_count = household_weight_hh[mask].sum()\n",
     "    unweighted_count = mask.sum()\n",
     "    pct_of_total = (weighted_count / total_households_weighted) * 100\n",
     "    \n",
@@ -363,8 +385,8 @@
     "print(\"=\"*90)\n",
     "\n",
     "# Also calculate total across all brackets\n",
-    "total_weighted = sum([household_weight[(agi >= lower) & (agi < upper)].sum() for lower, upper, _ in income_brackets])\n",
-    "total_unweighted = sum([((agi >= lower) & (agi < upper)).sum() for lower, upper, _ in income_brackets])\n",
+    "total_weighted = sum([household_weight_hh[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
+    "total_unweighted = sum([((agi_hh >= lower) & (agi_hh < upper)).sum() for lower, upper, _ in income_brackets])\n",
     "print(f\"\\nTotal households in $0-$60k range:\")\n",
     "print(f\"  Weighted: {total_weighted:,.0f}\")\n",
     "print(f\"  Unweighted: {total_unweighted:,}\")\n",
diff --git a/us/states/pa/pa_eitc_reform_analysis.ipynb b/us/states/pa/pa_eitc_reform_analysis.ipynb
index 525ea0b..d76387a 100644
--- a/us/states/pa/pa_eitc_reform_analysis.ipynb
+++ b/us/states/pa/pa_eitc_reform_analysis.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -149,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -195,27 +195,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading baseline (PA EITC at 0%)...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "Loading baseline (PA EITC at 0%)...\n",
       "✓ Baseline loaded\n",
       "\n",
       "Loading reform (PA EITC at 10%)...\n",
@@ -252,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -290,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -311,18 +298,18 @@
       "  Average household gain:      $170.28\n",
       "\n",
       "============================POVERTY IMPACT - OVERALL============================\n",
-      "Baseline poverty rate:         16.11%\n",
-      "Reform poverty rate:           15.93%\n",
-      "Absolute reduction:            0.18%\n",
-      "Relative reduction:            1.10%\n",
-      "People lifted from poverty:    19,239\n",
+      "Baseline poverty rate:         16.33%\n",
+      "Reform poverty rate:           16.31%\n",
+      "Absolute reduction:            0.02%\n",
+      "Relative reduction:            0.14%\n",
+      "People lifted from poverty:    5,273\n",
       "\n",
       "===========================POVERTY IMPACT - CHILDREN============================\n",
-      "Baseline child poverty rate:   20.61%\n",
-      "Reform child poverty rate:     20.24%\n",
-      "Absolute reduction:            0.38%\n",
-      "Relative reduction:            1.83%\n",
-      "Children lifted from poverty:  6,983\n",
+      "Baseline child poverty rate:   21.03%\n",
+      "Reform child poverty rate:     21.03%\n",
+      "Absolute reduction:            0.00%\n",
+      "Relative reduction:            0.00%\n",
+      "Children lifted from poverty:  0\n",
       "================================================================================\n"
      ]
     }
@@ -369,7 +356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -381,7 +368,7 @@
       "PA EITC REFORM SUMMARY\n",
       "==============================================================================================================\n",
       "Scenario PA EITC Match Net Cost Overall Poverty Change (%) Child Poverty Change (%) % Population Winning\n",
-      "  Reform           10%  $88.92M                      1.10%                    1.83%               14.42%\n",
+      "  Reform           10%  $88.92M                      0.14%                    0.00%               14.42%\n",
       "==============================================================================================================\n",
       "\n",
       "✓ Exported to: pa_eitc_reform_results.csv\n"
@@ -423,7 +410,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -437,7 +424,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/us/states/pa/pa_eitc_reform_results.csv b/us/states/pa/pa_eitc_reform_results.csv
index 826ebaf..9c2ae22 100644
--- a/us/states/pa/pa_eitc_reform_results.csv
+++ b/us/states/pa/pa_eitc_reform_results.csv
@@ -1,2 +1,2 @@
 Scenario,PA EITC Match,Net Cost,Overall Poverty Change (%),Child Poverty Change (%),% Population Winning
-Reform,10%,$88.92M,1.10%,1.83%,14.42%
+Reform,10%,$88.92M,0.14%,0.00%,14.42%

From c17c07619341ffa38518e19eb703eb3d20fee110 Mon Sep 17 00:00:00 2001
From: David Trimmer <david@policyengine.org>
Date: Mon, 1 Dec 2025 15:11:53 -0500
Subject: [PATCH 3/5] new data

---
 us/states/pa/data_exploration.ipynb          | 180 ++++++++-----------
 us/states/pa/pa_dataset_summary_weighted.csv |  26 +--
 us/states/pa/pa_eitc_reform_analysis.ipynb   |  46 +++--
 us/states/pa/pa_eitc_reform_results.csv      |   2 +-
 us/states/pa/test_dataset.ipynb              |  62 +++++++
 5 files changed, 183 insertions(+), 133 deletions(-)
 create mode 100644 us/states/pa/test_dataset.ipynb

diff --git a/us/states/pa/data_exploration.ipynb b/us/states/pa/data_exploration.ipynb
index 7338362..1438223 100644
--- a/us/states/pa/data_exploration.ipynb
+++ b/us/states/pa/data_exploration.ipynb
@@ -13,20 +13,13 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\dtsax\\envs\\pe\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from policyengine_us import Microsimulation\n",
     "import pandas as pd\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "\n",
+    "PA_DATASET = \"hf://policyengine/policyengine-us-data/states/PA.h5\""
    ]
   },
   {
@@ -35,16 +28,23 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5801ad26ee654449ab3be3dc62d09e8b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "PA.h5:   0%|          | 0.00/149M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
     "# Load PA dataset\n",
-    "sim = Microsimulation(dataset='hf://policyengine/test/PA.h5')"
+    "sim = Microsimulation(dataset=PA_DATASET)"
    ]
   },
   {
@@ -56,9 +56,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of households in dataset: 20,180\n",
-      "Household count (weighted): 4,435,467\n",
-      "Person count (weighted): 12,863,313\n"
+      "Number of households in dataset: 68,351\n",
+      "Household count (weighted): 4,662,650\n",
+      "Person count (weighted): 13,217,679\n"
      ]
     }
    ],
@@ -83,11 +83,11 @@
      "output_type": "stream",
      "text": [
       "Income distribution:\n",
-      "  Median AGI: $71,734\n",
-      "  75th percentile: $149,456\n",
-      "  90th percentile: $268,015\n",
-      "  95th percentile: $379,910\n",
-      "  Max AGI: $1,838,621\n"
+      "  Median AGI: $73,962\n",
+      "  75th percentile: $169,351\n",
+      "  90th percentile: $404,412\n",
+      "  95th percentile: $511,573\n",
+      "  Max AGI: $3,229,514\n"
      ]
     }
    ],
@@ -113,10 +113,10 @@
      "text": [
       "\n",
       "Households with children (weighted):\n",
-      "  Total households with children: 1,457,610\n",
-      "  Households with 1 child: 734,446\n",
-      "  Households with 2 children: 481,892\n",
-      "  Households with 3+ children: 241,273\n"
+      "  Total households with children: 1,489,087\n",
+      "  Households with 1 child: 720,370\n",
+      "  Households with 2 children: 504,238\n",
+      "  Households with 3+ children: 264,479\n"
      ]
     }
    ],
@@ -163,8 +163,8 @@
      "text": [
       "\n",
       "Children by age:\n",
-      "  Total children under 18: 2,494,202\n",
-      "  Children under 6: 780,623\n"
+      "  Total children under 18: 2,597,022\n",
+      "  Children under 6: 799,168\n"
      ]
     }
    ],
@@ -205,19 +205,19 @@
       "PA DATASET SUMMARY - WEIGHTED (Population Estimates)\n",
       "============================================================\n",
       "                        Metric      Value\n",
-      "    Household count (weighted)  4,435,467\n",
-      "       Person count (weighted) 12,863,313\n",
-      "                    Median AGI    $71,734\n",
-      "           75th percentile AGI   $149,456\n",
-      "           90th percentile AGI   $268,015\n",
-      "           95th percentile AGI   $379,910\n",
-      "                       Max AGI $1,838,621\n",
-      "Total households with children  1,457,610\n",
-      "       Households with 1 child    734,446\n",
-      "    Households with 2 children    481,892\n",
-      "   Households with 3+ children    241,273\n",
-      "       Total children under 18  2,494,202\n",
-      "              Children under 6    780,623\n",
+      "    Household count (weighted)  4,662,650\n",
+      "       Person count (weighted) 13,217,679\n",
+      "                    Median AGI    $73,962\n",
+      "           75th percentile AGI   $169,351\n",
+      "           90th percentile AGI   $404,412\n",
+      "           95th percentile AGI   $511,573\n",
+      "                       Max AGI $3,229,514\n",
+      "Total households with children  1,489,087\n",
+      "       Households with 1 child    720,370\n",
+      "    Households with 2 children    504,238\n",
+      "   Households with 3+ children    264,479\n",
+      "       Total children under 18  2,597,022\n",
+      "              Children under 6    799,168\n",
       "============================================================\n",
       "\n",
       "Summary saved to: pa_dataset_summary_weighted.csv\n"
@@ -274,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -285,38 +285,32 @@
       "======================================================================\n",
       "HOUSEHOLDS WITH $0 INCOME\n",
       "======================================================================\n",
-      "Weighted count:   106,517,064\n",
-      "Unweighted count: 445,821.65727878007\n",
-      "\n",
-      "Percentage of all households with $0 income:\n",
-      "  6.79%\n",
+      "Household count: 368,283\n",
+      "Percentage of all households: 7.90%\n",
       "======================================================================\n"
      ]
     }
    ],
    "source": [
     "# Households with $0 income\n",
-    "# Re-fetch household_weight at household level (was overwritten in cell 5 at person level)\n",
-    "household_weight_hh = sim.calculate(\"household_weight\", period=2025)\n",
-    "agi_hh = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n",
+    "agi_hh = np.array(sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\"))\n",
+    "weights = np.array(sim.calculate(\"household_weight\", period=2025))\n",
     "\n",
     "zero_income_mask = agi_hh == 0\n",
-    "zero_income_weighted = household_weight_hh[zero_income_mask].sum()\n",
-    "zero_income_unweighted = zero_income_mask.sum()\n",
+    "zero_income_count = weights[zero_income_mask].sum()\n",
+    "total_households = weights.sum()\n",
     "\n",
     "print(\"\\n\" + \"=\"*70)\n",
     "print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
     "print(\"=\"*70)\n",
-    "print(f\"Weighted count:   {zero_income_weighted:,.0f}\")\n",
-    "print(f\"Unweighted count: {zero_income_unweighted:,}\")\n",
-    "print(f\"\\nPercentage of all households with $0 income:\")\n",
-    "print(f\"  {zero_income_weighted / household_weight_hh.sum() * 100:.2f}%\")\n",
+    "print(f\"Household count: {zero_income_count:,.0f}\")\n",
+    "print(f\"Percentage of all households: {zero_income_count / total_households * 100:.2f}%\")\n",
     "print(\"=\"*70)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -324,31 +318,25 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "==========================================================================================\n",
+      "======================================================================\n",
       "HOUSEHOLD COUNTS BY INCOME BRACKET\n",
-      "==========================================================================================\n",
-      "Income Bracket Households (Weighted) % of All Households Households (Unweighted)\n",
-      "       $0-$10k           239,725,630              15.27%                   4,540\n",
-      "     $10k-$20k            37,046,016               2.36%                     765\n",
-      "     $20k-$30k            44,020,114               2.80%                     723\n",
-      "     $30k-$40k           108,601,465               6.92%                   1,264\n",
-      "     $40k-$50k            77,534,722               4.94%                   1,034\n",
-      "     $50k-$60k            66,831,837               4.26%                     937\n",
-      "==========================================================================================\n",
-      "\n",
-      "Total households in $0-$60k range:\n",
-      "  Weighted: 573,759,784\n",
-      "  Unweighted: 9,263\n",
+      "======================================================================\n",
+      "Income Bracket Households % of All Households\n",
+      "       $0-$10k    786,029              16.86%\n",
+      "     $10k-$20k    177,932               3.82%\n",
+      "     $20k-$30k    151,871               3.26%\n",
+      "     $30k-$40k    394,030               8.45%\n",
+      "     $40k-$50k    240,967               5.17%\n",
+      "     $50k-$60k    200,283               4.30%\n",
+      "======================================================================\n",
       "\n",
-      "Percentage of all households in $0-$60k range:\n",
-      "  36.56%\n"
+      "Total households in $0-$60k range: 1,951,112\n",
+      "Percentage of all households in $0-$60k range: 41.85%\n"
      ]
     }
    ],
    "source": [
     "# Household counts by income brackets\n",
-    "# Use household-level variables from previous cell\n",
-    "# Define income brackets from $0-$10k up to $50k-$60k\n",
     "income_brackets = [\n",
     "    (0, 10000, \"$0-$10k\"),\n",
     "    (10000, 20000, \"$10k-$20k\"),\n",
@@ -358,46 +346,36 @@
     "    (50000, 60000, \"$50k-$60k\")\n",
     "]\n",
     "\n",
-    "# Get total households for percentage calculation\n",
-    "total_households_weighted = household_weight_hh.sum()\n",
-    "\n",
-    "# Calculate weighted household counts for each bracket\n",
     "bracket_data = []\n",
     "for lower, upper, label in income_brackets:\n",
     "    mask = (agi_hh >= lower) & (agi_hh < upper)\n",
-    "    weighted_count = household_weight_hh[mask].sum()\n",
-    "    unweighted_count = mask.sum()\n",
-    "    pct_of_total = (weighted_count / total_households_weighted) * 100\n",
+    "    count = weights[mask].sum()\n",
+    "    pct_of_total = (count / total_households) * 100\n",
     "    \n",
     "    bracket_data.append({\n",
     "        \"Income Bracket\": label,\n",
-    "        \"Households (Weighted)\": f\"{weighted_count:,.0f}\",\n",
-    "        \"% of All Households\": f\"{pct_of_total:.2f}%\",\n",
-    "        \"Households (Unweighted)\": f\"{unweighted_count:,}\"\n",
+    "        \"Households\": f\"{count:,.0f}\",\n",
+    "        \"% of All Households\": f\"{pct_of_total:.2f}%\"\n",
     "    })\n",
     "\n",
     "income_df = pd.DataFrame(bracket_data)\n",
     "\n",
-    "print(\"\\n\" + \"=\"*90)\n",
+    "print(\"\\n\" + \"=\"*70)\n",
     "print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
-    "print(\"=\"*90)\n",
+    "print(\"=\"*70)\n",
     "print(income_df.to_string(index=False))\n",
-    "print(\"=\"*90)\n",
+    "print(\"=\"*70)\n",
     "\n",
-    "# Also calculate total across all brackets\n",
-    "total_weighted = sum([household_weight_hh[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
-    "total_unweighted = sum([((agi_hh >= lower) & (agi_hh < upper)).sum() for lower, upper, _ in income_brackets])\n",
-    "print(f\"\\nTotal households in $0-$60k range:\")\n",
-    "print(f\"  Weighted: {total_weighted:,.0f}\")\n",
-    "print(f\"  Unweighted: {total_unweighted:,}\")\n",
-    "print(f\"\\nPercentage of all households in $0-$60k range:\")\n",
-    "print(f\"  {total_weighted / total_households_weighted * 100:.2f}%\")"
+    "# Total in $0-$60k range\n",
+    "total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
+    "print(f\"\\nTotal households in $0-$60k range: {total_in_range:,.0f}\")\n",
+    "print(f\"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%\")"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -411,7 +389,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/us/states/pa/pa_dataset_summary_weighted.csv b/us/states/pa/pa_dataset_summary_weighted.csv
index 3d520be..a81a10c 100644
--- a/us/states/pa/pa_dataset_summary_weighted.csv
+++ b/us/states/pa/pa_dataset_summary_weighted.csv
@@ -1,14 +1,14 @@
 Metric,Value
-Household count (weighted),"4,435,467"
-Person count (weighted),"12,863,313"
-Median AGI,"$71,734"
-75th percentile AGI,"$149,456"
-90th percentile AGI,"$268,015"
-95th percentile AGI,"$379,910"
-Max AGI,"$1,838,621"
-Total households with children,"1,457,610"
-Households with 1 child,"734,446"
-Households with 2 children,"481,892"
-Households with 3+ children,"241,273"
-Total children under 18,"2,494,202"
-Children under 6,"780,623"
+Household count (weighted),"4,662,650"
+Person count (weighted),"13,217,679"
+Median AGI,"$73,962"
+75th percentile AGI,"$169,351"
+90th percentile AGI,"$404,412"
+95th percentile AGI,"$511,573"
+Max AGI,"$3,229,514"
+Total households with children,"1,489,087"
+Households with 1 child,"720,370"
+Households with 2 children,"504,238"
+Households with 3+ children,"264,479"
+Total children under 18,"2,597,022"
+Children under 6,"799,168"
diff --git a/us/states/pa/pa_eitc_reform_analysis.ipynb b/us/states/pa/pa_eitc_reform_analysis.ipynb
index d76387a..51b3393 100644
--- a/us/states/pa/pa_eitc_reform_analysis.ipynb
+++ b/us/states/pa/pa_eitc_reform_analysis.ipynb
@@ -24,14 +24,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from policyengine_us import Microsimulation\n",
     "from policyengine_core.reforms import Reform\n",
     "import pandas as pd\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "\n",
+    "PA_DATASET = \"hf://policyengine/policyengine-us-data/states/PA.h5\""
    ]
   },
   {
@@ -217,12 +219,12 @@
    "source": [
     "print(\"Loading baseline (PA EITC at 0%)...\")\n",
     "baseline_reform = create_baseline()\n",
-    "baseline = Microsimulation(dataset='hf://policyengine/test/PA.h5', reform=baseline_reform)\n",
+    "baseline = Microsimulation(dataset=PA_DATASET, reform=baseline_reform)\n",
     "print(\"✓ Baseline loaded\")\n",
     "\n",
     "print(\"\\nLoading reform (PA EITC at 10%)...\")\n",
     "reform = create_reform()\n",
-    "reform_sim = Microsimulation(dataset='hf://policyengine/test/PA.h5', reform=reform)\n",
+    "reform_sim = Microsimulation(dataset=PA_DATASET, reform=reform)\n",
     "print(\"✓ Reform loaded\")\n",
     "\n",
     "print(\"\\n\" + \"=\"*60)\n",
@@ -242,6 +244,14 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Invalid values for enum StateGroup: ['PA']. These will be encoded as index 0.\n",
+      "Invalid values for enum StateGroup: ['PA']. These will be encoded as index 0.\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -291,25 +301,25 @@
       "================================================================================\n",
       "\n",
       "================================BUDGETARY IMPACT================================\n",
-      "PA EITC net cost:              $88.92M\n",
+      "PA EITC net cost:              $221.73M\n",
       "\n",
       "==============================WINNERS (POPULATION)==============================\n",
-      "People gaining income:         7,408 (14.42% of population)\n",
-      "  Average household gain:      $170.28\n",
+      "People gaining income:         18,872 (9.03% of population)\n",
+      "  Average household gain:      $298.55\n",
       "\n",
       "============================POVERTY IMPACT - OVERALL============================\n",
-      "Baseline poverty rate:         16.33%\n",
-      "Reform poverty rate:           16.31%\n",
-      "Absolute reduction:            0.02%\n",
-      "Relative reduction:            0.14%\n",
-      "People lifted from poverty:    5,273\n",
+      "Baseline poverty rate:         12.24%\n",
+      "Reform poverty rate:           11.68%\n",
+      "Absolute reduction:            0.56%\n",
+      "Relative reduction:            4.58%\n",
+      "People lifted from poverty:    43,702\n",
       "\n",
       "===========================POVERTY IMPACT - CHILDREN============================\n",
-      "Baseline child poverty rate:   21.03%\n",
-      "Reform child poverty rate:     21.03%\n",
-      "Absolute reduction:            0.00%\n",
-      "Relative reduction:            0.00%\n",
-      "Children lifted from poverty:  0\n",
+      "Baseline child poverty rate:   10.96%\n",
+      "Reform child poverty rate:     10.82%\n",
+      "Absolute reduction:            0.14%\n",
+      "Relative reduction:            1.29%\n",
+      "Children lifted from poverty:  4,276\n",
       "================================================================================\n"
      ]
     }
@@ -368,7 +378,7 @@
       "PA EITC REFORM SUMMARY\n",
       "==============================================================================================================\n",
       "Scenario PA EITC Match Net Cost Overall Poverty Change (%) Child Poverty Change (%) % Population Winning\n",
-      "  Reform           10%  $88.92M                      0.14%                    0.00%               14.42%\n",
+      "  Reform           10% $221.73M                      4.58%                    1.29%                9.03%\n",
       "==============================================================================================================\n",
       "\n",
       "✓ Exported to: pa_eitc_reform_results.csv\n"
diff --git a/us/states/pa/pa_eitc_reform_results.csv b/us/states/pa/pa_eitc_reform_results.csv
index 9c2ae22..0c80080 100644
--- a/us/states/pa/pa_eitc_reform_results.csv
+++ b/us/states/pa/pa_eitc_reform_results.csv
@@ -1,2 +1,2 @@
 Scenario,PA EITC Match,Net Cost,Overall Poverty Change (%),Child Poverty Change (%),% Population Winning
-Reform,10%,$88.92M,0.14%,0.00%,14.42%
+Reform,10%,$221.73M,4.58%,1.29%,9.03%
diff --git a/us/states/pa/test_dataset.ipynb b/us/states/pa/test_dataset.ipynb
new file mode 100644
index 0000000..09e4470
--- /dev/null
+++ b/us/states/pa/test_dataset.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cell-0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "194e07960bcb4dffbe230453d20cdfd5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "PA.h5:   0%|          | 0.00/149M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset loaded successfully!\n",
+      "Number of households: 68,351\n"
+     ]
+    }
+   ],
+   "source": [
+    "from policyengine_us import Microsimulation\n",
+    "\n",
+    "sim = Microsimulation(dataset=\"hf://policyengine/policyengine-us-data/states/PA.h5\")\n",
+    "print(\"Dataset loaded successfully!\")\n",
+    "print(f\"Number of households: {len(sim.calculate('household_weight', period=2025)):,}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From c2dfc88ad3406ecfb4a5743ef52aab66aebc9c1e Mon Sep 17 00:00:00 2001
From: David Trimmer <david@policyengine.org>
Date: Mon, 1 Dec 2025 15:22:34 -0500
Subject: [PATCH 4/5] update

---
 us/states/pa/pa_eitc_reform_analysis.ipynb | 100 +++++++++++----------
 us/states/pa/pa_eitc_reform_results.csv    |   2 +-
 2 files changed, 55 insertions(+), 47 deletions(-)

diff --git a/us/states/pa/pa_eitc_reform_analysis.ipynb b/us/states/pa/pa_eitc_reform_analysis.ipynb
index 51b3393..8ff23aa 100644
--- a/us/states/pa/pa_eitc_reform_analysis.ipynb
+++ b/us/states/pa/pa_eitc_reform_analysis.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,11 +60,11 @@
     "    \n",
     "    Returns:\n",
     "        poverty_rate: Weighted poverty rate\n",
-    "        people_in_poverty: Unweighted count\n",
+    "        people_in_poverty: Weighted count\n",
     "    \"\"\"\n",
-    "    age = sim.calculate(\"age\", period=period)\n",
-    "    is_in_poverty = sim.calculate(\"person_in_poverty\", period=period)\n",
-    "    person_weight = sim.calculate(\"person_weight\", period=period)\n",
+    "    age = np.array(sim.calculate(\"age\", period=period))\n",
+    "    is_in_poverty = np.array(sim.calculate(\"person_in_poverty\", period=period))\n",
+    "    person_weight = np.array(sim.calculate(\"person_weight\", period=period))\n",
     "    \n",
     "    if child_only:\n",
     "        mask = age < 18\n",
@@ -76,14 +76,13 @@
     "    weighted_total = person_weight[mask].sum()\n",
     "    poverty_rate = weighted_in_poverty / weighted_total if weighted_total > 0 else 0\n",
     "    \n",
-    "    # Unweighted count\n",
-    "    unweighted_in_poverty = is_in_poverty[mask].sum()\n",
-    "    unweighted_total = mask.sum()\n",
+    "    # Weighted count of people in poverty\n",
+    "    people_in_poverty = weighted_in_poverty\n",
     "    \n",
     "    return {\n",
     "        \"poverty_rate\": poverty_rate,\n",
-    "        \"people_in_poverty\": unweighted_in_poverty,\n",
-    "        \"total_people\": unweighted_total\n",
+    "        \"people_in_poverty\": people_in_poverty,\n",
+    "        \"total_people\": weighted_total\n",
     "    }\n",
     "\n",
     "def calculate_budgetary_impact(baseline_sim, reform_sim, variable, period=2025):\n",
@@ -97,18 +96,20 @@
     "\n",
     "def calculate_winners(baseline_sim, reform_sim, period=2025):\n",
     "    \"\"\"\n",
-    "    Calculate winners from a reform at the person level.\n",
-    "    Winners: People in households with higher net income under reform\n",
-    "    Returns percentage of total population.\n",
+    "    Calculate winners from a reform at the person level (weighted).\n",
+    "    Winners: People in households with higher net income under reform.\n",
+    "    Returns weighted count and percentage of total population.\n",
     "    \"\"\"\n",
     "    # Get household-level income change\n",
-    "    baseline_income = baseline_sim.calculate(\"household_net_income\", period=period, map_to=\"household\")\n",
-    "    reform_income = reform_sim.calculate(\"household_net_income\", period=period, map_to=\"household\")\n",
+    "    baseline_income = np.array(baseline_sim.calculate(\"household_net_income\", period=period, map_to=\"household\"))\n",
+    "    reform_income = np.array(reform_sim.calculate(\"household_net_income\", period=period, map_to=\"household\"))\n",
+    "    household_weight = np.array(baseline_sim.calculate(\"household_weight\", period=period))\n",
     "    income_change = reform_income - baseline_income\n",
     "    \n",
-    "    # Map to person level\n",
-    "    household_id_person = baseline_sim.calculate(\"household_id\", period=period, map_to=\"person\")\n",
-    "    household_id_household = baseline_sim.calculate(\"household_id\", period=period, map_to=\"household\")\n",
+    "    # Get person-level data\n",
+    "    household_id_person = np.array(baseline_sim.calculate(\"household_id\", period=period, map_to=\"person\"))\n",
+    "    household_id_household = np.array(baseline_sim.calculate(\"household_id\", period=period, map_to=\"household\"))\n",
+    "    person_weight = np.array(baseline_sim.calculate(\"person_weight\", period=period))\n",
     "    \n",
     "    # Create mapping of household_id to income_change\n",
     "    income_change_dict = dict(zip(household_id_household, income_change))\n",
@@ -116,15 +117,20 @@
     "    # Map income change to each person\n",
     "    person_income_change = np.array([income_change_dict.get(hh_id, 0) for hh_id in household_id_person])\n",
     "    \n",
-    "    # Count people who are winners\n",
-    "    people_winning = (person_income_change > 1).sum()  # Gained more than $1\n",
-    "    total_people = len(person_income_change)\n",
+    "    # Weighted count of people who are winners (gained more than $1)\n",
+    "    winners_mask = person_income_change > 1\n",
+    "    people_winning = person_weight[winners_mask].sum()\n",
+    "    total_people = person_weight.sum()\n",
     "    \n",
     "    # Calculate percentage\n",
     "    pct_winners = (people_winning / total_people * 100) if total_people > 0 else 0\n",
     "    \n",
-    "    # Average gain for winning households\n",
-    "    avg_gain = income_change[income_change > 1].mean() if (income_change > 1).sum() > 0 else 0\n",
+    "    # Average gain for winning households (weighted)\n",
+    "    winning_hh_mask = income_change > 1\n",
+    "    if winning_hh_mask.sum() > 0:\n",
+    "        avg_gain = np.average(income_change[winning_hh_mask], weights=household_weight[winning_hh_mask])\n",
+    "    else:\n",
+    "        avg_gain = 0\n",
     "    \n",
     "    return {\n",
     "        \"people_winning\": people_winning,\n",
@@ -151,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -197,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -241,7 +247,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -287,7 +293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -304,22 +310,22 @@
       "PA EITC net cost:              $221.73M\n",
       "\n",
       "==============================WINNERS (POPULATION)==============================\n",
-      "People gaining income:         18,872 (9.03% of population)\n",
-      "  Average household gain:      $298.55\n",
+      "People gaining income:         3,066,550 (23.20% of population)\n",
+      "Average gain per household:    $298.55\n",
       "\n",
       "============================POVERTY IMPACT - OVERALL============================\n",
-      "Baseline poverty rate:         12.24%\n",
-      "Reform poverty rate:           11.68%\n",
-      "Absolute reduction:            0.56%\n",
-      "Relative reduction:            4.58%\n",
-      "People lifted from poverty:    43,702\n",
+      "Baseline poverty rate:         13.54%\n",
+      "Reform poverty rate:           13.21%\n",
+      "Absolute reduction:            0.33%\n",
+      "Relative reduction:            2.44%\n",
+      "People lifted from poverty:    43,703\n",
       "\n",
       "===========================POVERTY IMPACT - CHILDREN============================\n",
-      "Baseline child poverty rate:   10.96%\n",
-      "Reform child poverty rate:     10.82%\n",
-      "Absolute reduction:            0.14%\n",
-      "Relative reduction:            1.29%\n",
-      "Children lifted from poverty:  4,276\n",
+      "Baseline child poverty rate:   11.31%\n",
+      "Reform child poverty rate:     11.15%\n",
+      "Absolute reduction:            0.16%\n",
+      "Relative reduction:            1.43%\n",
+      "Children lifted from poverty:  4,277\n",
       "================================================================================\n"
      ]
     }
@@ -334,8 +340,8 @@
     "print(f\"PA EITC net cost:              {format_currency(eitc_cost)}\")\n",
     "\n",
     "print(f\"\\n{'WINNERS (POPULATION)':=^80}\")\n",
-    "print(f\"People gaining income:         {winners['people_winning']:,} ({winners['pct_winners']:.2f}% of population)\")\n",
-    "print(f\"  Average household gain:      ${winners['avg_gain']:,.2f}\")\n",
+    "print(f\"People gaining income:         {winners['people_winning']:,.0f} ({winners['pct_winners']:.2f}% of population)\")\n",
+    "print(f\"Average gain per household:    ${winners['avg_gain']:,.2f}\")\n",
     "\n",
     "print(f\"\\n{'POVERTY IMPACT - OVERALL':=^80}\")\n",
     "print(f\"Baseline poverty rate:         {format_percent(baseline_overall_pov['poverty_rate'])}\")\n",
@@ -344,7 +350,8 @@
     "overall_pov_pct_reduction = (overall_pov_reduction / baseline_overall_pov['poverty_rate'] * 100) if baseline_overall_pov['poverty_rate'] > 0 else 0\n",
     "print(f\"Absolute reduction:            {format_percent(overall_pov_reduction)}\")\n",
     "print(f\"Relative reduction:            {overall_pov_pct_reduction:.2f}%\")\n",
-    "print(f\"People lifted from poverty:    {int(baseline_overall_pov['people_in_poverty'] - reform_overall_pov['people_in_poverty']):,}\")\n",
+    "people_lifted = baseline_overall_pov['people_in_poverty'] - reform_overall_pov['people_in_poverty']\n",
+    "print(f\"People lifted from poverty:    {people_lifted:,.0f}\")\n",
     "\n",
     "print(f\"\\n{'POVERTY IMPACT - CHILDREN':=^80}\")\n",
     "print(f\"Baseline child poverty rate:   {format_percent(baseline_child_pov['poverty_rate'])}\")\n",
@@ -353,7 +360,8 @@
     "child_pov_pct_reduction = (child_pov_reduction / baseline_child_pov['poverty_rate'] * 100) if baseline_child_pov['poverty_rate'] > 0 else 0\n",
     "print(f\"Absolute reduction:            {format_percent(child_pov_reduction)}\")\n",
     "print(f\"Relative reduction:            {child_pov_pct_reduction:.2f}%\")\n",
-    "print(f\"Children lifted from poverty:  {int(baseline_child_pov['people_in_poverty'] - reform_child_pov['people_in_poverty']):,}\")\n",
+    "children_lifted = baseline_child_pov['people_in_poverty'] - reform_child_pov['people_in_poverty']\n",
+    "print(f\"Children lifted from poverty:  {children_lifted:,.0f}\")\n",
     "print(\"=\"*80)"
    ]
   },
@@ -366,7 +374,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -378,7 +386,7 @@
       "PA EITC REFORM SUMMARY\n",
       "==============================================================================================================\n",
       "Scenario PA EITC Match Net Cost Overall Poverty Change (%) Child Poverty Change (%) % Population Winning\n",
-      "  Reform           10% $221.73M                      4.58%                    1.29%                9.03%\n",
+      "  Reform           10% $221.73M                      2.44%                    1.43%               23.20%\n",
       "==============================================================================================================\n",
       "\n",
       "✓ Exported to: pa_eitc_reform_results.csv\n"
diff --git a/us/states/pa/pa_eitc_reform_results.csv b/us/states/pa/pa_eitc_reform_results.csv
index 0c80080..7c4f0b1 100644
--- a/us/states/pa/pa_eitc_reform_results.csv
+++ b/us/states/pa/pa_eitc_reform_results.csv
@@ -1,2 +1,2 @@
 Scenario,PA EITC Match,Net Cost,Overall Poverty Change (%),Child Poverty Change (%),% Population Winning
-Reform,10%,$221.73M,4.58%,1.29%,9.03%
+Reform,10%,$221.73M,2.44%,1.43%,23.20%

From a4104d31f078a5372e59fc8fb7ed91a2e7bc25dc Mon Sep 17 00:00:00 2001
From: David Trimmer <david@policyengine.org>
Date: Mon, 1 Dec 2025 15:27:00 -0500
Subject: [PATCH 5/5] household winners

---
 us/states/pa/pa_eitc_reform_analysis.ipynb | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/us/states/pa/pa_eitc_reform_analysis.ipynb b/us/states/pa/pa_eitc_reform_analysis.ipynb
index 8ff23aa..30b4f24 100644
--- a/us/states/pa/pa_eitc_reform_analysis.ipynb
+++ b/us/states/pa/pa_eitc_reform_analysis.ipynb
@@ -365,6 +365,47 @@
     "print(\"=\"*80)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "======================================================================\n",
+      "HOUSEHOLDS BENEFITTING FROM PA EITC\n",
+      "======================================================================\n",
+      "Households benefitting:        742,696\n",
+      "Total households:              4,662,650\n",
+      "Percentage of households:      15.93%\n",
+      "======================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate households benefitting (weighted)\n",
+    "baseline_hh_income = np.array(baseline.calculate(\"household_net_income\", period=2025, map_to=\"household\"))\n",
+    "reform_hh_income = np.array(reform_sim.calculate(\"household_net_income\", period=2025, map_to=\"household\"))\n",
+    "household_weight = np.array(baseline.calculate(\"household_weight\", period=2025))\n",
+    "\n",
+    "hh_income_change = reform_hh_income - baseline_hh_income\n",
+    "hh_benefitting_mask = hh_income_change > 1  # Gained more than $1\n",
+    "\n",
+    "households_benefitting = household_weight[hh_benefitting_mask].sum()\n",
+    "total_households = household_weight.sum()\n",
+    "pct_households_benefitting = (households_benefitting / total_households) * 100\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\"HOUSEHOLDS BENEFITTING FROM PA EITC\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"Households benefitting:        {households_benefitting:,.0f}\")\n",
+    "print(f\"Total households:              {total_households:,.0f}\")\n",
+    "print(f\"Percentage of households:      {pct_households_benefitting:.2f}%\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},