From 52a921560f977786dc893594bb170845d0f748a4 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Mon, 13 Oct 2025 15:42:38 -0400 Subject: [PATCH 1/3] Rhode Island dataset exploration Fixes #87 --- ri_dataset_exploration.ipynb | 393 ++++++++++++++++++++++++++++++ ri_dataset_summary_unweighted.csv | 11 + ri_dataset_summary_weighted.csv | 20 ++ 3 files changed, 424 insertions(+) create mode 100644 ri_dataset_exploration.ipynb create mode 100644 ri_dataset_summary_unweighted.csv create mode 100644 ri_dataset_summary_weighted.csv diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb new file mode 100644 index 0000000..b104e9c --- /dev/null +++ b/ri_dataset_exploration.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "id": "be1cea7a", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d21b774", + "metadata": {}, + "outputs": [], + "source": [ + "# Load RI dataset\n", + "sim = Microsimulation(dataset=\"hf://policyengine/test/RI.h5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1870e7ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of households in dataset: 2,368\n", + "Household count (mapped): 388,376\n", + "Person count (mapped): 1,106,390\n" + ] + } + ], + "source": [ + "# Check dataset size\n", + "household_weight = sim.calculate(\"household_weight\", period=2025)\n", + "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n", + "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n", + "\n", + "print(f\"Number of households in dataset: {len(household_weight):,}\")\n", + "print(f\"Household count (mapped): {household_count.sum():,.0f}\")\n", + "print(f\"Person count (mapped): {person_count.sum():,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f0c79a50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Income distribution:\n", + " Median AGI: $73,149\n", + " 75th percentile: $152,410\n", + " 90th percentile: $271,943\n", + " 95th percentile: $400,420\n", + " Max AGI: $1,740,956\n", + "\n", + "Households by income threshold:\n", + " Households over $80k: 180,850.09423405278\n", + " Households over $120k: 128,983.09166995375\n", + " Households over $160k: 88,148.79416422347\n", + " Households over $240k: 47,853.16035188042\n" + ] + } + ], + "source": [ + "# Check household income distribution (aggregate to household level using map_to)\n", + "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "print(f\"Income distribution:\")\n", + "print(f\" Median AGI: ${agi.median():,.0f}\")\n", + "print(f\" 75th percentile: ${agi.quantile(0.75):,.0f}\")\n", + "print(f\" 90th percentile: ${agi.quantile(0.90):,.0f}\")\n", + "print(f\" 95th percentile: ${agi.quantile(0.95):,.0f}\")\n", + "print(f\" Max AGI: ${agi.max():,.0f}\")\n", + "print(f\"\\nHouseholds by income threshold:\")\n", + "print(f\" Households over $80k: {(agi > 80_000).sum():,}\")\n", + "print(f\" Households over $120k: {(agi > 120_000).sum():,}\")\n", + "print(f\" Households over $160k: {(agi > 160_000).sum():,}\")\n", + "print(f\" Households over $240k: {(agi > 240_000).sum():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "71b548db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Households with children (weighted):\n", + " Total households with children: 121,492\n", + " Households with 1 child: 66,113\n", + " Households with 2 children: 37,589\n", + " Households with 3+ children: 17,790\n" + ] + } + ], + "source": [ + "# Check households with children (count at person level, aggregate to household)\n", + "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n", + "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n", + "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n", + "\n", + "# Create DataFrame for easier manipulation\n", + "df_households = pd.DataFrame({\n", + " 'household_id': household_id,\n", + " 'is_child': is_child,\n", + " 'household_weight': household_weight\n", + "})\n", + "\n", + "# Count children per household\n", + "children_per_household = df_households.groupby('household_id').agg({\n", + " 'is_child': 'sum',\n", + " 'household_weight': 'first' # household_weight is same for all members\n", + "}).reset_index()\n", + "\n", + "# Calculate weighted household counts\n", + "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n", + "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n", + "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n", + "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n", + "\n", + "print(f\"\\nHouseholds with children (weighted):\")\n", + "print(f\" Total households with children: {total_households_with_children:,.0f}\")\n", + "print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n", + "print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n", + "print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a215302f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Children by age:\n", + " Total children under 18: 203,860\n", + " Children under 4: 41,525\n", + " Children under 6: 61,595\n", + " Children ages 6-17: 138,203\n", + "\n", + "Sample of children under 4:\n", + " household_id tax_unit_id person_id age\n", + "7 2730011 5 7730007 1.0\n", + "16 2730006 10 7730016 2.0\n", + "37 2730018 26 7730037 3.0\n", + "83 2730035 55 7730083 0.0\n", + "101 2730041 66 7730101 3.0\n", + "102 2730041 66 7730102 2.0\n", + "103 2730041 66 7730103 0.0\n", + "108 2730043 69 7730108 1.0\n", + "111 2730045 71 7730111 1.0\n", + "115 2730045 71 7730115 3.0\n" + ] + } + ], + "source": [ + "# Check children by age groups using Ben's workaround\n", + "import pandas as pd\n", + "df = pd.DataFrame({\n", + " \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n", + " \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n", + " \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n", + " \"age\": sim.calculate(\"age\", map_to=\"person\"),\n", + " \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n", + "})\n", + "\n", + "# Filter for children and apply weights\n", + "children_under_4_df = df[df['age'] < 4]\n", + "children_under_6_df = df[df['age'] < 6]\n", + "children_under_18_df = df[df['age'] < 18]\n", + "children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]\n", + "\n", + "# Calculate weighted totals\n", + "is_child = sim.calculate(\"is_child\", period=2025)\n", + "total_children = is_child.sum()\n", + "children_under_4 = children_under_4_df['person_weight'].sum()\n", + "children_under_6 = children_under_6_df['person_weight'].sum()\n", + "children_6_17 = children_6_17_df['person_weight'].sum()\n", + "\n", + "print(f\"\\nChildren by age:\")\n", + "print(f\" Total children under 18: {total_children:,.0f}\")\n", + "print(f\" Children under 4: {children_under_4:,.0f}\")\n", + "print(f\" Children under 6: {children_under_6:,.0f}\")\n", + "print(f\" Children ages 6-17: {children_6_17:,.0f}\")\n", + "\n", + "print(f\"\\nSample of children under 4:\")\n", + "print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9468033e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "RI DATASET SUMMARY - WEIGHTED (Population Estimates)\n", + "============================================================\n", + " Metric Value\n", + " Household count (weighted) 388,376\n", + " Person count (weighted) 1,106,390\n", + " Median AGI $73,149\n", + " 75th percentile AGI $152,410\n", + " 90th percentile AGI $271,943\n", + " 95th percentile AGI $400,420\n", + " Max AGI $1,740,956\n", + " Households over $80k 180,850\n", + " Households over $120k 128,983\n", + " Households over $160k 88,149\n", + " Households over $240k 47,853\n", + "Total households with children 121,492\n", + " Households with 1 child 66,113\n", + " Households with 2 children 37,589\n", + " Households with 3+ children 17,790\n", + " Total children under 18 203,860\n", + " Children under 4 41,525\n", + " Children under 6 61,595\n", + " Children ages 6-17 138,203\n", + "============================================================\n", + "\n", + "============================================================\n", + "RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\n", + "============================================================\n", + " Metric Value\n", + " Number of households in dataset 2,368\n", + " Number of persons in dataset 5,888\n", + " Households with children (unweighted) 616\n", + " Households with 1 child (unweighted) 305\n", + " Households with 2 children (unweighted) 201\n", + "Households with 3+ children (unweighted) 110\n", + " Children under 18 (unweighted) 1,079\n", + " Children under 4 (unweighted) 254\n", + " Children under 6 (unweighted) 373\n", + " Children ages 6-17 (unweighted) 706\n", + "============================================================\n", + "\n", + "Summaries saved to:\n", + " - ri_dataset_summary_weighted.csv\n", + " - ri_dataset_summary_unweighted.csv\n" + ] + } + ], + "source": [ + "# Create weighted summary table\n", + "weighted_summary_data = {\n", + " 'Metric': [\n", + " 'Household count (weighted)',\n", + " 'Person count (weighted)',\n", + " 'Median AGI',\n", + " '75th percentile AGI',\n", + " '90th percentile AGI',\n", + " '95th percentile AGI',\n", + " 'Max AGI',\n", + " 'Households over $80k',\n", + " 'Households over $120k',\n", + " 'Households over $160k',\n", + " 'Households over $240k',\n", + " 'Total households with children',\n", + " 'Households with 1 child',\n", + " 'Households with 2 children',\n", + " 'Households with 3+ children',\n", + " 'Total children under 18',\n", + " 'Children under 4',\n", + " 'Children under 6',\n", + " 'Children ages 6-17'\n", + " ],\n", + " 'Value': [\n", + " f\"{household_count.sum():,.0f}\",\n", + " f\"{person_count.sum():,.0f}\",\n", + " f\"${agi.median():,.0f}\",\n", + " f\"${agi.quantile(0.75):,.0f}\",\n", + " f\"${agi.quantile(0.90):,.0f}\",\n", + " f\"${agi.quantile(0.95):,.0f}\",\n", + " f\"${agi.max():,.0f}\",\n", + " f\"{(agi > 80_000).sum():,.0f}\",\n", + " f\"{(agi > 120_000).sum():,.0f}\",\n", + " f\"{(agi > 160_000).sum():,.0f}\",\n", + " f\"{(agi > 240_000).sum():,.0f}\",\n", + " f\"{total_households_with_children:,.0f}\",\n", + " f\"{households_with_1_child:,.0f}\",\n", + " f\"{households_with_2_children:,.0f}\",\n", + " f\"{households_with_3plus_children:,.0f}\",\n", + " f\"{total_children:,.0f}\",\n", + " f\"{children_under_4:,.0f}\",\n", + " f\"{children_under_6:,.0f}\",\n", + " f\"{children_6_17:,.0f}\"\n", + " ]\n", + "}\n", + "\n", + "# Get unique counts for unweighted table\n", + "unique_households = df['household_id'].nunique()\n", + "unique_persons = len(df)\n", + "\n", + "# Create unweighted summary table\n", + "unweighted_summary_data = {\n", + " 'Metric': [\n", + " 'Number of households in dataset',\n", + " 'Number of persons in dataset',\n", + " 'Households with children (unweighted)',\n", + " 'Households with 1 child (unweighted)',\n", + " 'Households with 2 children (unweighted)',\n", + " 'Households with 3+ children (unweighted)',\n", + " 'Children under 18 (unweighted)',\n", + " 'Children under 4 (unweighted)',\n", + " 'Children under 6 (unweighted)',\n", + " 'Children ages 6-17 (unweighted)'\n", + " ],\n", + " 'Value': [\n", + " f\"{unique_households:,}\",\n", + " f\"{unique_persons:,}\",\n", + " f\"{(children_per_household['is_child'] > 0).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 1).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 2).sum():,}\",\n", + " f\"{(children_per_household['is_child'] >= 3).sum():,}\",\n", + " f\"{len(children_under_18_df):,}\",\n", + " f\"{len(children_under_4_df):,}\",\n", + " f\"{len(children_under_6_df):,}\",\n", + " f\"{len(children_6_17_df):,}\"\n", + " ]\n", + "}\n", + "\n", + "weighted_df = pd.DataFrame(weighted_summary_data)\n", + "unweighted_df = pd.DataFrame(unweighted_summary_data)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n", + "print(\"=\"*60)\n", + "print(weighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\")\n", + "print(\"=\"*60)\n", + "print(unweighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "# Save both tables\n", + "weighted_df.to_csv('ri_dataset_summary_weighted.csv', index=False)\n", + "unweighted_df.to_csv('ri_dataset_summary_unweighted.csv', index=False)\n", + "print(\"\\nSummaries saved to:\")\n", + "print(\" - ri_dataset_summary_weighted.csv\")\n", + "print(\" - ri_dataset_summary_unweighted.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ri_dataset_summary_unweighted.csv b/ri_dataset_summary_unweighted.csv new file mode 100644 index 0000000..8567486 --- /dev/null +++ b/ri_dataset_summary_unweighted.csv @@ -0,0 +1,11 @@ +Metric,Value +Number of households in dataset,"2,368" +Number of persons in dataset,"5,888" +Households with children (unweighted),616 +Households with 1 child (unweighted),305 +Households with 2 children (unweighted),201 +Households with 3+ children (unweighted),110 +Children under 18 (unweighted),"1,079" +Children under 4 (unweighted),254 +Children under 6 (unweighted),373 +Children ages 6-17 (unweighted),706 diff --git a/ri_dataset_summary_weighted.csv b/ri_dataset_summary_weighted.csv new file mode 100644 index 0000000..c4597a9 --- /dev/null +++ b/ri_dataset_summary_weighted.csv @@ -0,0 +1,20 @@ +Metric,Value +Household count (weighted),"388,376" +Person count (weighted),"1,106,390" +Median AGI,"$73,149" +75th percentile AGI,"$152,410" +90th percentile AGI,"$271,943" +95th percentile AGI,"$400,420" +Max AGI,"$1,740,956" +Households over $80k,"180,850" +Households over $120k,"128,983" +Households over $160k,"88,149" +Households over $240k,"47,853" +Total households with children,"121,492" +Households with 1 child,"66,113" +Households with 2 children,"37,589" +Households with 3+ children,"17,790" +Total children under 18,"203,860" +Children under 4,"41,525" +Children under 6,"61,595" +Children ages 6-17,"138,203" From f92b594e0a9d4d91ff4e69a4375dcd16245f0747 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Wed, 15 Oct 2025 09:45:38 -0400 Subject: [PATCH 2/3] agi deep dive --- ri_dataset_exploration.ipynb | 126 +++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb index b104e9c..0796319 100644 --- a/ri_dataset_exploration.ipynb +++ b/ri_dataset_exploration.ipynb @@ -367,6 +367,132 @@ "print(\" - ri_dataset_summary_weighted.csv\")\n", "print(\" - ri_dataset_summary_unweighted.csv\")" ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dzvou2zqia4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Median AGI by aggregation level:\n", + " Household level: $73,149\n", + " Tax unit level: $35,546\n", + " Person level: $47,592\n", + "\n", + "Total AGI for Rhode Island (by aggregation level):\n", + " Using tax unit level: $43,501,430,523\n", + " Using household level: $43,501,430,523\n", + " Using person level: $77,844,195,552\n" + ] + } + ], + "source": [ + "# Compare median AGI at different aggregation levels\n", + "agi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "agi_tax_unit = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "agi_person = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"person\")\n", + "\n", + "print(\"Median AGI by aggregation level:\")\n", + "print(f\" Household level: ${agi_household.median():,.0f}\")\n", + "print(f\" Tax unit level: ${agi_tax_unit.median():,.0f}\")\n", + "print(f\" Person level: ${agi_person.median():,.0f}\")\n", + "\n", + "# Calculate total AGI - just sum the values (weights are already built into the arrays)\n", + "total_agi_tax_unit = agi_tax_unit.sum()\n", + "total_agi_household = agi_household.sum()\n", + "total_agi_person = agi_person.sum()\n", + "\n", + "print(f\"\\nTotal AGI for Rhode Island (by aggregation level):\")\n", + "print(f\" Using tax unit level: ${total_agi_tax_unit:,.0f}\")\n", + "print(f\" Using household level: ${total_agi_household:,.0f}\")\n", + "print(f\" Using person level: ${total_agi_person:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "gispfkxpnph", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGI Component Breakdown (Tax Unit Level)\n", + "============================================================\n", + "\n", + "Total Income (Statewide):\n", + " Employment Income: $ 31,034,426,346\n", + " Self-Employment Income: $ 1,890,240,187\n", + " Capital Gains: $ 1,086,347,982\n", + " Qualified Dividends: $ 1,002,331,804\n", + " Interest Income: $ 670,462,607\n", + " Taxable Social Security: $ 1,123,366,624\n", + " Pension Income: $ 1,384,610,313\n", + " Adjusted Gross Income (AGI): $ 43,501,430,523\n", + "\n", + "Median Values:\n", + " Employment Income: $ 29,531\n", + " Self-Employment Income: $ 0\n", + " Capital Gains: $ 0\n", + " Qualified Dividends: $ 0\n", + " Interest Income: $ 0\n", + " Taxable Social Security: $ 0\n", + " Pension Income: $ 0\n", + " Adjusted Gross Income (AGI): $ 35,546\n", + "\n", + "Sum of income components: $ 38,191,785,863\n", + "AGI (for comparison): $ 43,501,430,523\n", + "Difference (potential missing income or deductions): $ -5,309,644,660\n" + ] + } + ], + "source": [ + "# Break down AGI components at tax unit level\n", + "print(\"AGI Component Breakdown (Tax Unit Level)\")\n", + "print(\"=\"*60)\n", + "\n", + "# Calculate key income components\n", + "employment_income = sim.calculate(\"employment_income\", period=2025, map_to=\"tax_unit\")\n", + "self_employment_income = sim.calculate(\"self_employment_income\", period=2025, map_to=\"tax_unit\")\n", + "capital_gains = sim.calculate(\"capital_gains\", period=2025, map_to=\"tax_unit\")\n", + "qualified_dividend_income = sim.calculate(\"qualified_dividend_income\", period=2025, map_to=\"tax_unit\")\n", + "interest_income = sim.calculate(\"interest_income\", period=2025, map_to=\"tax_unit\")\n", + "taxable_social_security = sim.calculate(\"taxable_social_security\", period=2025, map_to=\"tax_unit\")\n", + "pension_income = sim.calculate(\"pension_income\", period=2025, map_to=\"tax_unit\")\n", + "adjusted_gross_income = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "\n", + "print(\"\\nTotal Income (Statewide):\")\n", + "print(f\" Employment Income: ${employment_income.sum():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.sum():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.sum():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.sum():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.sum():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.sum():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.sum():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "\n", + "print(\"\\nMedian Values:\")\n", + "print(f\" Employment Income: ${employment_income.median():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.median():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.median():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.median():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.median():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.median():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.median():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.median():>15,.0f}\")\n", + "\n", + "# Calculate sum of components to compare with AGI\n", + "total_components = (employment_income + self_employment_income + capital_gains + \n", + " qualified_dividend_income + interest_income + taxable_social_security + pension_income)\n", + "print(f\"\\nSum of income components: ${total_components.sum():>15,.0f}\")\n", + "print(f\"AGI (for comparison): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "print(f\"Difference (potential missing income or deductions): ${(total_components.sum() - adjusted_gross_income.sum()):>15,.0f}\")" + ] } ], "metadata": { From 167514454c8c259f5e7065485236a477eca7f02e Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Wed, 26 Nov 2025 16:45:34 -0500 Subject: [PATCH 3/3] Rhode Island dataset exploration Fixes #87 --- .../Congressional-Hackathon-2025 | 1 + ri_dataset_exploration.ipynb | 197 ++++++++++-------- ri_dataset_summary_unweighted.csv | 20 +- ri_dataset_summary_weighted.csv | 38 ++-- 4 files changed, 137 insertions(+), 119 deletions(-) create mode 160000 obbba_district_impacts/Congressional-Hackathon-2025 diff --git a/obbba_district_impacts/Congressional-Hackathon-2025 b/obbba_district_impacts/Congressional-Hackathon-2025 new file mode 160000 index 0000000..3f6d05e --- /dev/null +++ b/obbba_district_impacts/Congressional-Hackathon-2025 @@ -0,0 +1 @@ +Subproject commit 3f6d05e76400c6e396a3a4eddd34a7b3f6919fc3 diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb index 0796319..3e656ea 100644 --- a/ri_dataset_exploration.ipynb +++ b/ri_dataset_exploration.ipynb @@ -2,10 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "id": "be1cea7a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\dtsax\\envs\\pe\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from policyengine_us import Microsimulation\n", "import pandas as pd\n", @@ -14,10 +23,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "id": "0d21b774", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n" + ] + } + ], "source": [ "# Load RI dataset\n", "sim = Microsimulation(dataset=\"hf://policyengine/test/RI.h5\")" @@ -25,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "id": "1870e7ac", "metadata": {}, "outputs": [ @@ -33,9 +50,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of households in dataset: 2,368\n", - "Household count (mapped): 388,376\n", - "Person count (mapped): 1,106,390\n" + "Number of households in dataset: 8,617\n", + "Household count (mapped): 401,236\n", + "Person count (mapped): 1,117,161\n" ] } ], @@ -52,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "id": "f0c79a50", "metadata": {}, "outputs": [ @@ -61,17 +78,17 @@ "output_type": "stream", "text": [ "Income distribution:\n", - " Median AGI: $73,149\n", - " 75th percentile: $152,410\n", - " 90th percentile: $271,943\n", - " 95th percentile: $400,420\n", - " Max AGI: $1,740,956\n", + " Median AGI: $79,994\n", + " 75th percentile: $168,598\n", + " 90th percentile: $405,000\n", + " 95th percentile: $518,164\n", + " Max AGI: $2,600,478\n", "\n", "Households by income threshold:\n", - " Households over $80k: 180,850.09423405278\n", - " Households over $120k: 128,983.09166995375\n", - " Households over $160k: 88,148.79416422347\n", - " Households over $240k: 47,853.16035188042\n" + " Households over $80k: 200,330.7008952641\n", + " Households over $120k: 146,947.59684899804\n", + " Households over $160k: 110,723.5024763195\n", + " Households over $240k: 72,041.0344688301\n" ] } ], @@ -93,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "id": "71b548db", "metadata": {}, "outputs": [ @@ -103,10 +120,10 @@ "text": [ "\n", "Households with children (weighted):\n", - " Total households with children: 121,492\n", - " Households with 1 child: 66,113\n", - " Households with 2 children: 37,589\n", - " Households with 3+ children: 17,790\n" + " Total households with children: 122,610\n", + " Households with 1 child: 65,074\n", + " Households with 2 children: 38,411\n", + " Households with 3+ children: 19,126\n" ] } ], @@ -144,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "id": "a215302f", "metadata": {}, "outputs": [ @@ -154,23 +171,23 @@ "text": [ "\n", "Children by age:\n", - " Total children under 18: 203,860\n", - " Children under 4: 41,525\n", - " Children under 6: 61,595\n", - " Children ages 6-17: 138,203\n", + " Total children under 18: 206,993\n", + " Children under 4: 43,318\n", + " Children under 6: 64,240\n", + " Children ages 6-17: 138,628\n", "\n", "Sample of children under 4:\n", " household_id tax_unit_id person_id age\n", - "7 2730011 5 7730007 1.0\n", - "16 2730006 10 7730016 2.0\n", - "37 2730018 26 7730037 3.0\n", - "83 2730035 55 7730083 0.0\n", - "101 2730041 66 7730101 3.0\n", - "102 2730041 66 7730102 2.0\n", - "103 2730041 66 7730103 0.0\n", - "108 2730043 69 7730108 1.0\n", - "111 2730045 71 7730111 1.0\n", - "115 2730045 71 7730115 3.0\n" + "27 6825009 12 11825027 3.0\n", + "112 6825079 54 11825112 1.0\n", + "140 6825054 69 11825140 2.0\n", + "143 6825055 70 11825143 2.0\n", + "146 6825056 71 11825146 1.0\n", + "173 6825065 80 11825173 2.0\n", + "174 6825065 80 11825174 0.0\n", + "200 6825076 96 11825200 0.0\n", + "224 6825085 109 11825224 2.0\n", + "292 6825109 145 11825292 3.0\n" ] } ], @@ -210,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "id": "9468033e", "metadata": {}, "outputs": [ @@ -223,41 +240,41 @@ "RI DATASET SUMMARY - WEIGHTED (Population Estimates)\n", "============================================================\n", " Metric Value\n", - " Household count (weighted) 388,376\n", - " Person count (weighted) 1,106,390\n", - " Median AGI $73,149\n", - " 75th percentile AGI $152,410\n", - " 90th percentile AGI $271,943\n", - " 95th percentile AGI $400,420\n", - " Max AGI $1,740,956\n", - " Households over $80k 180,850\n", - " Households over $120k 128,983\n", - " Households over $160k 88,149\n", - " Households over $240k 47,853\n", - "Total households with children 121,492\n", - " Households with 1 child 66,113\n", - " Households with 2 children 37,589\n", - " Households with 3+ children 17,790\n", - " Total children under 18 203,860\n", - " Children under 4 41,525\n", - " Children under 6 61,595\n", - " Children ages 6-17 138,203\n", + " Household count (weighted) 401,236\n", + " Person count (weighted) 1,117,161\n", + " Median AGI $79,994\n", + " 75th percentile AGI $168,598\n", + " 90th percentile AGI $405,000\n", + " 95th percentile AGI $518,164\n", + " Max AGI $2,600,478\n", + " Households over $80k 200,331\n", + " Households over $120k 146,948\n", + " Households over $160k 110,724\n", + " Households over $240k 72,041\n", + "Total households with children 122,610\n", + " Households with 1 child 65,074\n", + " Households with 2 children 38,411\n", + " Households with 3+ children 19,126\n", + " Total children under 18 206,993\n", + " Children under 4 43,318\n", + " Children under 6 64,240\n", + " Children ages 6-17 138,628\n", "============================================================\n", "\n", "============================================================\n", "RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\n", "============================================================\n", - " Metric Value\n", - " Number of households in dataset 2,368\n", - " Number of persons in dataset 5,888\n", - " Households with children (unweighted) 616\n", - " Households with 1 child (unweighted) 305\n", - " Households with 2 children (unweighted) 201\n", - "Households with 3+ children (unweighted) 110\n", - " Children under 18 (unweighted) 1,079\n", - " Children under 4 (unweighted) 254\n", - " Children under 6 (unweighted) 373\n", - " Children ages 6-17 (unweighted) 706\n", + " Metric Value\n", + " Number of households in dataset 8,617\n", + " Number of persons in dataset 26,217\n", + " Households with children (unweighted) 3,717\n", + " Households with 1 child (unweighted) 1,427\n", + " Households with 2 children (unweighted) 1,438\n", + "Households with 3+ children (unweighted) 852\n", + " Children under 18 (unweighted) 7,257\n", + " Children under 4 (unweighted) 1,221\n", + " Children under 6 (unweighted) 1,975\n", + " Children ages 6-17 (unweighted) 5,282\n", "============================================================\n", "\n", "Summaries saved to:\n", @@ -370,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "id": "dzvou2zqia4", "metadata": {}, "outputs": [ @@ -379,14 +396,14 @@ "output_type": "stream", "text": [ "Median AGI by aggregation level:\n", - " Household level: $73,149\n", - " Tax unit level: $35,546\n", - " Person level: $47,592\n", + " Household level: $79,994\n", + " Tax unit level: $38,552\n", + " Person level: $49,057\n", "\n", "Total AGI for Rhode Island (by aggregation level):\n", - " Using tax unit level: $43,501,430,523\n", - " Using household level: $43,501,430,523\n", - " Using person level: $77,844,195,552\n" + " Using tax unit level: $57,748,447,798\n", + " Using household level: $57,748,447,798\n", + " Using person level: $122,937,416,952\n" ] } ], @@ -414,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "id": "gispfkxpnph", "metadata": {}, "outputs": [ @@ -426,28 +443,28 @@ "============================================================\n", "\n", "Total Income (Statewide):\n", - " Employment Income: $ 31,034,426,346\n", - " Self-Employment Income: $ 1,890,240,187\n", - " Capital Gains: $ 1,086,347,982\n", - " Qualified Dividends: $ 1,002,331,804\n", - " Interest Income: $ 670,462,607\n", - " Taxable Social Security: $ 1,123,366,624\n", - " Pension Income: $ 1,384,610,313\n", - " Adjusted Gross Income (AGI): $ 43,501,430,523\n", + " Employment Income: $ 41,313,375,713\n", + " Self-Employment Income: $ 1,895,414,487\n", + " Capital Gains: $ 4,284,511,812\n", + " Qualified Dividends: $ 998,165,965\n", + " Interest Income: $ 674,707,426\n", + " Taxable Social Security: $ 1,115,904,003\n", + " Pension Income: $ 1,507,391,122\n", + " Adjusted Gross Income (AGI): $ 57,748,447,798\n", "\n", "Median Values:\n", - " Employment Income: $ 29,531\n", + " Employment Income: $ 32,484\n", " Self-Employment Income: $ 0\n", " Capital Gains: $ 0\n", " Qualified Dividends: $ 0\n", " Interest Income: $ 0\n", " Taxable Social Security: $ 0\n", " Pension Income: $ 0\n", - " Adjusted Gross Income (AGI): $ 35,546\n", + " Adjusted Gross Income (AGI): $ 38,552\n", "\n", - "Sum of income components: $ 38,191,785,863\n", - "AGI (for comparison): $ 43,501,430,523\n", - "Difference (potential missing income or deductions): $ -5,309,644,660\n" + "Sum of income components: $ 51,789,470,528\n", + "AGI (for comparison): $ 57,748,447,798\n", + "Difference (potential missing income or deductions): $ -5,958,977,270\n" ] } ], diff --git a/ri_dataset_summary_unweighted.csv b/ri_dataset_summary_unweighted.csv index 8567486..ab9b7b0 100644 --- a/ri_dataset_summary_unweighted.csv +++ b/ri_dataset_summary_unweighted.csv @@ -1,11 +1,11 @@ Metric,Value -Number of households in dataset,"2,368" -Number of persons in dataset,"5,888" -Households with children (unweighted),616 -Households with 1 child (unweighted),305 -Households with 2 children (unweighted),201 -Households with 3+ children (unweighted),110 -Children under 18 (unweighted),"1,079" -Children under 4 (unweighted),254 -Children under 6 (unweighted),373 -Children ages 6-17 (unweighted),706 +Number of households in dataset,"8,617" +Number of persons in dataset,"26,217" +Households with children (unweighted),"3,717" +Households with 1 child (unweighted),"1,427" +Households with 2 children (unweighted),"1,438" +Households with 3+ children (unweighted),852 +Children under 18 (unweighted),"7,257" +Children under 4 (unweighted),"1,221" +Children under 6 (unweighted),"1,975" +Children ages 6-17 (unweighted),"5,282" diff --git a/ri_dataset_summary_weighted.csv b/ri_dataset_summary_weighted.csv index c4597a9..7822171 100644 --- a/ri_dataset_summary_weighted.csv +++ b/ri_dataset_summary_weighted.csv @@ -1,20 +1,20 @@ Metric,Value -Household count (weighted),"388,376" -Person count (weighted),"1,106,390" -Median AGI,"$73,149" -75th percentile AGI,"$152,410" -90th percentile AGI,"$271,943" -95th percentile AGI,"$400,420" -Max AGI,"$1,740,956" -Households over $80k,"180,850" -Households over $120k,"128,983" -Households over $160k,"88,149" -Households over $240k,"47,853" -Total households with children,"121,492" -Households with 1 child,"66,113" -Households with 2 children,"37,589" -Households with 3+ children,"17,790" -Total children under 18,"203,860" -Children under 4,"41,525" -Children under 6,"61,595" -Children ages 6-17,"138,203" +Household count (weighted),"401,236" +Person count (weighted),"1,117,161" +Median AGI,"$79,994" +75th percentile AGI,"$168,598" +90th percentile AGI,"$405,000" +95th percentile AGI,"$518,164" +Max AGI,"$2,600,478" +Households over $80k,"200,331" +Households over $120k,"146,948" +Households over $160k,"110,724" +Households over $240k,"72,041" +Total households with children,"122,610" +Households with 1 child,"65,074" +Households with 2 children,"38,411" +Households with 3+ children,"19,126" +Total children under 18,"206,993" +Children under 4,"43,318" +Children under 6,"64,240" +Children ages 6-17,"138,628"