diff --git a/us/salt/non_salt_deductions_2026_joint_filers.csv b/us/salt/non_salt_deductions_2026_joint_filers.csv new file mode 100644 index 0000000..87b88e9 --- /dev/null +++ b/us/salt/non_salt_deductions_2026_joint_filers.csv @@ -0,0 +1,10 @@ +Wages and salaries (+/- 10%),Children,Mean Deductions,Median Deductions,90th Percentile Deductions,Filers (Weighted Count),Records in ECPS data +"$250,000",All,11336.08,0.00,24681.37,821040.38,982 +"$250,000",0,2641.99,0.00,0.00,129337.88,480 +"$250,000",2,28121.12,3556.79,24825.72,171572.94,206 +"$500,000",All,62069.63,0.00,57143.67,695217.75,469 +"$500,000",0,71487.45,0.00,57355.90,426164.66,227 +"$500,000",2,40970.18,0.00,0.00,82024.90,82 +"$1,000,000",All,25125.78,9007.26,42805.29,299692.81,193 +"$1,000,000",0,7938.19,7742.64,24296.01,142421.59,86 +"$1,000,000",2,37298.07,49037.14,49591.25,32978.49,30 diff --git a/us/salt/non_salt_deductions_by_income.ipynb b/us/salt/non_salt_deductions_by_income.ipynb new file mode 100644 index 0000000..81e656f --- /dev/null +++ b/us/salt/non_salt_deductions_by_income.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import microdf as mdf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bf9ef09102bc43d7b2451fe2aefb1162", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "enhanced_cps_2024.h5: 0%| | 0.00/107M [00:00 0 else 0\n", + " \n", + " # Calculate median deductions\n", + " median_deductions = 0\n", + " if not filtered_df.empty and total_filers > 0:\n", + " # Ensure data passed to MicroSeries is not empty and weights are valid\n", + " series_data = filtered_df['non_salt_deductions']\n", + " series_weights = filtered_df['tax_unit_weight']\n", + " if not series_data.empty and not series_weights.empty and series_weights.sum() > 0:\n", + " median_deductions = mdf_module.MicroSeries(series_data, weights=series_weights).median()\n", + "\n", + " # Number of records\n", + " num_records = len(filtered_df)\n", + "\n", + " return mean_deductions, median_deductions, total_filers, num_records\n", + "\n", + "def generate_deductions_csv(output_filename=\"non_salt_deductions_2026_joint_filers.csv\"):\n", + " \"\"\"\n", + " Generates and saves a CSV file for non-SALT itemized deductions.\n", + " This function assumes 'df' (Pandas DataFrame) and 'mdf' (module with MicroSeries)\n", + " are available in the global scope or the scope where this function is called.\n", + "\n", + " Args:\n", + " output_filename (str): The name of the CSV file to be created.\n", + " \"\"\"\n", + " # --- User Configuration ---\n", + " # Verify these column names match your DataFrame 'df'\n", + " employment_income_col = 'employment_income'\n", + " filing_status_col = 'filing_status'\n", + " children_column_name = 'tax_unit_children' # Updated\n", + " non_salt_deductions_col = 'non_salt_deductions'\n", + " tax_unit_weight_col = 'tax_unit_weight'\n", + "\n", + " # Verify the identifier for joint filers in your 'filing_status' column\n", + " filing_status_joint = 'JOINT' # Adjust if your identifier is different (e.g., 2 for joint)\n", + " year = 2026 # For context, not directly in CSV columns here\n", + " # --- End User Configuration ---\n", + "\n", + " # Check if necessary columns exist in the DataFrame 'df'\n", + " # This 'df' is expected to be globally available or passed in if modified.\n", + " required_cols = [employment_income_col, filing_status_col, children_column_name,\n", + " non_salt_deductions_col, tax_unit_weight_col]\n", + " missing_cols = [col for col in required_cols if col not in df.columns]\n", + " if missing_cols:\n", + " print(f\"Error: The following required columns are missing from your DataFrame 'df': {', '.join(missing_cols)}\")\n", + " print(\"Please ensure your DataFrame 'df' is loaded correctly and column names match before calling this function.\")\n", + " return\n", + " \n", + " # Check if mdf and mdf.MicroSeries are available\n", + " # This 'mdf' is expected to be globally available or passed in.\n", + " if 'mdf' not in globals() or not hasattr(mdf, 'MicroSeries'):\n", + " print(\"Error: The 'mdf' module or 'mdf.MicroSeries' is not available.\")\n", + " print(\"Please ensure your 'mdf' module is correctly imported and accessible.\")\n", + " return\n", + "\n", + "\n", + " wage_configs = [\n", + " (250000, 0.10),\n", + " (500000, 0.10),\n", + " (1000000, 0.10)\n", + " ]\n", + " children_categories = [\"All\", 0, 2] # \"All\" means no filter on children\n", + "\n", + " results_list = []\n", + "\n", + " # Header for the CSV file\n", + " csv_headers = [\n", + " \"Wages and salaries (+/- 10%)\",\n", + " \"Children\", # This corresponds to 'tax_unit_children' filtering\n", + " \"Mean Deductions\",\n", + " \"Median Deductions\",\n", + " \"Filers (Weighted Count)\",\n", + " \"Records in ECPS data\"\n", + " ]\n", + "\n", + " for base_wage, pct_range in wage_configs:\n", + " min_w = base_wage * (1 - pct_range)\n", + " max_w = base_wage * (1 + pct_range)\n", + " wage_label = f\"${base_wage:,.0f}\"\n", + "\n", + " for children_cat in children_categories:\n", + " children_label = str(children_cat) # \"All\", \"0\", \"2\"\n", + "\n", + " # Call get_stats_for_row, passing the globally available df and mdf\n", + " mean_val, median_val, filers_val, records_val = get_stats_for_row(\n", + " min_w, max_w, filing_status_joint, children_cat, df, mdf\n", + " )\n", + " results_list.append({\n", + " csv_headers[0]: wage_label,\n", + " csv_headers[1]: children_label,\n", + " csv_headers[2]: mean_val,\n", + " csv_headers[3]: median_val,\n", + " csv_headers[4]: filers_val,\n", + " csv_headers[5]: records_val\n", + " })\n", + "\n", + " # Create a Pandas DataFrame from the results list\n", + " results_df = pd.DataFrame(results_list)\n", + "\n", + " # Reorder columns to match the desired CSV header order\n", + " if not results_df.empty:\n", + " results_df = results_df[csv_headers]\n", + "\n", + " # Save the DataFrame to a CSV file\n", + " try:\n", + " results_df.to_csv(output_filename, index=False, float_format='%.2f') # Format floats to 2 decimal places\n", + " print(f\"Successfully generated CSV: {output_filename}\")\n", + " except Exception as e:\n", + " print(f\"Error writing to CSV file {output_filename}: {e}\")\n", + "\n", + "if __name__ == '__main__':\n", + " # --- SCRIPT EXECUTION STARTS HERE ---\n", + "\n", + " # 1. LOAD YOUR DATAFRAME 'df' HERE:\n", + " # Example:\n", + " # try:\n", + " # df = pd.read_csv('path_to_your_ecps_data.csv')\n", + " # print(\"DataFrame 'df' loaded successfully.\")\n", + " # except FileNotFoundError:\n", + " # print(\"Error: ECPS data CSV file not found. Please set the correct path for 'df'.\")\n", + " # exit() # Exit if data can't be loaded\n", + " # except Exception as e:\n", + " # print(f\"An error occurred while loading the DataFrame 'df': {e}\")\n", + " # exit()\n", + "\n", + " # 2. IMPORT/DEFINE YOUR 'mdf' MODULE AND 'MicroSeries' HERE:\n", + " # Example:\n", + " # try:\n", + " # # Assuming 'mdf_module.py' contains your MicroSeries class\n", + " # # import mdf_module as mdf\n", + " # # Or if mdf is a pre-existing object/module in your environment:\n", + " # # if 'mdf' not in globals(): raise ImportError(\"mdf module not found\")\n", + " # print(\"'mdf' module assumed to be available.\")\n", + " # except ImportError:\n", + " # print(\"Error: Could not import or find 'mdf' module. Ensure it's in your Python path or defined.\")\n", + " # exit() # Exit if mdf module isn't available\n", + " # except Exception as e:\n", + " # print(f\"An error occurred with the 'mdf' module: {e}\")\n", + " # exit()\n", + "\n", + " # --- IMPORTANT PRE-RUN CHECKS (Example - you might need to adapt) ---\n", + " # Check if df and mdf are loaded (you would uncomment and adapt the loading above)\n", + " if 'df' not in globals() or 'mdf' not in globals():\n", + " print(\"CRITICAL ERROR: DataFrame 'df' or module 'mdf' is not loaded or defined.\")\n", + " print(\"Please load your data into 'df' and import/define 'mdf' at the beginning of the script execution block.\")\n", + " print(\"Script will not run without 'df' and 'mdf'.\")\n", + " # In a real scenario, you would exit here if they are not loaded.\n", + " # For this example, if you run it without loading df and mdf, it will fail inside generate_deductions_csv.\n", + " else:\n", + " print(\"DataFrame 'df' and module 'mdf' are assumed to be loaded. Proceeding with CSV generation...\")\n", + " # Call the function to generate the CSV\n", + " generate_deductions_csv(output_filename=\"non_salt_itemized_deductions_2026_joint_filers.csv\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CSV written to non_salt_deductions_2026_joint_filers.csv\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# IMPORTANT:\n", + "# 1. Ensure your Pandas DataFrame 'df' is loaded with your ECPS data BEFORE running this script.\n", + "# For example:\n", + "# # df = pd.read_csv('your_ecps_data.csv')\n", + "#\n", + "# 2. Ensure your 'mdf' module (with the MicroSeries class) is imported and available.\n", + "# For example:\n", + "# # import your_mdf_library as mdf # Or however you import/access it\n", + "\n", + "\n", + "def get_stats_for_row(min_wages, max_wages, filing_status_val, children_filter_val, df_data, mdf_module):\n", + " \"\"\"\n", + " Calculates mean, median, 90th-percentile non-SALT deductions, number of filers, and record counts\n", + " for a single row of the output table.\n", + "\n", + " Args:\n", + " min_wages (float): Minimum employment income for filtering.\n", + " max_wages (float): Maximum employment income for filtering.\n", + " filing_status_val (str | int): Identifier for the filing status.\n", + " children_filter_val (str | int): Number of children to filter by. \"All\" for no filter.\n", + " df_data (pd.DataFrame): The input DataFrame with ECPS data. Must contain\n", + " 'employment_income', 'filing_status', 'tax_unit_children',\n", + " 'non_salt_deductions', and 'tax_unit_weight'.\n", + " mdf_module: Module containing the MicroSeries class for weighted statistics.\n", + "\n", + " Returns:\n", + " tuple: (mean, median, p90, total_filers, num_records)\n", + " \"\"\"\n", + "\n", + " children_col_name = 'tax_unit_children'\n", + "\n", + " # Income & filing-status filter\n", + " filtered_df = df_data[\n", + " df_data['employment_income'].between(min_wages, max_wages) &\n", + " (df_data['filing_status'] == filing_status_val)\n", + " ]\n", + "\n", + " # Children filter (if requested)\n", + " if children_filter_val != \"All\":\n", + " filtered_df = filtered_df[filtered_df[children_col_name] == children_filter_val]\n", + "\n", + " if filtered_df.empty:\n", + " return 0, 0, 0, 0, 0\n", + "\n", + " deductions = filtered_df['non_salt_deductions']\n", + " weights = filtered_df['tax_unit_weight']\n", + " total_filers = weights.sum()\n", + "\n", + " # Mean\n", + " mean_val = (deductions * weights).sum() / total_filers if total_filers > 0 else 0\n", + "\n", + " # Weighted median & 90th percentile\n", + " median_val = 0\n", + " p90_val = 0\n", + " if total_filers > 0:\n", + " ms = mdf_module.MicroSeries(deductions, weights=weights)\n", + " median_val = ms.median()\n", + " # Some MicroSeries versions expose a quantile method; fall back if absent\n", + " if hasattr(ms, 'quantile'):\n", + " p90_val = ms.quantile(0.9)\n", + " else: # Manual weighted quantile\n", + " qdf = filtered_df[['non_salt_deductions', 'tax_unit_weight']].sort_values('non_salt_deductions')\n", + " cum_w = qdf['tax_unit_weight'].cumsum()\n", + " cutoff = 0.9 * total_filers\n", + " p90_val = qdf.loc[cum_w >= cutoff, 'non_salt_deductions'].iloc[0]\n", + "\n", + " num_records = len(filtered_df)\n", + "\n", + " return mean_val, median_val, p90_val, total_filers, num_records\n", + "\n", + "\n", + "def generate_deductions_csv(output_filename=\"non_salt_deductions_2026_joint_filers.csv\"):\n", + " \"\"\"\n", + " Generates a CSV summarising non-SALT itemised deductions by wage band and child category,\n", + " including mean, median, and 90th-percentile values.\n", + " Assumes global variables 'df' (DataFrame) and 'mdf' (module with MicroSeries).\n", + " \"\"\"\n", + "\n", + " # Column and constant configuration\n", + " employment_income_col = 'employment_income'\n", + " filing_status_col = 'filing_status'\n", + " children_column_name = 'tax_unit_children'\n", + " non_salt_deductions_col = 'non_salt_deductions'\n", + " tax_unit_weight_col = 'tax_unit_weight'\n", + " filing_status_joint = 'JOINT' # Adjust if different in your data\n", + "\n", + " required_cols = [employment_income_col, filing_status_col, children_column_name,\n", + " non_salt_deductions_col, tax_unit_weight_col]\n", + " missing_cols = [c for c in required_cols if c not in df.columns]\n", + " if missing_cols:\n", + " raise ValueError(f\"Missing required columns in df: {', '.join(missing_cols)}\")\n", + "\n", + " if 'mdf' not in globals() or not hasattr(mdf, 'MicroSeries'):\n", + " raise ImportError(\"'mdf.MicroSeries' is not available. Ensure the module is imported.\")\n", + "\n", + " # Wage bands (±10 %) centred at 250 k, 500 k, 1 M\n", + " wage_configs = [\n", + " (250000, 0.10),\n", + " (500000, 0.10),\n", + " (1000000, 0.10)\n", + " ]\n", + " children_categories = [\"All\", 0, 2]\n", + "\n", + " results = []\n", + " headers = [\n", + " \"Wages and salaries (+/- 10%)\",\n", + " \"Children\",\n", + " \"Mean Deductions\",\n", + " \"Median Deductions\",\n", + " \"90th Percentile Deductions\",\n", + " \"Filers (Weighted Count)\",\n", + " \"Records in ECPS data\"\n", + " ]\n", + "\n", + " for base_w, pct in wage_configs:\n", + " min_w, max_w = base_w * (1 - pct), base_w * (1 + pct)\n", + " wage_lbl = f\"${base_w:,.0f}\"\n", + "\n", + " for child_cat in children_categories:\n", + " mean_v, med_v, p90_v, filers_v, rec_v = get_stats_for_row(\n", + " min_w, max_w, filing_status_joint, child_cat, df, mdf\n", + " )\n", + " results.append({\n", + " headers[0]: wage_lbl,\n", + " headers[1]: str(child_cat),\n", + " headers[2]: mean_v,\n", + " headers[3]: med_v,\n", + " headers[4]: p90_v,\n", + " headers[5]: filers_v,\n", + " headers[6]: rec_v\n", + " })\n", + "\n", + " pd.DataFrame(results)[headers].to_csv(\n", + " output_filename, index=False, float_format='%.2f'\n", + " )\n", + " print(f\"CSV written to {output_filename}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Quick sanity checks (expects df & mdf loaded in the environment)\n", + " if 'df' not in globals() or 'mdf' not in globals():\n", + " raise RuntimeError(\"Load DataFrame 'df' and module 'mdf' before running.\")\n", + "\n", + " generate_deductions_csv()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "policyengine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/us/salt/non_salt_itemized_deductions_2026_joint_filers.csv b/us/salt/non_salt_itemized_deductions_2026_joint_filers.csv new file mode 100644 index 0000000..6591379 --- /dev/null +++ b/us/salt/non_salt_itemized_deductions_2026_joint_filers.csv @@ -0,0 +1,10 @@ +Wages and salaries (+/- 10%),Children,Mean Deductions,Median Deductions,Filers (Weighted Count),Records in ECPS data +"$250,000",All,11336.08,0.00,821040.38,982 +"$250,000",0,2641.99,0.00,129337.88,480 +"$250,000",2,28121.12,3556.79,171572.94,206 +"$500,000",All,62069.63,0.00,695217.75,469 +"$500,000",0,71487.45,0.00,426164.66,227 +"$500,000",2,40970.18,0.00,82024.90,82 +"$1,000,000",All,25125.78,9007.26,299692.81,193 +"$1,000,000",0,7938.19,7742.64,142421.59,86 +"$1,000,000",2,37298.07,49037.14,32978.49,30