diff --git a/cps_stage2/dataprep.py b/cps_stage2/dataprep.py index 047fa171..7c60cc96 100644 --- a/cps_stage2/dataprep.py +++ b/cps_stage2/dataprep.py @@ -24,10 +24,16 @@ def target(target_val, pop, factor, value): weights * factors["APOPSNR"][year], weights * factors["ARETS"][year], ) - single_returns = np.where((data["mars"] == 1) & (data["filer"] == 1), s006, 0) - joint_returns = np.where((data["mars"] == 2) & (data["filer"] == 1), s006, 0) + single_returns = np.where( + (data["mars"] == 1) & (data["filer"] == 1), s006, 0 + ) + joint_returns = np.where( + (data["mars"] == 2) & (data["filer"] == 1), s006, 0 + ) hh_returns = np.where((data["mars"] == 4) & (data["filer"] == 1), s006, 0) - returns_w_ss = np.where((data["e02400"] > 0) & (data["filer"] == 1), s006, 0) + returns_w_ss = np.where( + (data["e02400"] > 0) & (data["filer"] == 1), s006, 0 + ) dep_exemptions = ( np.where(data["mars"] == 2, data["XTOT"] - 2, data["XTOT"] - 1) * s006 ) @@ -45,27 +51,39 @@ def target(target_val, pop, factor, value): # wage distribution wage1 = np.where(data["agi"] <= 10000, data["e00200"], 0) * s006 wage2 = ( - np.where((data["agi"] > 10000) & (data["agi"] <= 20000), data["e00200"], 0) + np.where( + (data["agi"] > 10000) & (data["agi"] <= 20000), data["e00200"], 0 + ) * s006 ) wage3 = ( - np.where((data["agi"] > 20000) & (data["agi"] <= 30000), data["e00200"], 0) + np.where( + (data["agi"] > 20000) & (data["agi"] <= 30000), data["e00200"], 0 + ) * s006 ) wage4 = ( - np.where((data["agi"] > 30000) & (data["agi"] <= 40000), data["e00200"], 0) + np.where( + (data["agi"] > 30000) & (data["agi"] <= 40000), data["e00200"], 0 + ) * s006 ) wage5 = ( - np.where((data["agi"] > 40000) & (data["agi"] <= 50000), data["e00200"], 0) + np.where( + (data["agi"] > 40000) & (data["agi"] <= 50000), data["e00200"], 0 + ) * s006 ) wage6 = ( - np.where((data["agi"] > 50000) & (data["agi"] <= 75000), data["e00200"], 0) + np.where( + (data["agi"] > 50000) & (data["agi"] <= 75000), data["e00200"], 0 + ) * s006 ) wage7 = ( - np.where((data["agi"] > 75000) & (data["agi"] <= 100_000), data["e00200"], 0) + np.where( + (data["agi"] > 75000) & (data["agi"] <= 100_000), data["e00200"], 0 + ) * s006 ) wage8 = np.where(data["agi"] > 100_000, data["e00200"], 0) * s006 @@ -118,15 +136,27 @@ def target(target_val, pop, factor, value): target_name = "SS_return" rhs_vars["returns_w_ss"] = targets[year][target_name] - returns_w_ss.sum() target_name = "Dep_return" - rhs_vars["dep_exemptions"] = targets[year][target_name] - dep_exemptions.sum() - rhs_vars["interest"] = target(targets[year]["INTS"], apopn, aints, interest.sum()) - rhs_vars["dividend"] = target(targets[year]["DIVS"], apopn, adivs, dividend.sum()) + rhs_vars["dep_exemptions"] = ( + targets[year][target_name] - dep_exemptions.sum() + ) + rhs_vars["interest"] = target( + targets[year]["INTS"], apopn, aints, interest.sum() + ) + rhs_vars["dividend"] = target( + targets[year]["DIVS"], apopn, adivs, dividend.sum() + ) rhs_vars["biz_income"] = target( targets[year]["SCHCI"], apopn, aschci, biz_income.sum() ) - rhs_vars["biz_loss"] = target(targets[year]["SCHCL"], apopn, aschcl, biz_loss.sum()) - rhs_vars["cap_gain"] = target(targets[year]["CGNS"], apopn, acgns, cap_gain.sum()) - rhs_vars["pension"] = target(targets[year]["Pension"], apopn, atxpy, pension.sum()) + rhs_vars["biz_loss"] = target( + targets[year]["SCHCL"], apopn, aschcl, biz_loss.sum() + ) + rhs_vars["cap_gain"] = target( + targets[year]["CGNS"], apopn, acgns, cap_gain.sum() + ) + rhs_vars["pension"] = target( + targets[year]["Pension"], apopn, atxpy, pension.sum() + ) rhs_vars["sch_e_income"] = target( targets[year]["SCHEI"], apopn, aschei, sch_e_income.sum() ) @@ -136,15 +166,33 @@ def target(target_val, pop, factor, value): rhs_vars["ss_income"] = target( targets[year]["SS"], apopsnr, asocsec, ss_income.sum() ) - rhs_vars["ucomp"] = target(targets[year]["UCOMP"], apopn, aucomp, ucomp.sum()) - rhs_vars["wage1"] = target(targets[year]["wage1"], apopn, awage, wage1.sum()) - rhs_vars["wage2"] = target(targets[year]["wage2"], apopn, awage, wage2.sum()) - rhs_vars["wage3"] = target(targets[year]["wage3"], apopn, awage, wage3.sum()) - rhs_vars["wage4"] = target(targets[year]["wage4"], apopn, awage, wage4.sum()) - rhs_vars["wage5"] = target(targets[year]["wage5"], apopn, awage, wage5.sum()) - rhs_vars["wage6"] = target(targets[year]["wage6"], apopn, awage, wage6.sum()) - rhs_vars["wage7"] = target(targets[year]["wage7"], apopn, awage, wage7.sum()) - rhs_vars["wage8"] = target(targets[year]["wage8"], apopn, awage, wage8.sum()) + rhs_vars["ucomp"] = target( + targets[year]["UCOMP"], apopn, aucomp, ucomp.sum() + ) + rhs_vars["wage1"] = target( + targets[year]["wage1"], apopn, awage, wage1.sum() + ) + rhs_vars["wage2"] = target( + targets[year]["wage2"], apopn, awage, wage2.sum() + ) + rhs_vars["wage3"] = target( + targets[year]["wage3"], apopn, awage, wage3.sum() + ) + rhs_vars["wage4"] = target( + targets[year]["wage4"], apopn, awage, wage4.sum() + ) + rhs_vars["wage5"] = target( + targets[year]["wage5"], apopn, awage, wage5.sum() + ) + rhs_vars["wage6"] = target( + targets[year]["wage6"], apopn, awage, wage6.sum() + ) + rhs_vars["wage7"] = target( + targets[year]["wage7"], apopn, awage, wage7.sum() + ) + rhs_vars["wage8"] = target( + targets[year]["wage8"], apopn, awage, wage8.sum() + ) model_vars = [ "single_returns", diff --git a/cps_stage2/stage2.py b/cps_stage2/stage2.py index 21c6c106..8ce2a26f 100644 --- a/cps_stage2/stage2.py +++ b/cps_stage2/stage2.py @@ -63,7 +63,9 @@ def main(): for year in range(START_YEAR, END_YEAR + 1): try: factor_match = _factors[year].equals(CUR_FACTORS[year]) - target_match = stage_2_targets[f"{year}"].equals(CUR_TARGETS[f"{year}"]) + target_match = stage_2_targets[f"{year}"].equals( + CUR_TARGETS[f"{year}"] + ) if files_match and factor_match and target_match: print(f"Skipping {year}") skipped_years.append(year) diff --git a/createpuf.py b/createpuf.py index 2c010ece..552c7a0f 100644 --- a/createpuf.py +++ b/createpuf.py @@ -49,13 +49,18 @@ def dataprep(data): """ # we use a slightly modified version of mars for matching. # _mars = 1 if single, 3 if HoH, 2 any type of joint filer - data["_mars"] = np.where(data["mars"] == 1, 1, np.where(data["mars"] == 4, 3, 2)) + data["_mars"] = np.where( + data["mars"] == 1, 1, np.where(data["mars"] == 4, 3, 2) + ) data["const"] = 1 data["bil"] = np.maximum(0, data["e00900"]) data["fil"] = np.maximum(0, data["e02100"]) data["tpi"] = data[INC_VARS].sum(axis=1) data["wage_share"] = np.divide( - data["e00200"], data["tpi"], out=np.zeros(data.shape[0]), where=data["tpi"] != 0 + data["e00200"], + data["tpi"], + out=np.zeros(data.shape[0]), + where=data["tpi"] != 0, ) data["cap_inc"] = data[CAP_VARS].sum(axis=1) data["cap_share"] = np.divide( @@ -73,17 +78,23 @@ def dataprep(data): ), 0, ) - data["people"] = np.where(data["_mars"] == 2, data["depne"] + 2, data["depne"] + 1) + data["people"] = np.where( + data["_mars"] == 2, data["depne"] + 2, data["depne"] + 1 + ) data["people"] = np.minimum(5, data["people"]) wage_flag = (data["e00200"] != 0).astype(int) # self employment flag - se_flag = np.logical_or(data["e00900"] != 0, data["e02100"] != 0).astype(int) + se_flag = np.logical_or(data["e00900"] != 0, data["e02100"] != 0).astype( + int + ) # income source flags data["se1"] = np.where(wage_flag & ~se_flag, 1, 0) data["se2"] = np.where(~wage_flag & se_flag, 1, 0) data["se3"] = np.where(wage_flag & se_flag, 1, 0) data["_depne"] = np.where( - np.logical_and(data["mars"] == 3, data["_depne"] == 0), 1, data["_depne"] + np.logical_and(data["mars"] == 3, data["_depne"] == 0), + 1, + data["_depne"], ) return data @@ -91,7 +102,9 @@ def dataprep(data): # create CPS tax units print("Creating CPS tax units") -raw_cps = cps.create(DATA_PATH, exportpkl=True, cps_files=[CPS_YEAR], benefits=False) +raw_cps = cps.create( + DATA_PATH, exportpkl=True, cps_files=[CPS_YEAR], benefits=False +) # minor PUF prep print("Prepping PUF") puf2011 = pd.read_csv(Path(DATA_PATH, "puf2011.csv")) @@ -158,7 +171,9 @@ def dataprep(data): # merge all the data together print("Merging matched data") -data = pd.merge(raw_puf, match_index, how="inner", left_on="recid", right_on="recip") +data = pd.merge( + raw_puf, match_index, how="inner", left_on="recid", right_on="recip" +) data = pd.merge( data, filers, diff --git a/history/report.py b/history/report.py index d51cf3b6..699886c8 100644 --- a/history/report.py +++ b/history/report.py @@ -174,7 +174,9 @@ def report(): img_path = Path(CUR_PATH, f"{var}.png") plot.write_image(str(img_path)) plot_paths.append(img_path) - growth_rate_projections.append(f"![]({str(img_path)})" + "{.center}") + growth_rate_projections.append( + f"![]({str(img_path)})" + "{.center}" + ) template_args["growth_rate_projections"] = growth_rate_projections @@ -204,13 +206,18 @@ def report(): # compare tax calculator projections # baseline CPS calculator - base_cps = tc.Calculator(records=tc.Records.cps_constructor(), policy=tc.Policy()) + base_cps = tc.Calculator( + records=tc.Records.cps_constructor(), policy=tc.Policy() + ) base_cps.advance_to_year(first_year) base_cps.calc_all() # updated CPS calculator - cps = pd.read_csv(Path(CUR_PATH, "..", "data", "cps.csv.gz"), index_col=None) + cps = pd.read_csv( + Path(CUR_PATH, "..", "data", "cps.csv.gz"), index_col=None + ) cps_weights = pd.read_csv( - Path(CUR_PATH, "..", "cps_stage2", "cps_weights.csv.gz"), index_col=None + Path(CUR_PATH, "..", "cps_stage2", "cps_weights.csv.gz"), + index_col=None, ) gfactor_path_str = str(GROW_FACTORS_PATH) gft = tc.GrowFactors(growfactors_filename=gfactor_path_str) @@ -232,13 +239,18 @@ def report(): # Validation with CBO tax model # baseline CPS calculator - base_cps = tc.Calculator(records=tc.Records.cps_constructor(), policy=tc.Policy()) + base_cps = tc.Calculator( + records=tc.Records.cps_constructor(), policy=tc.Policy() + ) base_cps.advance_to_year(first_year) base_cps.calc_all() # updated CPS calculator - cps = pd.read_csv(Path(CUR_PATH, "..", "data", "cps.csv.gz"), index_col=None) + cps = pd.read_csv( + Path(CUR_PATH, "..", "data", "cps.csv.gz"), index_col=None + ) cps_weights = pd.read_csv( - Path(CUR_PATH, "..", "cps_stage2", "cps_weights.csv.gz"), index_col=None + Path(CUR_PATH, "..", "cps_stage2", "cps_weights.csv.gz"), + index_col=None, ) gfactor_path_str = str(GROW_FACTORS_PATH) gft = tc.GrowFactors(growfactors_filename=gfactor_path_str) @@ -267,7 +279,8 @@ def report(): base_puf.calc_all() # updated puf calculator puf_weights = pd.read_csv( - Path(CUR_PATH, "..", "puf_stage2", "puf_weights.csv.gz"), index_col=None + Path(CUR_PATH, "..", "puf_stage2", "puf_weights.csv.gz"), + index_col=None, ) puf_ratios = pd.read_csv( Path(CUR_PATH, "..", "puf_stage3", "puf_ratios.csv"), index_col=0 @@ -293,7 +306,8 @@ def report(): base_puf.calc_all() # updated puf calculator puf_weights = pd.read_csv( - Path(CUR_PATH, "..", "puf_stage2", "puf_weights.csv.gz"), index_col=None + Path(CUR_PATH, "..", "puf_stage2", "puf_weights.csv.gz"), + index_col=None, ) puf_ratios = pd.read_csv( Path(CUR_PATH, "..", "puf_stage3", "puf_ratios.csv"), index_col=0 @@ -317,7 +331,9 @@ def report(): template_args["puf_income_table"] = None template_args["puf_payroll_table"] = None template_args["puf_salaries_and_wages_table"] = None - template_args["puf_taxable_interest_and_ordinary_dividends_table"] = None + template_args["puf_taxable_interest_and_ordinary_dividends_table"] = ( + None + ) template_args["puf_qualified_dividends_table"] = None template_args["puf_capital_table"] = None template_args["puf_business_table"] = None diff --git a/history/report_utils.py b/history/report_utils.py index 6e7f3ad5..35e0d545 100644 --- a/history/report_utils.py +++ b/history/report_utils.py @@ -83,7 +83,9 @@ def add_bins( bin_width = cumsum_range / float(num_bins) bin_edges = list(min_cumsum + np.arange(0, (num_bins + 1)) * bin_width) bin_edges[-1] = 9e99 # raise top of last bin to include all observations - bin_edges[0] = -9e99 # lower bottom of 1st bin to include all observation00s + bin_edges[0] = ( + -9e99 + ) # lower bottom of 1st bin to include all observation00s if decile_details: assert bin_edges[1] > 1e-9 # bin_edges[1] is top of bottom decile bin_edges.insert(1, 1e-9) # top of zeros @@ -164,42 +166,53 @@ def distribution(item, weight, agi): total = (item * weight).sum() agi_1 = (item[agi < 0] * weight[agi < 0]).sum() pct1 = round(agi_1 / total, 2) - agi_2 = (item[(agi > 1) & (agi < 5000)] * weight[(agi > 1) & (agi < 5000)]).sum() + agi_2 = ( + item[(agi > 1) & (agi < 5000)] * weight[(agi > 1) & (agi < 5000)] + ).sum() pct2 = round(agi_1 / total, 2) agi_3 = ( - item[(agi > 5000) & (agi < 10000)] * weight[(agi > 5000) & (agi < 10000)] + item[(agi > 5000) & (agi < 10000)] + * weight[(agi > 5000) & (agi < 10000)] ).sum() pct3 = round(agi_3 / total, 2) agi_4 = ( - item[(agi > 10000) & (agi < 15000)] * weight[(agi > 10000) & (agi < 15000)] + item[(agi > 10000) & (agi < 15000)] + * weight[(agi > 10000) & (agi < 15000)] ).sum() pct4 = round(agi_4 / total, 2) agi_5 = ( - item[(agi > 15000) & (agi < 20000)] * weight[(agi > 15000) & (agi < 20000)] + item[(agi > 15000) & (agi < 20000)] + * weight[(agi > 15000) & (agi < 20000)] ).sum() pct5 = round(agi_5 / total, 2) agi_6 = ( - item[(agi > 20000) & (agi < 25000)] * weight[(agi > 20000) & (agi < 25000)] + item[(agi > 20000) & (agi < 25000)] + * weight[(agi > 20000) & (agi < 25000)] ).sum() pct6 = round(agi_6 / total, 2) agi_7 = ( - item[(agi > 25000) & (agi < 30000)] * weight[(agi > 25000) & (agi < 30000)] + item[(agi > 25000) & (agi < 30000)] + * weight[(agi > 25000) & (agi < 30000)] ).sum() pct7 = round(agi_7 / total, 2) agi_8 = ( - item[(agi > 30000) & (agi < 40000)] * weight[(agi > 30000) & (agi < 40000)] + item[(agi > 30000) & (agi < 40000)] + * weight[(agi > 30000) & (agi < 40000)] ).sum() pct8 = round(agi_8 / total, 2) agi_9 = ( - item[(agi > 40000) & (agi < 50000)] * weight[(agi > 40000) & (agi < 50000)] + item[(agi > 40000) & (agi < 50000)] + * weight[(agi > 40000) & (agi < 50000)] ).sum() pct9 = round(agi_9 / total, 2) agi_10 = ( - item[(agi > 50000) & (agi < 75000)] * weight[(agi > 50000) & (agi < 75000)] + item[(agi > 50000) & (agi < 75000)] + * weight[(agi > 50000) & (agi < 75000)] ).sum() pct10 = round(agi_10 / total, 2) agi_11 = ( - item[(agi > 75000) & (agi < 100_000)] * weight[(agi > 75000) & (agi < 100_000)] + item[(agi > 75000) & (agi < 100_000)] + * weight[(agi > 75000) & (agi < 100_000)] ).sum() pct11 = round(agi_11 / total, 2) agi_12 = ( @@ -434,8 +447,12 @@ def form_output(meta, var_list): msg = "'file' must be either 'cps' or 'puf'" raise ValueError(msg) _file = f"taxdata_{file_}" - cur_avail = set(cur_meta[cur_meta["availability"].str.contains(_file)].index) - new_avail = set(new_meta[new_meta["availability"].str.contains(_file)].index) + cur_avail = set( + cur_meta[cur_meta["availability"].str.contains(_file)].index + ) + new_avail = set( + new_meta[new_meta["availability"].str.contains(_file)].index + ) _added_vars = new_avail - cur_avail _removed_vars = cur_avail - new_avail # get detailed information @@ -602,9 +619,9 @@ def CBO_projections(rev_proj): "https://www.cbo.gov/about/products/budget-economic-data" """ # extract values for AGI rows in the excel file - salary_wage = rev_proj.loc["Calculation of adjusted gross income (AGI)"].loc[ - "Salaries and wages" - ] + salary_wage = rev_proj.loc[ + "Calculation of adjusted gross income (AGI)" + ].loc["Salaries and wages"] for indx in salary_wage.index: if type(indx) != int: @@ -612,14 +629,18 @@ def CBO_projections(rev_proj): taxable_interest_ordinary_divid = rev_proj.loc[ "Calculation of adjusted gross income (AGI)" - ].loc["Taxable interest and ordinary dividends (excludes qualified dividends)"] + ].loc[ + "Taxable interest and ordinary dividends (excludes qualified dividends)" + ] q_div = rev_proj.loc["Calculation of adjusted gross income (AGI)"].loc[ "Qualified dividends " ] - capital_g_l = rev_proj.loc["Calculation of adjusted gross income (AGI)"].loc[ - "Capital gain or lossa" - ] - business_inc = rev_proj.loc["Calculation of adjusted gross income (AGI)"].loc[ + capital_g_l = rev_proj.loc[ + "Calculation of adjusted gross income (AGI)" + ].loc["Capital gain or lossa"] + business_inc = rev_proj.loc[ + "Calculation of adjusted gross income (AGI)" + ].loc[ "Net business income (all income and loss reported on Schedules C, E, and F)b" ] pension_annuities_IRAdis = rev_proj.loc[ @@ -658,7 +679,9 @@ def CBO_projections(rev_proj): "Total exemptions and deductions after limitsf" ] taxable_inc = ( - rev_proj.loc["Calculation of taxable income"].loc["Taxable incomeg"].iloc[0] + rev_proj.loc["Calculation of taxable income"] + .loc["Taxable incomeg"] + .iloc[0] ) tot_inctax = rev_proj.loc["Calculation of income tax liability"].loc[ "Total income tax (including AMT) before credits" @@ -868,9 +891,9 @@ def compare_calcs(base, new, name, template_args, plot_paths): cur_taxable_inc = run_calc_var(base, year, "c04800") cur_tot_inctax = run_calc_var(base, year, "c05800") cur_tot_cdt = run_calc_var(base, year, "c07100") - cur_inctax_af_credit = run_calc_var(base, year, "c05800") - run_calc_var( - base, year, "c07100" - ) + cur_inctax_af_credit = run_calc_var( + base, year, "c05800" + ) - run_calc_var(base, year, "c07100") new_salary_wage = run_calc_var(new, year, "e00200") new_taxable_interest_ordinary_divid = ( run_calc_var(new, year, "e00300") @@ -888,9 +911,9 @@ def compare_calcs(base, new, name, template_args, plot_paths): + run_calc_var(new, year, "e02000") + run_calc_var(new, year, "e02100") ) - new_pension_annuities_IRAdis = run_calc_var(new, year, "e01400") + run_calc_var( - new, year, "e01700" - ) + new_pension_annuities_IRAdis = run_calc_var( + new, year, "e01400" + ) + run_calc_var(new, year, "e01700") new_ssb = run_calc_var(new, year, "c02500") new_total_inc = run_calc_var(new, year, "c00100") + run_calc_var( new, year, "c02900" @@ -918,9 +941,9 @@ def compare_calcs(base, new, name, template_args, plot_paths): new_taxable_inc = run_calc_var(new, year, "c04800") new_tot_inctax = run_calc_var(new, year, "c05800") new_tot_cdt = run_calc_var(new, year, "c07100") - new_inctax_af_credit = run_calc_var(new, year, "c05800") - run_calc_var( - new, year, "c07100" - ) + new_inctax_af_credit = run_calc_var( + new, year, "c05800" + ) - run_calc_var(new, year, "c07100") aggs["Tax Liability"].append(base_aggs["payrolltax"]) aggs["Tax"].append("Current Payroll") aggs["Year"].append(year) @@ -1084,9 +1107,15 @@ def compare_calcs(base, new, name, template_args, plot_paths): template_args[f"{name}_agg_plot"] = f"![]({str(img_path)})" + "{.center}" # create tax liability and projection tables - template_args[f"{name}_combined_table"] = agg_liability_table(agg_df, "Combined") - template_args[f"{name}_payroll_table"] = agg_liability_table(agg_df, "Payroll") - template_args[f"{name}_income_table"] = agg_liability_table(agg_df, "Income") + template_args[f"{name}_combined_table"] = agg_liability_table( + agg_df, "Combined" + ) + template_args[f"{name}_payroll_table"] = agg_liability_table( + agg_df, "Payroll" + ) + template_args[f"{name}_income_table"] = agg_liability_table( + agg_df, "Income" + ) agi_table_name_list = [ f"{name}_salaries_and_wages_table", @@ -1189,9 +1218,9 @@ def CBO_validation(cbo_df, new, name, template_args): + run_calc_var(new, year, "e02000") + run_calc_var(new, year, "e02100") ) - new_pension_annuities_IRAdis = run_calc_var(new, year, "e01400") + run_calc_var( - new, year, "e01700" - ) + new_pension_annuities_IRAdis = run_calc_var( + new, year, "e01400" + ) + run_calc_var(new, year, "e01700") new_ssb = run_calc_var(new, year, "c02500") new_total_inc = run_calc_var(new, year, "c00100") + run_calc_var( new, year, "c02900" @@ -1221,9 +1250,9 @@ def CBO_validation(cbo_df, new, name, template_args): new_tot_cdt = run_calc_var(new, year, "c07100") + run_calc_var( new, year, "refund" ) - new_inctax_af_credit = run_calc_var(new, year, "c05800") - run_calc_var( - new, year, "c07100" - ) + new_inctax_af_credit = run_calc_var( + new, year, "c05800" + ) - run_calc_var(new, year, "c07100") agi_list = [ new_salary_wage, @@ -1355,6 +1384,8 @@ def CBO_validation(cbo_df, new, name, template_args): f"{name}_validation_Top50_percent_income_group_shares_of_AGI_table", ] share_keyword_list = ["Top1p", "Top5p", "Top10p", "Top25p", "Top50p"] - for table_name, keyword in zip(shareval_table_name_list, share_keyword_list): + for table_name, keyword in zip( + shareval_table_name_list, share_keyword_list + ): template_args[table_name] = validation_table(agg3_df, cbo_df, keyword) return template_args diff --git a/puf_stage1/cbo_chained_cpiu.py b/puf_stage1/cbo_chained_cpiu.py index 15de68f0..b349409c 100644 --- a/puf_stage1/cbo_chained_cpiu.py +++ b/puf_stage1/cbo_chained_cpiu.py @@ -149,7 +149,9 @@ def read_check_spreadsheet(fname): sys.stderr.write(f"STRUCT: {msg}\n") ok_years = False lby = taxcalc.Policy.LAST_BUDGET_YEAR - valid_lby = range(CBO_YEAR["first"]["year"] + 2, CBO_YEAR["last"]["year"] + 1) + valid_lby = range( + CBO_YEAR["first"]["year"] + 2, CBO_YEAR["last"]["year"] + 1 + ) if lby not in valid_lby: msg = f"Policy.LAST_BUDGET_YEAR={lby} inconsistent with CBO_YEAR info" sys.stderr.write(f"STRUCT: {msg}\n") diff --git a/puf_stage1/stage1.py b/puf_stage1/stage1.py index cfdffb20..9774375c 100644 --- a/puf_stage1/stage1.py +++ b/puf_stage1/stage1.py @@ -23,7 +23,9 @@ # # projection for 2014+ -pop_projection = pd.read_csv(os.path.join(CUR_PATH, "NP2014_D1.csv"), index_col="year") +pop_projection = pd.read_csv( + os.path.join(CUR_PATH, "NP2014_D1.csv"), index_col="year" +) pop_projection = pop_projection[ (pop_projection.sex == 0) & (pop_projection.race == 0) @@ -39,12 +41,16 @@ # data for 2010-2014 historical1 = pd.read_csv(os.path.join(CUR_PATH, "NC-EST2014-AGESEX-RES.csv")) historical1 = historical1[historical1.SEX == 0] -historical1 = historical1.drop(["SEX", "CENSUS2010POP", "ESTIMATESBASE2010"], axis=1) +historical1 = historical1.drop( + ["SEX", "CENSUS2010POP", "ESTIMATESBASE2010"], axis=1 +) pop_dep1 = historical1[historical1.AGE <= DEP].sum() pop_dep1 = pop_dep1.drop(["AGE"], axis=0) -pop_snr1 = historical1[(historical1.AGE >= SENIOR) & (historical1.AGE < TOTES)].sum() +pop_snr1 = historical1[ + (historical1.AGE >= SENIOR) & (historical1.AGE < TOTES) +].sum() pop_snr1 = pop_snr1.drop(["AGE"], axis=0) total_pop1 = historical1[historical1.AGE == TOTES] @@ -53,7 +59,9 @@ # data for 2008-2009 historical2 = pd.read_csv(os.path.join(CUR_PATH, "US-EST00INT-ALLDATA.csv")) historical2 = historical2[ - (historical2.MONTH == 7) & (historical2.YEAR >= 2008) & (historical2.YEAR < 2010) + (historical2.MONTH == 7) + & (historical2.YEAR >= 2008) + & (historical2.YEAR < 2010) ] historical2 = historical2.drop(historical2.columns[4:], axis=1) historical2 = historical2.drop(historical2.columns[0], axis=1) @@ -65,10 +73,14 @@ pop_dep2.append(historical2.TOT_POP[year09under19].sum()) year08over65 = ( - (historical2.YEAR == 2008) & (historical2.AGE >= SENIOR) & (historical2.AGE < TOTES) + (historical2.YEAR == 2008) + & (historical2.AGE >= SENIOR) + & (historical2.AGE < TOTES) ) year09over65 = ( - (historical2.YEAR == 2009) & (historical2.AGE >= SENIOR) & (historical2.AGE < TOTES) + (historical2.YEAR == 2009) + & (historical2.AGE >= SENIOR) + & (historical2.AGE < TOTES) ) pop_snr2 = [] pop_snr2.append(historical2.TOT_POP[year08over65].sum()) @@ -120,16 +132,30 @@ pop_growth_rates = pop_growth_rates.drop(pop_growth_rates.index[0], axis=0) # import CBO baseline projection -cbo_baseline = pd.read_csv(os.path.join(CUR_PATH, "CBO_baseline.csv"), index_col=0) +cbo_baseline = pd.read_csv( + os.path.join(CUR_PATH, "CBO_baseline.csv"), index_col=0 +) cbobase = cbo_baseline.transpose() cbobase.index = index cbobase = cbobase.astype(float) -Stage_I_factors["AGDPN"] = pd.DataFrame(cbobase.GDP / cbobase.GDP[SYR], index=index) -Stage_I_factors["ATXPY"] = pd.DataFrame(cbobase.TPY / cbobase.TPY[SYR], index=index) -Stage_I_factors["ASCHF"] = pd.DataFrame(cbobase.SCHF / cbobase.SCHF[SYR], index=index) -Stage_I_factors["ABOOK"] = pd.DataFrame(cbobase.BOOK / cbobase.BOOK[SYR], index=index) -Stage_I_factors["ACPIU"] = pd.DataFrame(cbobase.CPIU / cbobase.CPIU[SYR], index=index) -Stage_I_factors["ACPIM"] = pd.DataFrame(cbobase.CPIM / cbobase.CPIM[SYR], index=index) +Stage_I_factors["AGDPN"] = pd.DataFrame( + cbobase.GDP / cbobase.GDP[SYR], index=index +) +Stage_I_factors["ATXPY"] = pd.DataFrame( + cbobase.TPY / cbobase.TPY[SYR], index=index +) +Stage_I_factors["ASCHF"] = pd.DataFrame( + cbobase.SCHF / cbobase.SCHF[SYR], index=index +) +Stage_I_factors["ABOOK"] = pd.DataFrame( + cbobase.BOOK / cbobase.BOOK[SYR], index=index +) +Stage_I_factors["ACPIU"] = pd.DataFrame( + cbobase.CPIU / cbobase.CPIU[SYR], index=index +) +Stage_I_factors["ACPIM"] = pd.DataFrame( + cbobase.CPIM / cbobase.CPIM[SYR], index=index +) cbo_growth_rates = cbobase.pct_change() + 1.0 cbo_growth_rates = cbo_growth_rates.drop(cbo_growth_rates.index[0], axis=0) @@ -149,7 +175,9 @@ return_growth_rate.Returns.index = index # read SOI estimates for 2008+ -soi_estimates = pd.read_csv(os.path.join(CUR_PATH, "SOI_estimates.csv"), index_col=0) +soi_estimates = pd.read_csv( + os.path.join(CUR_PATH, "SOI_estimates.csv"), index_col=0 +) soi_estimates = soi_estimates.transpose() historical_index = list(range(2008, SOI_YR + 1)) soi_estimates.index = historical_index @@ -161,7 +189,9 @@ Joint = return_projection.Joint[i] * return_growth_rate.Returns[i + 1] HH = return_projection.HH[i] * return_growth_rate.Returns[i + 1] SS_return = return_projection.SS_return[i] * pop_growth_rates.POPSNR[i + 1] - Dep_return = return_projection.Dep_return[i] * pop_growth_rates.POPDEP[i + 1] + Dep_return = ( + return_projection.Dep_return[i] * pop_growth_rates.POPDEP[i + 1] + ) INTS = return_projection.INTS[i] * cbo_growth_rates.INTS[i + 1] DIVS = return_projection.DIVS[i] * cbo_growth_rates.DIVS[i + 1] SCHCI = return_projection.SCHCI[i] * cbo_growth_rates.SCHC[i + 1] @@ -238,23 +268,34 @@ Stage_I_factors["AWAGE"] = total_wage / total_wage.AWAGE[SYR] -Stage_I_factors["ASCHCI"] = Stage_II_targets.SCHCI / Stage_II_targets.SCHCI[SYR] -Stage_I_factors["ASCHCL"] = Stage_II_targets.SCHCL / Stage_II_targets.SCHCL[SYR] +Stage_I_factors["ASCHCI"] = ( + Stage_II_targets.SCHCI / Stage_II_targets.SCHCI[SYR] +) +Stage_I_factors["ASCHCL"] = ( + Stage_II_targets.SCHCL / Stage_II_targets.SCHCL[SYR] +) -Stage_I_factors["ASCHEI"] = Stage_II_targets.SCHEI / Stage_II_targets.SCHEI[SYR] -Stage_I_factors["ASCHEL"] = Stage_II_targets.SCHEL / Stage_II_targets.SCHEL[SYR] +Stage_I_factors["ASCHEI"] = ( + Stage_II_targets.SCHEI / Stage_II_targets.SCHEI[SYR] +) +Stage_I_factors["ASCHEL"] = ( + Stage_II_targets.SCHEL / Stage_II_targets.SCHEL[SYR] +) Stage_I_factors["AINTS"] = Stage_II_targets.INTS / Stage_II_targets.INTS[SYR] Stage_I_factors["ADIVS"] = Stage_II_targets.DIVS / Stage_II_targets.DIVS[SYR] Stage_I_factors["ACGNS"] = Stage_II_targets.CGNS / Stage_II_targets.CGNS[SYR] Stage_I_factors["ASOCSEC"] = Stage_II_targets.SS / Stage_II_targets.SS[SYR] -Stage_I_factors["AUCOMP"] = Stage_II_targets.UCOMP / Stage_II_targets.UCOMP[SYR] +Stage_I_factors["AUCOMP"] = ( + Stage_II_targets.UCOMP / Stage_II_targets.UCOMP[SYR] +) Stage_I_factors["AIPD"] = Stage_II_targets.IPD / Stage_II_targets.IPD[SYR] # Add benefit growth rates to Stage 1 factors benefit_programs = pd.read_csv( - os.path.join(CUR_PATH, "../taxdata/cps/benefitprograms.csv"), index_col="Program" + os.path.join(CUR_PATH, "../taxdata/cps/benefitprograms.csv"), + index_col="Program", ) benefit_sums = benefit_programs[benefit_programs.columns[2:]].apply(sum) # Find growth rate between 2020 and 2021 and extrapolate out to EYR @@ -263,7 +304,9 @@ prev_year = year - 1 prev_value = benefit_sums["{}_cost".format(prev_year)] benefit_sums["{}_cost".format(year)] = prev_value * gr -ABENEFITS = (benefit_sums / benefit_sums["{}_cost".format(BEN_SYR)]).transpose() +ABENEFITS = ( + benefit_sums / benefit_sums["{}_cost".format(BEN_SYR)] +).transpose() benefit_factors = pd.DataFrame() for year in range(SYR, EYR + 1): if year <= BEN_SYR: diff --git a/puf_stage1/updatecbo.py b/puf_stage1/updatecbo.py index c36cc054..5e49ed0c 100644 --- a/puf_stage1/updatecbo.py +++ b/puf_stage1/updatecbo.py @@ -123,7 +123,10 @@ def update_econproj(url, baseline, text_args): else: # read in economic projections econ_proj = pd.read_excel( - econ_url, sheet_name="2. Calendar Year", skiprows=6, index_col=[0, 1, 2, 3] + econ_url, + sheet_name="2. Calendar Year", + skiprows=6, + index_col=[0, 1, 2, 3], ) # extract values for needed rows in the excel file # some variables have a missing value in the multi-index. Use iloc @@ -250,7 +253,10 @@ def update_socsec(url, baseline, text_args): html = pd.read_html(socsec_url, match=match_txt)[0] # merge the columns with years and data into one sub_data = pd.concat( - [html["Fiscal year", "Fiscal year.1"], html["Cost", "Scheduled benefits"]], + [ + html["Fiscal year", "Fiscal year.1"], + html["Cost", "Scheduled benefits"], + ], axis=1, ) sub_data.columns = ["year", "cost"] @@ -317,7 +323,9 @@ def update_rets(url, baseline, text_args): if re.match(pattern, link): spreadsheet_url = link break - data = pd.read_excel(spreadsheet_url, sheet_name="1B-BOD", index_col=0, header=2) + data = pd.read_excel( + spreadsheet_url, sheet_name="1B-BOD", index_col=0, header=2 + ) projections = data.loc["Forms 1040 and 1040-SR, Total"] projections /= 1_000_000 # convert units pct_change = projections.pct_change() + 1 @@ -412,7 +420,9 @@ def fill_text_args(text): Provide initial values for all text arguments """ text_args = {} - previous_cbo_doc = re.search(r"Previous Document: ([\w \d]+)", text).groups()[0] + previous_cbo_doc = re.search( + r"Previous Document: ([\w \d]+)", text + ).groups()[0] cur_cbo_doc = re.search(r"Current Document: ([\w \d]+)", text).groups()[0] text_args["previous_cbo"] = previous_cbo_doc text_args["current_cbo"] = cur_cbo_doc @@ -450,16 +460,16 @@ def update_cbo(): "methods", "CBO_Baseline_Updating_Instructions.md", ) - template_str = Path(CUR_PATH, "doc", "cbo_instructions_template.md").open().read() + template_str = ( + Path(CUR_PATH, "doc", "cbo_instructions_template.md").open().read() + ) current_text = out_path.open().read() text_args = fill_text_args(current_text) baseline = pd.read_csv(Path(CUR_PATH, "CBO_baseline.csv"), index_col=0) CBO_URL = "https://www.cbo.gov/about/products/budget-economic-data" SOCSEC_URL = "https://www.ssa.gov/oact/TR/" RETS_URL = "https://www.irs.gov/statistics/soi-tax-stats-calendar-year-projections-publication-6187" - UCOMP_URL = ( - "https://www.cbo.gov/about/products/baseline-projections-selected-programs" - ) + UCOMP_URL = "https://www.cbo.gov/about/products/baseline-projections-selected-programs" baseline, text_args = update_econproj(CBO_URL, baseline, text_args) baseline, text_args = update_cpim(baseline, text_args) diff --git a/puf_stage1/updatesoi.py b/puf_stage1/updatesoi.py index f42eb79a..fd8786a0 100644 --- a/puf_stage1/updatesoi.py +++ b/puf_stage1/updatesoi.py @@ -21,7 +21,13 @@ "Unnamed: 69_level_2", "Number of\nreturns", ), - ("INTS", "Taxable interest", "Unnamed: 8_level_1", "Unnamed: 8_level_2", "Amount"), + ( + "INTS", + "Taxable interest", + "Unnamed: 8_level_1", + "Unnamed: 8_level_2", + "Amount", + ), ( "DIVS", "Ordinary dividends", @@ -29,8 +35,20 @@ "Unnamed: 12_level_2", "Amount", ), - ("SCHCI", "Business or profession", "Net\nincome", "Unnamed: 20_level_2", "Amount"), - ("SCHCL", "Business or profession", "Net\nloss", "Unnamed: 22_level_2", "Amount"), + ( + "SCHCI", + "Business or profession", + "Net\nincome", + "Unnamed: 20_level_2", + "Amount", + ), + ( + "SCHCL", + "Business or profession", + "Net\nloss", + "Unnamed: 22_level_2", + "Amount", + ), ( "CGNS", "Sales of capital assets reported on Form 1040, Schedule D [2]", @@ -38,7 +56,13 @@ "Unnamed: 26_level_2", "Amount", ), - ("Pension", "Pensions and\nannuities", "Unnamed: 38_level_1", "Taxable", "Amount"), + ( + "Pension", + "Pensions and\nannuities", + "Unnamed: 38_level_1", + "Taxable", + "Amount", + ), ( "SCHEI", "Total rental and royalty", @@ -53,8 +77,20 @@ "Unnamed: 56_level_2", "Amount", ), - ("SCHEI", "Estate and trust", "Net\nincome", "Unnamed: 60_level_2", "Amount"), - ("SCHEL", "Total rental and royalty", "Net\nloss", "Unnamed: 54_level_2", "Amount"), + ( + "SCHEI", + "Estate and trust", + "Net\nincome", + "Unnamed: 60_level_2", + "Amount", + ), + ( + "SCHEL", + "Total rental and royalty", + "Net\nloss", + "Unnamed: 54_level_2", + "Amount", + ), ( "SCHEL", "Partnership and S corporation", @@ -62,8 +98,20 @@ "Unnamed: 58_level_2", "Amount", ), - ("SCHEL", "Estate and trust", "Net\nloss", "Unnamed: 62_level_2", "Amount"), - ("SS", "Social security benefits", "Total [1]", "Unnamed: 70_level_2", "Amount"), + ( + "SCHEL", + "Estate and trust", + "Net\nloss", + "Unnamed: 62_level_2", + "Amount", + ), + ( + "SS", + "Social security benefits", + "Total [1]", + "Unnamed: 70_level_2", + "Amount", + ), ( "UCOMP", "Unemployment compensation", @@ -88,7 +136,16 @@ (15, 15), (17, 20), ] -CPSWAGES = [(2, 4), (5, 6), (7, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 20)] +CPSWAGES = [ + (2, 4), + (5, 6), + (7, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 20), +] def update_soi(year, datapath, wage_indicies, file_): @@ -174,7 +231,9 @@ def table21(year, datapath): datapath: path to the directory holding the SOI files """ file_ = f"{str(year)[-2:]}in21id.xls" - data = pd.read_excel(Path(datapath, file_), header=[2, 3, 4, 5, 6], index_col=0) + data = pd.read_excel( + Path(datapath, file_), header=[2, 3, 4, 5, 6], index_col=0 + ) itemded = "Itemized deductions" ipd = "Interest paid deduction" un = "Unnamed: 83_level_3" @@ -209,7 +268,9 @@ def table14_wages(data, indicies): """ was = [] assert len(data) == 21 # they sometimes change up the wage bins they use - data = data["Salaries and wages"]["Unnamed: 6_level_1"]["Unnamed: 6_level_2"] + data = data["Salaries and wages"]["Unnamed: 6_level_1"][ + "Unnamed: 6_level_2" + ] for i, j in indicies: val = data.iloc[i : j + 1].sum()[0].astype(int) was.append(val) @@ -227,7 +288,9 @@ def table14(year, wage_indicies, datapath): datapath: path to directory where the SOI data is stored """ data = pd.read_excel( - Path(datapath, f"{str(year)[-2:]}in14ar.xls"), header=[2, 3, 4, 5], index_col=0 + Path(datapath, f"{str(year)[-2:]}in14ar.xls"), + header=[2, 3, 4, 5], + index_col=0, ) data = data.iloc[:21] nonwages = table14_nonwages(data, TABLE14COLS) @@ -239,7 +302,9 @@ def table14(year, wage_indicies, datapath): parser = argparse.ArgumentParser() parser.add_argument("year", help="Year of the update", type=str) parser.add_argument( - "path", help="Path to a directory with all of the SOI files needed", type=str + "path", + help="Path to a directory with all of the SOI files needed", + type=str, ) args = parser.parse_args() year = args.year diff --git a/puf_stage2/dataprep.py b/puf_stage2/dataprep.py index a37f711e..589c6139 100644 --- a/puf_stage2/dataprep.py +++ b/puf_stage2/dataprep.py @@ -17,13 +17,16 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): hh_return = np.where((puf.mars == 4) & (puf.filer == 1), s006, 0) return_w_SS = np.where((puf.e02400 > 0) & (puf.filer == 1), s006, 0) - dependent_exempt_num = (puf.xocah + puf.xocawh + puf.xoodep + puf.xopar) * s006 + dependent_exempt_num = ( + puf.xocah + puf.xocawh + puf.xoodep + puf.xopar + ) * s006 interest = puf.e00300 * s006 dividend = puf.e00600 * s006 biz_income = np.where(puf.e00900 > 0, puf.e00900, 0) * s006 biz_loss = np.where(puf.e00900 < 0, -puf.e00900, 0) * s006 cap_gain = ( - np.where((puf.p23250 + puf.p22250) > 0, puf.p23250 + puf.p22250, 0) * s006 + np.where((puf.p23250 + puf.p22250) > 0, puf.p23250 + puf.p22250, 0) + * s006 ) annuity_pension = puf.e01700 * s006 sch_e_income = np.where(puf.e02000 > 0, puf.e02000, 0) * s006 @@ -33,33 +36,47 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): # Wage distribution wage_1 = np.where(puf.e00100 <= 0, puf.e00200, 0) * s006 - wage_2 = np.where((puf.e00100 > 0) & (puf.e00100 <= 10000), puf.e00200, 0) * s006 + wage_2 = ( + np.where((puf.e00100 > 0) & (puf.e00100 <= 10000), puf.e00200, 0) + * s006 + ) wage_3 = ( - np.where((puf.e00100 > 10000) & (puf.e00100 <= 20000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 10000) & (puf.e00100 <= 20000), puf.e00200, 0) + * s006 ) wage_4 = ( - np.where((puf.e00100 > 20000) & (puf.e00100 <= 30000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 20000) & (puf.e00100 <= 30000), puf.e00200, 0) + * s006 ) wage_5 = ( - np.where((puf.e00100 > 30000) & (puf.e00100 <= 40000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 30000) & (puf.e00100 <= 40000), puf.e00200, 0) + * s006 ) wage_6 = ( - np.where((puf.e00100 > 40000) & (puf.e00100 <= 50000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 40000) & (puf.e00100 <= 50000), puf.e00200, 0) + * s006 ) wage_7 = ( - np.where((puf.e00100 > 50000) & (puf.e00100 <= 75000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 50000) & (puf.e00100 <= 75000), puf.e00200, 0) + * s006 ) wage_8 = ( - np.where((puf.e00100 > 75000) & (puf.e00100 <= 100000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 75000) & (puf.e00100 <= 100000), puf.e00200, 0) + * s006 ) wage_9 = ( - np.where((puf.e00100 > 100000) & (puf.e00100 <= 200000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 100000) & (puf.e00100 <= 200000), puf.e00200, 0) + * s006 ) wage_10 = ( - np.where((puf.e00100 > 200000) & (puf.e00100 <= 500000), puf.e00200, 0) * s006 + np.where((puf.e00100 > 200000) & (puf.e00100 <= 500000), puf.e00200, 0) + * s006 ) wage_11 = ( - np.where((puf.e00100 > 500000) & (puf.e00100 <= 1000000), puf.e00200, 0) * s006 + np.where( + (puf.e00100 > 500000) & (puf.e00100 <= 1000000), puf.e00200, 0 + ) + * s006 ) wage_12 = np.where(puf.e00100 > 1000000, puf.e00200, 0) * s006 @@ -118,7 +135,10 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): AINTS = Stage_I_factors[year]["AINTS"] INTEREST = ( - Stage_II_targets[ystr]["Taxable Interest Income"] * APOPN / AINTS * 1000 + Stage_II_targets[ystr]["Taxable Interest Income"] + * APOPN + / AINTS + * 1000 - interest.sum() ) @@ -130,19 +150,28 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): ASCHCI = Stage_I_factors[year]["ASCHCI"] BIZ_INCOME = ( - Stage_II_targets[ystr]["Business Income (Schedule C)"] * APOPN / ASCHCI * 1000 + Stage_II_targets[ystr]["Business Income (Schedule C)"] + * APOPN + / ASCHCI + * 1000 - biz_income.sum() ) ASCHCL = Stage_I_factors[year]["ASCHCL"] BIZ_LOSS = ( - Stage_II_targets[ystr]["Business Loss (Schedule C)"] * APOPN / ASCHCL * 1000 + Stage_II_targets[ystr]["Business Loss (Schedule C)"] + * APOPN + / ASCHCL + * 1000 - biz_loss.sum() ) ACGNS = Stage_I_factors[year]["ACGNS"] CAP_GAIN = ( - Stage_II_targets[ystr]["Net Capital Gains in AGI"] * APOPN / ACGNS * 1000 + Stage_II_targets[ystr]["Net Capital Gains in AGI"] + * APOPN + / ACGNS + * 1000 - cap_gain.sum() ) @@ -156,12 +185,16 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): ASCHEI = Stage_I_factors[year]["ASCHEI"] target_name = "Supplemental Income (Schedule E)" SCH_E_INCOME = ( - Stage_II_targets[ystr][target_name] * APOPN / ASCHEI * 1000 - sch_e_income.sum() + Stage_II_targets[ystr][target_name] * APOPN / ASCHEI * 1000 + - sch_e_income.sum() ) ASCHEL = Stage_I_factors[year]["ASCHEL"] SCH_E_LOSS = ( - Stage_II_targets[ystr]["Supplemental Loss (Schedule E)"] * APOPN / ASCHEL * 1000 + Stage_II_targets[ystr]["Supplemental Loss (Schedule E)"] + * APOPN + / ASCHEL + * 1000 - sch_e_loss.sum() ) @@ -177,35 +210,74 @@ def dataprep(puf, Stage_I_factors, Stage_II_targets, year): AUCOMP = Stage_I_factors[year]["AUCOMP"] UNEMPLOYMENT_COMP = ( - Stage_II_targets[ystr]["Unemployment Compensation"] * APOPN / AUCOMP * 1000 + Stage_II_targets[ystr]["Unemployment Compensation"] + * APOPN + / AUCOMP + * 1000 - unemployment_comp.sum() ) AWAGE = Stage_I_factors[year]["AWAGE"] target_name = "Wages and Salaries: Zero or Less" - WAGE_1 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_1.sum() + WAGE_1 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_1.sum() + ) target_name = "Wages and Salaries: $1 Less Than $10,000" - WAGE_2 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_2.sum() + WAGE_2 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_2.sum() + ) target_name = "Wages and Salaries: $10,000 Less Than $20,000" - WAGE_3 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_3.sum() + WAGE_3 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_3.sum() + ) target_name = "Wages and Salaries: $20,000 Less Than $30,000" - WAGE_4 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_4.sum() + WAGE_4 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_4.sum() + ) target_name = "Wages and Salaries: $30,000 Less Than $40,000" - WAGE_5 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_5.sum() + WAGE_5 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_5.sum() + ) target_name = "Wages and Salaries: $40,000 Less Than $50,000" - WAGE_6 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_6.sum() + WAGE_6 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_6.sum() + ) target_name = "Wages and Salaries: $50,000 Less Than $75,000" - WAGE_7 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_7.sum() + WAGE_7 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_7.sum() + ) target_name = "Wages and Salaries: $75,000 Less Than $100,000" - WAGE_8 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_8.sum() + WAGE_8 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_8.sum() + ) target_name = "Wages and Salaries: $100,000 Less Than $200,000" - WAGE_9 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_9.sum() + WAGE_9 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_9.sum() + ) target_name = "Wages and Salaries: $200,000 Less Than $500,000" - WAGE_10 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_10.sum() + WAGE_10 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_10.sum() + ) target_name = "Wages and Salaries: $500,000 Less Than $1 Million" - WAGE_11 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_11.sum() + WAGE_11 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_11.sum() + ) target_name = "Wages and Salaries: $1 Million and Over" - WAGE_12 = Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 - wage_12.sum() + WAGE_12 = ( + Stage_II_targets[ystr][target_name] * APOPN / AWAGE * 1000 + - wage_12.sum() + ) temp = [ INTEREST, diff --git a/puf_stage2/stage2.py b/puf_stage2/stage2.py index 985ebac1..f1e78087 100644 --- a/puf_stage2/stage2.py +++ b/puf_stage2/stage2.py @@ -99,7 +99,11 @@ # Write all weights (rounded to nearest integer) to puf_weights.csv file z = z.round(0).astype("int64") -z.to_csv(os.path.join(CUR_PATH, "puf_weights.csv.gz"), index=False, compression="gzip") +z.to_csv( + os.path.join(CUR_PATH, "puf_weights.csv.gz"), + index=False, + compression="gzip", +) # remove all .npz (numpy array) files for file in glob.glob("*.npz"): diff --git a/puf_stage3/stage3.py b/puf_stage3/stage3.py index 1faba02e..a631c7a5 100644 --- a/puf_stage3/stage3.py +++ b/puf_stage3/stage3.py @@ -66,8 +66,12 @@ def adjustment(agi, var, var_name, target, weights, blowup): bin_8 = np.where((agi >= 40000) & (agi < 50000), var * s006, 0).sum() bin_9 = np.where((agi >= 50000) & (agi < 75000), var * s006, 0).sum() bin_10 = np.where((agi >= 75000) & (agi < 100000), var * s006, 0).sum() - bin_11 = np.where((agi >= 100000) & (agi < 200000), var * s006, 0).sum() - bin_12 = np.where((agi >= 200000) & (agi < 500000), var * s006, 0).sum() + bin_11 = np.where( + (agi >= 100000) & (agi < 200000), var * s006, 0 + ).sum() + bin_12 = np.where( + (agi >= 200000) & (agi < 500000), var * s006, 0 + ).sum() bin_13 = np.where((agi >= 500000) & (agi < 1e6), var * s006, 0).sum() bin_14 = np.where((agi >= 1e6) & (agi < 1.5e6), var * s006, 0).sum() bin_15 = np.where((agi >= 1.5e6) & (agi < 2e6), var * s006, 0).sum() @@ -133,9 +137,13 @@ def adjustment(agi, var, var_name, target, weights, blowup): # Read all necessary files puf = pd.read_csv(os.path.join(CUR_PATH, "../data/cps-matched-puf.csv")) -targets = pd.read_csv(os.path.join(CUR_PATH, "stage3_targets.csv"), index_col=0) +targets = pd.read_csv( + os.path.join(CUR_PATH, "stage3_targets.csv"), index_col=0 +) wght = pd.read_csv(os.path.join(CUR_PATH, "../puf_stage2/puf_weights.csv.gz")) -bf = pd.read_csv(os.path.join(CUR_PATH, "../puf_stage1/growfactors.csv"), index_col=0) +bf = pd.read_csv( + os.path.join(CUR_PATH, "../puf_stage1/growfactors.csv"), index_col=0 +) # Call adjustment function for each variable being adjusted ints = adjustment(puf.e00100, puf.e00300, "INT", targets, wght, bf.AINTS) diff --git a/taxdata/cps/benefits.py b/taxdata/cps/benefits.py index 16ba95d7..8f53b42c 100644 --- a/taxdata/cps/benefits.py +++ b/taxdata/cps/benefits.py @@ -49,16 +49,18 @@ def read_ben(path_prefix, usecols): wic_infants = read_ben( WIC_STR.format("infants"), ["peridnum", "WIC_impute"] ).rename(columns={"WIC_impute": "wic_infants"}) - wic_women = read_ben(WIC_STR.format("women"), ["peridnum", "WIC_impute"]).rename( - columns={"WIC_impute": "wic_women"} - ) + wic_women = read_ben( + WIC_STR.format("women"), ["peridnum", "WIC_impute"] + ).rename(columns={"WIC_impute": "wic_women"}) # combine all WIC imputation into one variable wic = reduce( lambda left, right: pd.merge(left, right, on="peridnum"), [wic_children, wic_infants, wic_women], ) - wic["wic_impute"] = wic[["wic_women", "wic_infants", "wic_children"]].sum(axis=1) + wic["wic_impute"] = wic[["wic_women", "wic_infants", "wic_children"]].sum( + axis=1 + ) # merge housing and snap cps_merged = cps.merge(housing, on=["fh_seq", "ffpos"], how="left") @@ -72,7 +74,9 @@ def read_ben(path_prefix, usecols): if export: print("Saving {} Data".format(year)) - cps_merged.to_csv(Path(data_path, f"cpsmar{year}_ben.csv"), index=False) + cps_merged.to_csv( + Path(data_path, f"cpsmar{year}_ben.csv"), index=False + ) del mcaid, mcare, vb, ssi, ss, tanf, ui, wic, housing, snap # assert that no additional rows have been introduced by bad merges @@ -114,7 +118,9 @@ def distribute_benefits(data, other_ben): # Distribute other benefits data["dist_ben"] = data[["mcaid_ben", "ssi_ben", "snap_ben"]].sum(axis=1) data["ratio"] = ( - data["dist_ben"] * data["s006"] / (data["dist_ben"] * data["s006"]).sum() + data["dist_ben"] + * data["s006"] + / (data["dist_ben"] * data["s006"]).sum() ) # ... remove TANF and WIC from other_ben total tanf_total = (data["tanf_ben"] * data["s006"]).sum() diff --git a/taxdata/cps/cpsmar.py b/taxdata/cps/cpsmar.py index eb295ab4..21b2baa3 100644 --- a/taxdata/cps/cpsmar.py +++ b/taxdata/cps/cpsmar.py @@ -38,7 +38,14 @@ def person_details(record, benefits, h_seq, fhseq, ffpos, year): # calculate earned and unearned income EARNED_INC_VARS = ["wsal_val", "semp_val", "frse_val", "rnt_val"] - UNEARNED_INC_VARS = ["int_val", "div_val", "rtm_val", "alimony", "uc_val", "ss_val"] + UNEARNED_INC_VARS = [ + "int_val", + "div_val", + "rtm_val", + "alimony", + "uc_val", + "ss_val", + ] record["earned_inc"] = sum([record[var] for var in EARNED_INC_VARS]) record["unearned_inc"] = sum([record[var] for var in UNEARNED_INC_VARS]) record["tot_inc"] = record["earned_inc"] + record["unearned_inc"] diff --git a/taxdata/cps/create.py b/taxdata/cps/create.py index 58127998..e281657c 100644 --- a/taxdata/cps/create.py +++ b/taxdata/cps/create.py @@ -83,11 +83,15 @@ def create( cps_dfs[year] = pickle.load(pkl_path.open("rb")) else: # check if .dat file exists, and, if not, download it - data_file = Path(os.path.join(datapath, "asec" + str(year) + "_pubuse.dat")) + data_file = Path( + os.path.join(datapath, "asec" + str(year) + "_pubuse.dat") + ) if data_file.exists(): pass else: - cpsmar_url = "http://data.nber.org/cps/cpsmar" + str(year) + ".zip" + cpsmar_url = ( + "http://data.nber.org/cps/cpsmar" + str(year) + ".zip" + ) r = requests.get(cpsmar_url) z = zipfile.ZipFile(io.BytesIO(r.content)) z.extractall(datapath) diff --git a/taxdata/cps/finalprep.py b/taxdata/cps/finalprep.py index 4b2c23a0..23b86dfb 100644 --- a/taxdata/cps/finalprep.py +++ b/taxdata/cps/finalprep.py @@ -113,8 +113,12 @@ def adjust_helper(agi, var, target, weight, agi_bin): bin_8 = np.where((agi >= 40000) & (agi < 50000), var * weight, 0).sum() bin_9 = np.where((agi >= 50000) & (agi < 75000), var * weight, 0).sum() bin_10 = np.where((agi >= 75000) & (agi < 100_000), var * weight, 0).sum() - bin_11 = np.where((agi >= 100_000) & (agi < 200_000), var * weight, 0).sum() - bin_12 = np.where((agi >= 200_000) & (agi < 500_000), var * weight, 0).sum() + bin_11 = np.where( + (agi >= 100_000) & (agi < 200_000), var * weight, 0 + ).sum() + bin_12 = np.where( + (agi >= 200_000) & (agi < 500_000), var * weight, 0 + ).sum() bin_13 = np.where((agi >= 500_000) & (agi < 1e6), var * weight, 0).sum() bin_14 = np.where((agi >= 1e6) & (agi < 1.5e6), var * weight, 0).sum() bin_15 = np.where((agi >= 1.5e6) & (agi < 2e6), var * weight, 0).sum() diff --git a/taxdata/cps/helpers.py b/taxdata/cps/helpers.py index 4180dc10..be6abac1 100644 --- a/taxdata/cps/helpers.py +++ b/taxdata/cps/helpers.py @@ -64,31 +64,41 @@ def read_ben(path_prefix, usecols, index_col=None): # Set global variables so they can be accested later global MCAID, MCARE, VB, SNAP, SSI, SS, HOUSING, TANF, UI, WIC # read in benefit imputations - MCAID = read_ben("medicaid", ["MedicaidX", "peridnum"], "peridnum").to_dict("index") - MCARE = read_ben("medicare", ["MedicareX", "peridnum"], "peridnum").to_dict("index") - VB = read_ben("VB_Imputation", ["vb_impute", "peridnum"], "peridnum").to_dict( - "index" - ) - SNAP = read_ben("SNAP_Imputation_", ["h_seq", "snap_impute"], "h_seq").to_dict( - "index" - ) - SSI = read_ben("SSI_Imputation", ["ssi_impute", "peridnum"], "peridnum").to_dict( - "index" - ) + MCAID = read_ben( + "medicaid", ["MedicaidX", "peridnum"], "peridnum" + ).to_dict("index") + MCARE = read_ben( + "medicare", ["MedicareX", "peridnum"], "peridnum" + ).to_dict("index") + VB = read_ben( + "VB_Imputation", ["vb_impute", "peridnum"], "peridnum" + ).to_dict("index") + SNAP = read_ben( + "SNAP_Imputation_", ["h_seq", "snap_impute"], "h_seq" + ).to_dict("index") + SSI = read_ben( + "SSI_Imputation", ["ssi_impute", "peridnum"], "peridnum" + ).to_dict("index") SS = ( read_ben("SS_augmentation_", ["ss_val", "peridnum"], "peridnum") .rename(columns={"ss_val": "ss_impute"}) .to_dict("index") ) HOUSING = read_ben( - "Housing_Imputation_logreg_", ["fh_seq", "ffpos", "housing_impute"], None + "Housing_Imputation_logreg_", + ["fh_seq", "ffpos", "housing_impute"], + None, ) # make a unique index from fh_seq and ffpos - HOUSING["index"] = HOUSING["fh_seq"].astype(str) + HOUSING["ffpos"].astype(str) + HOUSING["index"] = HOUSING["fh_seq"].astype(str) + HOUSING["ffpos"].astype( + str + ) HOUSING.set_index("index", inplace=True) HOUSING = HOUSING.to_dict("index") # TODO: Look up how to drop duplicated index - TANF = read_ben("TANF_Imputation_", ["peridnum", "tanf_impute"], "peridnum") + TANF = read_ben( + "TANF_Imputation_", ["peridnum", "tanf_impute"], "peridnum" + ) # drop duplicated people in tanf TANF = TANF.loc[~TANF.index.duplicated(keep="first")] TANF = TANF.to_dict("index") @@ -103,16 +113,18 @@ def read_ben(path_prefix, usecols, index_col=None): wic_infants = read_ben( WIC_STR.format("infants"), ["peridnum", "WIC_impute"] ).rename(columns={"WIC_impute": "wic_infants"}) - wic_women = read_ben(WIC_STR.format("women"), ["peridnum", "WIC_impute"]).rename( - columns={"WIC_impute": "wic_women"} - ) + wic_women = read_ben( + WIC_STR.format("women"), ["peridnum", "WIC_impute"] + ).rename(columns={"WIC_impute": "wic_women"}) # combine all WIC imputation into one variable WIC = reduce( lambda left, right: pd.merge(left, right, on="peridnum"), [wic_children, wic_infants, wic_women], ) - WIC["wic_impute"] = WIC[["wic_women", "wic_infants", "wic_children"]].sum(axis=1) + WIC["wic_impute"] = WIC[["wic_women", "wic_infants", "wic_children"]].sum( + axis=1 + ) # Set index to pernumid WIC = WIC.set_index("peridnum") WIC = WIC.to_dict("index") diff --git a/taxdata/cps/impute.py b/taxdata/cps/impute.py index 5989e28e..1cf05d9c 100644 --- a/taxdata/cps/impute.py +++ b/taxdata/cps/impute.py @@ -84,9 +84,13 @@ def imputation(data, logit_betas, ols_betas): data["joint_filer"] = np.where(data["mars"] == 2, 1, 0) # cap family size at 5 to match with the PUF data["fam_size"] = np.minimum(data["XTOT"], 5) - data["agede"] = (data["age_head"] >= filingparams.elderly_age[cps_yr_idx]).astype( + data["agede"] = ( + data["age_head"] >= filingparams.elderly_age[cps_yr_idx] + ).astype(int) + ( + data["age_spouse"] >= filingparams.elderly_age[cps_yr_idx] + ).astype( int - ) + (data["age_spouse"] >= filingparams.elderly_age[cps_yr_idx]).astype(int) + ) data["constant"] = np.ones(len(data)) np.random.seed(5410) # set random seed before imputations @@ -174,8 +178,12 @@ def imputation(data, logit_betas, ols_betas): # tobit models TOBIT_XVARS = ["lntot_inc", "joint_filer", "fam_size", "agede", "constant"] - data["CHARITABLE"] = tobit(data, ols_betas["char_ols"], TOBIT_XVARS, 48765.45, 1.0) - data["MISCITEM"] = tobit(data, ols_betas["misc_ols"], TOBIT_XVARS, 14393.99, 0.3) + data["CHARITABLE"] = tobit( + data, ols_betas["char_ols"], TOBIT_XVARS, 48765.45, 1.0 + ) + data["MISCITEM"] = tobit( + data, ols_betas["misc_ols"], TOBIT_XVARS, 14393.99, 0.3 + ) # add imputed capital gains and IRA distributions to total income data["tot_inc"] += data["CGAGIX"] + data["TIRAD"] @@ -195,7 +203,9 @@ def imputation(data, logit_betas, ols_betas): 1_000_000, np.inf, ] - dpad_bin = pd.cut(data["tot_inc"], DPAD_BINS, labels=np.arange(1, 10), right=False) + dpad_bin = pd.cut( + data["tot_inc"], DPAD_BINS, labels=np.arange(1, 10), right=False + ) DPAD_probs = [ 0.01524, 0.00477, @@ -208,10 +218,14 @@ def imputation(data, logit_betas, ols_betas): 0.54408, ] dpad_prob_dict = {x: prob for x, prob in zip(range(1, 10), DPAD_probs)} - dpad_prob = pd.Series([dpad_prob_dict[x] for x in dpad_bin]) * dpad_indicator + dpad_prob = ( + pd.Series([dpad_prob_dict[x] for x in dpad_bin]) * dpad_indicator + ) DPAD_bases = [20686, 1784, 2384, 2779, 3312, 4827, 10585, 24358, 116_275] dpad_base_dict = {x: base for x, base in zip(range(1, 10), DPAD_bases)} - dpad_base = pd.Series([dpad_base_dict[x] for x in dpad_bin]) * dpad_indicator + dpad_base = ( + pd.Series([dpad_base_dict[x] for x in dpad_bin]) * dpad_indicator + ) _prob = dpad_prob * 0.7 z1 = np.random.uniform(0, 1, len(dpad_prob)) diff --git a/taxdata/cps/pycps.py b/taxdata/cps/pycps.py index 13e7096f..300cb90f 100644 --- a/taxdata/cps/pycps.py +++ b/taxdata/cps/pycps.py @@ -31,7 +31,9 @@ def find_person(data: list, lineno: int) -> dict: raise ValueError(msg) -def eic_eligible(person: dict, age_head: int, age_spouse: int, mars: int) -> int: +def eic_eligible( + person: dict, age_head: int, age_spouse: int, mars: int +) -> int: """ Function to determine if a dependent is an EIC eligible child df: DataFrame of just dependents @@ -62,7 +64,9 @@ def eic_eligible(person: dict, age_head: int, age_spouse: int, mars: int) -> int return eligible -def find_claimer(claimerno: int, head_lineno: int, a_lineno: int, data: list) -> bool: +def find_claimer( + claimerno: int, head_lineno: int, a_lineno: int, data: list +) -> bool: """ Determine if an individual is the dependent of the head of the tax unit @@ -79,7 +83,9 @@ def find_claimer(claimerno: int, head_lineno: int, a_lineno: int, data: list) -> return True # see if person is dependent or spouse of head claimer = find_person(data, claimerno) - spouse_dep = claimer["a_spouse"] == claimerno | claimer["dep_stat"] == claimerno + spouse_dep = ( + claimer["a_spouse"] == claimerno | claimer["dep_stat"] == claimerno + ) if spouse_dep: return True # follow any potential spouse/dependent trails to find the one @@ -251,7 +257,9 @@ def create_units(data, year, verbose=False, ctam_benefits=True): if filer: if verbose: print("dep filer", person["a_lineno"]) - tu = TaxUnit(person, year, dep_status=True, ctam_benefits=ctam_benefits) + tu = TaxUnit( + person, year, dep_status=True, ctam_benefits=ctam_benefits + ) # remove dependent from person claiming them units[person["claimer"]].remove_dependent(person) if verbose: @@ -300,7 +308,9 @@ def _create_units(data, year, verbose=False, ctam_benefits=False): if person["a_lineno"] == _person["dep_stat"]: if verbose: print("adding dependent", _person["a_lineno"]) - _eic = eic_eligible(_person, tu.age_head, tu.age_spouse, tu.mars) + _eic = eic_eligible( + _person, tu.age_head, tu.age_spouse, tu.mars + ) tu.add_dependent(_person, _eic) dependents.append(_person) units[person["a_lineno"]] = tu @@ -311,7 +321,9 @@ def _create_units(data, year, verbose=False, ctam_benefits=False): if person["filestat"] != 6: if verbose: print("dep filer", person["a_lineno"]) - tu = TaxUnit(person, year, dep_status=True, ctam_benefits=ctam_benefits) + tu = TaxUnit( + person, year, dep_status=True, ctam_benefits=ctam_benefits + ) # remove dependent from person claiming them units[person["claimer"]].remove_dependent(person) if verbose: diff --git a/taxdata/cps/taxunit.py b/taxdata/cps/taxunit.py index d187f121..3e9f3ca8 100644 --- a/taxdata/cps/taxunit.py +++ b/taxdata/cps/taxunit.py @@ -1,4 +1,9 @@ -from .helpers import filingparams, cps_yr_idx, C_TAM_BENEFIT_TUPLES, CPS_BENEFIT_TUPLES +from .helpers import ( + filingparams, + cps_yr_idx, + C_TAM_BENEFIT_TUPLES, + CPS_BENEFIT_TUPLES, +) INCOME_TUPLES = [ diff --git a/taxdata/cps/transform_sas.py b/taxdata/cps/transform_sas.py index 44d3eaa3..43082224 100644 --- a/taxdata/cps/transform_sas.py +++ b/taxdata/cps/transform_sas.py @@ -77,7 +77,11 @@ def main(): person = parse_sas(sas) sas.close() - master_dict[year] = {"household": household, "family": family, "person": person} + master_dict[year] = { + "household": household, + "family": family, + "person": person, + } with Path(CUR_PATH, "master_cps_dict.pkl").open("wb") as f: pickle.dump(master_dict, f) diff --git a/taxdata/cps/validation.py b/taxdata/cps/validation.py index 5881f06a..0d30bf0c 100644 --- a/taxdata/cps/validation.py +++ b/taxdata/cps/validation.py @@ -74,7 +74,11 @@ def record_error(var, h_seq, pycps, cps, year): # elderly people in the household if elderly_deps_data > elderly_deps_cps: record_error( - "elderly_dependents", h_seq, elderly_deps_data, elderly_deps_cps, year + "elderly_dependents", + h_seq, + elderly_deps_data, + elderly_deps_cps, + year, ) num_errors += 1 diff --git a/taxdata/matching/statmatch.py b/taxdata/matching/statmatch.py index b015aa88..de117dca 100644 --- a/taxdata/matching/statmatch.py +++ b/taxdata/matching/statmatch.py @@ -71,9 +71,13 @@ def match( # get cell counts and ID's if they partition if groupby: d_count = counts(donor, groupby, donor_wt) - d_count.rename(columns={"count": "d_count", "wt": "d_wt"}, inplace=True) + d_count.rename( + columns={"count": "d_count", "wt": "d_wt"}, inplace=True + ) r_count = counts(recipient, groupby, recipient_wt) - r_count.rename(columns={"count": "r_count", "wt": "r_wt"}, inplace=True) + r_count.rename( + columns={"count": "r_count", "wt": "r_wt"}, inplace=True + ) full_count = pd.merge(r_count, d_count, on=groupby, how="inner") full_count["cellid"] = full_count.index + 1 full_count["factor"] = full_count["r_wt"] / full_count["d_wt"] @@ -90,7 +94,9 @@ def match( # run regression on each cell id gdf = recipient.groupby("cellid", as_index=False) - params = gdf.apply(reg, dep_var=predict_var, indep_vars=match_on, wt=recipient_wt) + params = gdf.apply( + reg, dep_var=predict_var, indep_vars=match_on, wt=recipient_wt + ) params = params.add_prefix("param_") params["cellid"] = params.index + 1 donor = pd.merge(donor, params, on="cellid", how="inner") diff --git a/taxdata/puf/finalprep.py b/taxdata/puf/finalprep.py index b3b78345..ba70fe06 100644 --- a/taxdata/puf/finalprep.py +++ b/taxdata/puf/finalprep.py @@ -2,6 +2,7 @@ import pandas as pd import taxcalc as tc from .impute_pencon import impute_pension_contributions +from .impute_qbid_w2 import impute_PT_binc_w2_wages from .constants import UNUSED_READ_VARS from pathlib import Path @@ -34,7 +35,9 @@ def finalprep(data): medical_limit = np.maximum( zero, data["e17500"] - np.maximum(zero, data["e00100"]) * 0.075 ) - med_adj = np.minimum(medical_limit, 0.025 * np.maximum(zero, data["e00100"])) + med_adj = np.minimum( + medical_limit, 0.025 * np.maximum(zero, data["e00100"]) + ) stx_adj = np.maximum(zero, data["e18400"]) cmbtp_itemizer = ( cmbtp_standard @@ -57,6 +60,9 @@ def finalprep(data): # - Replace e20500 with g20500: data = replace_20500(data) + # - Impute PT_binc_w2_wages: + data = impute_PT_binc_w2_wages(data) + data["s006"] = data["matched_weight"] * 100 # - Remove variables not expected by Tax-Calculator: @@ -226,9 +232,9 @@ def split_earnings_variables(data, data_year): # split wage-and-salary earnings subject to FICA taxation # the two e00200x variables come from the CPS. We'll use them just for # the wage ratio that we split up the PUF wages from - total = np.where(data["MARS"] == 2, data["e00200p"] + data["e00200s"], 0).astype( - float - ) + total = np.where( + data["MARS"] == 2, data["e00200p"] + data["e00200s"], 0 + ).astype(float) frac_p = np.where(total != 0, data["e00200p"] / total, 1.0) frac_s = 1.0 - frac_p data["e00200p"] = np.around(frac_p * data["e00200"], 2) @@ -319,7 +325,9 @@ def replace_20500(data): (gross loss values less than 10% AGI are unknown and assumed to be zero) """ gross = np.where( - data.e20500 > 0.0, data.e20500 + 0.10 * np.maximum(0.0, data.e00100), 0.0 + data.e20500 > 0.0, + data.e20500 + 0.10 * np.maximum(0.0, data.e00100), + 0.0, ) data["g20500"] = np.int_(gross.round()) return data diff --git a/taxdata/puf/impute_itmexp.py b/taxdata/puf/impute_itmexp.py index 375be343..e5c7344a 100644 --- a/taxdata/puf/impute_itmexp.py +++ b/taxdata/puf/impute_itmexp.py @@ -46,7 +46,12 @@ def impute( - ievar, logit_prob_af, log_amount_af, exogenous_vars, itemizer_data, nonitemizer_data + ievar, + logit_prob_af, + log_amount_af, + exogenous_vars, + itemizer_data, + nonitemizer_data, ): """ Function that estimates imputation equations for ievar with itemizer_data @@ -78,7 +83,8 @@ def impute( # with the Heckman sample selection problems present in this imputation # process. tpi_data = itemizer_data[ - (itemizer_data[ievar] > 0) & (itemizer_data[ievar] < itemizer_data["stdded"]) + (itemizer_data[ievar] > 0) + & (itemizer_data[ievar] < itemizer_data["stdded"]) ] ols_y = np.log(tpi_data[ievar]) ols_x = tpi_data[exogenous_vars] @@ -208,8 +214,12 @@ def standard_deduction(row): print("PUF fraction of ALL = {:.4f}".format(data["filer"].mean())) ier = data["itemizer"] print("ALL itemizer mean = {:.4f}".format(ier.mean())) - print("PUF itemizer mean = {:.4f}".format(ier[data["filer"] == 1].mean())) - print("CPS itemizer mean = {:.4f}".format(ier[data["filer"] == 0].mean())) + print( + "PUF itemizer mean = {:.4f}".format(ier[data["filer"] == 1].mean()) + ) + print( + "CPS itemizer mean = {:.4f}".format(ier[data["filer"] == 0].mean()) + ) for iev in iev_names: var = itemizer_data[iev] varpos = var > 0 @@ -227,7 +237,11 @@ def standard_deduction(row): for iev in iev_names: var = nonitemizer_data[iev] varpos = var > 0 - print("frac of non-itemizers with {}>0 = {:.4f}".format(iev, varpos.mean())) + print( + "frac of non-itemizers with {}>0 = {:.4f}".format( + iev, varpos.mean() + ) + ) # specify 2011 JCT count/amount targets for nonitemizers # (When JCX-75-15 Table 2 contains more than one line item for a @@ -336,7 +350,9 @@ def standard_deduction(row): if DUMP2: r_a = nonitemizer_data[iev_names].sum(axis=1) / stdded print( - "AFTER: num of nonitemizers with sum>stdded = {}".format(len(r_a[r_a > 1])) + "AFTER: num of nonitemizers with sum>stdded = {}".format( + len(r_a[r_a > 1]) + ) ) print( "AFTER: frac of nonitemizers with sum>stdded = {:.4f}".format( diff --git a/taxdata/puf/impute_pencon.py b/taxdata/puf/impute_pencon.py index 36d886ba..a4134ae7 100644 --- a/taxdata/puf/impute_pencon.py +++ b/taxdata/puf/impute_pencon.py @@ -71,8 +71,12 @@ def targets(year): revised data specified here. Also, the top two wage groups (5u10M and 10u30M) are combined into a single group (5Mplus). """ - cnt_df = pd.read_csv(Path(CURPATH, f"dcpentargetcnt{year}.csv"), index_col=0) - amt_df = pd.read_csv(Path(CURPATH, f"dcpentargetamt{year}.csv"), index_col=0) + cnt_df = pd.read_csv( + Path(CURPATH, f"dcpentargetcnt{year}.csv"), index_col=0 + ) + amt_df = pd.read_csv( + Path(CURPATH, f"dcpentargetamt{year}.csv"), index_col=0 + ) return cnt_df, amt_df @@ -183,7 +187,9 @@ def impute(idata, target_cnt, target_amt): if wgrp >= MIN_HIWAGE_GROUP: prob *= HIWAGE_PROB_SF if DUMP1 and prob > 1.0: - print("agrp={};wgrp={} ==> prob= {:.3f}".format(agrp, wgrp, prob)) + print( + "agrp={};wgrp={} ==> prob= {:.3f}".format(agrp, wgrp, prob) + ) cell_idata["pencon"] = np.where(cell_idata["urn"] < prob, 1, 0) pos_pc = cell_idata["pencon"] > 0 if pos_pc.sum() == 0: # no positive pension contributions in cell @@ -198,15 +204,23 @@ def impute(idata, target_cnt, target_amt): cell_target_amt = target_amt.iloc[wgrp, agrp] rate0 = min(1.0, cell_target_amt / wgt_pos_pc_wages) if DUMP2: - print("agrp={};wgrp={} ==> rate0= {:.4f}".format(agrp, wgrp, rate0)) + print( + "agrp={};wgrp={} ==> rate0= {:.4f}".format( + agrp, wgrp, rate0 + ) + ) # iteratively raise non-capped deferral rate to hit target_amt num_iterations = 10 for itr in range(0, num_iterations): - uncapped_amt = np.where(pos_pc, np.round(wage * rate0).astype(int), 0) + uncapped_amt = np.where( + pos_pc, np.round(wage * rate0).astype(int), 0 + ) capped_amt = np.minimum(uncapped_amt, MAX_PENCON_AMT) over_amt = uncapped_amt - capped_amt over_tot = (over_amt * wgt).sum() * 1e-9 - rate1 = min(1.0, (cell_target_amt + over_tot) / wgt_pos_pc_wages) + rate1 = min( + 1.0, (cell_target_amt + over_tot) / wgt_pos_pc_wages + ) if np.allclose([rate1], [rate0]): if DUMP2 and itr > 0: print(" iter={} ==> rate= {:.4f}".format(itr, rate0)) @@ -238,10 +252,14 @@ def impute_pension_contributions(alldata, year): target_amt.drop(labels="total", axis="columns", inplace=True) if DUMP0: print( - "target_cnt.shape={} and size={}".format(target_cnt.shape, target_cnt.size) + "target_cnt.shape={} and size={}".format( + target_cnt.shape, target_cnt.size + ) ) print( - "target_amt.shape={} and size={}".format(target_amt.shape, target_amt.size) + "target_amt.shape={} and size={}".format( + target_amt.shape, target_amt.size + ) ) print("len(UNDER_AGE)={}".format(len(UNDER_AGE))) print("len(UNDER_WAGE)={}".format(len(UNDER_WAGE))) @@ -267,7 +285,9 @@ def impute_pension_contributions(alldata, year): idata_s["spouse"] = np.ones(len(idata_s.index), dtype=np.int8) idata_s["weight"] = alldata["s006"] * 0.01 idata_s.rename( - {"age_spouse": "age", "e00200s": "e00200"}, axis="columns", inplace=True + {"age_spouse": "age", "e00200s": "e00200"}, + axis="columns", + inplace=True, ) # ... combine the _p and _s DataFrames idata = pd.concat([idata_p, idata_s], copy=True) diff --git a/taxdata/puf/impute_qbid_w2.py b/taxdata/puf/impute_qbid_w2.py new file mode 100644 index 00000000..da19a51c --- /dev/null +++ b/taxdata/puf/impute_qbid_w2.py @@ -0,0 +1,89 @@ +""" +Impute W-2 wages share in Qualified Business Income in PUF. + +The objective of this imputation is to improve the accuracy of Qualified +Business Income Deduction (QBID) revenue estimates produced by Tax-Calculator. +In the absence of a valid W-2 wage share variable (e.g., when it is set to zero), +Tax-Calculatorgenerates QBID values that diverge significantly from benchmark +estimates published by agencies such as the Congressional Budget Office (CBO). + +Since no microdata source contains reliable information on the W-2 wage share +associated with pass-through income, a statistical imputation is required. +We calibrate the imputation to match CBO's published QBID revenue estimates. +To be noticed, there is an observed inconsistency on the CBO published QBID +estimates between the years before 2020 and the years after 2021. So only the +QBID value from the years 2021 ~ 2025 are selected targets, which would better +reflect the policy environments of the recent years. Targets will be updated as +new CBO QBID projections become available. + +The imputation strategy is to minimize the distance between Tax-Calculator's +simulated QBID revenue and CBO's published targets through an optimization +process. Specifically, we define a loss function to measure the year-by-year +squared deviations in aggregate QBID amounts and solve for the W-2 wage share +parameter which minimizes this loss. + +The imputation will add the PT_w2_binc_wages into puf.csv file. +""" + +import pandas as pd +import taxcalc as tc +import numpy as np +from scipy.optimize import minimize_scalar +from functools import partial + + +def impute_PT_binc_w2_wages(data): + """ + Main function in impute_qbid_w2.py file. + Argument: puf.csv DataFrame just before imputation is done. + Returns: puf.csv DataFrame with imputed W-2 wage share variable. + """ + # add the qualified business income variable, to be noticed the self employment + # part is not included, to simplyfy calculation. + data["qbinc"] = ( + data["e00900"] + data["e26270"] + data["e02100"] + data["e27200"] + ) + # solve for W-2 wage share in QBI + w2_ratio = opt_ratio(data) + data["PT_binc_w2_wages"] = w2_ratio * data["qbinc"] + del data["qbinc"] + return data + + +def qbid_value(data, ratio): + """ + Function that calculate the QBID aggregates through years. + """ + qbided = [] + df = data.copy() + df["PT_binc_w2_wages"] = ratio * df["qbinc"] + pol = tc.Policy() + recs = tc.Records(data=df) + calc0 = tc.Calculator(policy=pol, records=recs) + for year in range(2021, 2026): + calc0.advance_to_year(year) + calc0.calc_all() + qbided.append(calc0.weighted_total("qbided")) + return np.array(qbided) + + +def loss_function(data, ratio): + """ + Function that estimates loss between between Tax-Calculator's + simulated QBID revenue and CBO's published targets on QBID aggregates. + """ + # CBO QBID aggregates for the year 2021 ~ 2025 + target = [205.1e9, 215.7e9, 229.5e9, 247.1e9, 258.3e9] + return np.sum((qbid_value(data, ratio) - target) ** 2) + + +def opt_ratio(data): + """ + Function that solve for the W-2 wage share from the loss function + Argument: puf.csv before imputation. + Returns: percentage of W-2 wage income in the Qualified Business Income. + """ + loss = partial(loss_function, data) + res = minimize_scalar(loss, bounds=(0, 1), method="bounded") + # print(f"Optimal ratio: {res.x:.4f}, target: {res.fun:.4f}") + return res.x diff --git a/tests/test_data.py b/tests/test_data.py index 70450c81..4fcb7df8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -37,9 +37,13 @@ def min_max(data, meta, dataname): if dataname not in availability: in_data = False if in_data: - m = "{}-{} contains values less than min value".format(dataname, var) + m = "{}-{} contains values less than min value".format( + dataname, var + ) assert np.all(data[var] >= min_value), m - m = "{}-{} contains values greater than max value".format(dataname, var) + m = "{}-{} contains values greater than max value".format( + dataname, var + ) assert np.all(data[var] <= max_value), m @@ -131,7 +135,9 @@ def variable_check(test_path, data, dataname): expected_max[var] = int(split[3]) # loop through each column in the dataset and check sum, min, max - actual_txt = "{:20}{:>15}{:>15}{:>15}\n".format("VARIABLE", "SUM", "MIN", "MAX") + actual_txt = "{:20}{:>15}{:>15}{:>15}\n".format( + "VARIABLE", "SUM", "MIN", "MAX" + ) var_inform = "{:20}{:15d}{:15d}{:15d}\n" diffs = False diff_list_str = "" # string to hold all of the variables with errors @@ -196,7 +202,17 @@ def check_cps_benefits(data, expect_ben_stat): and average value for each of the benefits in the CPS. That information can be found in cps_benefits_metadata.json """ - BNAMES = ["mcare", "mcaid", "ssi", "snap", "wic", "tanf", "housing", "vet", "other"] + BNAMES = [ + "mcare", + "mcaid", + "ssi", + "snap", + "wic", + "tanf", + "housing", + "vet", + "other", + ] # # compare actual and expected benefit statistics error_msg = "" wgt = data["s006"] * 0.01 diff --git a/tests/test_ratios.py b/tests/test_ratios.py index c53aceee..95d68fce 100644 --- a/tests/test_ratios.py +++ b/tests/test_ratios.py @@ -43,7 +43,11 @@ def test_ratios( for col in ratios: if ratios[col].min() < min_ratio: msg = "{} ratios[{}].min()={} < {}" - raise ValueError(msg.format(kind, col, ratios[col].min(), min_ratio)) + raise ValueError( + msg.format(kind, col, ratios[col].min(), min_ratio) + ) if ratios[col].max() > max_ratio: msg = "{} ratios[{}].max()={} > {}" - raise ValueError(msg.format(kind, col, ratios[col].max(), max_ratio)) + raise ValueError( + msg.format(kind, col, ratios[col].max(), max_ratio) + ) diff --git a/tests/test_weights.py b/tests/test_weights.py index adb09bb9..3ef8a7c4 100644 --- a/tests/test_weights.py +++ b/tests/test_weights.py @@ -50,10 +50,14 @@ def test_weights( for col in weights: if weights[col].min() <= MIN_WEIGHT: msg = "{} weights[{}].min()={} <= {}" - raise ValueError(msg.format(kind, col, weights[col].min(), MIN_WEIGHT)) + raise ValueError( + msg.format(kind, col, weights[col].min(), MIN_WEIGHT) + ) if weights[col].max() > MAX_WEIGHT: msg = "{} weights[{}].max()={} > {}" - raise ValueError(msg.format(kind, col, weights[col].max(), MAX_WEIGHT)) + raise ValueError( + msg.format(kind, col, weights[col].max(), MAX_WEIGHT) + ) # test sum of weights (in millions) for each year MIN_WEIGHT_SUM = 144 MAX_WEIGHT_SUM = 258