Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
!demographics.csv
!incomes_projection.csv
!policyengine_uk_data/datasets/local_areas/**/*.csv
!policyengine_uk_data/datasets/firm/**/*.csv
**/_build
!policyengine_uk_data/storage/*.csv
**/version.json
3 changes: 3 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- bump: minor
changes:
- Firm synthetic data generation script with ONS and HMRC data integration
234 changes: 234 additions & 0 deletions policyengine_uk_data/datasets/firm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
"""
Firm dataset for PolicyEngine UK.

This module processes synthetic firm data into PolicyEngine UK dataset format,
handling firm demographics, turnover, VAT, employment, and other business variables.
The synthetic firm data represents the UK business population for tax-benefit modelling.
"""

from policyengine_core.data import Dataset
from pathlib import Path
import pandas as pd
import numpy as np
from policyengine_uk_data.utils.datasets import STORAGE_FOLDER
import logging

logger = logging.getLogger(__name__)


def create_firm(year: int = 2023):
"""
Process synthetic firm data into PolicyEngine UK dataset format.

Generates synthetic firm microdata and transforms it into a structured
PolicyEngine UK dataset with firm, sector, and employment-level variables
mapped to the appropriate tax-benefit system variables.

Args:
year: Survey year for the dataset.

Returns:
Dataset with processed firm data ready for policy simulation.
"""
# Always generate fresh synthetic data using generate_synthetic_data.py
logger.info("Generating synthetic firm data...")
import sys

sys.path.append(str(Path(__file__).parent / "firm"))
from generate_synthetic_data import SyntheticFirmGenerator

generator = SyntheticFirmGenerator(device="cpu")
synthetic_df = generator.generate_synthetic_firms()

# Create entity DataFrames for firm structure
pe_firm = pd.DataFrame()
pe_sector = pd.DataFrame()
pe_business_group = pd.DataFrame()

# Add primary keys and identifiers
pe_firm["firm_id"] = range(len(synthetic_df))
pe_firm["firm_sector_id"] = synthetic_df["sic_code"].astype(int)
pe_firm["firm_business_group_id"] = pe_firm["firm_id"] // 100

# Create unique sectors
unique_sectors = synthetic_df["sic_code"].astype(int).unique()
pe_sector["sector_id"] = unique_sectors

# Create business groups
unique_groups = pe_firm["firm_business_group_id"].unique()
pe_business_group["business_group_id"] = unique_groups

# Add grossing weights
pe_firm["firm_weight"] = synthetic_df["weight"].values

# Add basic firm variables - exactly from synthetic data
pe_firm["sic_code"] = synthetic_df["sic_code"]
pe_firm["annual_turnover_k"] = synthetic_df["annual_turnover_k"].values
pe_firm["annual_input_k"] = synthetic_df["annual_input_k"].values
pe_firm["vat_liability_k"] = synthetic_df["vat_liability_k"].values
pe_firm["employment"] = synthetic_df["employment"].astype(int).values
pe_firm["vat_registered"] = (
synthetic_df["vat_registered"].astype(bool).values
)

# Add derived variables using pd.cut for efficiency
turnover_bins = [0, 85, 150, 300, 500, 1000, 10000, float("inf")]
turnover_labels = [
"£1_to_Threshold",
"£Threshold_to_£150k",
"£150k_to_£300k",
"£300k_to_£500k",
"£500k_to_£1m",
"£1m_to_£10m",
"Greater_than_£10m",
]
pe_firm["hmrc_band"] = (
pd.cut(
pe_firm["annual_turnover_k"],
bins=turnover_bins,
labels=turnover_labels,
include_lowest=True,
)
.astype(str)
.replace("nan", "Negative_or_Zero")
)

employment_bins = [0, 4, 9, 19, 49, 99, 249, float("inf")]
employment_labels = [
"0-4",
"5-9",
"10-19",
"20-49",
"50-99",
"100-249",
"250+",
]
pe_firm["employment_band"] = pd.cut(
pe_firm["employment"],
bins=employment_bins,
labels=employment_labels,
include_lowest=True,
).astype(str)

pe_firm["sic_numeric"] = pe_firm["sic_code"].astype(int)

# Add year field
pe_firm["year"] = year
pe_sector["year"] = year
pe_business_group["year"] = year

# Create the dataset - use a simple object to hold the data
class FirmDataset:
def __init__(self):
self.firm = pe_firm
self.sector = pe_sector
self.business_group = pe_business_group

def save(self, path):
# Save as HDF5 for compatibility
self.firm.to_hdf(path, key="firm", mode="w")
self.sector.to_hdf(path, key="sector", mode="a")
self.business_group.to_hdf(path, key="business_group", mode="a")

dataset = FirmDataset()

# Add metadata about the dataset
dataset.metadata = {
"source": "synthetic_firm_generator",
"year": year,
"n_firms": len(pe_firm),
"total_weighted_firms": pe_firm["firm_weight"].sum(),
"vat_registered_firms": pe_firm[pe_firm["vat_registered"]][
"firm_weight"
].sum(),
"total_employment": (
pe_firm["employment"] * pe_firm["firm_weight"]
).sum(),
"total_turnover_billions": (
pe_firm["annual_turnover_k"] * pe_firm["firm_weight"]
).sum()
/ 1e6,
"total_vat_liability_billions": (
pe_firm["vat_liability_k"] * pe_firm["firm_weight"]
).sum()
/ 1e6,
}

logger.info(f"Created firm dataset with {len(pe_firm):,} firms")
logger.info(
f"Total weighted population: {dataset.metadata['total_weighted_firms']:,.0f}"
)
logger.info(
f"Total employment: {dataset.metadata['total_employment']:,.0f}"
)
logger.info(
f"Total turnover: £{dataset.metadata['total_turnover_billions']:.1f}bn"
)

return dataset


# Dataset class for direct import like FRS
class firm_2023_24:
"""UK Firm dataset for 2023-24, following the FRS pattern."""

def __init__(self):
# Load the dataset from storage or create if needed
dataset_path = STORAGE_FOLDER / "firm_2023_24.h5"

if dataset_path.exists():
self.firm = pd.read_hdf(dataset_path, key="firm")
self.sector = pd.read_hdf(dataset_path, key="sector")
self.business_group = pd.read_hdf(
dataset_path, key="business_group"
)
else:
# Create and save the dataset
dataset = create_firm(year=2023)
dataset.save(dataset_path)
self.firm = dataset.firm
self.sector = dataset.sector
self.business_group = dataset.business_group


# Main execution for testing
if __name__ == "__main__":
"""Test the firm dataset creation."""
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logger.info("Creating firm dataset...")

# Create the dataset
firm_dataset = create_firm(year=2023)

# Save to storage
output_path = STORAGE_FOLDER / "firm_2023_24.h5"
firm_dataset.save(output_path)

logger.info(f"Saved firm dataset to {output_path}")

# Display summary statistics
print("\n" + "=" * 60)
print("FIRM DATASET SUMMARY")
print("=" * 60)

for key, value in firm_dataset.metadata.items():
if isinstance(value, (int, float)):
if "billions" in key:
print(f"{key}: £{value:.2f}bn")
elif key in [
"n_firms",
"total_weighted_firms",
"vat_registered_firms",
"total_employment",
]:
print(f"{key}: {value:,.0f}")
else:
print(f"{key}: {value}")
else:
print(f"{key}: {value}")

print("=" * 60)
print("Dataset creation complete!")
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
Trade_Sector,Trade_Sub_Sector,2023-24
00001,"Crop and animal production, hunting and related service activities",-2330
00002,Forestry and logging,100
00003,Fishing and aquaculture,-100
00005,Mining of coal and lignite,10
00006,Extraction of crude petroleum and natural gas,-460
00007,Mining of metal ores,-10
00008,Other mining and quarrying,270
00009,Mining and support service activities,-330
00010,Manufacture of food products,-2150
00011,Manufacture of beverages,1820
00012,Manufacture of tobacco products,1700
00013,Manufacture of textiles,450
00014,Manufacture of wearing apparel,280
00015,Manufacture of leather and related products,140
00016,"Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials",650
00017,Manufacture of paper and paper products,920
00018,Printing and reproduction of recorded media,480
00019,Manufacture of coke and refined petroleum products,7810
00020,Manufacture of chemicals and chemical products,1500
00021,Manufacture of basic pharmaceutical products and pharmaceutical preparations,570
00022,Manufacture of rubber and rubber products,1440
00023,Manufacture of other non-metallic mineral products,1130
00024,Manufacture of basic metals,-830
00025,"Manufacture of fabricated metal products, except machinery and equipment",2000
00026,"Manufacture of computer, electronic and optical products",1050
00027,Manufacture of electrical equipment,770
00028,Manufacture of machinery and equipment n.e.c,1150
00029,"Manufacture of motor vehicles, trailers and semi-trailers",2040
00030,Manufacture of other transport equipment,-1240
00031,Manufacture of furniture,730
00032,Other manufacturing,1090
00033,Repair and installation of machinery and equipment,810
00035,"Electricity, gas, steam and air conditioning supply",6020
00036,"Water collection, treatment and supply",-2250
00037,Sewerage,-10
00038,"Waste collection, treatment and disposal activities; materials recovery",680
00039,Remediation activities and other waste management services,370
00041,Construction of buildings,2780
00042,Civil engineering,4130
00043,Specialised construction activities,3950
00045,Wholesale and retail trade and repair of motor vehicles and motorcycles,13040
00046,"Wholesale trade, except of motor vehicles and motorcycles",25980
00047,Retail trade except of motor vehicles and motorcycles,16440
00049,Land transport and transport via pipelines,-420
00050,Water transport,-100
00051,Air transport,-1080
00052,Warehousing and support activities for transportation,1930
00053,Postal and courier activities,1140
00055,Accommodation,2760
00056,Food and beverage service activities,8450
00058,Publishing activities,700
00059,"Motion picture, video and television programme production, sound recording and music publishing activities",210
00060,Programming and broadcasting activities,-20
00061,Telecommunications,4320
00062,"Computer programming, consultancy and related activities",13720
00063,Information service activities,460
00064,"Financial service activities, except insurance and pension funding",-190
00065,"Insurance, reinsurance and pension funding, except compulsory social security",200
00066,Activities auxiliary to financial services and insurance activities,1180
00068,Real estate activities,4180
00069,Legal and accounting services,9120
00070,Activities of head offices; management consultancy services,9010
00071,Architectural and engineering activities; technical testing and analysis,4830
00072,Scientific research and development,160
00073,Advertising and market research,1990
00074,"Other professional, scientific and technical activities",1560
00075,Veterinary activities,770
00077,Rental and leasing activities,3410
00078,Employment activities,6000
00079,"Travel agency, tour operator and other reservation service and related activities",-30
00080,Security and investigation activities,1050
00081,Services to buildings and landscape activities,2150
00082,Office administrative and support activities,6230
00084,Public administration and defence; compulsory social security,-11080
00085,Education,410
00086,Human health activities,60
00087,Residential care activities,260
00088,Social work activities without accommodation,370
00090,"Creative, arts and entertainment activities",1690
00091,"Libraries, archives, museums and other cultural activities",40
00092,Gambling and betting activities,50
00093,Sports activities and amusement and recreation activities,1680
00094,Activities of membership organisations,520
00095,Repair of computers and personal and household goods,280
00096,Other personal service activities,2780
00097,Activities of households as employers of domestic personnel,0
00098,Undifferentiated goods- and services-producing activities of private households for own use,0
00099,Activities of extraterritorial organisations and bodies,40
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Financial_Year,Negative_or_Zero,£1_to_Threshold,£Threshold_to_£150k,£150k_to_£300k,£300k_to_£500k,£500k_to_£1m,£1m_to_£10m,Greater_than_£10m,Total
2004-05,-980,150,2660,3020,2650,4100,14080,35270,60940
2005-06,-1850,200,2630,2980,2640,3880,13060,32450,55980
2006-07,-1490,170,2780,3150,2810,4250,14750,36330,62760
2007-08,-1700,90,2680,3130,2720,4410,15900,40920,68140
2008-09,-1540,120,2750,3060,2640,4150,15030,37590,63810
2009-10,-960,470,2490,2770,2340,3550,14200,38780,63640
2010-11,-1010,570,2880,3220,2630,4120,15750,41020,69200
2011-12,-1300,680,3170,3760,2940,4630,17180,47370,78430
2012-13,-1270,820,3190,3810,2980,4710,17700,50140,82070
2013-14,-1490,800,3190,4000,3150,4800,19020,54700,88170
2014-15,-1770,800,3300,4250,3220,5090,20210,57960,93060
2015-16,-2230,720,3420,4370,3400,5300,21060,61120,97160
2016-17,-2200,750,3510,4700,3460,5470,21460,62960,100110
2017-18,-2250,770,3740,4830,3650,5640,22740,63820,102930
2018-19,-2270,820,3760,4980,3650,5690,23360,65300,105300
2019-20,-2270,770,3730,5170,3790,5870,23820,68270,109130
2020-21,-2220,440,2530,3540,2770,4550,19620,67000,98230
2021-22,-2500,1260,2290,3600,3060,5040,22720,105180,140650
2022-23,-3050,-10,2300,4380,3690,5880,26340,117230,156740
2023-24,-2640,1460,2430,4590,3900,6170,27520,129960,173380
Loading