From 9ff9927a10e56f88b4c0b1098c875d1ba1ff3047 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 01:02:03 +0000 Subject: [PATCH 1/9] net:mana: Create seperate EQs for each vport To prepare for assigning vPorts to dedicated MSIx vectors, removing EQ sharing among the vPorts and create dedicated EQs for each vPort. --- drivers/infiniband/hw/mana/main.c | 14 ++- drivers/infiniband/hw/mana/qp.c | 4 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 111 ++++++++++-------- include/net/mana/mana.h | 8 +- 4 files changed, 83 insertions(+), 54 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index eda9c5b971dee6..86e9ac0127a8bb 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -19,8 +19,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, pd->vport_use_count--; WARN_ON(pd->vport_use_count < 0); - if (!pd->vport_use_count) + if (!pd->vport_use_count) { + mana_destroy_eq(mpc); mana_uncfg_vport(mpc); + } mutex_unlock(&pd->vport_mutex); } @@ -54,15 +56,21 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, return err; } - mutex_unlock(&pd->vport_mutex); pd->tx_shortform_allowed = mpc->tx_shortform_allowed; pd->tx_vp_offset = mpc->tx_vp_offset; + err = mana_create_eq(mpc); + if (err) { + mana_uncfg_vport(mpc); + pd->vport_use_count--; + } + + mutex_unlock(&pd->vport_mutex); ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", mpc->port_handle, pd->pdn, doorbell_id); - return 0; + return err; } int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index c928af58f38bfe..9bdd413b807172 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -189,7 +189,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq = &mpc->ac->eqs[cq->comp_vector]; + eq = &mpc->eqs[cq->comp_vector % mpc->num_queues]; cq_spec.attached_eq = eq->eq->id; ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, @@ -341,7 +341,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; eq_vec = send_cq->comp_vector; - eq = &mpc->ac->eqs[eq_vec]; + eq = &mpc->eqs[eq_vec % mpc->num_queues]; cq_spec.attached_eq = eq->eq->id; err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 2bac6be8f6a09c..019a1c99862ed5 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1239,79 +1239,83 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, } EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA"); -static void mana_destroy_eq(struct mana_context *ac) +void mana_destroy_eq(struct mana_port_context *apc) { + struct mana_context *ac = apc->ac; struct gdma_context *gc = ac->gdma_dev->gdma_context; struct gdma_queue *eq; int i; - if (!ac->eqs) + if (!apc->eqs) return; - debugfs_remove_recursive(ac->mana_eqs_debugfs); - ac->mana_eqs_debugfs = NULL; + debugfs_remove_recursive(apc->mana_eqs_debugfs); + apc->mana_eqs_debugfs = NULL; - for (i = 0; i < gc->max_num_queues; i++) { - eq = ac->eqs[i].eq; + for (i = 0; i < apc->num_queues; i++) { + eq = apc->eqs[i].eq; if (!eq) continue; mana_gd_destroy_queue(gc, eq); } - kfree(ac->eqs); - ac->eqs = NULL; + kfree(apc->eqs); + apc->eqs = NULL; } +EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA"); -static void mana_create_eq_debugfs(struct mana_context *ac, int i) +static void mana_create_eq_debugfs(struct mana_port_context *apc, int i) { - struct mana_eq eq = ac->eqs[i]; + struct mana_eq eq = apc->eqs[i]; char eqnum[32]; sprintf(eqnum, "eq%d", i); - eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs); + eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs); debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head); debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail); debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops); } -static int mana_create_eq(struct mana_context *ac) +int mana_create_eq(struct mana_port_context *apc) { - struct gdma_dev *gd = ac->gdma_dev; + struct gdma_dev *gd = apc->ac->gdma_dev; struct gdma_context *gc = gd->gdma_context; struct gdma_queue_spec spec = {}; int err; int i; - ac->eqs = kcalloc(gc->max_num_queues, sizeof(struct mana_eq), - GFP_KERNEL); - if (!ac->eqs) + WARN_ON(apc->eqs); + apc->eqs = kcalloc(apc->num_queues, sizeof(struct mana_eq), + GFP_KERNEL); + if (!apc->eqs) return -ENOMEM; spec.type = GDMA_EQ; spec.monitor_avl_buf = false; spec.queue_size = EQ_SIZE; spec.eq.callback = NULL; - spec.eq.context = ac->eqs; + spec.eq.context = apc->eqs; spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; - ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs); + apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs); - for (i = 0; i < gc->max_num_queues; i++) { + for (i = 0; i < apc->num_queues; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; - err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq); + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); if (err) { dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err); goto out; } - mana_create_eq_debugfs(ac, i); + mana_create_eq_debugfs(apc, i); } return 0; out: - mana_destroy_eq(ac); + mana_destroy_eq(apc); return err; } +EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA"); static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq) { @@ -2014,7 +2018,7 @@ static int mana_create_txq(struct mana_port_context *apc, spec.monitor_avl_buf = false; spec.queue_size = cq_size; spec.cq.callback = mana_schedule_napi; - spec.cq.parent_eq = ac->eqs[i].eq; + spec.cq.parent_eq = apc->eqs[i].eq; spec.cq.context = cq; err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); if (err) @@ -2398,13 +2402,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx) static int mana_add_rx_queues(struct mana_port_context *apc, struct net_device *ndev) { - struct mana_context *ac = apc->ac; struct mana_rxq *rxq; int err = 0; int i; for (i = 0; i < apc->num_queues; i++) { - rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev); + rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev); if (!rxq) { err = -ENOMEM; netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err); @@ -2423,9 +2426,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc, return err; } -static void mana_destroy_vport(struct mana_port_context *apc) +static void mana_destroy_rxqs(struct mana_port_context *apc) { - struct gdma_dev *gd = apc->ac->gdma_dev; struct mana_rxq *rxq; u32 rxq_idx; @@ -2437,8 +2439,12 @@ static void mana_destroy_vport(struct mana_port_context *apc) mana_destroy_rxq(apc, rxq, true); apc->rxqs[rxq_idx] = NULL; } +} + +static void mana_destroy_vport(struct mana_port_context *apc) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; - mana_destroy_txq(apc); mana_uncfg_vport(apc); if (gd->gdma_context->is_pf) @@ -2459,11 +2465,7 @@ static int mana_create_vport(struct mana_port_context *apc, return err; } - err = mana_cfg_vport(apc, gd->pdid, gd->doorbell); - if (err) - return err; - - return mana_create_txq(apc, net); + return mana_cfg_vport(apc, gd->pdid, gd->doorbell); } static int mana_rss_table_alloc(struct mana_port_context *apc) @@ -2655,21 +2657,36 @@ int mana_alloc_queues(struct net_device *ndev) err = mana_create_vport(apc, ndev); if (err) { - netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err); + netdev_err(ndev, "Failed to create vPort %u : %d\n", + apc->port_idx, err); return err; } + err = mana_create_eq(apc); + if (err) { + netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n", + apc->port_idx, err); + goto destroy_vport; + } + + err = mana_create_txq(apc, ndev); + if (err) { + netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n", + apc->port_idx, err); + goto destroy_eq; + } + err = netif_set_real_num_tx_queues(ndev, apc->num_queues); if (err) { netdev_err(ndev, "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n", apc->num_queues, err); - goto destroy_vport; + goto destroy_txq; } err = mana_add_rx_queues(apc, ndev); if (err) - goto destroy_vport; + goto destroy_txq; apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE; @@ -2678,7 +2695,7 @@ int mana_alloc_queues(struct net_device *ndev) netdev_err(ndev, "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n", apc->num_queues, err); - goto destroy_vport; + goto destroy_rxq; } mana_rss_table_init(apc); @@ -2686,19 +2703,25 @@ int mana_alloc_queues(struct net_device *ndev) err = mana_config_rss(apc, TRI_STATE_TRUE, true, true); if (err) { netdev_err(ndev, "Failed to configure RSS table: %d\n", err); - goto destroy_vport; + goto destroy_rxq; } if (gd->gdma_context->is_pf) { err = mana_pf_register_filter(apc); if (err) - goto destroy_vport; + goto destroy_rxq; } mana_chn_setxdp(apc, mana_xdp_get(apc)); return 0; +destroy_rxq: + mana_destroy_rxqs(apc); +destroy_txq: + mana_destroy_txq(apc); +destroy_eq: + mana_destroy_eq(apc); destroy_vport: mana_destroy_vport(apc); return err; @@ -2805,6 +2828,9 @@ static int mana_dealloc_queues(struct net_device *ndev) return err; } + mana_destroy_rxqs(apc); + mana_destroy_txq(apc); + mana_destroy_eq(apc); mana_destroy_vport(apc); return 0; @@ -3019,12 +3045,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming) gd->driver_data = ac; } - err = mana_create_eq(ac); - if (err) { - dev_err(dev, "Failed to create EQs: %d\n", err); - goto out; - } - err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION, &num_ports); if (err) @@ -3138,7 +3158,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending) free_netdev(ndev); } - mana_destroy_eq(ac); out: mana_gd_deregister_device(gd); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0f78065de8fe42..810d6c48a973c0 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -409,9 +409,6 @@ struct mana_context { u16 num_ports; - struct mana_eq *eqs; - struct dentry *mana_eqs_debugfs; - struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; }; @@ -421,6 +418,9 @@ struct mana_port_context { u8 mac_addr[ETH_ALEN]; + struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; + enum TRI_STATE rss_state; mana_handle_t default_rxobj; @@ -826,6 +826,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); +int mana_create_eq(struct mana_port_context *apc); +void mana_destroy_eq(struct mana_port_context *apc); struct net_device *mana_get_primary_netdev(struct mana_context *ac, u32 port_index, From 837f86041cb11dd46e3f76c7490fcdfe51001f27 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 02:05:21 +0000 Subject: [PATCH 2/9] net:mana: Query device and decides msi sharing for EQs When querying the device, adjust the max number of queues to allow dedicated MSIx vectors for each vPort. The number of queues per vPort is clamped down to no less than 16. MSIx sharing among vPort is disabled by default and it is turned on only when it's not possible. --- .../net/ethernet/microsoft/mana/gdma_main.c | 53 ++++++++++++++++--- drivers/net/ethernet/microsoft/mana/mana_en.c | 30 ++++++----- include/net/mana/gdma.h | 6 +++ 3 files changed, 69 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index f4b82bccd1727a..3cb4319d19fa5d 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -70,6 +70,8 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_query_max_resources_resp resp = {}; struct gdma_general_req req = {}; + unsigned int max_num_queues; + u16 num_ports; int err; mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, @@ -115,6 +117,30 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) if (gc->max_num_queues > gc->num_msix_usable - 1) gc->max_num_queues = gc->num_msix_usable - 1; + err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &num_ports); + if (err) + return err; + + /* + * Adjust gc->max_num_queues returned from the SOC to allow dedicated MSIx + * for each vPort. Reduce max_num_queues to no less than 16 if necessary + */ + max_num_queues = (gc->num_msix_usable - 1) / num_ports; + max_num_queues = roundup_pow_of_two(max_num_queues); + if (max_num_queues < 16) + max_num_queues = 16; + + /* + * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for + * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1) + */ + gc->max_num_queues = min(gc->max_num_queues, max_num_queues); + if (gc->max_num_queues * num_ports > gc->num_msix_usable - 1) + gc->msi_sharing = true; + + dev_info(gc->dev, "MSI sharing mode %d max queues %d\n", gc->msi_sharing, gc->max_num_queues); + return 0; } @@ -1639,6 +1665,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev) /* Need 1 interrupt for HWC */ max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1; min_irqs = 2; + gc->msi_sharing = true; } nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX); @@ -1744,20 +1771,30 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; - err = mana_gd_query_max_resources(pdev); + err = mana_gd_detect_devices(pdev); if (err) goto destroy_hwc; - err = mana_gd_setup_remaining_irqs(pdev); - if (err) { - dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); - goto destroy_hwc; - } - - err = mana_gd_detect_devices(pdev); + err = mana_gd_query_max_resources(pdev); if (err) goto destroy_hwc; + if (!gc->msi_sharing) { + gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL); + if (!gc->msi_bitmap) { + err = -ENOMEM; + goto destroy_hwc; + } + // Set bit for HWC + set_bit(0, gc->msi_bitmap); + } else { + err = mana_gd_setup_remaining_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); + goto destroy_hwc; + } + } + dev_dbg(&pdev->dev, "mana gdma setup successful\n"); return 0; diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 019a1c99862ed5..ed55b96de7af91 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -758,10 +758,9 @@ static int mana_init_port_context(struct mana_port_context *apc) return !apc->rxqs ? -ENOMEM : 0; } -static int mana_send_request(struct mana_context *ac, void *in_buf, - u32 in_len, void *out_buf, u32 out_len) +static int gdma_mana_send_request(struct gdma_context *gc, void *in_buf, + u32 in_len, void *out_buf, u32 out_len) { - struct gdma_context *gc = ac->gdma_dev->gdma_context; struct gdma_resp_hdr *resp = out_buf; struct gdma_req_hdr *req = in_buf; struct device *dev = gc->dev; @@ -790,6 +789,14 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, return 0; } +static int mana_send_request(struct mana_context *ac, void *in_buf, + u32 in_len, void *out_buf, u32 out_len) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + + return gdma_mana_send_request(gc, in_buf, in_len, out_buf, out_len); +} + static int mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr, const enum mana_command_code expected_code, const u32 min_size) @@ -919,11 +926,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) err, resp.hdr.status); } -static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, - u32 proto_minor_ver, u32 proto_micro_ver, - u16 *max_num_vports) +int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, + u32 proto_minor_ver, u32 proto_micro_ver, + u16 *max_num_vports) { - struct gdma_context *gc = ac->gdma_dev->gdma_context; struct mana_query_device_cfg_resp resp = {}; struct mana_query_device_cfg_req req = {}; struct device *dev = gc->dev; @@ -938,7 +944,7 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, req.proto_minor_ver = proto_minor_ver; req.proto_micro_ver = proto_micro_ver; - err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp)); + err = gdma_mana_send_request(gc, &req, sizeof(req), &resp, sizeof(resp)); if (err) { dev_err(dev, "Failed to query config: %d", err); return err; @@ -961,8 +967,6 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, else gc->adapter_mtu = ETH_FRAME_LEN; - debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); - return 0; } @@ -3045,11 +3049,13 @@ int mana_probe(struct gdma_dev *gd, bool resuming) gd->driver_data = ac; } - err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, - MANA_MICRO_VERSION, &num_ports); + err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &num_ports); if (err) goto out; + debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); + if (!resuming) { ac->num_ports = num_ports; } else { diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 918411d4153bce..3aafd1c8b5c2d9 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -409,6 +409,9 @@ struct gdma_context { /* Azure RDMA adapter */ struct gdma_dev mana_ib; + + bool msi_sharing; + unsigned long *msi_bitmap; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -902,4 +905,7 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, + u32 proto_minor_ver, u32 proto_micro_ver, + u16 *max_num_vports); #endif /* _GDMA_H */ From 09fcf8ea4906c728ccf4b1819a973ea7ca9b6c16 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:35:52 +0000 Subject: [PATCH 3/9] net:mana: Introduce new gic context and refcount for interrupt context To allow Ethernet EQs share or use MSIx and RDMA EQs on the same MSIx, introduce gic context and allow driver to create an interrupt context on an assigned/unassigned MSIx. --- .../net/ethernet/microsoft/mana/gdma_main.c | 124 ++++++++++++++++++ include/net/mana/gdma.h | 8 ++ 2 files changed, 132 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 3cb4319d19fa5d..a162736ce2a959 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1384,6 +1384,129 @@ static irqreturn_t mana_gd_intr(int irq, void *arg) return IRQ_HANDLED; } +void gdma_put_gic(struct gdma_context *gc, bool use_bitmap, int msi) +{ + struct pci_dev *dev = to_pci_dev(gc->dev); + struct msi_map irq_map; + struct gdma_irq_context *gic; + int irq; + + mutex_lock(&gc->gic_mutex); + + gic = xa_load(&gc->irq_contexts, msi); + if (WARN_ON(!gic)) { + mutex_unlock(&gc->gic_mutex); + return; + } + + if (!refcount_dec_and_test(&gic->refcount)) + goto clear_bitmap; + + irq = pci_irq_vector(dev, msi); + + irq_update_affinity_hint(irq, NULL); + free_irq(irq, gic); + + irq_map.virq = irq; + irq_map.index = msi; + pci_msix_free_irq(dev, irq_map); + + xa_erase(&gc->irq_contexts, msi); + kfree(gic); + +clear_bitmap: + if (use_bitmap) + clear_bit(msi, gc->msi_bitmap); + + mutex_unlock(&gc->gic_mutex); +} +EXPORT_SYMBOL_NS(gdma_put_gic, "NET_MANA"); + +struct gdma_irq_context *gdma_get_gic(struct gdma_context *gc, bool use_bitmap, + u16 port_index, int queue_index, + int *msi_requested) +{ + struct gdma_irq_context *gic; + struct pci_dev *dev = to_pci_dev(gc->dev); + struct msi_map irq_map; + int irq; + int msi; + int err; + + mutex_lock(&gc->gic_mutex); + + if (use_bitmap) { + msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable); + *msi_requested = msi; + } else { + msi = *msi_requested; + } + + gic = xa_load(&gc->irq_contexts, msi); + if (gic) { + refcount_inc(&gic->refcount); + if (use_bitmap) + set_bit(msi, gc->msi_bitmap); + goto out; + } + + irq = pci_irq_vector(dev, msi); + if (irq == -EINVAL) { + irq_map = pci_msix_alloc_irq_at(dev, msi, NULL); + if (!irq_map.virq) { + err = irq_map.index; + dev_err(gc->dev, + "Failed to alloc irq_map msi %d err %d\n", + msi, err); + gic = NULL; + goto out; + } + irq = irq_map.virq; + msi = irq_map.index; + } + + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + dev_err(gc->dev, "Failed to allocate gic\n"); + goto out; + } + gic->handler = mana_gd_process_eq_events; + gic->msi = msi; + gic->irq = irq; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); + + if (!gic->msi) + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", + pci_name(dev)); + else if (use_bitmap) + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_p%dq%d@pci:%s", + port_index, queue_index, pci_name(dev)); + else + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", + queue_index, pci_name(dev)); + + err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); + if (err) { + dev_err(gc->dev, "Failed to request irq %d %s\n", + irq, gic->name); + kfree(gic); + gic = NULL; + goto out; + } + + refcount_set(&gic->refcount, 1); + xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL); + + if (use_bitmap) + set_bit(msi, gc->msi_bitmap); + +out: + mutex_unlock(&gc->gic_mutex); + return gic; +} +EXPORT_SYMBOL_NS(gdma_get_gic, "NET_MANA"); + int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r) { r->map = bitmap_zalloc(res_avail, GFP_KERNEL); @@ -1856,6 +1979,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto release_region; mutex_init(&gc->eq_test_event_mutex); + mutex_init(&gc->gic_mutex); pci_set_drvdata(pdev, gc); gc->bar0_pa = pci_resource_start(pdev, 0); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 3aafd1c8b5c2d9..86906badbd6bd1 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -363,6 +363,9 @@ struct gdma_irq_context { spinlock_t lock; struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; + unsigned int msi; + unsigned int irq; + refcount_t refcount; }; struct gdma_context { @@ -410,6 +413,7 @@ struct gdma_context { /* Azure RDMA adapter */ struct gdma_dev mana_ib; + struct mutex gic_mutex; bool msi_sharing; unsigned long *msi_bitmap; }; @@ -905,6 +909,10 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +struct gdma_irq_context *gdma_get_gic(struct gdma_context *gc, bool use_bitmap, + u16 port_index, int queue_index, + int *msi_requested); +void gdma_put_gic(struct gdma_context *gc, bool use_bitmap, int msi); int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, u32 proto_minor_ver, u32 proto_micro_ver, u16 *max_num_vports); From 45e5260bf1984d3f0ec080b5fb41a2e5fe495a0e Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:43:54 +0000 Subject: [PATCH 4/9] net:mana: Use gic functions to allocate glocal EQs Replace the interrupt setup code with gic functions. Those functions keep track of interrupt context usage from vPorts EQs and RDMA EQs. --- .../net/ethernet/microsoft/mana/gdma_main.c | 74 ++----------------- 1 file changed, 8 insertions(+), 66 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index a162736ce2a959..397ab3c92021f9 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1622,17 +1622,11 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) * further used in irq_setup() */ for (i = 1; i <= nvec; i++) { - gic = kzalloc(sizeof(*gic), GFP_KERNEL); + gic = gdma_get_gic(gc, false, 0, i, &i); if (!gic) { err = -ENOMEM; goto free_irq; } - gic->handler = mana_gd_process_eq_events; - INIT_LIST_HEAD(&gic->eq_list); - spin_lock_init(&gic->lock); - - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", - i - 1, pci_name(pdev)); /* one pci vector is already allocated for HWC */ irqs[i - 1] = pci_irq_vector(pdev, i); @@ -1640,12 +1634,6 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) err = irqs[i - 1]; goto free_current_gic; } - - err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_current_gic; - - xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } /* @@ -1672,14 +1660,8 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) free_irq: for (i -= 1; i > 0; i--) { irq = pci_irq_vector(pdev, i); - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + gdma_put_gic(gc, false, i); } kfree(irqs); return err; @@ -1700,34 +1682,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) start_irqs = irqs; for (i = 0; i < nvec; i++) { - gic = kzalloc(sizeof(*gic), GFP_KERNEL); + gic = gdma_get_gic(gc, false, 0, i, &i); if (!gic) { err = -ENOMEM; goto free_irq; } - - gic->handler = mana_gd_process_eq_events; - INIT_LIST_HEAD(&gic->eq_list); - spin_lock_init(&gic->lock); - - if (!i) - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", - pci_name(pdev)); - else - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", - i - 1, pci_name(pdev)); - - irqs[i] = pci_irq_vector(pdev, i); - if (irqs[i] < 0) { - err = irqs[i]; - goto free_current_gic; - } - - err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_current_gic; - - xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } /* If number of IRQ is one extra than number of online CPUs, @@ -1756,19 +1715,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) kfree(start_irqs); return 0; -free_current_gic: - kfree(gic); free_irq: for (i -= 1; i >= 0; i--) { irq = pci_irq_vector(pdev, i); - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + gdma_put_gic(gc, false, i); } kfree(start_irqs); @@ -1843,26 +1794,17 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev) static void mana_gd_remove_irqs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); - struct gdma_irq_context *gic; int irq, i; if (gc->max_num_msix < 1) return; - for (i = 0; i < gc->max_num_msix; i++) { - irq = pci_irq_vector(pdev, i); - if (irq < 0) - continue; - - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - + for (i = 0; i < (gc->msi_sharing ? gc->max_num_msix : 1); i++) { /* Need to clear the hint before free_irq */ + irq = pci_irq_vector(pdev, i); irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + + gdma_put_gic(gc, !gc->msi_sharing, i); } pci_free_irq_vectors(pdev); From ca930b2781fea0ed20489b29f43a5132fe981b85 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:50:48 +0000 Subject: [PATCH 5/9] net:mana: Create or get interrupt for each EQ when creating vPort Use gic functions to create a dedicated interrupt context or acquire a shared interrupt context for this EQ. --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 12 +++++++++++- include/net/mana/gdma.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 397ab3c92021f9..008753e17d0235 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -632,7 +632,6 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue) } spin_unlock_irqrestore(&gic->lock, flags); - queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; synchronize_rcu(); } @@ -746,6 +745,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd, out: dev_err(dev, "Failed to create EQ: %d\n", err); mana_gd_destroy_eq(gc, false, queue); + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; return err; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ed55b96de7af91..a58d61f1a42e26 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1262,6 +1262,7 @@ void mana_destroy_eq(struct mana_port_context *apc) continue; mana_gd_destroy_queue(gc, eq); + gdma_put_gic(gc, !gc->msi_sharing, eq->eq.msix_index); } kfree(apc->eqs); @@ -1278,6 +1279,7 @@ static void mana_create_eq_debugfs(struct mana_port_context *apc, int i) eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs); debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head); debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail); + debugfs_create_u32("irq", 0400, eq.mana_eq_debugfs, &eq.eq->eq.irq); debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops); } @@ -1288,6 +1290,7 @@ int mana_create_eq(struct mana_port_context *apc) struct gdma_queue_spec spec = {}; int err; int i; + struct gdma_irq_context *gic; WARN_ON(apc->eqs); apc->eqs = kcalloc(apc->num_queues, sizeof(struct mana_eq), @@ -1305,12 +1308,19 @@ int mana_create_eq(struct mana_port_context *apc) apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs); for (i = 0; i < apc->num_queues; i++) { - spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + if (gc->msi_sharing) + spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + gic = gdma_get_gic(gc, !gc->msi_sharing, apc->port_idx, i, + &spec.eq.msix_index); + if (!gic) + goto out; + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); if (err) { dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err); goto out; } + apc->eqs[i].eq->eq.irq = gic->irq; mana_create_eq_debugfs(apc, i); } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 86906badbd6bd1..0dcb1d97330110 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -317,6 +317,7 @@ struct gdma_queue { void *context; unsigned int msix_index; + unsigned int irq; u32 log2_throttle_limit; } eq; From 4846d29cc7a88ecf43b9950f14ac6e73f7f3cb55 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:56:54 +0000 Subject: [PATCH 6/9] RDMA/mana_ib: Create interrupts on EQs Use the newly introduced function to allocate interrupt context for EQs. Those interrupt contexts may be shared with Ethernet EQs. --- drivers/infiniband/hw/mana/main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 86e9ac0127a8bb..c454590078025c 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -738,6 +738,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue_spec spec = {}; + struct gdma_irq_context *gic; int err, i; spec.type = GDMA_EQ; @@ -748,6 +749,8 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; spec.eq.msix_index = 0; + gic = gdma_get_gic(gc, false, 0, 0, &spec.eq.msix_index); + err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq); if (err) return err; @@ -761,6 +764,9 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.eq.callback = NULL; for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + + gic = gdma_get_gic(gc, false, 0, 0, &spec.eq.msix_index); + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); if (err) goto destroy_eqs; @@ -780,12 +786,16 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); - int i; + int i, msi; mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + gdma_put_gic(gc, false, 0); - for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { mana_gd_destroy_queue(gc, mdev->eqs[i]); + msi = (i + 1) % gc->num_msix_usable; + gdma_put_gic(gc, false, msi); + } kfree(mdev->eqs); } From ca870d9589fec89990036cc9265909116ec2a0a8 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 4 Sep 2025 18:56:07 -0700 Subject: [PATCH 7/9] net:mana: Indicate the driver now supports dedicated EQs/MSIs for each vPort --- include/net/mana/gdma.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 0dcb1d97330110..0b90238ff077bd 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -572,6 +572,8 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) /* Driver supports dynamic MSI-X vector allocation */ #define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) +/* Driver supports separate EQ/MSIs for each vPort */ +#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19) #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ @@ -580,7 +582,8 @@ enum { GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ - GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT) + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT) #define GDMA_DRV_CAP_FLAGS2 0 From 5acd774ea7842b9ffce4c101f75c43b51ec7b74f Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 8 Sep 2025 18:52:14 -0700 Subject: [PATCH 8/9] staged --- .../net/ethernet/microsoft/mana/gdma_main.c | 43 ++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 119 ++++++++---------- include/net/mana/gdma.h | 9 ++ include/net/mana/mana.h | 10 +- 4 files changed, 108 insertions(+), 73 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 008753e17d0235..19f56a22f4ca80 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -523,6 +523,7 @@ static void mana_gd_process_eq_events(void *arg) struct gdma_queue *eq = arg; struct gdma_context *gc; struct gdma_eqe *eqe; + unsigned int arm_bit; u32 head, num_eqe; int i; @@ -562,16 +563,48 @@ static void mana_gd_process_eq_events(void *arg) eq->head++; } + /* Always rearm the EQ for HWC. For MANA, rearm it when NAPI is done. */ + if (mana_gd_is_hwc(eq->gdma_dev)) { + arm_bit = SET_ARM_BIT; + } else if (eq->eq.work_done < eq->eq.budget && + napi_complete_done(&eq->eq.napi, eq->eq.work_done)) { + arm_bit = SET_ARM_BIT; + } else { + arm_bit = 0; + } + head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS); mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id, - head, SET_ARM_BIT); + head, arm_bit); +} + +static int mana_poll(struct napi_struct *napi, int budget) +{ + struct gdma_queue *eq = container_of(napi, struct gdma_queue, eq.napi); + + eq->eq.work_done = 0; + eq->eq.budget = budget; + + mana_gd_process_eq_events(eq); + + return min(eq->eq.work_done, budget); +} + +static void mana_gd_schedule_napi(void *arg) +{ + struct gdma_queue *eq = arg; + struct napi_struct *napi; + + napi = &eq->eq.napi; + napi_schedule_irqoff(napi); } static int mana_gd_register_irq(struct gdma_queue *queue, const struct gdma_queue_spec *spec) { struct gdma_dev *gd = queue->gdma_dev; + bool is_mana = mana_gd_is_mana(gd); struct gdma_irq_context *gic; struct gdma_context *gc; unsigned int msi_index; @@ -596,6 +629,14 @@ static int mana_gd_register_irq(struct gdma_queue *queue, if (WARN_ON(!gic)) return -EINVAL; + if (is_mana) { + netif_napi_add(spec->eq.ndev, &queue->eq.napi, mana_poll); +// netif_napi_add(spec->eq.ndev, &queue->eq.napi, mana_poll, +// NAPI_POLL_WEIGHT); + napi_enable(&queue->eq.napi); + gic->handler = mana_gd_schedule_napi; + } + spin_lock_irqsave(&gic->lock, flags); list_add_rcu(&queue->entry, &gic->eq_list); spin_unlock_irqrestore(&gic->lock, flags); diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a58d61f1a42e26..18945a6f87d147 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1243,6 +1243,14 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, } EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA"); +static void mana_init_cqe_poll_buf(struct gdma_comp *cqe_poll_buf) +{ + int i; + + for (i = 0; i < CQE_POLLING_BUFFER; i++) + memset(&cqe_poll_buf[i], 0, sizeof(struct gdma_comp)); +} + void mana_destroy_eq(struct mana_port_context *apc) { struct mana_context *ac = apc->ac; @@ -1304,12 +1312,16 @@ int mana_create_eq(struct mana_port_context *apc) spec.eq.callback = NULL; spec.eq.context = apc->eqs; spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.ndev = apc->ndev; apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs); for (i = 0; i < apc->num_queues; i++) { + mana_init_cqe_poll_buf(apc->eqs[i].cqe_poll); + if (gc->msi_sharing) spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + gic = gdma_get_gic(gc, !gc->msi_sharing, apc->port_idx, i, &spec.eq.msix_index); if (!gic) @@ -1422,6 +1434,7 @@ static void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc) static void mana_poll_tx_cq(struct mana_cq *cq) { + struct gdma_queue *gdma_eq = cq->gdma_cq->cq.parent; struct gdma_comp *completions = cq->gdma_comp_buf; struct gdma_posted_wqe_info *wqe_info; unsigned int pkt_transmitted = 0; @@ -1443,9 +1456,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) comp_read = mana_gd_poll_cq(cq->gdma_cq, completions, CQE_POLLING_BUFFER); - if (comp_read < 1) - return; - for (i = 0; i < comp_read; i++) { struct mana_tx_comp_oob *cqe_oob; @@ -1501,7 +1511,7 @@ static void mana_poll_tx_cq(struct mana_cq *cq) mana_unmap_skb(skb, apc); - napi_consume_skb(skb, cq->budget); + napi_consume_skb(skb, gdma_eq->eq.budget); pkt_transmitted++; } @@ -1530,8 +1540,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) if (atomic_sub_return(pkt_transmitted, &txq->pending_sends) < 0) WARN_ON_ONCE(1); - - cq->work_done = pkt_transmitted; } static void mana_post_pkt_rxq(struct mana_rxq *rxq) @@ -1584,15 +1592,19 @@ static void mana_rx_skb(void *buf_va, bool from_pool, struct mana_stats_rx *rx_stats = &rxq->stats; struct net_device *ndev = rxq->ndev; uint pkt_len = cqe->ppi[0].pkt_len; + struct mana_port_context *apc; u16 rxq_idx = rxq->rxq_idx; struct napi_struct *napi; + struct gdma_queue *eq; struct xdp_buff xdp = {}; struct sk_buff *skb; u32 hash_value; u32 act; - rxq->rx_cq.work_done++; - napi = &rxq->rx_cq.napi; + apc = netdev_priv(ndev); + eq = apc->eqs[rxq_idx].eq; + eq->eq.work_done++; + napi = &eq->eq.napi; if (!buf_va) { ++ndev->stats.rx_dropped; @@ -1836,10 +1848,10 @@ static void mana_poll_rx_cq(struct mana_cq *cq) xdp_do_flush(); } -static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) +static void mana_cq_handler(void *context, struct gdma_queue *gdma_queue) { struct mana_cq *cq = context; - int w; +// int w; WARN_ON_ONCE(cq->gdma_cq != gdma_queue); @@ -1848,6 +1860,8 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) else mana_poll_tx_cq(cq); + mana_gd_ring_cq(gdma_queue, SET_ARM_BIT); +#if 0 w = cq->work_done; cq->work_done_since_doorbell += w; @@ -1867,26 +1881,7 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) } return w; -} - -static int mana_poll(struct napi_struct *napi, int budget) -{ - struct mana_cq *cq = container_of(napi, struct mana_cq, napi); - int w; - - cq->work_done = 0; - cq->budget = budget; - - w = mana_cq_handler(cq, cq->gdma_cq); - - return min(w, budget); -} - -static void mana_schedule_napi(void *context, struct gdma_queue *gdma_queue) -{ - struct mana_cq *cq = context; - - napi_schedule_irqoff(&cq->napi); +#endif } static void mana_deinit_cq(struct mana_port_context *apc, struct mana_cq *cq) @@ -1911,7 +1906,6 @@ static void mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) static void mana_destroy_txq(struct mana_port_context *apc) { - struct napi_struct *napi; int i; if (!apc->tx_qp) @@ -1921,13 +1915,6 @@ static void mana_destroy_txq(struct mana_port_context *apc) debugfs_remove_recursive(apc->tx_qp[i].mana_tx_debugfs); apc->tx_qp[i].mana_tx_debugfs = NULL; - napi = &apc->tx_qp[i].tx_cq.napi; - if (apc->tx_qp[i].txq.napi_initialized) { - napi_synchronize(napi); - napi_disable(napi); - netif_napi_del(napi); - apc->tx_qp[i].txq.napi_initialized = false; - } mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); @@ -1956,8 +1943,8 @@ static void mana_create_txq_debugfs(struct mana_port_context *apc, int idx) &tx_qp->tx_cq.gdma_cq->head); debugfs_create_u32("cq_tail", 0400, tx_qp->mana_tx_debugfs, &tx_qp->tx_cq.gdma_cq->tail); - debugfs_create_u32("cq_budget", 0400, tx_qp->mana_tx_debugfs, - &tx_qp->tx_cq.budget); +// debugfs_create_u32("cq_budget", 0400, tx_qp->mana_tx_debugfs, +// &tx_qp->tx_cq.budget); debugfs_create_file("txq_dump", 0400, tx_qp->mana_tx_debugfs, tx_qp->txq.gdma_sq, &mana_dbg_q_fops); debugfs_create_file("cq_dump", 0400, tx_qp->mana_tx_debugfs, @@ -2023,6 +2010,7 @@ static int mana_create_txq(struct mana_port_context *apc, /* Create SQ's CQ */ cq = &apc->tx_qp[i].tx_cq; + cq->gdma_comp_buf = apc->eqs[i].cqe_poll; cq->type = MANA_CQ_TYPE_TX; cq->txq = txq; @@ -2031,7 +2019,7 @@ static int mana_create_txq(struct mana_port_context *apc, spec.type = GDMA_CQ; spec.monitor_avl_buf = false; spec.queue_size = cq_size; - spec.cq.callback = mana_schedule_napi; + spec.cq.callback = mana_cq_handler; spec.cq.parent_eq = apc->eqs[i].eq; spec.cq.context = cq; err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); @@ -2077,10 +2065,6 @@ static int mana_create_txq(struct mana_port_context *apc, mana_create_txq_debugfs(apc, i); - netif_napi_add_tx(net, &cq->napi, mana_poll); - napi_enable(&cq->napi); - txq->napi_initialized = true; - mana_gd_ring_cq(cq->gdma_cq, SET_ARM_BIT); } @@ -2092,6 +2076,21 @@ static int mana_create_txq(struct mana_port_context *apc, return err; } +static void mana_napi_sync_for_rx(struct mana_rxq *rxq) +{ + struct net_device *ndev = rxq->ndev; + struct mana_port_context *apc; + u16 rxq_idx = rxq->rxq_idx; + struct napi_struct *napi; + struct gdma_queue *eq; + + apc = netdev_priv(ndev); + eq = apc->eqs[rxq_idx].eq; + napi = &eq->eq.napi; + + napi_synchronize(napi); +} + static void mana_destroy_rxq(struct mana_port_context *apc, struct mana_rxq *rxq, bool napi_initialized) @@ -2099,7 +2098,6 @@ static void mana_destroy_rxq(struct mana_port_context *apc, struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; struct mana_recv_buf_oob *rx_oob; struct device *dev = gc->dev; - struct napi_struct *napi; struct page *page; int i; @@ -2109,17 +2107,11 @@ static void mana_destroy_rxq(struct mana_port_context *apc, debugfs_remove_recursive(rxq->mana_rx_debugfs); rxq->mana_rx_debugfs = NULL; - napi = &rxq->rx_cq.napi; - - if (napi_initialized) { - napi_synchronize(napi); - - napi_disable(napi); - - netif_napi_del(napi); - } xdp_rxq_info_unreg(&rxq->xdp_rxq); + if (napi_initialized) + mana_napi_sync_for_rx(rxq); + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); mana_deinit_cq(apc, &rxq->rx_cq); @@ -2245,11 +2237,13 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc) { struct mana_port_context *mpc = netdev_priv(rxq->ndev); struct page_pool_params pprm = {}; + u16 rxq_idx = rxq->rxq_idx; + struct gdma_queue *eq = mpc->eqs[rxq_idx].eq; int ret; pprm.pool_size = mpc->rx_queue_size; pprm.nid = gc->numa_node; - pprm.napi = &rxq->rx_cq.napi; + pprm.napi = &eq->eq.napi; pprm.netdev = rxq->ndev; pprm.order = get_order(rxq->alloc_size); @@ -2318,6 +2312,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, /* Create RQ's CQ */ cq = &rxq->rx_cq; + cq->gdma_comp_buf = eq->cqe_poll; cq->type = MANA_CQ_TYPE_RX; cq->rxq = rxq; @@ -2325,7 +2320,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, spec.type = GDMA_CQ; spec.monitor_avl_buf = false; spec.queue_size = cq_size; - spec.cq.callback = mana_schedule_napi; + spec.cq.callback = mana_cq_handler; spec.cq.parent_eq = eq->eq; spec.cq.context = cq; err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); @@ -2367,15 +2362,11 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, gc->cq_table[cq->gdma_id] = cq->gdma_cq; - netif_napi_add_weight(ndev, &cq->napi, mana_poll, 1); - - WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx, - cq->napi.napi_id)); +// WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx, +// cq->napi.napi_id)); WARN_ON(xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL, rxq->page_pool)); - napi_enable(&cq->napi); - mana_gd_ring_cq(cq->gdma_cq, SET_ARM_BIT); out: if (!err) @@ -2407,7 +2398,7 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx) &rxq->rx_cq.gdma_cq->head); debugfs_create_u32("cq_tail", 0400, rxq->mana_rx_debugfs, &rxq->rx_cq.gdma_cq->tail); - debugfs_create_u32("cq_budget", 0400, rxq->mana_rx_debugfs, &rxq->rx_cq.budget); +// debugfs_create_u32("cq_budget", 0400, rxq->mana_rx_debugfs, &rxq->rx_cq.budget); debugfs_create_file("rxq_dump", 0400, rxq->mana_rx_debugfs, rxq->gdma_rq, &mana_dbg_q_fops); debugfs_create_file("cq_dump", 0400, rxq->mana_rx_debugfs, rxq->rx_cq.gdma_cq, &mana_dbg_q_fops); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 0b90238ff077bd..52dcb4a8c7e4fa 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -267,8 +267,10 @@ struct gdma_event { struct gdma_queue; +#define CQE_POLLING_BUFFER 512 struct mana_eq { struct gdma_queue *eq; + struct gdma_comp cqe_poll[CQE_POLLING_BUFFER]; struct dentry *mana_eq_debugfs; }; @@ -320,6 +322,11 @@ struct gdma_queue { unsigned int irq; u32 log2_throttle_limit; + + /* NAPI data */ + struct napi_struct napi; + int work_done; + int budget; } eq; struct { @@ -344,6 +351,8 @@ struct gdma_queue_spec { unsigned long log2_throttle_limit; unsigned int msix_index; + + struct net_device *ndev; } eq; struct { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 810d6c48a973c0..5e8d1f9e606e05 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -283,14 +283,8 @@ struct mana_cq { */ struct mana_txq *txq; - /* Buffer which the CQ handler can copy the CQE's into. */ - struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER]; - - /* NAPI data */ - struct napi_struct napi; - int work_done; - int work_done_since_doorbell; - int budget; + /* Pointer to a buffer which the CQ handler can copy the CQE's into. */ + struct gdma_comp *gdma_comp_buf; }; struct mana_recv_buf_oob { From b520bd0e3ed6397d1127ea94df2fa2026c019b4f Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 9 Sep 2025 10:59:36 -0700 Subject: [PATCH 9/9] staged --- drivers/net/ethernet/microsoft/mana/mana_en.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 18945a6f87d147..935ca00b413080 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2362,8 +2362,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, gc->cq_table[cq->gdma_id] = cq->gdma_cq; -// WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx, -// cq->napi.napi_id)); + WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx, + eq->eq->eq.napi.napi_id)); WARN_ON(xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL, rxq->page_pool));