From c733f2aee36076d1a0f2e29887b656b1716c98f2 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 19 Apr 2024 12:14:14 +0300 Subject: [PATCH 1/6] telemetry2: Add telemetry2 slot and chunk reservation code TELEMETRY2 is a generic memory structure for passing real-time debug information from the SOF system. It only provides the framework for passing pieces of abstract data from DSP side to host side debugging tools. This commit implements telemetry2 data passing from DSP to host side using debug memory window. The commit defines telemetry2 payload header and telemetry2 chunk header. Adds function to search, or if not found reserve, a chunk of specified size and id from telemetry2 slot. Each chunk is forced to star from 64-byte cache line boundary. A pointer to the chunk is returned. Signed-off-by: Jyri Sarha --- src/debug/telemetry/Kconfig | 20 +++++ src/debug/telemetry/telemetry2_slot.c | 112 ++++++++++++++++++++++++++ src/debug/telemetry/telemetry2_slot.h | 86 ++++++++++++++++++++ 3 files changed, 218 insertions(+) create mode 100644 src/debug/telemetry/telemetry2_slot.c create mode 100644 src/debug/telemetry/telemetry2_slot.h diff --git a/src/debug/telemetry/Kconfig b/src/debug/telemetry/Kconfig index 54f4b66f590d..2a61ee456ff6 100644 --- a/src/debug/telemetry/Kconfig +++ b/src/debug/telemetry/Kconfig @@ -9,3 +9,23 @@ config SOF_TELEMETRY systick_info measurement which measures scheduler task performance and may slightly affect overall performance. +config SOF_TELEMETRY2_SLOT + bool "Enable SOF telemetry2 slot in debug window" + help + TELEMETRY2 is a generic memory structure for passing real-time debug + information from the SOF system. It only provides the framework for + passing pieces of abstract data from DSP side to host side debugging + tools. The option enables TELEMETRY2 debug window slot for transferring + telemetry2 data. + +if SOF_TELEMETRY2_SLOT + +config SOF_TELEMETRY2_SLOT_NUMBER + int "Debug window slot where to put telemetry2 slot" + default 3 + range 0 14 + help + Which debug slot to reserve for telemetry2. Remember to map + the slot with MEMORY_WIN_2_SIZE in soc/xtensa/intel_adsp. + +endif diff --git a/src/debug/telemetry/telemetry2_slot.c b/src/debug/telemetry/telemetry2_slot.c new file mode 100644 index 000000000000..1462e870dd26 --- /dev/null +++ b/src/debug/telemetry/telemetry2_slot.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. + +#include +#include +#include +#include +#include +#include "telemetry2_slot.h" + +LOG_MODULE_REGISTER(telemetry2_slot); + +static struct k_spinlock lock; + +static +struct telemetry2_chunk_hdr *telemetry2_chunk_search(struct telemetry2_chunk_hdr *chunk, + int id) +{ + uint8_t *p; + + /* Loop through the chunks and look for 'id'. If found leave a + * pointer to it in 'chunk', if not leave 'chunk' pointing at + * the beginning of the free space. + */ + while (chunk->id != id && chunk->size != 0) { + p = (((uint8_t *) chunk) + chunk->size); + + chunk = (struct telemetry2_chunk_hdr *) p; + } + return chunk; +} + +/* + * Search, or if not found reserve, a chunk of specified size and id from + * telemetry2 slot. Each chuck must be aligned with a 64-byte cache line. + * A pointer to the chunk is returned. + */ +void *telemetry2_chunk_get(int id, size_t required_size) +{ + const int slot = CONFIG_SOF_TELEMETRY2_SLOT_NUMBER; + struct telemetry2_payload_hdr *payload = + (struct telemetry2_payload_hdr *)(ADSP_DW->slots[slot]); + size_t hdr_size = ALIGN_UP(sizeof(*payload), CONFIG_DCACHE_LINE_SIZE); + struct telemetry2_chunk_hdr *chunk = (struct telemetry2_chunk_hdr *) + (((uint8_t *) payload) + hdr_size); + size_t size = ALIGN_UP(required_size, CONFIG_DCACHE_LINE_SIZE); + k_spinlock_key_t key; + + if (ADSP_DW->descs[slot].type != ADSP_DW_SLOT_TELEMETRY2) { + if (ADSP_DW->descs[slot].type != 0) + LOG_WRN("Slot %d was not free: %u", slot, ADSP_DW->descs[slot].type); + LOG_INF("Initializing telemetry2 slot 0x%08x", ADSP_DW->descs[slot].type); + ADSP_DW->descs[slot].type = ADSP_DW_SLOT_TELEMETRY2; + payload->hdr_size = hdr_size; + payload->magic = TELEMETRY2_PAYLOAD_MAGIC; + payload->abi = TELEMETRY2_PAYLOAD_V0_0; + } + + LOG_INF("Add id %u size %u (after alignment %u)", id, required_size, size); + chunk = telemetry2_chunk_search(chunk, id); + if (id == chunk->id) + goto match_found; + + /* End of chunk list reached, but specified chunk not found. We + * need to reseeve one, so take the spin lock and check if more + * chunks were added since the previus search. + */ + key = k_spin_lock(&lock); + chunk = telemetry2_chunk_search(chunk, id); + /* Check if some other thread beat us to add the chunk we want */ + if (id == chunk->id) { + k_spin_unlock(&lock, key); + goto match_found; + } + + /* If the chunk was not found, reserve space for it after + * the last reserved chunk. + */ + if (((uint8_t *) chunk) + size >= ADSP_DW->slots[slot + 1]) { + LOG_ERR("No space for chunk %u of size %u in slot %u, offset %u", + id, size, slot, ((uint8_t *) chunk) - ADSP_DW->slots[slot]); + k_spin_unlock(&lock, key); + return NULL; + } + + if (chunk->id != TELEMETRY2_CHUNK_ID_EMPTY) + LOG_WRN("Chunk of size %u has type %u, assuming empty", + chunk->size, chunk->id); + + LOG_INF("Chunk %u reserved", id); + chunk->size = size; + chunk->id = id; + payload->total_size = size + ((uint8_t *) chunk) - ((uint8_t *) payload); + + k_spin_unlock(&lock, key); + + return chunk; + +match_found: + /* If a matching chunk was found check that the size is enough + * for what was requested. If yes return the value, if not + * return NULL. + */ + if (required_size > chunk->size) { + LOG_ERR("Chunk %d size too small %u > %u", + id, required_size, chunk->size); + return NULL; + } + LOG_INF("Chunk %u found", id); + return chunk; +} diff --git a/src/debug/telemetry/telemetry2_slot.h b/src/debug/telemetry/telemetry2_slot.h new file mode 100644 index 000000000000..554e7f966550 --- /dev/null +++ b/src/debug/telemetry/telemetry2_slot.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2024 Intel Corporation. + */ + +#ifndef __SOC_TELEMETRY2_SLOT_H__ +#define __SOC_TELEMETRY2_SLOT_H__ + +/* + * TELEMETRY2 is a generic memory structure for passing real-time debug + * information from the SOF system. It only provides the framework for + * passing pieces of abstract data from DSP side to host side debugging + * tools. + * + * This example describes how TELEMETRY2 data is transferred from DSP + * to host side using debug memory window. + * + * TELEMETRY2 slot is taken from SRAM window divided into several + * chunks with simple header declared below. + * + * "type" describes the contents of the chunk + * "size" describes the size of the chunk in bytes + * + * After each chunk another chunk is expected to follow, until an + * empty chunk header is found. If chunk type is 0 and size 0 it + * indicates an empty header, and that the rest of the slot is unused. + * A new chunk, with the header, can be reserved stating at the end of + * previous chunk (which has been aligned to the end of cache line). + * + * Each chunk is aligned to start at 64-byte cache line boundary. + * + * For example: + * -------------------------------------------------- --- + * | magic = TELEMETRY2_PAYLOAD_MAGIC | | + * | hdr_size = 64 | | + * | total_size = 320 | | + * | abi = TELEMETRY2_PAYLOAD_V0_0 | 64 bytes + * | tstamp = | | + * | | | + * -------------------------------------------------- --- + * | type = ADSP_DW_TELEMETRY2_ID_THREAD_INFO | | + * | size = 256 | 256 bytes + * | chunk data | | + * | ... | | + * -------------------------------------------------- --- + * | type = ADSP_DW_TELEMETRY2_TYPE_EMPTY | + * | size = 0 | + * -------------------------------------------------- + * + * The above depicts a telemetry2 slot with thread analyzer data + * of 256 bytes (including the header) in the first chunk and + * the rest of the slot being free. + */ + +#include + +#define TELEMETRY2_PAYLOAD_MAGIC 0x1ED15EED + +#define TELEMETRY2_PAYLOAD_V0_0 0 + +/* Telemetry2 payload header + * + * The payload header should be written in the beginning of the + * telemetry2 payload, before the chunks. + */ +struct telemetry2_payload_hdr { + uint32_t magic; // used to identify valid data + uint32_t hdr_size; // size of this header only in bytes + uint32_t total_size; // total size of the whole payload in bytes + uint32_t abi; // ABI version, can be used by tool to warn users if too old. + uint64_t tstamp; // aligned with host epoch. +} __packed __aligned(CONFIG_DCACHE_LINE_SIZE); + +/* Defined telemetry2 chunk types */ +#define TELEMETRY2_CHUNK_ID_EMPTY 0 +#define TELEMETRY2_ID_THREAD_INFO 1 + +/* Telemetry2 chunk header */ +struct telemetry2_chunk_hdr { + uint32_t id; // Chunk ID + uint32_t size; // Size of telemetry chunk +} __packed; + +void *telemetry2_chunk_get(int id, size_t size); + +#endif /* __SOC_TELEMETRY2_SLOT_H__ */ From 0acff8674cdc29b90c834be607ac7f9e65a6a94e Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 19 Apr 2024 12:16:38 +0300 Subject: [PATCH 2/6] telemetry2: thread_info: Add telemetry2 thread info module Adds code for collecting thread statistic for cpu and stack usage, and for writing that information to telemetry2 memory chuck. Signed-off-by: Jyri Sarha --- src/debug/telemetry/CMakeLists.txt | 4 + src/debug/telemetry/Kconfig | 23 ++ src/debug/telemetry/telemetry2_thread_info.c | 361 +++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 src/debug/telemetry/telemetry2_thread_info.c diff --git a/src/debug/telemetry/CMakeLists.txt b/src/debug/telemetry/CMakeLists.txt index f2249490db60..318d349778a6 100644 --- a/src/debug/telemetry/CMakeLists.txt +++ b/src/debug/telemetry/CMakeLists.txt @@ -1,3 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause add_local_sources_ifdef(CONFIG_SOF_TELEMETRY sof telemetry.c) + +add_local_sources_ifdef(CONFIG_SOF_TELEMETRY2_SLOT sof telemetry2_slot.c) + +add_local_sources_ifdef(CONFIG_SOF_TELEMETRY2_THREAD_INFO sof telemetry2_thread_info.c) diff --git a/src/debug/telemetry/Kconfig b/src/debug/telemetry/Kconfig index 2a61ee456ff6..653f5cab689c 100644 --- a/src/debug/telemetry/Kconfig +++ b/src/debug/telemetry/Kconfig @@ -28,4 +28,27 @@ config SOF_TELEMETRY2_SLOT_NUMBER Which debug slot to reserve for telemetry2. Remember to map the slot with MEMORY_WIN_2_SIZE in soc/xtensa/intel_adsp. +config SOF_TELEMETRY2_THREAD_INFO + bool "Enable thread info chunk in telemetry2 slot" + select INIT_STACKS + select THREAD_MONITOR + select THREAD_STACK_INFO + select THREAD_RUNTIME_STATS + help + This activates Zephyr thread info in telemetry2 slot. For + the thread names to be any more than hex numbers its suggested + to select THREAD_NAME=y too. + +if SOF_TELEMETRY2_THREAD_INFO + +config SOF_TELEMETRY2_THREAD_INFO_INTERVAL + int "Thread information collection interval in seconds" + default 2 + range 1 10 + help + Decides how often thread info runs and checks execution cycle + statistics and stack usage. + +endif + endif diff --git a/src/debug/telemetry/telemetry2_thread_info.c b/src/debug/telemetry/telemetry2_thread_info.c new file mode 100644 index 000000000000..28b9e7d0b6ff --- /dev/null +++ b/src/debug/telemetry/telemetry2_thread_info.c @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. + +#include +#include +#include +#include +#include + +#include "telemetry2_slot.h" + +LOG_MODULE_REGISTER(thread_info); + +#define THREAD_INFO_MAX_THREADS 16 + +struct thread_info { + char name[14]; + uint8_t stack_usage; /* Relative stack usage U(0,8) fixed point value */ + uint8_t cpu_usage; /* Relative cpu usage U(0,8) fixed point value */ +} __packed; + +#define THREAD_INFO_VERSION_0_0 0 + +#define THREAD_INFO_STATE_UNINITIALIZED 0 +#define THREAD_INFO_STATE_BEING_UPDATED 1 +#define THREAD_INFO_STATE_UPTODATE 2 + +/* Core specific data, updated each round, including the thread table. */ +struct thread_info_core { + uint8_t state; /* indicates if core data is in consistent state */ + uint8_t counter; /* incremented every round */ + uint8_t load; /* Core's load U(0,8) fixed point value */ + uint8_t thread_count; + struct thread_info thread[THREAD_INFO_MAX_THREADS]; +} __packed __aligned(CONFIG_DCACHE_LINE_SIZE); + +/* Thread info telemetry2 chunk header. Only initialized at first + * thread info thread start. Should be used to find the core specific + * sections where the thread info is when decoding. + */ +struct thread_info_chunk { + struct telemetry2_chunk_hdr hdr; + uint16_t core_count; + uint16_t core_offset[CONFIG_MP_MAX_NUM_CPUS]; + struct thread_info_core __aligned(CONFIG_DCACHE_LINE_SIZE) core[CONFIG_MP_MAX_NUM_CPUS]; +} __packed; + +#ifdef CONFIG_THREAD_RUNTIME_STATS +/* Data structure to store the cycle counter values from the previous + * round. The numbers are used to calculate what the load was on this + * round. + */ +static struct previous_counters { // Cached data from previous round + uint64_t active; // All execution cycles + uint64_t all; // All cycles including idle + struct thread_counters { + void *tid; // thread ID (the thread struct ptr) + uint64_t cycles; // cycle counter value + } threads[THREAD_INFO_MAX_THREADS]; // The max amount of threads we follow +} previous[CONFIG_MP_MAX_NUM_CPUS]; +#endif + +/* Data structure to be passed down to thread_info_cb() by + * k_thread_foreach_current_cpu(). + */ +struct user_data { + struct thread_info_core *core_data; + int thread_count; +#ifdef CONFIG_THREAD_RUNTIME_STATS + bool stats_valid; + uint32_t all_cycles; + void *active_threads[THREAD_INFO_MAX_THREADS]; + struct previous_counters *previous; +#endif +}; + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Finds and/or updates cached cycle counter value for 'tid'-thread + * and calculates used execution cycles since previous round and + * returns it. + */ + +static uint32_t get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats, + struct user_data *ud, const char *name) + +{ + int i; + + ud->active_threads[ud->thread_count] = tid; // Mark the thread as active + // look for cached value from previous round for 'tid'-thread + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == tid) { + // Calculate number cycles since previous round + uint32_t cycles = (uint32_t) (thread_stats->execution_cycles - + ud->previous->threads[i].cycles); + + LOG_DBG("%p found at %d (%s %llu)", tid, i, + name, thread_stats->execution_cycles); + // updare cached value + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + return cycles; + } + } + + // If no cached value was found, look for an empty slot to store the recent value + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == NULL) { + ud->previous->threads[i].tid = tid; + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + LOG_DBG("%p placed at %d (%s %llu)", tid, i, + name, ud->previous->threads[i].cycles); + break; + } + } + + // If there is more than THREAD_INFO_MAX_THREADS threads in the system + if (i == ARRAY_SIZE(ud->previous->threads)) + LOG_INF("No place found for %s %p", name, tid); + + // If there was no previous counter value to compare, return 0 cycles. + return 0; +} + +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud, const char *name) +{ + k_thread_runtime_stats_t thread_stats; + uint32_t cycles; + int ret; + + if (!ud->stats_valid) + return 0; + + if (k_thread_runtime_stats_get(thread, &thread_stats) != 0) + return 0; + + cycles = get_cycles(thread, &thread_stats, ud, name); + + LOG_DBG("thread %s %u / %u", name, cycles, ud->all_cycles); + + return (uint8_t) ((255llu * cycles) / ud->all_cycles); +} +#else +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud) +{ + return 0; +} +#endif + +#ifdef CONFIG_THREAD_STACK_INFO +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + size_t stack_size, stack_unused; + int ret; + + stack_size = thread->stack_info.size; + ret = k_thread_stack_space_get(thread, &stack_unused); + if (ret) { + LOG_ERR(" %-20s: unable to get stack space (%d)", + name, ret); + stack_unused = 0; + } + return (UINT8_MAX * (stack_size - stack_unused)) / stack_size; +} +#else +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + return 0; +} +#endif + +static void thread_info_cb(const struct k_thread *cthread, void *user_data) +{ + struct k_thread *thread = (struct k_thread *)cthread; + struct user_data *ud = user_data; + struct thread_info *thread_info; + const char *name; + + if (ud->thread_count > ARRAY_SIZE(ud->core_data->thread)) { + LOG_ERR("Thread count %u exceeds the memory window size\n", + ud->thread_count++); + return; + } + + thread_info = &ud->core_data->thread[ud->thread_count]; + + name = k_thread_name_get((k_tid_t)thread); + if (!name || name[0] == '\0') { + snprintk(thread_info->name, sizeof(thread_info->name), "%p", + (void *)thread); + } else { + strncpy(thread_info->name, name, sizeof(thread_info->name)); + } + + thread_info->stack_usage = thread_info_stack_level(thread, name); + thread_info->cpu_usage = thread_info_cpu_utilization(thread, ud, name); + + LOG_DBG("core %u %s stack %u%% cpu %u%%", ud->core, + thread_info->name, thread_info->stack_usage * 100U / 255, + thread_info->cpu_usage * 100U / 255); + + ud->thread_count++; +} + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Marks cached thread cycle counter entries of removed threads + * free. This also happens to threads pinned to other cpu than the + * primary. In the beginning they are listed on primary cpu, until the + * pinned cpu is started up and the thread is executed for the fist + * time and it moves to the cpu its pinned on. + */ +static void cleanup_old_thread_cycles(struct user_data *ud) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + bool found = false; + + // This entry is already free, continue + if (ud->previous->threads[i].tid == NULL) + continue; + + // Check if the thread is any more active + for (j = 0; j < ud->thread_count; j++) { + if (ud->active_threads[j] == ud->previous->threads[i].tid) { + found = true; + break; + } + } + // If the thead is not any more active, mark the entry free + if (!found) { + ud->previous->threads[i].tid = NULL; + ud->previous->threads[i].cycles = 0; + } + } +} +#else +static void cleanup_old_thread_cycles(struct user_data *ud) { } +#endif + +static void thread_info_get(struct thread_info_core *core_data) +{ + k_thread_runtime_stats_t core_stats; + struct user_data ud = { + .core_data = core_data, + .thread_count = 0, +#ifdef CONFIG_THREAD_RUNTIME_STATS + .previous = &previous[arch_curr_cpu()->id], + .active_threads = { NULL }, +#endif + }; + uint8_t load = 0; +#ifdef CONFIG_THREAD_RUNTIME_STATS + int ret = k_thread_runtime_stats_current_cpu_get(&core_stats); + + if (ret == 0) { + uint32_t active_cycles = (uint32_t) (core_stats.total_cycles - + ud.previous->active); + uint32_t all_cycles = (uint32_t) (core_stats.execution_cycles - + ud.previous->all); + + ud.stats_valid = true; + load = (uint8_t) ((255LLU * active_cycles) / all_cycles); + LOG_DBG("Core %u load %u / %u total %llu / %llu", arch_curr_cpu()->id, + active_cycles, all_cycles, + core_stats.total_cycles, core_stats.execution_cycles); + ud.previous->active = core_stats.total_cycles; + ud.previous->all = core_stats.execution_cycles; + ud.all_cycles = all_cycles; + } +#endif + core_data->state = THREAD_INFO_STATE_BEING_UPDATED; + + core_data->load = load; + k_thread_foreach_current_cpu(thread_info_cb, &ud); + + cleanup_old_thread_cycles(&ud); + + core_data->counter++; + core_data->thread_count = ud.thread_count; + core_data->state = THREAD_INFO_STATE_UPTODATE; +} + +static void thread_info_run(void *data, void *cnum, void *a) +{ + struct thread_info_chunk *chunk = data; + int core = (int) cnum; + struct thread_info_core *core_data = &chunk->core[core]; + + for (;;) { + thread_info_get(core_data); + k_sleep(K_SECONDS(CONFIG_SOF_TELEMETRY2_THREAD_INFO_INTERVAL)); + } +} + +static struct thread_info_chunk *thread_info_init(void) +{ + struct thread_info_chunk *chunk; + int i; + + chunk = telemetry2_chunk_get(TELEMETRY2_ID_THREAD_INFO, + sizeof(*chunk)); + if (!chunk) + return NULL; + + chunk->core_count = CONFIG_MP_MAX_NUM_CPUS; + for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) + chunk->core_offset[i] = offsetof(struct thread_info_chunk, core[i]); + + return chunk; +} + +static K_THREAD_STACK_ARRAY_DEFINE(info_thread_stacks, CONFIG_MP_MAX_NUM_CPUS, + 1024); +static struct k_thread info_thread[CONFIG_MP_MAX_NUM_CPUS]; + +static int thread_info_start(void) +{ + struct thread_info_chunk *chunk = thread_info_init(); + uint32_t i; + + if (!chunk) + return 0; + + for (i = 0; i < ARRAY_SIZE(info_thread); i++) { + char name[24]; + k_tid_t tid = NULL; + int ret; + + tid = k_thread_create(&info_thread[i], info_thread_stacks[i], + 1024, + thread_info_run, + chunk, (void *) i, NULL, + K_LOWEST_APPLICATION_THREAD_PRIO, 0, K_FOREVER); + if (!tid) { + LOG_ERR("k_thread_create() failed for core %u", i); + continue; + } + ret = k_thread_cpu_pin(tid, i); + if (ret < 0) { + LOG_ERR("Pinning thread to code core %u", i); + k_thread_abort(tid); + continue; + } + snprintf(name, sizeof(name), "%u thread info", i); + ret = k_thread_name_set(tid, name); + if (ret < 0) + LOG_INF("k_thread_name_set failed: %d for %u", ret, i); + + k_thread_start(tid); + LOG_DBG("Thread %p for core %u started", tid, i); + } + + return 0; +} + +SYS_INIT(thread_info_start, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); From 2f7a71313dbc862f14f41e2d0f66fa8535199405 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Wed, 27 Mar 2024 18:19:09 +0200 Subject: [PATCH 3/6] tools: thread-info: Thread info client for telemetry2 debugfs file Simple code to open telemetry2 debugfs file, check telemetry2 payload header, look for thread info chunk, decode it periodically, and print contents to stdout. Without any arguments this the script opens and polls telemetry2 debugfs file contents. The DSP should be up and running For there to be any data to be decoded. When DSP up, the telemetry2 thread info support is compiled into the FW, and the SOF Linux driver supports telemetry2 slot mapping to debugfs, the output should look like this: core_count: 3 Core 0 load 1% 0x401184a8 cpu 0.0% stack 2.0% 0x401183e0 cpu 0.0% stack 2.0% 1 thread info cpu 0.0% stack 8.2% 0 thread info cpu 0.0% stack 47.1% edf_workq cpu 0.0% stack 10.2% sysworkq cpu 0.0% stack 31.8% logging cpu 0.8% stack 19.6% idle 00 cpu 98.0% stack 33.7% Core 2 load 6% ll_thread2 cpu 0.0% stack 18.8% idle 02 cpu 93.3% stack 27.1% 0x40118570 cpu 0.4% stack 47.1% 2 thread info cpu 0.0% stack 47.1% Signed-off-by: Jyri Sarha --- src/debug/telemetry/telemetry2_thread_info.c | 38 ++-- tools/telemetry2/thread-info.py | 217 +++++++++++++++++++ 2 files changed, 238 insertions(+), 17 deletions(-) create mode 100644 tools/telemetry2/thread-info.py diff --git a/src/debug/telemetry/telemetry2_thread_info.c b/src/debug/telemetry/telemetry2_thread_info.c index 28b9e7d0b6ff..723bd5096a17 100644 --- a/src/debug/telemetry/telemetry2_thread_info.c +++ b/src/debug/telemetry/telemetry2_thread_info.c @@ -65,6 +65,7 @@ static struct previous_counters { // Cached data from previous round * k_thread_foreach_current_cpu(). */ struct user_data { + int core; struct thread_info_core *core_data; int thread_count; #ifdef CONFIG_THREAD_RUNTIME_STATS @@ -88,23 +89,24 @@ static uint32_t get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats, { int i; - ud->active_threads[ud->thread_count] = tid; // Mark the thread as active - // look for cached value from previous round for 'tid'-thread + /* Mark the thread as active */ + ud->active_threads[ud->thread_count] = tid; + /* look for cached value from previous round for 'tid'-thread */ for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { if (ud->previous->threads[i].tid == tid) { - // Calculate number cycles since previous round + /* Calculate number cycles since previous round */ uint32_t cycles = (uint32_t) (thread_stats->execution_cycles - ud->previous->threads[i].cycles); LOG_DBG("%p found at %d (%s %llu)", tid, i, name, thread_stats->execution_cycles); - // updare cached value + /* update cached value */ ud->previous->threads[i].cycles = thread_stats->execution_cycles; return cycles; } } - // If no cached value was found, look for an empty slot to store the recent value + /* If no cached value was found, look for an empty slot to store the recent value */ for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { if (ud->previous->threads[i].tid == NULL) { ud->previous->threads[i].tid = tid; @@ -115,11 +117,11 @@ static uint32_t get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats, } } - // If there is more than THREAD_INFO_MAX_THREADS threads in the system + /* If there is more than THREAD_INFO_MAX_THREADS threads in the system */ if (i == ARRAY_SIZE(ud->previous->threads)) LOG_INF("No place found for %s %p", name, tid); - // If there was no previous counter value to compare, return 0 cycles. + /* If there was no previous counter value to compare, return 0 cycles. */ return 0; } @@ -144,7 +146,7 @@ static uint8_t thread_info_cpu_utilization(struct k_thread *thread, } #else static uint8_t thread_info_cpu_utilization(struct k_thread *thread, - struct user_data *ud) + struct user_data *ud, const char *name) { return 0; } @@ -220,18 +222,18 @@ static void cleanup_old_thread_cycles(struct user_data *ud) for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { bool found = false; - // This entry is already free, continue + /* This entry is already free, continue */ if (ud->previous->threads[i].tid == NULL) continue; - // Check if the thread is any more active + /* Check if the thread was seen on previous round */ for (j = 0; j < ud->thread_count; j++) { if (ud->active_threads[j] == ud->previous->threads[i].tid) { found = true; break; } } - // If the thead is not any more active, mark the entry free + /* If the thead is not any more active, mark the entry free */ if (!found) { ud->previous->threads[i].tid = NULL; ud->previous->threads[i].cycles = 0; @@ -242,20 +244,21 @@ static void cleanup_old_thread_cycles(struct user_data *ud) static void cleanup_old_thread_cycles(struct user_data *ud) { } #endif -static void thread_info_get(struct thread_info_core *core_data) +static void thread_info_get(int core, struct thread_info_core *core_data) { k_thread_runtime_stats_t core_stats; struct user_data ud = { + .core = core, .core_data = core_data, .thread_count = 0, #ifdef CONFIG_THREAD_RUNTIME_STATS - .previous = &previous[arch_curr_cpu()->id], + .previous = &previous[core], .active_threads = { NULL }, #endif }; uint8_t load = 0; #ifdef CONFIG_THREAD_RUNTIME_STATS - int ret = k_thread_runtime_stats_current_cpu_get(&core_stats); + int ret = k_thread_runtime_stats_cpu_get(core, &core_stats); if (ret == 0) { uint32_t active_cycles = (uint32_t) (core_stats.total_cycles - @@ -265,7 +268,7 @@ static void thread_info_get(struct thread_info_core *core_data) ud.stats_valid = true; load = (uint8_t) ((255LLU * active_cycles) / all_cycles); - LOG_DBG("Core %u load %u / %u total %llu / %llu", arch_curr_cpu()->id, + LOG_DBG("Core %u load %u / %u total %llu / %llu", core, active_cycles, all_cycles, core_stats.total_cycles, core_stats.execution_cycles); ud.previous->active = core_stats.total_cycles; @@ -276,7 +279,8 @@ static void thread_info_get(struct thread_info_core *core_data) core_data->state = THREAD_INFO_STATE_BEING_UPDATED; core_data->load = load; - k_thread_foreach_current_cpu(thread_info_cb, &ud); + /* This is best effort debug tool. Unlocked version should be fine. */ + k_thread_foreach_unlocked_filter_by_cpu(core, thread_info_cb, &ud); cleanup_old_thread_cycles(&ud); @@ -292,7 +296,7 @@ static void thread_info_run(void *data, void *cnum, void *a) struct thread_info_core *core_data = &chunk->core[core]; for (;;) { - thread_info_get(core_data); + thread_info_get(core, core_data); k_sleep(K_SECONDS(CONFIG_SOF_TELEMETRY2_THREAD_INFO_INTERVAL)); } } diff --git a/tools/telemetry2/thread-info.py b/tools/telemetry2/thread-info.py new file mode 100644 index 000000000000..c5c5e5783361 --- /dev/null +++ b/tools/telemetry2/thread-info.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-3-Clause +# +# Copyright (c) 2024, Intel Corporation. + +import argparse +import mmap +import ctypes +import time +import sys + +TELEMETRY2_PAYLOAD_MAGIC = 0x1ED15EED + +TELEMETRY2_CHUNK_ID_EMPTY = 0 +TELEMETRY2_ID_THREAD_INFO = 1 + +TELEMETRY2_PAYLOAD_V0_0 = 0 + +#telemetry2 payload +class Telemetry2PayloadHdr(ctypes.Structure): + _fields_ = [ + ("magic", ctypes.c_uint), + ("hdr_size", ctypes.c_uint), + ("total_size", ctypes.c_uint), + ("abi", ctypes.c_uint), + ("tstamp", ctypes.c_ulonglong) + ] + +# telemetry2 header struct +class Telemetry2ChunkHdr(ctypes.Structure): + _fields_ = [("id", ctypes.c_uint), ("size", ctypes.c_uint)] + +def get_telemetry2_payload_ok(slot): + payload_hdr = ctypes.cast(slot, + ctypes.POINTER(Telemetry2PayloadHdr)) + if payload_hdr.contents.magic != TELEMETRY2_PAYLOAD_MAGIC: + print("Telemetry2 payload header bad magic 0x%x\n" % + payload_hdr.contents.magic) + return False + if payload_hdr.contents.abi != TELEMETRY2_PAYLOAD_V0_0: + print("Unknown Telemetry2 abi version %u\n" % + payload_hdr.contents.abi) + return True + +def get_telemetry2_chunk_offset(id, slot): + """ + Generic function to get telmetry2 chunk offset by type + """ + offset = 0 + while True: + struct_ptr = ctypes.cast(slot[offset:], + ctypes.POINTER(Telemetry2ChunkHdr)) + #print("Checking %u id 0x%x size %u\n" % + # (offset, struct_ptr.contents.id, struct_ptr.contents.size)) + if struct_ptr.contents.id == id: + return offset + if struct_ptr.contents.size == 0: + return -1 + offset = offset + struct_ptr.contents.size + +class ThreadInfo(ctypes.Structure): + _pack_ = 1 + _fields_ = [ + ("name", ctypes.c_char * 14), + ("stack_usage", ctypes.c_ubyte), + ("cpu_usage", ctypes.c_ubyte), + ] + +class CPUInfo(ctypes.Structure): + _pack_ = 1 + _fields_ = [ + ("state", ctypes.c_ubyte), + ("counter", ctypes.c_ubyte), + ("load", ctypes.c_ubyte), + ("thread_count", ctypes.c_ubyte), + ("thread", ThreadInfo * 16), + ] + +class ThreadInfoChunk(ctypes.Structure): + _pack_ = 1 + _fields_ = [ + ("hdr", Telemetry2ChunkHdr ), + ("core_count", ctypes.c_ushort), + ("core_offsets", ctypes.c_ushort * 16), + ] + +class CoreData: + """ Class for tracking thread analyzer info for one core """ + counter = 0 + offset = 0 + core = 0 + STATE_UNINITIALIZED = 0 + STATE_BEING_UPDATED = 1 + STATE_UPTODATE = 2 + + def __init__(self, offset, core): + self.counter = 0 + self.offset = offset + self.core = core + + def print(self, slot): + cpu_info = ctypes.cast(slot[self.offset:], + ctypes.POINTER(CPUInfo)) + if self.counter == cpu_info.contents.counter: + return + if cpu_info.contents.state != self.STATE_UPTODATE: + return + self.counter = cpu_info.contents.counter + print("Core %d load %02.1f%%" % + (self.core, cpu_info.contents.load / 2.55)) + for i in range(cpu_info.contents.thread_count): + thread = cpu_info.contents.thread[i] + print("\t%-20s cpu %02.1f%% stack %02.1f%%" % + (thread.name.decode('utf-8'), thread.cpu_usage / 2.55, + thread.stack_usage / 2.55)); + +class Telemetry2ThreadAnalyzerDecoder: + """ + Class for finding thread analyzer chuck and initializing CoreData objects. + """ + file_size = 4096 # ADSP debug slot size + f = None + chunk_offset = -1 + chunk = None + core_data = [] + + def set_file(self, f): + self.f = f + + def decode_chunk(self): + if self.chunk != None: + return + self.f.seek(0) + slot = self.f.read(self.file_size) + if not get_telemetry2_payload_ok(slot): + return + self.chunk_offset = get_telemetry2_chunk_offset(TELEMETRY2_ID_THREAD_INFO, slot) + if self.chunk_offset < 0: + print("Thread info chunk not found") + self.chunk = None + return + self.chunk = ctypes.cast(slot[self.chunk_offset:], + ctypes.POINTER(ThreadInfoChunk)) + hdr = self.chunk.contents.hdr + #print(f"Chunk at {self.chunk_offset} id: {hdr.id} size: {hdr.size}") + print(f"core_count: {self.chunk.contents.core_count}") + + def init_core_data(self): + if self.chunk == None: + return + if len(self.core_data) == self.chunk.contents.core_count: + return + self.core_data = [None] * self.chunk.contents.core_count + for i in range(self.chunk.contents.core_count): + offset = self.chunk_offset + self.chunk.contents.core_offsets[i] + self.core_data[i] = CoreData(offset, i) + #print(f"core_offsets {i}: {self.chunk.contents.core_offsets[i]}") + + def loop_cores(self): + if len(self.core_data) == 0: + return + self.f.seek(0) + slot = self.f.read(self.file_size) + for i in range(len(self.core_data)): + self.core_data[i].print(slot) + + def reset(self): + self.f = None + self.chunk = None + + def print_status(self, tag): + print(f"{tag} f {self.f} offset {self.chunk_offset} chunk {self.chunk} cores {len(self.core_data)}") + +def main_f(args): + """ + Map telemetry2 slot into memorey and find the chunk we are interested in + """ + decoder = Telemetry2ThreadAnalyzerDecoder() + prev_error = None + while True: + try: + with open(args.telemetry2_file, "rb") as f: + decoder.set_file(f) + decoder.decode_chunk() + decoder.init_core_data() + while True: + decoder.loop_cores() + time.sleep(args.update_interval) + except FileNotFoundError: + print(f"File {args.telemetry2_file} not found!") + break + except OSError as e: + if str(e) != prev_error: + print(f"Open {args.telemetry2_file} failed '{e}'") + prev_error = str(e) + decoder.reset() + time.sleep(args.update_interval) + +def parse_params(): + """ Parses parameters + """ + parser = argparse.ArgumentParser(description= + "SOF Telemetry2 thread info client. "+ + "Opens telemetry2 slot debugfs file, looks"+ + " for thread info chunk, and decodes its "+ + "contentes into stdout periodically.") + parser.add_argument('-t', '--update-interval', type=float, + help='Telemetry2 window polling interval in seconds, default 1', + default=1) + parser.add_argument('-f', '--telemetry2-file', + help='File to read the telemetry2 data from, default /sys/kernel/debug/sof/telemetry2', + default="/sys/kernel/debug/sof/telemetry2") + parsed_args = parser.parse_args() + return parsed_args + +args = parse_params() +main_f(args) From aad8ab43b7a504cf88ee939a369f712d3ba395a9 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Tue, 7 May 2024 22:59:10 +0300 Subject: [PATCH 4/6] memory_window: Increase Debug Window size from 12k to 16k Increase Debug Window (Memory window 2) size by one slot (4096 bytes). The extra space is needed for telemetry2 slot. See SOF_TELEMETRY2_SLOT option for details. Signed-off-by: Jyri Sarha --- app/boards/intel_adsp_ace15_mtpm.conf | 2 +- app/boards/intel_adsp_ace20_lnl.conf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/boards/intel_adsp_ace15_mtpm.conf b/app/boards/intel_adsp_ace15_mtpm.conf index 5923272fa986..bc7931633fc4 100644 --- a/app/boards/intel_adsp_ace15_mtpm.conf +++ b/app/boards/intel_adsp_ace15_mtpm.conf @@ -79,7 +79,7 @@ CONFIG_INTEL_ADSP_IPC=y CONFIG_WATCHDOG=y CONFIG_LL_WATCHDOG=y -CONFIG_MEMORY_WIN_2_SIZE=12288 +CONFIG_MEMORY_WIN_2_SIZE=16384 # Temporary disabled options CONFIG_TRACE=n diff --git a/app/boards/intel_adsp_ace20_lnl.conf b/app/boards/intel_adsp_ace20_lnl.conf index 88a8e6522cfe..dff8260f1f33 100644 --- a/app/boards/intel_adsp_ace20_lnl.conf +++ b/app/boards/intel_adsp_ace20_lnl.conf @@ -74,7 +74,7 @@ CONFIG_INTEL_ADSP_IPC=y CONFIG_PROBE=y CONFIG_PROBE_DMA_MAX=2 -CONFIG_MEMORY_WIN_2_SIZE=12288 +CONFIG_MEMORY_WIN_2_SIZE=16384 # Temporary disabled options From ab028a01951f4f49d2a4ad80b62a54609f2f08e7 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 26 Apr 2024 13:33:23 +0300 Subject: [PATCH 5/6] src/debug/telemetry/telemetry2_slot.h: HACK add ADSP_DW_SLOT_TELEMETRY2 --- src/debug/telemetry/telemetry2_slot.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/debug/telemetry/telemetry2_slot.h b/src/debug/telemetry/telemetry2_slot.h index 554e7f966550..49483c938b45 100644 --- a/src/debug/telemetry/telemetry2_slot.h +++ b/src/debug/telemetry/telemetry2_slot.h @@ -54,6 +54,9 @@ #include +// HACK: This should be in zephyr/soc/intel/intel_adsp/common/include/adsp_debug_window.h +#define ADSP_DW_SLOT_TELEMETRY2 0x4c455500 + #define TELEMETRY2_PAYLOAD_MAGIC 0x1ED15EED #define TELEMETRY2_PAYLOAD_V0_0 0 From 7747c684e17769dea0467b59f95146c33a2644b3 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Wed, 10 Apr 2024 20:49:31 +0300 Subject: [PATCH 6/6] app/prj.conf: HACK enable Thread Info --- app/prj.conf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/app/prj.conf b/app/prj.conf index 5a681ac5b74b..88e4f33e66b4 100644 --- a/app/prj.conf +++ b/app/prj.conf @@ -57,3 +57,10 @@ CONFIG_HEAP_MEM_POOL_SIZE=2048 CONFIG_LLEXT=y CONFIG_LLEXT_STORAGE_WRITABLE=y CONFIG_MODULES=y + +#Thread info + +CONFIG_SOF_TELEMETRY2_SLOT=y +CONFIG_SOF_TELEMETRY2_THREAD_INFO=y +CONFIG_THREAD_NAME=y +CONFIG_MEMORY_WIN_2_SIZE=16384