diff --git a/cmake/findllvm.cmake b/cmake/findllvm.cmake
new file mode 100644
index 0000000000..fbf38f0d07
--- /dev/null
+++ b/cmake/findllvm.cmake
@@ -0,0 +1,37 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+if (NOT LLPC_LLVM_SRC_PATH)
+    # Find LLVM source. Allow client driver to override using its own name for overlay builds.
+    set(DEFAULT_LLPC_LLVM_SRC_PATH ${XGL_LLVM_SRC_PATH})
+    if (NOT DEFAULT_LLPC_LLVM_SRC_PATH)
+        if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/../../../imported/llvm-project/llvm)
+            set(DEFAULT_LLPC_LLVM_SRC_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../imported/llvm-project/llvm)
+        elseif(EXISTS ${CMAKE_CURRENT_LIST_DIR}/../../llvm-project/llvm)
+            set(DEFAULT_LLPC_LLVM_SRC_PATH ${CMAKE_CURRENT_LIST_DIR}/../../llvm-project/llvm)
+        endif()
+    endif()
+    set(LLPC_LLVM_SRC_PATH ${DEFAULT_LLPC_LLVM_SRC_PATH} CACHE PATH "Specify the path to LLVM.")
+endif()
diff --git a/cmake/llvm.cmake b/cmake/llvm.cmake
new file mode 100644
index 0000000000..ee26b47a75
--- /dev/null
+++ b/cmake/llvm.cmake
@@ -0,0 +1,129 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+# Build LLVM, using previously set up LLVM_EXTERNAL_PROJECTS and LLVM_EXTERNAL_*_SOURCE_DIR.
+# Relies on findllvm.cmake being run first.
+
+if (NOT LLPC_LLVM_SRC_PATH)
+    message(FATAL_ERROR "No LLPC_LLVM_SRC_PATH specified")
+endif()
+
+# Set cached options.
+set(LLVMRAYTRACING_BUILD_TESTS ${LLPC_BUILD_TESTS})
+set(LLVM_TARGETS_TO_BUILD AMDGPU CACHE STRING "LLVM targets to build")
+set(LLVM_BUILD_TESTS OFF CACHE BOOL "LLVM build tests")
+set(LLVM_BUILD_TOOLS ${LLPC_BUILD_LLVM_TOOLS} CACHE BOOL "LLVM build tools")
+set(LLVM_BUILD_UTILS OFF CACHE BOOL "LLVM build utils")
+set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "LLVM include docs")
+set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "LLVM include examples")
+set(LLVM_INCLUDE_GO_TESTS OFF CACHE BOOL "LLVM include go tests")
+set(LLVM_INCLUDE_TESTS ${LLPC_BUILD_TESTS} CACHE BOOL "LLVM include tests")
+set(LLVM_INCLUDE_TOOLS ON CACHE BOOL "LLVM include tools")
+set(LLVM_INCLUDE_UTILS ON CACHE BOOL "LLVM include utils")
+set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "LLVM enable terminfo")
+set(LLVM_RAM_PER_TABLEGEN_JOB 4000 CACHE STRING "LLVM RAM per tablegen job")
+set(LLVM_RAM_PER_LINK_JOB 5000 CACHE STRING "LLVM RAM per link job")
+if(CMAKE_BUILD_TYPE_DEBUG)
+    # Build optimized version of llvm-tblgen even in debug builds, for faster build times.
+    set(LLVM_OPTIMIZED_TABLEGEN ON CACHE BOOL "Build optimized llvm-tblgen")
+#if _WIN32
+    if(LLVM_OPTIMIZED_TABLEGEN AND WIN32 AND (CMAKE_GENERATOR MATCHES "Ninja"))
+        # LLVM implements the Release build of llvm-tblgen as a cross-compile target, which fails to find
+        # our DK-based toolchain (created with amd_generate_msvc_toolchain). However, we can inject the toolchain
+        # argument into LLVM's add_custom_target that sets up this cross-compile build.
+        # See: llvm-project/llvm/cmake/modules/CrossCompile.cmake
+        set(CROSS_TOOLCHAIN_FLAGS_NATIVE "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" CACHE STRING
+            "Toolchain flags for native build" FORCE)
+    endif()
+#endif
+endif()
+
+# This will greatly speed up debug builds because we won't be listing all the symbols with llvm-nm.
+set(LLVM_BUILD_LLVM_C_DYLIB OFF CACHE BOOL "LLVM build LLVM-C dylib")
+
+# Remove /nologo from CMAKE_RC_FLAGS to avoid getting an error from specifying it twice in LLVM.
+if (CMAKE_RC_FLAGS)
+    string(REPLACE "/nologo" "" CMAKE_RC_FLAGS ${CMAKE_RC_FLAGS})
+endif()
+
+# Build LLVM.
+if (NOT LLPC_LLVM_BUILD_PATH)
+    set(LLPC_LLVM_BUILD_PATH ${PROJECT_BINARY_DIR}/llvm)
+endif()
+if (ICD_BUILD_LLPC)
+    add_subdirectory(${LLPC_LLVM_SRC_PATH} ${LLPC_LLVM_BUILD_PATH})
+else()
+    add_subdirectory(${LLPC_LLVM_SRC_PATH} ${LLPC_LLVM_BUILD_PATH} EXCLUDE_FROM_ALL)
+endif()
+
+# Get LLVMConfig onto cmake path.
+list(APPEND CMAKE_MODULE_PATH
+    "${LLPC_LLVM_BUILD_PATH}/lib/cmake/llvm"
+    "${LLPC_LLVM_BUILD_PATH}/${CMAKE_CFG_INTDIR}/lib/cmake/llvm" # Workaround for VS generator with older LLVM.
+)
+
+# Export LLVM build path for client driver.
+# TODO: Change uses to LLPC_LLVM_BUILD_PATH.
+set(XGL_LLVM_BUILD_PATH ${LLPC_LLVM_BUILD_PATH} PARENT_SCOPE)
+
+# Extract LLVM revision number for code outside the LLPC repository to use.
+file(READ "${LLPC_LLVM_SRC_PATH}/include/llvm/Config/llvm-config.h.cmake" LLVM_CONFIG_HEADER)
+string(REGEX MATCH "#define LLVM_MAIN_REVISION ([0-9]+)" "\\1" _ "${LLVM_CONFIG_HEADER}")
+set(LLVM_MAIN_REVISION "${CMAKE_MATCH_1}")
+set(LLVM_MAIN_REVISION ${LLVM_MAIN_REVISION} PARENT_SCOPE)
+
+# Some of the games using old versions of the tcmalloc lib are crashing
+# when allocating aligned memory. C++17 enables aligned new by default,
+# so we need to disable it to prevent those crashes.
+if (ICD_BUILD_LLPC AND NOT WIN32)
+    llvm_map_components_to_libnames(llvm_libs
+        AMDGPUAsmParser
+        AMDGPUCodeGen
+        AMDGPUDisassembler
+        AMDGPUInfo
+        Analysis
+        BinaryFormat
+        Core
+        Coroutines
+        BitReader
+        BitWriter
+        CodeGen
+        InstCombine
+        ipo
+        IRPrinter
+        IRReader
+        Linker
+        LTO
+        MC
+        Passes
+        ScalarOpts
+        Support
+        Target
+        TransformUtils
+    )
+    foreach (lib ${llvm_libs})
+        target_compile_options(${lib} PRIVATE "-fno-aligned-new")
+    endforeach()
+endif()
diff --git a/cmake/vkgc.cmake b/cmake/vkgc.cmake
new file mode 100644
index 0000000000..acde1624ca
--- /dev/null
+++ b/cmake/vkgc.cmake
@@ -0,0 +1,113 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+### Top-level VKGC Interface ###
+add_library(vkgc INTERFACE)
+
+### VKGC header-only library ###
+add_library(vkgc_headers INTERFACE)
+
+target_link_libraries(vkgc_headers INTERFACE llpc_version)
+
+### Options that affect the headers ####################################################################################
+#if LLPC_BUILD_GFX11
+if(LLPC_BUILD_GFX11)
+    target_compile_definitions(vkgc_headers INTERFACE LLPC_BUILD_GFX11)
+endif()
+#endif
+
+#if LLPC_RAY_TRACING
+if(LLPC_RAY_TRACING)
+    if(NOT LLPC_IS_STANDALONE)
+        target_compile_definitions(vkgc_headers INTERFACE HAVE_GPURT_SHIM)
+    endif()
+
+    target_compile_definitions(vkgc_headers INTERFACE LLPC_RAY_TRACING)
+    target_compile_definitions(vkgc_headers INTERFACE GPURT_CLIENT_INTERFACE_MAJOR_VERSION=${GPURT_CLIENT_INTERFACE_MAJOR_VERSION})
+endif()
+#endif
+
+target_link_libraries(vkgc INTERFACE vkgc_headers)
+
+### Expose header files ################################################################################################
+target_include_directories(vkgc_headers
+    INTERFACE
+        ${PROJECT_SOURCE_DIR}/include
+)
+
+### external SPIRV headers #########################################################
+if (NOT SPIRV_HEADERS_PATH)
+    if(EXISTS ${PROJECT_SOURCE_DIR}/../SPIRV-Headers)
+        set(SPIRV_HEADERS_PATH ${PROJECT_SOURCE_DIR}/../SPIRV-Headers CACHE PATH "The path of SPIRV headers.")
+    elseif(EXISTS ${PROJECT_SOURCE_DIR}/../../../../SPIRV-Headers)
+        set(SPIRV_HEADERS_PATH ${PROJECT_SOURCE_DIR}/../../../../SPIRV-Headers CACHE PATH "The path of SPIRV headers.")
+    endif()
+endif()
+
+### Interface Target ###################################################################################################
+### SPIRV Interface ###
+add_library(khronos_spirv_interface INTERFACE)
+
+if(EXISTS ${SPIRV_HEADERS_PATH})
+    target_include_directories(khronos_spirv_interface
+        INTERFACE
+            ${SPIRV_HEADERS_PATH}/include
+            ${PROJECT_SOURCE_DIR}/include/khronos
+    )
+    if (NOT SPIRV_HEADERS_PATH_INTERNAL)
+        target_compile_definitions(khronos_spirv_interface
+            INTERFACE
+                EXTERNAL_SPIRV_HEADERS=1
+        )
+    endif()
+else()
+    target_include_directories(khronos_spirv_interface
+        INTERFACE
+            ${PROJECT_SOURCE_DIR}/include/khronos
+    )
+endif()
+
+if(LLPC_BUILD_TOOLS)
+# SPVGEN
+if(EXISTS ${PROJECT_SOURCE_DIR}/../spvgen)
+    set(XGL_SPVGEN_PATH ${PROJECT_SOURCE_DIR}/../spvgen CACHE PATH "Specify the path to SPVGEN.")
+elseif(EXISTS ${PROJECT_SOURCE_DIR}/../xgl/tools/spvgen)
+    set(XGL_SPVGEN_PATH ${PROJECT_SOURCE_DIR}/../xgl/tools/spvgen CACHE PATH "Specify the path to SPVGEN.")
+else()
+    set(XGL_SPVGEN_PATH ${PROJECT_SOURCE_DIR}/../../../tools/spvgen CACHE PATH "Specify the path to SPVGEN.")
+endif()
+
+if(EXISTS ${XGL_SPVGEN_PATH})
+    set(XGL_SPVGEN_BUILD_PATH ${CMAKE_BINARY_DIR}/spvgen)
+    add_subdirectory(${XGL_SPVGEN_PATH} ${XGL_SPVGEN_BUILD_PATH} EXCLUDE_FROM_ALL)
+endif()
+
+endif(LLPC_BUILD_TOOLS)
+
+if(ICD_BUILD_LLPC)
+    # Generate Strings for LLPC standalone tool and vkgc_gpurtshim
+    add_subdirectory(util ${PROJECT_BINARY_DIR}/util)
+    add_subdirectory(gfxruntime ${PROJECT_BINARY_DIR}/gfxruntime)
+endif()
diff --git a/compilerutils/include/compilerutils/IRSerializationUtils.h b/compilerutils/include/compilerutils/IRSerializationUtils.h
new file mode 100644
index 0000000000..b6cda6761c
--- /dev/null
+++ b/compilerutils/include/compilerutils/IRSerializationUtils.h
@@ -0,0 +1,58 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- IRSerializationUtils.h - Library for compiler frontends ------------===//
+//
+// Implements several shared helper functions for dumping IR in various forms
+// including to DOT files and LL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IRSERIALIZATIONUTILS_H
+#define IRSERIALIZATIONUTILS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+
+namespace irserializationutils {
+
+// Returns an MD5 hash of the module LL. This is returned as a string so it can
+// be used as part of a filename.
+std::string getModuleHashStr(const llvm::Module &m);
+
+// Writes a DOT file with the CFG of the function. The filename is:
+// FilenamePrefix.FuncName.Hash.dot where FuncName is determined by demangling
+// the DXIL function name, and Hash is given by getModuleHashStr.
+// Set cfgOnly = false to include instructions within the BBs.
+void writeCFGToDotFile(const llvm::Function &f, llvm::StringRef filenamePrefix = "cfg", bool cfgOnly = true);
+
+// Writes an LL file with the module. The filename is:
+// FilenamePrefix.Hash.ll where Hash is given by getModuleHashStr.
+void writeModuleToLLFile(const llvm::Module &m, llvm::StringRef filenamePrefix = "module");
+
+} // namespace irserializationutils
+
+#endif
diff --git a/compilerutils/include/compilerutils/MbStandardInstrumentations.h b/compilerutils/include/compilerutils/MbStandardInstrumentations.h
new file mode 100644
index 0000000000..d86f52318b
--- /dev/null
+++ b/compilerutils/include/compilerutils/MbStandardInstrumentations.h
@@ -0,0 +1,124 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// An alternative to LLVM's StandardInstrumentations that (partly) patches
+// things up so they work on ModuleBunch passes.
+// Most code here is copied from LLVM's StandardInstrumentations.cpp and
+// edited.
+
+#pragma once
+
+#include "compilerutils/ModuleBunch.h"
+#include "llvm/Passes/StandardInstrumentations.h"
+
+namespace llvm {
+
+// Copy of PrintIRInstrumentation with edits for ModuleBunch.
+/// Instrumentation to print IR before/after passes.
+///
+/// Needs state to be able to print module after pass that invalidates IR unit
+/// (typically Loop or SCC).
+class MbPrintIRInstrumentation {
+public:
+  ~MbPrintIRInstrumentation();
+
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+private:
+  void printBeforePass(StringRef PassID, Any IR);
+  void printAfterPass(StringRef PassID, Any IR);
+  void printAfterPassInvalidated(StringRef PassID);
+
+  bool shouldPrintBeforePass(StringRef PassID);
+  bool shouldPrintAfterPass(StringRef PassID);
+
+  using PrintModuleDesc = std::tuple<Any, std::string, StringRef>;
+
+  void pushModuleDesc(StringRef PassID, Any IR);
+  PrintModuleDesc popModuleDesc(StringRef PassID);
+
+  PassInstrumentationCallbacks *PIC;
+  /// Stack of Module description, enough to print the module after a given
+  /// pass.
+  SmallVector<PrintModuleDesc, 2> ModuleDescStack;
+};
+
+// Debug logging for transformation and analysis passes.
+class MbPrintPassInstrumentation {
+  raw_ostream &print();
+
+public:
+  MbPrintPassInstrumentation(bool Enabled, PrintPassOptions Opts) : Enabled(Enabled), Opts(Opts) {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+private:
+  bool Enabled;
+  PrintPassOptions Opts;
+  int Indent = 0;
+};
+
+// Copy of VerifyInstrumentation with edits for ModuleBunch.
+class MbVerifyInstrumentation {
+  bool DebugLogging;
+
+public:
+  MbVerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
+/// This class provides an interface to register all the standard pass
+/// instrumentations and manages their state (if any).
+/// Ones that have not yet been adapted for use with a ModuleBunch pass manager (ones without an Mb prefix)
+/// may well be broken.
+class MbStandardInstrumentations {
+  MbPrintIRInstrumentation PrintIR;
+  MbPrintPassInstrumentation PrintPass;
+  TimePassesHandler TimePasses;
+  TimeProfilingPassesHandler TimeProfilingPasses;
+  OptNoneInstrumentation OptNone;
+  // OptPassGateInstrumentation OptPassGate; // Cannot even attempt to use this as it needs LLVMContext
+  PreservedCFGCheckerInstrumentation PreservedCFGChecker;
+  IRChangedPrinter PrintChangedIR;
+  PseudoProbeVerifier PseudoProbeVerification;
+  InLineChangePrinter PrintChangedDiff;
+  DotCfgChangeReporter WebsiteChangeReporter;
+  PrintCrashIRInstrumentation PrintCrashIR;
+  IRChangedTester ChangeTester;
+  MbVerifyInstrumentation Verify;
+
+  bool VerifyEach;
+
+public:
+  MbStandardInstrumentations(bool DebugLogging, bool VerifyEach = false,
+                             PrintPassOptions PrintPassOpts = PrintPassOptions());
+
+  // Register all the standard instrumentation callbacks. If \p FAM is nullptr
+  // then PreservedCFGChecker is not enabled.
+  void registerCallbacks(PassInstrumentationCallbacks &PIC, ModuleAnalysisManager *MAM = nullptr);
+
+  TimePassesHandler &getTimePasses() { return TimePasses; }
+};
+
+} // namespace llvm
diff --git a/compilerutils/include/compilerutils/ModuleBunch.h b/compilerutils/include/compilerutils/ModuleBunch.h
new file mode 100644
index 0000000000..2bfe8cabcd
--- /dev/null
+++ b/compilerutils/include/compilerutils/ModuleBunch.h
@@ -0,0 +1,289 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// The ModuleBunch class, representing a bunch of modules, and a pass manager
+// and analysis manager for it allowing you to run passes on it.
+
+#pragma once
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+
+namespace llvm {
+
+class ModuleBunch;
+
+using ModuleBunchPassManager = PassManager<ModuleBunch>;
+using ModuleBunchAnalysisManager = AnalysisManager<ModuleBunch>;
+using ModuleAnalysisManagerModuleBunchProxy = InnerAnalysisManagerProxy<ModuleAnalysisManager, ModuleBunch>;
+using ModuleBunchAnalysisManagerModuleProxy = OuterAnalysisManagerProxy<ModuleBunchAnalysisManager, Module>;
+
+/// ModuleBunch is a pseudo-IR construct for a bunch of modules that we want to run passes on.
+class ModuleBunch {
+public:
+  // Iterator for accessing the Modules in the ModuleBunch, without being able to free or replace
+  // any Module. The iterator does a "double dereference" from pointer-to-unique-ptr-to-Module
+  // down to Module &.
+  using iterator = llvm::pointee_iterator<ArrayRef<std::unique_ptr<Module>>::iterator>;
+
+  // Access the Modules in the ModuleBunch, without erasing/removing/replacing them.
+  iterator begin() const {
+    assert(isNormalized());
+    return iterator(Modules.begin());
+  }
+  iterator end() const {
+    assert(isNormalized());
+    return iterator(Modules.end());
+  }
+  size_t size() const { return end() - begin(); }
+  bool empty() const { return size() == 0; }
+
+  // Access the array of Modules in the ModuleBunch, directly accessing the unique_ptrs
+  // for erasing/removing/replacing them.
+  // After doing that, call renormalize() to remove any holes.
+  MutableArrayRef<std::unique_ptr<Module>> getMutableModules() { return Modules; }
+
+  // Add Module to ModuleBunch, taking ownership. Invalidates modules() iterator.
+  void addModule(std::unique_ptr<Module> module);
+
+  // Remove Module from ModuleBunch, returning ownership to the caller.
+  // Returns empty unique_ptr if Module not found.
+  std::unique_ptr<Module> removeModule(const Module *moduleToRemove);
+
+  // Renormalize ModuleBunch's array of Modules after manipulation by user.
+  // Invalidates modules() iterator.
+  void renormalize();
+
+  // Check that Modules list has been renormalized since caller removed/freed modules.
+  bool isNormalized() const;
+
+  // Print the ModuleBunch to an output stream. The extra args are passed as-is
+  // to Module::print for each module.
+  void print(raw_ostream &OS, AssemblyAnnotationWriter *AAW, bool ShouldPreserveUseListOrder = false,
+             bool IsForDebug = false) const;
+
+  // Dump the module to stderr (for debugging).
+  void dump() const;
+
+  // API used by PassManager.h.
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    IsNewDbgInfoFormat = UseNewFormat;
+    assert(isNormalized());
+    for (const std::unique_ptr<Module> &Entry : Modules)
+      Entry->setIsNewDbgInfoFormat(UseNewFormat);
+  }
+
+  // Public field used by PassManager.h.
+  bool IsNewDbgInfoFormat = false;
+
+private:
+  SmallVector<std::unique_ptr<Module>> Modules;
+};
+
+/// A raw_ostream inserter for ModuleBunch
+inline raw_ostream &operator<<(raw_ostream &O, const ModuleBunch &MB) {
+  MB.print(O, nullptr);
+  return O;
+}
+
+#if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 503109
+template <> inline void printIRUnitNameForStackTrace<ModuleBunch>(raw_ostream &OS, const ModuleBunch &IR) {
+  OS << "Anonymous ModuleBunch \"";
+}
+#endif
+
+extern template class PassManager<ModuleBunch>;
+extern template class AnalysisManager<ModuleBunch>;
+extern template class AllAnalysesOn<ModuleBunch>;
+
+/// Trivial adaptor that maps from a ModuleBunch to its modules.
+///
+/// Designed to allow composition of a ModulePass(Manager) and
+/// a ModuleBunchPassManager, by running the ModulePass(Manager) over every
+/// module in the ModuleBunch.
+///
+/// Module passes run within this adaptor can rely on having exclusive access
+/// to the module they are run over. They should not read or modify any other
+/// modules! Other threads or systems may be manipulating other functions in
+/// the ModuleBunch, and so their state should never be relied on.
+///
+/// Module passes can also read the ModuleBunch containing the module, but they
+/// should not modify that ModuleBunch.
+/// For example, a module pass is not permitted to add modules to the
+/// ModuleBunch.
+///
+/// Note that although module passes can access ModuleBunch analyses, ModuleBunch
+/// analyses are not invalidated while the module passes are running, so they
+/// may be stale.  Module analyses will not be stale.
+class ModuleBunchToModulePassAdaptor : public PassInfoMixin<ModuleBunchToModulePassAdaptor> {
+public:
+  using PassConceptT = detail::PassConcept<Module, ModuleAnalysisManager>;
+
+  /// Construct with a function that returns a pass. It can then parallelize compilation by calling
+  /// the function once for each parallel thread.
+  explicit ModuleBunchToModulePassAdaptor(function_ref<std::unique_ptr<PassConceptT>()> PassMaker,
+                                          bool EagerlyInvalidate = false)
+      : PassMaker(PassMaker), EagerlyInvalidate(EagerlyInvalidate) {}
+
+  /// Construct with a pass. It can then not parallelize compilation.
+  explicit ModuleBunchToModulePassAdaptor(std::unique_ptr<PassConceptT> pass, bool eagerlyInvalidate)
+      : Pass(std::move(pass)), EagerlyInvalidate(eagerlyInvalidate) {}
+
+  /// Runs the module pass across every module in the ModuleBunch.
+  PreservedAnalyses run(ModuleBunch &moduleBunch, ModuleBunchAnalysisManager &analysisMgr);
+  void printPipeline(raw_ostream &os, function_ref<StringRef(StringRef)> mapClassName2PassName);
+
+  static bool isRequired() { return true; }
+
+private:
+  std::unique_ptr<PassConceptT> Pass;
+  function_ref<std::unique_ptr<PassConceptT>()> PassMaker;
+  bool EagerlyInvalidate;
+};
+
+// A function to deduce a module pass type and create a unique_ptr of it for returning from the PassMaker
+// function.
+template <typename ModulePassT>
+std::unique_ptr<ModuleBunchToModulePassAdaptor::PassConceptT>
+createForModuleBunchToModulePassAdaptor(ModulePassT Pass) {
+  // Analysis are always preserved.
+  using PassModelT = detail::PassModel<Module, ModulePassT, ModuleAnalysisManager>;
+  return std::unique_ptr<ModuleBunchToModulePassAdaptor::PassConceptT>(new PassModelT(std::forward<ModulePassT>(Pass)));
+}
+
+// A function to deduce a module pass type and wrap it in the templated adaptor.
+template <typename ModulePassT>
+ModuleBunchToModulePassAdaptor createModuleBunchToModulePassAdaptor(ModulePassT Pass, bool EagerlyInvalidate = false) {
+  return ModuleBunchToModulePassAdaptor(createForModuleBunchToModulePassAdaptor(std::move(Pass)), EagerlyInvalidate);
+}
+
+/// This class provides access to building LLVM's passes.
+///
+/// Currently implemented as a subclass of LLVM's PassBuilder. If we merge ModuleBunch
+/// into LLVM, then the functionality here would be merged into PassBuilder.
+class MbPassBuilder : public PassBuilder {
+public:
+  explicit MbPassBuilder(TargetMachine *TM = nullptr, PipelineTuningOptions PTO = PipelineTuningOptions(),
+                         std::optional<PGOOptions> PGOOpt = std::nullopt, PassInstrumentationCallbacks *PIC = nullptr)
+      : PassBuilder(TM, PTO, PGOOpt, PIC) {}
+
+  /// Parse a textual pass pipeline description into a \c
+  /// ModulePassManager.
+  ///
+  /// The format of the textual pass pipeline description looks something like:
+  ///
+  ///   modulebunch(module(function(instcombine,sroa),dce,cgscc(inliner,function(...)),...))
+  ///
+  /// Pass managers have ()s describing the nest structure of passes. All passes
+  /// are comma separated. As a special shortcut, if the very first pass is not
+  /// a modulebunch pass (as a modulebunch pass manager is), this will automatically form
+  /// the shortest stack of pass managers that allow inserting that first pass.
+  /// So, assuming module passes 'mpassN', function passes 'fpassN', CGSCC passes
+  /// 'cgpassN', and loop passes 'lpassN', all of these are valid:
+  ///
+  ///   mpass1,mpass2,mpass3
+  ///   fpass1,fpass2,fpass3
+  ///   cgpass1,cgpass2,cgpass3
+  ///   lpass1,lpass2,lpass3
+  ///
+  /// And they are equivalent to the following (resp.):
+  ///
+  ///   modulebunch(module(mpass1,mpass2,mpass3))
+  ///   modulebunch(module(function(fpass1,fpass2,fpass3)))
+  ///   modulebunch(module(cgscc(cgpass1,cgpass2,cgpass3)))
+  ///   modulebunch(module(function(loop(lpass1,lpass2,lpass3))))
+  ///
+  /// This shortcut is especially useful for debugging and testing small pass
+  /// combinations.
+  ///
+  /// The sequence of passes aren't necessarily the exact same kind of pass.
+  /// You can mix different levels implicitly if adaptor passes are defined to
+  /// make them work. For example,
+  ///
+  ///   mpass1,fpass1,fpass2,mpass2,lpass1
+  ///
+  /// This pipeline uses only one pass manager: the top-level modulebunch manager.
+  /// fpass1,fpass2 and lpass1 are added into the the top-level modulebunch manager
+  /// using only adaptor passes. No nested function/loop pass managers are
+  /// added. The purpose is to allow easy pass testing when the user
+  /// specifically want the pass to run under a adaptor directly. This is
+  /// preferred when a pipeline is largely of one type, but one or just a few
+  /// passes are of different types(See PassBuilder.cpp for examples).
+  Error parsePassPipeline(ModuleBunchPassManager &passMgr, StringRef pipelineText);
+
+  /// Register pipeline parsing callbacks with this pass builder instance.
+  /// Using these callbacks, callers can parse both a single pass name, as well
+  /// as entire sub-pipelines, and populate the PassManager instance
+  /// accordingly.
+  void registerPipelineParsingCallback(
+      const std::function<bool(StringRef Name, ModuleBunchPassManager &, ArrayRef<PipelineElement>)> &C) {
+    ModuleBunchPipelineParsingCallbacks.push_back(C);
+  }
+
+  // Forward other overloads of registerPipelineParsingCallback to PassBuilder.
+  void registerPipelineParsingCallback(
+      const std::function<bool(StringRef Name, ModulePassManager &, ArrayRef<PipelineElement>)> &C) {
+    PassBuilder::registerPipelineParsingCallback(C);
+  }
+
+  void registerPipelineParsingCallback(
+      const std::function<bool(StringRef Name, FunctionPassManager &, ArrayRef<PipelineElement>)> &C) {
+    PassBuilder::registerPipelineParsingCallback(C);
+  }
+
+  void registerPipelineParsingCallback(
+      const std::function<bool(StringRef Name, LoopPassManager &, ArrayRef<PipelineElement>)> &C) {
+    PassBuilder::registerPipelineParsingCallback(C);
+  }
+
+private:
+  static std::optional<std::vector<PipelineElement>> parsePipelineText(StringRef Text);
+
+  Error parseModuleBunchPassPipeline(ModuleBunchPassManager &MBPM, ArrayRef<PipelineElement> Pipeline);
+
+  Error parseModuleBunchPass(ModuleBunchPassManager &MBPM, const PipelineElement &E);
+
+  SmallVector<std::function<bool(StringRef, ModuleBunchPassManager &, ArrayRef<PipelineElement>)>, 2>
+      ModuleBunchPipelineParsingCallbacks;
+};
+
+// Copied from PrintModulePass in IRPrintingPasses.h and edited.
+/// ModuleBunch pass to print the IR of the modules.
+class PrintModuleBunchPass : public llvm::PassInfoMixin<PrintModuleBunchPass> {
+  raw_ostream &OS;
+  std::string Banner;
+  bool ShouldPreserveUseListOrder;
+
+public:
+  PrintModuleBunchPass() : OS(dbgs()) {}
+  PrintModuleBunchPass(raw_ostream &OS, const std::string &Banner, bool ShouldPreserveUseListOrder)
+      : OS(OS), Banner(Banner), ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
+
+  PreservedAnalyses run(ModuleBunch &MB, AnalysisManager<ModuleBunch> &);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
diff --git a/compilerutils/lib/IRSerializationUtils.cpp b/compilerutils/lib/IRSerializationUtils.cpp
new file mode 100644
index 0000000000..625207add1
--- /dev/null
+++ b/compilerutils/lib/IRSerializationUtils.cpp
@@ -0,0 +1,98 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#include "compilerutils/IRSerializationUtils.h"
+#include "compilerutils/DxilUtils.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/MD5.h"
+#include <iomanip>
+#include <sstream>
+
+using namespace llvm;
+using namespace irserializationutils;
+
+// Returns an MD5 hash of the module LL. This is returned as a string so it can
+// be used as part of a filename.
+std::string irserializationutils::getModuleHashStr(const Module &m) {
+  std::string moduleStr;
+  raw_string_ostream os(moduleStr);
+  os << m;
+
+  MD5 hash;
+  MD5::MD5Result result;
+  hash.update(moduleStr);
+  hash.final(result);
+
+  SmallString<32> resStr;
+  MD5::stringifyResult(result, resStr);
+  std::stringstream hexStream;
+  for (char value : resStr)
+    hexStream << std::hex << value;
+
+  return hexStream.str();
+}
+
+static void writeToHashedOutputFile(const Module &m, StringRef filenamePrefix, StringRef filenameExt,
+                                    std::function<void(raw_fd_ostream &)> callback) {
+  // LLVM_DEBUG is not used in this function because the call will already be
+  // guarded by a DEBUG macro, such as: DEBUG_WITH_TYPE(...);
+
+  auto hash = getModuleHashStr(m);
+  auto fullName = filenamePrefix + "." + hash + "." + filenameExt;
+
+  // If a file with an identical hash exists then we don't need to write it
+  // again.
+  if (sys::fs::exists(fullName))
+    return;
+
+  std::error_code ec;
+  raw_fd_ostream file(fullName.str(), ec, sys::fs::OF_Text);
+
+  if (ec) {
+    errs() << "Error opening " << fullName << " : " << ec.message() << "\n";
+    return;
+  }
+
+  callback(file);
+
+  dbgs() << "Wrote file '" << fullName << "'\n";
+}
+
+void irserializationutils::writeCFGToDotFile(const Function &f, StringRef filenamePrefix, bool cfgOnly) {
+  // LLVM_DEBUG is not used in this function because the call will already be
+  // guarded by a DEBUG macro, such as: DEBUG_WITH_TYPE(...);
+  auto funcName = CompilerUtils::dxil::tryDemangleFunctionName(f.getName());
+  auto filenamePrefixFuncName = filenamePrefix.str() + "." + funcName.str();
+
+  writeToHashedOutputFile(*f.getParent(), filenamePrefixFuncName, "dot", [&](raw_fd_ostream &file) {
+    DOTFuncInfo cfgInfo(&f);
+    WriteGraph(file, &cfgInfo, cfgOnly);
+  });
+}
+
+void irserializationutils::writeModuleToLLFile(const Module &m, StringRef filenamePrefix) {
+  writeToHashedOutputFile(m, filenamePrefix, "ll", [&](raw_fd_ostream &file) { file << m; });
+}
diff --git a/compilerutils/lib/MbStandardInstrumentations.cpp b/compilerutils/lib/MbStandardInstrumentations.cpp
new file mode 100644
index 0000000000..6ac50aa3e9
--- /dev/null
+++ b/compilerutils/lib/MbStandardInstrumentations.cpp
@@ -0,0 +1,472 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// An alternative to LLVM's StandardInstrumentations that (partly) patches
+// things up so they work on ModuleBunch passes.
+// Most code here is copied from LLVM's StandardInstrumentations.cpp and
+// edited.
+
+#include "compilerutils/MbStandardInstrumentations.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extract the outermost IR unit out of \p IR unit. May return wrapped nullptr if \p IR does not match
+/// certain global filters. Will never return wrapped nullptr if \p Force is true.
+Any unwrapOuter(Any IR, bool Force = false) {
+  if (const auto **MB = any_cast<const ModuleBunch *>(&IR))
+    return *MB;
+  if (const auto **M = any_cast<const Module *>(&IR))
+    return *M;
+
+  if (const auto **F = any_cast<const Function *>(&IR)) {
+    if (!Force && !isFunctionInPrintList((*F)->getName()))
+      return nullptr;
+
+    return (*F)->getParent();
+  }
+
+  if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR)) {
+    for (const LazyCallGraph::Node &N : **C) {
+      const Function &F = N.getFunction();
+      if (Force || (!F.isDeclaration() && isFunctionInPrintList(F.getName()))) {
+        return F.getParent();
+      }
+    }
+    assert(!Force && "Expected a module");
+    return nullptr;
+  }
+
+  if (const auto **L = any_cast<const Loop *>(&IR)) {
+    const Function *F = (*L)->getHeader()->getParent();
+    if (!Force && !isFunctionInPrintList(F->getName()))
+      return nullptr;
+    return F->getParent();
+  }
+
+  llvm_unreachable("Unknown IR unit");
+}
+
+void printIR(raw_ostream &OS, const Function *F) {
+  if (!isFunctionInPrintList(F->getName()))
+    return;
+  OS << *F;
+}
+
+void printIR(raw_ostream &OS, const Module *M) {
+  if (isFunctionInPrintList("*") || forcePrintModuleIR()) {
+    M->print(OS, nullptr);
+  } else {
+    for (const auto &F : M->functions()) {
+      printIR(OS, &F);
+    }
+  }
+}
+
+void printIR(raw_ostream &OS, const ModuleBunch *MB) {
+  for (Module &M : *MB)
+    printIR(OS, &M);
+}
+
+void printIR(raw_ostream &OS, const LazyCallGraph::SCC *C) {
+  for (const LazyCallGraph::Node &N : *C) {
+    const Function &F = N.getFunction();
+    if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+      F.print(OS);
+    }
+  }
+}
+
+void printIR(raw_ostream &OS, const Loop *L) {
+  const Function *F = L->getHeader()->getParent();
+  if (!isFunctionInPrintList(F->getName()))
+    return;
+  printLoop(const_cast<Loop &>(*L), OS);
+}
+
+std::string getIRName(Any IR) {
+  if (any_cast<const ModuleBunch *>(&IR))
+    return "[moduleBunch]";
+
+  if (any_cast<const Module *>(&IR))
+    return "[module]";
+
+  if (const auto **F = any_cast<const Function *>(&IR))
+    return (*F)->getName().str();
+
+  if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR))
+    return (*C)->getName();
+
+  if (const auto **L = any_cast<const Loop *>(&IR))
+    return (*L)->getName().str();
+
+  llvm_unreachable("Unknown wrapped IR type");
+}
+
+bool moduleContainsFilterPrintFunc(const Module &M) {
+  return any_of(M.functions(), [](const Function &F) { return isFunctionInPrintList(F.getName()); }) ||
+         isFunctionInPrintList("*");
+}
+
+bool sccContainsFilterPrintFunc(const LazyCallGraph::SCC &C) {
+  return any_of(C, [](const LazyCallGraph::Node &N) { return isFunctionInPrintList(N.getName()); }) ||
+         isFunctionInPrintList("*");
+}
+
+bool shouldPrintIR(Any IR) {
+  if (const auto **MB = any_cast<const ModuleBunch *>(&IR)) {
+    bool ShouldPrint = false;
+    for (Module &M : **MB)
+      ShouldPrint |= moduleContainsFilterPrintFunc(M);
+    return ShouldPrint;
+  }
+
+  if (const auto **M = any_cast<const Module *>(&IR))
+    return moduleContainsFilterPrintFunc(**M);
+
+  if (const auto **F = any_cast<const Function *>(&IR))
+    return isFunctionInPrintList((*F)->getName());
+
+  if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR))
+    return sccContainsFilterPrintFunc(**C);
+
+  if (const auto **L = any_cast<const Loop *>(&IR))
+    return isFunctionInPrintList((*L)->getHeader()->getParent()->getName());
+  llvm_unreachable("Unknown wrapped IR type");
+}
+
+/// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
+/// Any and does actual print job.
+void unwrapAndPrint(raw_ostream &OS, Any IR) {
+  if (!shouldPrintIR(IR))
+    return;
+
+  if (forcePrintModuleIR())
+    IR = unwrapOuter(IR);
+
+  if (const auto **MB = any_cast<const ModuleBunch *>(&IR)) {
+    printIR(OS, *MB);
+    return;
+  }
+
+  if (const auto **M = any_cast<const Module *>(&IR)) {
+    printIR(OS, *M);
+    return;
+  }
+
+  if (const auto **F = any_cast<const Function *>(&IR)) {
+    printIR(OS, *F);
+    return;
+  }
+
+  if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR)) {
+    printIR(OS, *C);
+    return;
+  }
+
+  if (const auto **L = any_cast<const Loop *>(&IR)) {
+    printIR(OS, *L);
+    return;
+  }
+  llvm_unreachable("Unknown wrapped IR type");
+}
+
+// Return true when this is a pass for which changes should be ignored
+bool isIgnored(StringRef PassID) {
+  return isSpecialPass(PassID, {"PassManager", "PassAdaptor", "AnalysisManagerProxy", "DevirtSCCRepeatedPass",
+                                "ModuleInlinerWrapperPass"});
+}
+
+} // anonymous namespace
+
+MbPrintIRInstrumentation::~MbPrintIRInstrumentation() {
+  assert(ModuleDescStack.empty() && "ModuleDescStack is not empty at exit");
+}
+
+void MbPrintIRInstrumentation::pushModuleDesc(StringRef PassID, Any IR) {
+  ModuleDescStack.emplace_back(unwrapOuter(IR), getIRName(IR), PassID);
+}
+
+MbPrintIRInstrumentation::PrintModuleDesc MbPrintIRInstrumentation::popModuleDesc(StringRef PassID) {
+  assert(!ModuleDescStack.empty() && "empty ModuleDescStack");
+  PrintModuleDesc ModuleDesc = ModuleDescStack.pop_back_val();
+  assert(std::get<2>(ModuleDesc) == PassID && "malformed ModuleDescStack");
+  return ModuleDesc;
+}
+
+void MbPrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) {
+  if (isIgnored(PassID))
+    return;
+
+  // Saving Module for AfterPassInvalidated operations.
+  // Note: here we rely on a fact that we do not change modules while
+  // traversing the pipeline, so the latest captured module is good
+  // for all print operations that has not happen yet.
+  if (shouldPrintAfterPass(PassID))
+    pushModuleDesc(PassID, IR);
+
+  if (!shouldPrintBeforePass(PassID))
+    return;
+
+  if (!shouldPrintIR(IR))
+    return;
+
+  dbgs() << "*** IR Dump Before " << PassID << " on " << getIRName(IR) << " ***\n";
+  unwrapAndPrint(dbgs(), IR);
+}
+
+void MbPrintIRInstrumentation::printAfterPass(StringRef PassID, Any IR) {
+  if (isIgnored(PassID))
+    return;
+
+  if (!shouldPrintAfterPass(PassID))
+    return;
+
+  Any OuterIR;
+  std::string IRName;
+  StringRef StoredPassID;
+  std::tie(OuterIR, IRName, StoredPassID) = popModuleDesc(PassID);
+  assert(StoredPassID == PassID && "mismatched PassID");
+
+  if (!shouldPrintIR(IR))
+    return;
+
+  dbgs() << "*** IR Dump After " << PassID << " on " << IRName << " ***\n";
+  unwrapAndPrint(dbgs(), IR);
+}
+
+void MbPrintIRInstrumentation::printAfterPassInvalidated(StringRef PassID) {
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  if (!shouldPrintAfterPass(PassName))
+    return;
+
+  if (isIgnored(PassID))
+    return;
+
+  Any OuterIR;
+  std::string IRName;
+  StringRef StoredPassID;
+  std::tie(OuterIR, IRName, StoredPassID) = popModuleDesc(PassID);
+  assert(StoredPassID == PassID && "mismatched PassID");
+  // Additional filtering (e.g. -filter-print-func) can lead to module
+  // printing being skipped.
+  if (!*any_cast<const void *>(&OuterIR))
+    return;
+
+  SmallString<20> Banner = formatv("*** IR Dump After {0} on {1} (invalidated) ***", PassID, IRName);
+  dbgs() << Banner << "\n";
+  unwrapAndPrint(dbgs(), OuterIR);
+}
+
+bool MbPrintIRInstrumentation::shouldPrintBeforePass(StringRef PassID) {
+  if (shouldPrintBeforeAll())
+    return true;
+
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  return is_contained(printBeforePasses(), PassName);
+}
+
+bool MbPrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) {
+  if (shouldPrintAfterAll())
+    return true;
+
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  return is_contained(printAfterPasses(), PassName);
+}
+
+void MbPrintIRInstrumentation::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  this->PIC = &PIC;
+
+  // BeforePass callback is not just for printing, it also saves a Module
+  // for later use in AfterPassInvalidated.
+  if (shouldPrintBeforeSomePass() || shouldPrintAfterSomePass())
+    PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) { this->printBeforePass(P, IR); });
+
+  if (shouldPrintAfterSomePass()) {
+    PIC.registerAfterPassCallback(
+        [this](StringRef P, Any IR, const PreservedAnalyses &) { this->printAfterPass(P, IR); });
+    PIC.registerAfterPassInvalidatedCallback(
+        [this](StringRef P, const PreservedAnalyses &) { this->printAfterPassInvalidated(P); });
+  }
+}
+
+void MbVerifyInstrumentation::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  PIC.registerAfterPassCallback([this](StringRef P, Any IR, const PreservedAnalyses &PassPA) {
+    if (isIgnored(P) || P == "VerifierPass")
+      return;
+    const Function **FPtr = any_cast<const Function *>(&IR);
+    const Function *F = FPtr ? *FPtr : nullptr;
+    if (!F) {
+      if (const auto **L = any_cast<const Loop *>(&IR))
+        F = (*L)->getHeader()->getParent();
+    }
+
+    if (F) {
+      if (DebugLogging)
+        dbgs() << "Verifying function " << F->getName() << "\n";
+
+      if (verifyFunction(*F, &errs()))
+        report_fatal_error("Broken function found, compilation aborted!");
+    } else if (const ModuleBunch **MB = any_cast<const ModuleBunch *>(&IR)) {
+      for (Module &M : **MB) {
+        if (DebugLogging)
+          dbgs() << "Verifying module " << M.getName() << "\n";
+
+        if (verifyModule(M, &errs()))
+          report_fatal_error("Broken module found, compilation aborted!");
+      }
+    } else {
+      const Module **MPtr = any_cast<const Module *>(&IR);
+      const Module *M = MPtr ? *MPtr : nullptr;
+      if (!M) {
+        if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR))
+          M = (*C)->begin()->getFunction().getParent();
+      }
+
+      if (M) {
+        if (DebugLogging)
+          dbgs() << "Verifying module " << M->getName() << "\n";
+
+        if (verifyModule(*M, &errs()))
+          report_fatal_error("Broken module found, compilation aborted!");
+      }
+    }
+  });
+}
+
+raw_ostream &MbPrintPassInstrumentation::print() {
+  if (Opts.Indent) {
+    assert(Indent >= 0);
+    dbgs().indent(Indent);
+  }
+  return dbgs();
+}
+
+void MbPrintPassInstrumentation::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  if (!Enabled)
+    return;
+
+  std::vector<StringRef> SpecialPasses;
+  if (!Opts.Verbose) {
+    SpecialPasses.emplace_back("PassManager");
+    SpecialPasses.emplace_back("PassAdaptor");
+  }
+
+  PIC.registerBeforeSkippedPassCallback([this, SpecialPasses](StringRef PassID, Any IR) {
+    assert(!isSpecialPass(PassID, SpecialPasses) && "Unexpectedly skipping special pass");
+
+    print() << "Skipping pass: " << PassID << " on " << getIRName(IR) << "\n";
+  });
+  PIC.registerBeforeNonSkippedPassCallback([this, SpecialPasses](StringRef PassID, Any IR) {
+    if (isSpecialPass(PassID, SpecialPasses))
+      return;
+
+    auto &OS = print();
+    OS << "Running pass: " << PassID << " on " << getIRName(IR);
+    if (const auto **F = any_cast<const Function *>(&IR)) {
+      unsigned Count = (*F)->getInstructionCount();
+      OS << " (" << Count << " instruction";
+      if (Count != 1)
+        OS << 's';
+      OS << ')';
+    } else if (const auto **C = any_cast<const LazyCallGraph::SCC *>(&IR)) {
+      int Count = (*C)->size();
+      OS << " (" << Count << " node";
+      if (Count != 1)
+        OS << 's';
+      OS << ')';
+    }
+    OS << "\n";
+    Indent += 2;
+  });
+  PIC.registerAfterPassCallback([this, SpecialPasses](StringRef PassID, Any IR, const PreservedAnalyses &) {
+    if (isSpecialPass(PassID, SpecialPasses))
+      return;
+
+    Indent -= 2;
+  });
+  PIC.registerAfterPassInvalidatedCallback([this, SpecialPasses](StringRef PassID, Any IR) {
+    if (isSpecialPass(PassID, SpecialPasses))
+      return;
+
+    Indent -= 2;
+  });
+
+  if (!Opts.SkipAnalyses) {
+    PIC.registerBeforeAnalysisCallback([this](StringRef PassID, Any IR) {
+      print() << "Running analysis: " << PassID << " on " << getIRName(IR) << "\n";
+      Indent += 2;
+    });
+    PIC.registerAfterAnalysisCallback([this](StringRef PassID, Any IR) { Indent -= 2; });
+    PIC.registerAnalysisInvalidatedCallback([this](StringRef PassID, Any IR) {
+      print() << "Invalidating analysis: " << PassID << " on " << getIRName(IR) << "\n";
+    });
+    PIC.registerAnalysesClearedCallback(
+        [this](StringRef IRName) { print() << "Clearing all analysis results for: " << IRName << "\n"; });
+  }
+}
+
+MbStandardInstrumentations::MbStandardInstrumentations(bool DebugLogging, bool VerifyEach,
+                                                       PrintPassOptions PrintPassOpts)
+    : PrintPass(DebugLogging, PrintPassOpts), OptNone(DebugLogging),
+      PrintChangedIR(PrintChanged == ChangePrinter::Verbose),
+      PrintChangedDiff(PrintChanged == ChangePrinter::DiffVerbose || PrintChanged == ChangePrinter::ColourDiffVerbose,
+                       PrintChanged == ChangePrinter::ColourDiffVerbose ||
+                           PrintChanged == ChangePrinter::ColourDiffQuiet),
+      WebsiteChangeReporter(PrintChanged == ChangePrinter::DotCfgVerbose), Verify(DebugLogging),
+      VerifyEach(VerifyEach) {
+}
+
+// Copied from LLVM's StandardInstrumentations.cpp and edited.
+void MbStandardInstrumentations::registerCallbacks(PassInstrumentationCallbacks &PIC, ModuleAnalysisManager *MAM) {
+  PrintIR.registerCallbacks(PIC);
+  PrintPass.registerCallbacks(PIC);
+  TimePasses.registerCallbacks(PIC);
+  OptNone.registerCallbacks(PIC);
+  // OptPassGate.registerCallbacks(PIC);
+  if (MAM)
+    PreservedCFGChecker.registerCallbacks(PIC, *MAM);
+  PrintChangedIR.registerCallbacks(PIC);
+  PseudoProbeVerification.registerCallbacks(PIC);
+  if (VerifyEach)
+    Verify.registerCallbacks(PIC);
+  PrintChangedDiff.registerCallbacks(PIC);
+  WebsiteChangeReporter.registerCallbacks(PIC);
+
+  ChangeTester.registerCallbacks(PIC);
+
+  PrintCrashIR.registerCallbacks(PIC);
+  // TimeProfiling records the pass running time cost.
+  // Its 'BeforePassCallback' can be appended at the tail of all the
+  // BeforeCallbacks by calling `registerCallbacks` in the end.
+  // Its 'AfterPassCallback' is put at the front of all the
+  // AfterCallbacks by its `registerCallbacks`. This is necessary
+  // to ensure that other callbacks are not included in the timings.
+  TimeProfilingPasses.registerCallbacks(PIC);
+}
diff --git a/compilerutils/lib/ModuleBunch.cpp b/compilerutils/lib/ModuleBunch.cpp
new file mode 100644
index 0000000000..a0ed6638f3
--- /dev/null
+++ b/compilerutils/lib/ModuleBunch.cpp
@@ -0,0 +1,410 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// The ModuleBunch class, representing a bunch of modules, and a pass manager
+// and analysis manager for it allowing you to run passes on it.
+
+#include "compilerutils/ModuleBunch.h"
+#include "llvm/IR/PassManagerImpl.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace llvm {
+
+template class PassManager<ModuleBunch>;
+template class AnalysisManager<ModuleBunch>;
+template class AllAnalysesOn<ModuleBunch>;
+
+} // namespace llvm
+
+using namespace llvm;
+
+// Add Module to ModuleBunch, taking ownership.
+void ModuleBunch::addModule(std::unique_ptr<Module> module) {
+  Modules.push_back(std::move(module));
+}
+
+// Remove Module from ModuleBunch, returning ownership to the caller.
+// Returns empty unique_ptr if Module not found.
+std::unique_ptr<Module> ModuleBunch::removeModule(const Module *moduleToRemove) {
+  for (std::unique_ptr<Module> &module : Modules) {
+    if (module && &*module == moduleToRemove)
+      return std::move(module);
+  }
+  return nullptr;
+}
+
+// Renormalize ModuleBunch's array of Modules after manipulation by user.
+// Invalidates modules() iterator.
+void ModuleBunch::renormalize() {
+  // Remove holes from where caller freed/released modules.
+  Modules.erase(std::remove(Modules.begin(), Modules.end(), nullptr), Modules.end());
+}
+
+// Check that Modules list has been renormalized since caller removed/freed modules.
+// Checks that there are no holes.
+bool ModuleBunch::isNormalized() const {
+  for (const std::unique_ptr<Module> &entry : Modules) {
+    if (!entry)
+      return false;
+  }
+  return true;
+}
+
+/// Print the ModuleBunch to an output stream. The extra args are passed as-is
+/// to Module::print for each module.
+void ModuleBunch::print(raw_ostream &OS, AssemblyAnnotationWriter *AAW, bool ShouldPreserveUseListOrder,
+                        bool IsForDebug) const {
+  for (const std::unique_ptr<Module> &M : Modules) {
+    if (!M)
+      OS << "<EMPTY SLOT IN MODULEBUNCH>\n";
+    else
+      M->print(OS, AAW, ShouldPreserveUseListOrder, IsForDebug);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// Dump ModuleBunch to dbgs().
+LLVM_DUMP_METHOD
+void ModuleBunch::dump() const {
+  print(dbgs(), nullptr, false, /*IsForDebug=*/true);
+}
+#endif
+
+// Copied from IRPrintingPasses.cpp and edited.
+PreservedAnalyses PrintModuleBunchPass::run(ModuleBunch &MB, ModuleBunchAnalysisManager &AM) {
+  if (llvm::isFunctionInPrintList("*")) {
+    if (!Banner.empty())
+      OS << Banner << "\n";
+    MB.print(OS, nullptr, ShouldPreserveUseListOrder);
+  } else {
+    bool BannerPrinted = false;
+    for (const Module &M : MB) {
+      for (const auto &F : M.functions()) {
+        if (llvm::isFunctionInPrintList(F.getName())) {
+          if (!BannerPrinted && !Banner.empty()) {
+            OS << Banner << "\n";
+            BannerPrinted = true;
+          }
+          F.print(OS);
+        }
+      }
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+// Copied from FunctionAnalysisManagerModuleProxy in llvm/lib/IR/PassManager.cpp and edited.
+template <>
+bool ModuleAnalysisManagerModuleBunchProxy::Result::invalidate(ModuleBunch &Bunch, const PreservedAnalyses &PA,
+                                                               ModuleBunchAnalysisManager::Invalidator &Inv) {
+  // If literally everything is preserved, we're done.
+  if (PA.areAllPreserved())
+    return false; // This is still a valid proxy.
+
+  // If this proxy isn't marked as preserved, then even if the result remains
+  // valid, the key itself may no longer be valid, so we clear everything.
+  //
+  // Note that in order to preserve this proxy, a ModuleBunch pass must ensure that
+  // the MAM has been completely updated to handle the deletion of modules.
+  // Specifically, any MAM-cached results for those modules need to have been
+  // forcibly cleared. When preserved, this proxy will only invalidate results
+  // cached on modules *still in the ModuleBunch* at the end of the ModuleBunch pass.
+  auto PAC = PA.getChecker<ModuleAnalysisManagerModuleBunchProxy>();
+  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<ModuleBunch>>()) {
+    InnerAM->clear();
+    return true;
+  }
+
+  // Directly check if the relevant set is preserved.
+  bool AreModuleAnalysesPreserved = PA.allAnalysesInSetPreserved<AllAnalysesOn<Module>>();
+
+  // Now walk all the modules to see if any inner analysis invalidation is
+  // necessary.
+  for (Module &M : Bunch) {
+    std::optional<PreservedAnalyses> ModulePA;
+
+    // Check to see whether the preserved set needs to be pruned based on
+    // module-level analysis invalidation that triggers deferred invalidation
+    // registered with the outer analysis manager proxy for this module.
+    if (auto *OuterProxy = InnerAM->getCachedResult<ModuleBunchAnalysisManagerModuleProxy>(M))
+      for (const auto &OuterInvalidationPair : OuterProxy->getOuterInvalidations()) {
+        AnalysisKey *OuterAnalysisID = OuterInvalidationPair.first;
+        const auto &InnerAnalysisIDs = OuterInvalidationPair.second;
+        if (Inv.invalidate(OuterAnalysisID, Bunch, PA)) {
+          if (!ModulePA)
+            ModulePA = PA;
+          for (AnalysisKey *InnerAnalysisID : InnerAnalysisIDs)
+            ModulePA->abandon(InnerAnalysisID);
+        }
+      }
+
+    // Check if we needed a custom PA set, and if so we'll need to run the
+    // inner invalidation.
+    if (ModulePA) {
+      InnerAM->invalidate(M, *ModulePA);
+      continue;
+    }
+
+    // Otherwise we only need to do invalidation if the original PA set didn't
+    // preserve all module analyses.
+    if (!AreModuleAnalysesPreserved)
+      InnerAM->invalidate(M, PA);
+  }
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
+// Copied from ModuleToFunctionPassAdaptor::printPipeline in llvm/lib/IR/PassManager.cpp and edited.
+void ModuleBunchToModulePassAdaptor::printPipeline(raw_ostream &OS,
+                                                   function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  OS << "module";
+  if (EagerlyInvalidate)
+    OS << "<eager-inv>";
+  OS << "(";
+  Pass->printPipeline(OS, MapClassName2PassName);
+  OS << ")";
+}
+
+// Copied from ModuleToFunctionPassAdaptor::run in llvm/lib/IR/PassManager.cpp and edited.
+PreservedAnalyses ModuleBunchToModulePassAdaptor::run(ModuleBunch &Bunch, ModuleBunchAnalysisManager &AM) {
+  ModuleAnalysisManager &MAM = AM.getResult<ModuleAnalysisManagerModuleBunchProxy>(Bunch).getManager();
+
+  // Request PassInstrumentation from analysis manager, will use it to run
+  // instrumenting callbacks for the passes later.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(Bunch);
+
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
+  // TODO: Add real parallelism, with an API to provide threads to run module passes.
+  // For now, run each distinct LLVMContext in a separate copy of the module pass manager,
+  // so we can at least test users adding identical copies of the module pass manager.
+  SmallPtrSet<LLVMContext *, 16> DoneContexts;
+  for (unsigned StartIdx = 0; StartIdx != Bunch.size(); ++StartIdx) {
+    Module &module = Bunch.begin()[StartIdx];
+    LLVMContext *Context = &module.getContext();
+    if (!DoneContexts.insert(Context).second)
+      continue;
+
+    // Use the single Pass if it was set. Otherwise call PassMaker to create a Pass each time
+    // round the outer per-LLVMContext loop.
+    std::unique_ptr<PassConceptT> AllocatedPass;
+    PassConceptT *ThisPass = Pass.get();
+    if (!ThisPass) {
+      AllocatedPass = PassMaker();
+      ThisPass = &*AllocatedPass;
+    }
+
+    for (unsigned Idx = StartIdx; Idx != Bunch.size(); ++Idx) {
+      Module &M = Bunch.begin()[Idx];
+      if (&M.getContext() != Context)
+        continue;
+
+      // Check the PassInstrumentation's BeforePass callbacks before running the
+      // pass, skip its execution completely if asked to (callback returns
+      // false).
+      if (!PI.runBeforePass<Module>(*ThisPass, M))
+        continue;
+
+      PreservedAnalyses PassPA = ThisPass->run(M, MAM);
+      PI.runAfterPass(*ThisPass, M, PassPA);
+
+      // TODO: With real parallelism, the next two statements need to be under a mutex.
+      // We know that the module pass couldn't have invalidated any other
+      // module's analyses (that's the contract of a module pass), so
+      // directly handle the module analysis manager's invalidation here.
+      MAM.invalidate(M, EagerlyInvalidate ? PreservedAnalyses::none() : PassPA);
+
+      // Then intersect the preserved set so that invalidation of module
+      // analyses will eventually occur when the module pass completes.
+      PA.intersect(std::move(PassPA));
+    }
+  }
+
+  // The ModuleAnalysisManagerModuleBunchProxy is preserved because (we assume)
+  // the module passes we ran didn't add or remove any modules.
+  //
+  // We also preserve all analyses on Modules, because we did all the
+  // invalidation we needed to do above.
+  PA.preserveSet<AllAnalysesOn<Module>>();
+  PA.preserve<ModuleAnalysisManagerModuleBunchProxy>();
+  return PA;
+}
+
+// Copied from lib/Passes/PassBuilder.cpp because it is private there.
+std::optional<std::vector<PassBuilder::PipelineElement>> MbPassBuilder::parsePipelineText(StringRef Text) {
+  std::vector<PipelineElement> ResultPipeline;
+
+  SmallVector<std::vector<PipelineElement> *, 4> PipelineStack = {&ResultPipeline};
+  for (;;) {
+    std::vector<PipelineElement> &Pipeline = *PipelineStack.back();
+    size_t Pos = Text.find_first_of(",()");
+    Pipeline.push_back({Text.substr(0, Pos), {}});
+
+    // If we have a single terminating name, we're done.
+    if (Pos == Text.npos)
+      break;
+
+    char Sep = Text[Pos];
+    Text = Text.substr(Pos + 1);
+    if (Sep == ',')
+      // Just a name ending in a comma, continue.
+      continue;
+
+    if (Sep == '(') {
+      // Push the inner pipeline onto the stack to continue processing.
+      PipelineStack.push_back(&Pipeline.back().InnerPipeline);
+      continue;
+    }
+
+    assert(Sep == ')' && "Bogus separator!");
+    // When handling the close parenthesis, we greedily consume them to avoid
+    // empty strings in the pipeline.
+    do {
+      // If we try to pop the outer pipeline we have unbalanced parentheses.
+      if (PipelineStack.size() == 1)
+        return std::nullopt;
+
+      PipelineStack.pop_back();
+    } while (Text.consume_front(")"));
+
+    // Check if we've finished parsing.
+    if (Text.empty())
+      break;
+
+    // Otherwise, the end of an inner pipeline always has to be followed by
+    // a comma, and then we can continue.
+    if (!Text.consume_front(","))
+      return std::nullopt;
+  }
+
+  if (PipelineStack.size() > 1)
+    // Unbalanced parentheses.
+    return std::nullopt;
+
+  assert(PipelineStack.back() == &ResultPipeline && "Wrong pipeline at the bottom of the stack!");
+  return {std::move(ResultPipeline)};
+}
+
+// Copied from PassBuilder.cpp.
+/// Tests whether registered callbacks will accept a given pass name.
+///
+/// When parsing a pipeline text, the type of the outermost pipeline may be
+/// omitted, in which case the type is automatically determined from the first
+/// pass name in the text. This may be a name that is handled through one of the
+/// callbacks. We check this through the oridinary parsing callbacks by setting
+/// up a dummy PassManager in order to not force the client to also handle this
+/// type of query.
+template <typename PassManagerT, typename CallbacksT>
+static bool callbacksAcceptPassName(StringRef Name, CallbacksT &Callbacks) {
+  if (!Callbacks.empty()) {
+    PassManagerT DummyPM;
+    for (auto &CB : Callbacks)
+      if (CB(Name, DummyPM, {}))
+        return true;
+  }
+  return false;
+}
+
+// Copied from isModulePassName in PassBuilder.cpp and edited.
+template <typename CallbacksT> static bool isModuleBunchPassName(StringRef Name, CallbacksT &Callbacks) {
+  // Explicitly handle pass manager names.
+  if (Name == "modulebunch")
+    return true;
+  if (Name == "module")
+    return true;
+  if (Name == "cgscc")
+    return true;
+  if (Name == "function" || Name == "function<eager-inv>")
+    return true;
+  if (Name == "coro-cond")
+    return true;
+
+  return callbacksAcceptPassName<ModuleBunchPassManager>(Name, Callbacks);
+}
+
+// Copied from the ModulePassManager overload in llvm/lib/Passes/PassBuilder.cpp and edited.
+// Primary pass pipeline description parsing routine for a \c ModuleBunchPassManager
+// FIXME: Should this routine accept a TargetMachine or require the caller to
+// pre-populate the analysis managers with target-specific stuff?
+Error MbPassBuilder::parsePassPipeline(ModuleBunchPassManager &MBPM, StringRef PipelineText) {
+  auto Pipeline = parsePipelineText(PipelineText);
+  if (!Pipeline || Pipeline->empty())
+    return make_error<StringError>(formatv("invalid pipeline '{0}'", PipelineText).str(), inconvertibleErrorCode());
+
+  // If the first name isn't at the modulebunch layer, wrap the pipeline up
+  // automatically.
+  StringRef FirstName = Pipeline->front().Name;
+
+  if (!isModuleBunchPassName(FirstName, ModuleBunchPipelineParsingCallbacks)) {
+    ModulePassManager MPM;
+    if (Error Err = PassBuilder::parsePassPipeline(MPM, PipelineText))
+      return Err;
+    MBPM.addPass(createModuleBunchToModulePassAdaptor(std::move(MPM)));
+    return Error::success();
+  }
+
+  if (auto Err = parseModuleBunchPassPipeline(MBPM, *Pipeline))
+    return Err;
+  return Error::success();
+}
+
+// Copied from PassBuilder::parseModulePassPipeline and edited.
+Error MbPassBuilder::parseModuleBunchPassPipeline(ModuleBunchPassManager &MBPM, ArrayRef<PipelineElement> Pipeline) {
+  for (const auto &Element : Pipeline) {
+    if (auto Err = parseModuleBunchPass(MBPM, Element))
+      return Err;
+  }
+  return Error::success();
+}
+
+// Copied from PassBuilder::parseModulePass and edited.
+Error MbPassBuilder::parseModuleBunchPass(ModuleBunchPassManager &MBPM, const PipelineElement &E) {
+  auto &Name = E.Name;
+  auto &InnerPipeline = E.InnerPipeline;
+
+  // First handle complex passes like the pass managers which carry pipelines.
+  if (!InnerPipeline.empty()) {
+    if (Name == "modulebunch") {
+      ModuleBunchPassManager NestedMBPM;
+      if (auto Err = parseModuleBunchPassPipeline(NestedMBPM, InnerPipeline))
+        return Err;
+      MBPM.addPass(std::move(NestedMBPM));
+      return Error::success();
+    }
+    // TODO:
+    // For any other nested pass manager ("module", "function" etc) we want to invoke
+    // parseModulePassPipeline etc, but we can't as it is private in PassBuilder. So
+    // instead we need to reconstruct a text string and call parsePipelineText.
+    report_fatal_error("Nested pipeline spec not handled yet");
+  }
+
+  for (auto &C : ModuleBunchPipelineParsingCallbacks)
+    if (C(Name, MBPM, InnerPipeline))
+      return Error::success();
+  return make_error<StringError>(formatv("unknown modulebunch pass '{0}'", Name).str(), inconvertibleErrorCode());
+}
diff --git a/imported/llvm-dialects b/imported/llvm-dialects
index c436594690..50e4ca3a5c 160000
--- a/imported/llvm-dialects
+++ b/imported/llvm-dialects
@@ -1 +1 @@
-Subproject commit c4365946902436063f872dbcf1a370fe73982a54
+Subproject commit 50e4ca3a5c365b0bde36b122cc34256406723049
diff --git a/include/khronos/spirv/GLSL.ext.AMD.h b/include/khronos/spirv/GLSL.ext.AMD.h
new file mode 100644
index 0000000000..ba7f636425
--- /dev/null
+++ b/include/khronos/spirv/GLSL.ext.AMD.h
@@ -0,0 +1,87 @@
+/*
+** Copyright (c) 2014-2016 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+**
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+#ifndef GLSLextAMD_H
+#define GLSLextAMD_H
+
+enum BuiltIn;
+enum Capability;
+enum Decoration;
+enum Op;
+
+static const int GLSLextAMDVersion = 100;
+static const int GLSLextAMDRevision = 8;
+
+// SPV_AMD_shader_ballot
+enum ShaderBallotAMD {
+    ShaderBallotBadAMD = 0, // Don't use
+
+    SwizzleInvocationsAMD = 1,
+    SwizzleInvocationsMaskedAMD = 2,
+    WriteInvocationAMD = 3,
+    MbcntAMD = 4,
+
+    ShaderBallotCountAMD
+};
+
+// SPV_AMD_shader_trinary_minmax
+enum ShaderTrinaryMinMaxAMD {
+    ShaderTrinaryMinMaxBadAMD = 0, // Don't use
+
+    FMin3AMD = 1,
+    UMin3AMD = 2,
+    SMin3AMD = 3,
+    FMax3AMD = 4,
+    UMax3AMD = 5,
+    SMax3AMD = 6,
+    FMid3AMD = 7,
+    UMid3AMD = 8,
+    SMid3AMD = 9,
+
+    ShaderTrinaryMinMaxCountAMD
+};
+
+// SPV_AMD_shader_explicit_vertex_parameter
+enum ShaderExplicitVertexParameterAMD {
+    ShaderExplicitVertexParameterBadAMD = 0, // Don't use
+
+    InterpolateAtVertexAMD = 1,
+
+    ShaderExplicitVertexParameterCountAMD
+};
+
+// SPV_AMD_gcn_shader
+enum GcnShaderAMD {
+    GcnShaderBadAMD = 0, // Don't use
+
+    CubeFaceIndexAMD = 1,
+    CubeFaceCoordAMD = 2,
+    TimeAMD = 3,
+
+    GcnShaderCountAMD
+};
+
+#endif  // #ifndef GLSLextAMD_H
diff --git a/lgc/include/lgc/patch/LgcLowering.h b/lgc/include/lgc/patch/LgcLowering.h
new file mode 100644
index 0000000000..36687ed86f
--- /dev/null
+++ b/lgc/include/lgc/patch/LgcLowering.h
@@ -0,0 +1,77 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LgcLowering.h
+ * @brief LLPC header file: contains declaration of class lgc::Patch.
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "lgc/Pipeline.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class PassBuilder;
+
+} // namespace llvm
+
+namespace lgc {
+
+class PipelineState;
+class PassManager;
+
+// =====================================================================================================================
+// Represents the pass of LLVM patching operations, as the base class.
+class Patch {
+public:
+  Patch() : m_module(nullptr), m_context(nullptr), m_entryPoint(nullptr) {}
+  virtual ~Patch() = default;
+
+  static void addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, llvm::Timer *patchTimer,
+                        llvm::Timer *optTimer, Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel);
+
+  // Register all the patching passes into the given pass manager
+  static void registerPasses(lgc::PassManager &passMgr);
+
+  // Register all the patching passes into the given pass builder
+  static void registerPasses(llvm::PassBuilder &passBuilder);
+
+  static llvm::Constant *getLdsVariable(PipelineState *pipelineState, llvm::Function *func, bool rtStack = false);
+
+protected:
+  static void addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel);
+
+  void init(llvm::Module *module);
+
+  llvm::Module *m_module;                       // LLVM module to be run on
+  llvm::LLVMContext *m_context;                 // Associated LLVM context of the LLVM module that passes run on
+  std::optional<ShaderStageEnum> m_shaderStage; // Shader stage
+  llvm::Function *m_entryPoint;                 // Entry-point
+};
+
+} // namespace lgc
diff --git a/lgc/patch/LgcLowering.cpp b/lgc/patch/LgcLowering.cpp
new file mode 100644
index 0000000000..e7b79c7ddd
--- /dev/null
+++ b/lgc/patch/LgcLowering.cpp
@@ -0,0 +1,524 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LgcLowering.cpp
+ * @brief LLPC source file: contains implementation of class lgc::Patch.
+ ***********************************************************************************************************************
+ */
+#include "lgc/patch/LgcLowering.h"
+#include "GenerateNullFragmentShader.h"
+#include "LowerPopsInterlock.h"
+#include "LowerRayQueryWrapper.h"
+#include "llvmraytracing/Continuations.h"
+#include "lgc/LgcContext.h"
+#include "lgc/PassManager.h"
+#include "lgc/Pipeline.h"
+#include "lgc/builder/BuilderReplayer.h"
+#include "lgc/patch/AddLoopMetadata.h"
+#include "lgc/patch/ApplyWorkarounds.h"
+#include "lgc/patch/CheckShaderCache.h"
+#include "lgc/patch/CollectImageOperations.h"
+#include "lgc/patch/CollectResourceUsage.h"
+#include "lgc/patch/Continufy.h"
+#include "lgc/patch/FragmentColorExport.h"
+#include "lgc/patch/GenerateCopyShader.h"
+#include "lgc/patch/IncludeLlvmIr.h"
+#include "lgc/patch/LowerBufferOperations.h"
+#include "lgc/patch/LowerDebugPrintf.h"
+#include "lgc/patch/LowerDesc.h"
+#include "lgc/patch/LowerGpuRt.h"
+#include "lgc/patch/LowerImageDerivatives.h"
+#include "lgc/patch/LowerInOut.h"
+#include "lgc/patch/LowerInvariantLoads.h"
+#include "lgc/patch/LowerMulDx9Zero.h"
+#include "lgc/patch/LowerReadFirstLane.h"
+#include "lgc/patch/LowerSubgroupOps.h"
+#include "lgc/patch/MutateEntryPoint.h"
+#include "lgc/patch/PassthroughHullShader.h"
+#include "lgc/patch/PatchInitializeWorkgroupMemory.h"
+#include "lgc/patch/PeepholeOptimization.h"
+#include "lgc/patch/PreparePipelineAbi.h"
+#include "lgc/patch/ScalarizeLoads.h"
+#include "lgc/patch/SetupTargetFeatures.h"
+#include "lgc/patch/StructurizeBuffers.h"
+#include "lgc/patch/VertexFetch.h"
+
+#if LLPC_BUILD_STRIX1
+#include "lgc/patch/WorkaroundDsSubdwordWrite.h"
+#endif
+#include "lgc/patch/CombineCooperativeMatrix.h"
+#include "lgc/patch/LowerCooperativeMatrix.h"
+#include "lgc/state/AbiMetadata.h"
+#include "lgc/state/PipelineState.h"
+#include "lgc/state/TargetInfo.h"
+#include "lgc/util/Debug.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRPrinter/IRPrintingPasses.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+
+#define DEBUG_TYPE "lgc-patch"
+
+using namespace llvm;
+
+static const char LdsGsName[] = "Lds.GS";
+static const char LdsHsName[] = "Lds.HS";
+
+namespace lgc {
+
+// =====================================================================================================================
+// Add whole-pipeline patch passes to pass manager
+//
+// @param pipelineState : Pipeline state
+// @param [in/out] passMgr : Pass manager to add passes to
+// @param patchTimer : Timer to time patch passes with, nullptr if not timing
+// @param optTimer : Timer to time LLVM optimization passes with, nullptr if not timing
+// @param checkShaderCacheFunc : Callback function to check shader cache
+// @param optLevel : The optimization level uses to adjust the aggressiveness of
+//                   passes and which passes to add.
+void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, Timer *patchTimer, Timer *optTimer,
+                      Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel) {
+  // Start timer for patching passes.
+  if (patchTimer)
+    LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, true);
+
+  if (pipelineState->getOptions().useGpurt) {
+    passMgr.addPass(LowerRayQueryWrapper());
+  }
+  const auto indirectMode = pipelineState->getOptions().rtIndirectMode;
+  if (indirectMode == RayTracingIndirectMode::ContinuationsContinufy ||
+      indirectMode == RayTracingIndirectMode::Continuations) {
+    if (indirectMode == RayTracingIndirectMode::ContinuationsContinufy) {
+      passMgr.addPass(Continufy());
+      // NOTE: LowerGpuRt needs to be run before continuation transform for continufy mode because some GPURT dialects
+      // that continuation transform does not support are used.
+      passMgr.addPass(LowerGpuRt());
+    } else {
+      // NOTE: LowerRaytracingPipelinePass should be run before getting into LGC because we will need to collect
+      // metadata added by the pass.
+      // Optimize away the alloca's insert during lower-raytracing pipeline to avoid being put in continuation state.
+      passMgr.addPass(createModuleToFunctionPassAdaptor(SROAPass(llvm::SROAOptions::ModifyCFG)));
+    }
+
+    addLgcContinuationTransform(passMgr);
+  }
+
+  if (pipelineState->getOptions().useGpurt) {
+    // NOTE: Lower GPURT operations and run InstCombinePass before builder replayer, because some Op are going to be
+    // turned into constant value, so that we can eliminate unused `@lgc.load.buffer.desc` before getting into
+    // replayer. Otherwise, unnecessary `writes_uavs` and `uses_uav` may be set.
+    // NOTE: Lower GPURT operations after continuations transform, because we will inline some functions from GPURT
+    // library which may use gpurt dialect, and the library itself doesn't run any LGC passes.
+    passMgr.addPass(LowerGpuRt());
+    passMgr.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+  }
+
+  // NOTE: Replay after continuations transform, because we will inline some functions from GPURT library which may use
+  // lgc record ops, and the library itself doesn't run any LGC passes.
+  // We're using BuilderRecorder; replay the Builder calls now
+  passMgr.addPass(BuilderReplayer());
+  passMgr.addPass(LowerSubgroupOps());
+
+  if (raw_ostream *outs = getLgcOuts()) {
+    passMgr.addPass(PrintModulePass(*outs,
+                                    "===============================================================================\n"
+                                    "// LLPC pipeline before-patching results\n"));
+  }
+
+  passMgr.addPass(IPSCCPPass());
+  passMgr.addPass(createModuleToFunctionPassAdaptor(CombineCooperativeMatrix()));
+  // Lower the cooperative matrix
+  passMgr.addPass(LowerCooperativeMatrix());
+
+  if (pipelineState->hasShaderStage(ShaderStage::Vertex) && !pipelineState->hasShaderStage(ShaderStage::TessControl) &&
+      pipelineState->hasShaderStage(ShaderStage::TessEval))
+    passMgr.addPass(TcsPassthroughShader());
+
+  passMgr.addPass(GenerateNullFragmentShader());
+  passMgr.addPass(CollectResourceUsage()); // also removes inactive/unused resources
+
+  // CheckShaderCache depends on CollectResourceUsage
+  passMgr.addPass(CheckShaderCache(std::move(checkShaderCacheFunc)));
+
+  // First part of lowering to "AMDGCN-style"
+  passMgr.addPass(ApplyWorkarounds());
+  passMgr.addPass(GenerateCopyShader());
+  passMgr.addPass(LowerVertexFetch());
+  passMgr.addPass(LowerFragColorExport());
+  passMgr.addPass(LowerDebugPrintf());
+  passMgr.addPass(LowerDesc());
+  passMgr.addPass(MutateEntryPoint());
+  passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock()));
+  passMgr.addPass(PatchInitializeWorkgroupMemory());
+  passMgr.addPass(LowerInOut());
+
+  // Patch invariant load and loop metadata.
+  passMgr.addPass(createModuleToFunctionPassAdaptor(LowerInvariantLoads()));
+  passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(AddLoopMetadata())));
+
+#if LLPC_BUILD_STRIX1
+  passMgr.addPass(WorkaroundDsSubdwordWrite());
+#endif
+
+  if (patchTimer) {
+    LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, false);
+    LgcContext::createAndAddStartStopTimer(passMgr, optTimer, true);
+  }
+
+  addOptimizationPasses(passMgr, optLevel);
+
+  if (patchTimer) {
+    LgcContext::createAndAddStartStopTimer(passMgr, optTimer, false);
+    LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, true);
+  }
+
+  // Collect image operations
+  if (pipelineState->getTargetInfo().getGfxIpVersion().major >= 11)
+    passMgr.addPass(CollectImageOperations());
+
+  // Second part of lowering to "AMDGCN-style"
+  passMgr.addPass(PreparePipelineAbi());
+
+  // Do inlining and global DCE to inline subfunctions that were introduced during preparing pipeline ABI.
+  passMgr.addPass(AlwaysInlinerPass());
+  passMgr.addPass(GlobalDCEPass());
+
+  const bool canUseNgg = pipelineState->isGraphics() &&
+                         ((pipelineState->getTargetInfo().getGfxIpVersion().major == 10 &&
+                           (pipelineState->getOptions().nggFlags & NggFlagDisable) == 0) ||
+                          pipelineState->getTargetInfo().getGfxIpVersion().major >= 11); // Must enable NGG on GFX11+
+  if (canUseNgg) {
+    if (patchTimer) {
+      LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, false);
+      LgcContext::createAndAddStartStopTimer(passMgr, optTimer, true);
+    }
+
+    // Extra optimizations after NGG primitive shader creation
+    FunctionPassManager fpm;
+    fpm.addPass(PromotePass());
+    fpm.addPass(ADCEPass());
+    fpm.addPass(StructurizeBuffers());
+    fpm.addPass(PatchBufferOp());
+    fpm.addPass(InstCombinePass());
+    fpm.addPass(SimplifyCFGPass());
+    passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
+
+    if (patchTimer) {
+      LgcContext::createAndAddStartStopTimer(passMgr, optTimer, false);
+      LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, true);
+    }
+  } else {
+    FunctionPassManager fpm;
+    fpm.addPass(StructurizeBuffers());
+    fpm.addPass(PatchBufferOp());
+    fpm.addPass(InstCombinePass());
+    passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
+  }
+
+  passMgr.addPass(LowerImageDerivatives());
+
+  // Set up target features in shader entry-points.
+  // NOTE: Needs to be done after post-NGG function inlining, because LLVM refuses to inline something
+  // with conflicting attributes. Attributes could conflict on GFX10 because PatchSetupTargetFeatures
+  // adds a target feature to determine wave32 or wave64.
+  passMgr.addPass(PatchSetupTargetFeatures());
+
+  // Include LLVM IR as a separate section in the ELF binary
+  if (pipelineState->getOptions().includeIr)
+    passMgr.addPass(IncludeLlvmIr());
+
+  // Stop timer for patching passes.
+  if (patchTimer)
+    LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, false);
+
+  // Dump the result
+  if (raw_ostream *outs = getLgcOuts()) {
+    passMgr.addPass(PrintModulePass(*outs,
+                                    "===============================================================================\n"
+                                    "// LLPC pipeline patching results\n"));
+  }
+}
+
+// =====================================================================================================================
+// Register all the patching passes into the given pass manager
+//
+// @param [in/out] passMgr : Pass manager
+void Patch::registerPasses(lgc::PassManager &passMgr) {
+#define LLPC_PASS(NAME, CLASS) passMgr.registerPass(NAME, CLASS::name());
+#define LLPC_MODULE_ANALYSIS(NAME, CLASS) passMgr.registerPass(NAME, CLASS::name());
+#include "PassRegistry.inc"
+}
+
+// =====================================================================================================================
+// Register all the patching passes into the given pass manager
+//
+// @param [in/out] passMgr : Pass manager
+void Patch::registerPasses(PassBuilder &passBuilder) {
+#define HANDLE_PASS(NAME, CLASS)                                                                                       \
+  if (innerPipeline.empty() && name == NAME) {                                                                         \
+    passMgr.addPass(CLASS());                                                                                          \
+    return true;                                                                                                       \
+  }
+
+#define HANDLE_ANALYSIS(NAME, CLASS, IRUNIT)                                                                           \
+  if (innerPipeline.empty() && name == "require<" NAME ">") {                                                          \
+    passMgr.addPass(RequireAnalysisPass<CLASS, IRUNIT>());                                                             \
+    return true;                                                                                                       \
+  }                                                                                                                    \
+  if (innerPipeline.empty() && name == "invalidate<" NAME ">") {                                                       \
+    passMgr.addPass(InvalidateAnalysisPass<CLASS>());                                                                  \
+    return true;                                                                                                       \
+  }
+
+  auto checkNameWithParams = [](StringRef name, StringRef passName, StringRef &params) -> bool {
+    params = name;
+    if (!params.consume_front(passName))
+      return false;
+    if (params.empty())
+      return true;
+    if (!params.consume_front("<"))
+      return false;
+    if (!params.consume_back(">"))
+      return false;
+    return true;
+  };
+  (void)checkNameWithParams;
+
+#define HANDLE_PASS_WITH_PARSER(NAME, CLASS)                                                                           \
+  if (innerPipeline.empty() && checkNameWithParams(name, NAME, params))                                                \
+    return CLASS::parsePass(params, passMgr);
+
+  passBuilder.registerPipelineParsingCallback(
+      [=](StringRef name, ModulePassManager &passMgr, ArrayRef<PassBuilder::PipelineElement> innerPipeline) {
+        StringRef params;
+        (void)params;
+#define LLPC_PASS(NAME, CLASS) /* */
+#define LLPC_MODULE_PASS HANDLE_PASS
+#define LLPC_MODULE_PASS_WITH_PARSER HANDLE_PASS_WITH_PARSER
+#define LLPC_MODULE_ANALYSIS(NAME, CLASS) HANDLE_ANALYSIS(NAME, CLASS, Module)
+#include "PassRegistry.inc"
+
+        return false;
+      });
+
+  passBuilder.registerPipelineParsingCallback(
+      [=](StringRef name, FunctionPassManager &passMgr, ArrayRef<PassBuilder::PipelineElement> innerPipeline) {
+        StringRef params;
+        (void)params;
+#define LLPC_PASS(NAME, CLASS) /* */
+#define LLPC_FUNCTION_PASS HANDLE_PASS
+#define LLPC_FUNCTION_PASS_WITH_PARSER HANDLE_PASS_WITH_PARSER
+#include "PassRegistry.inc"
+
+        return false;
+      });
+
+  passBuilder.registerPipelineParsingCallback(
+      [=](StringRef name, LoopPassManager &passMgr, ArrayRef<PassBuilder::PipelineElement> innerPipeline) {
+        StringRef params;
+        (void)params;
+#define LLPC_PASS(NAME, CLASS) /* */
+#define LLPC_LOOP_PASS HANDLE_PASS
+#define LLPC_LOOP_PASS_WITH_PARSER HANDLE_PASS_WITH_PARSER
+#include "PassRegistry.inc"
+
+        return false;
+      });
+
+#undef HANDLE_PASS
+#undef HANDLE_PASS_WITH_PARSER
+}
+
+// =====================================================================================================================
+// Add optimization passes to pass manager
+//
+// @param [in/out] passMgr : Pass manager to add passes to
+// @param optLevel : The optimization level uses to adjust the aggressiveness of
+//                   passes and which passes to add.
+void Patch::addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel) {
+  LLPC_OUTS("PassManager optimization level = " << optLevel << "\n");
+
+  passMgr.addPass(ForceFunctionAttrsPass());
+  FunctionPassManager fpm;
+  fpm.addPass(InstCombinePass());
+  fpm.addPass(SimplifyCFGPass());
+  fpm.addPass(SROAPass(SROAOptions::ModifyCFG));
+  fpm.addPass(EarlyCSEPass(true));
+  fpm.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget = */ true));
+  fpm.addPass(CorrelatedValuePropagationPass());
+  fpm.addPass(SimplifyCFGPass());
+  fpm.addPass(AggressiveInstCombinePass());
+  fpm.addPass(InstCombinePass());
+  fpm.addPass(PeepholeOptimization());
+  fpm.addPass(SimplifyCFGPass());
+  fpm.addPass(ReassociatePass());
+  LoopPassManager lpm;
+  lpm.addPass(LoopRotatePass());
+  lpm.addPass(LICMPass(LICMOptions()));
+  fpm.addPass(createFunctionToLoopPassAdaptor(std::move(lpm), true));
+  fpm.addPass(SimplifyCFGPass());
+  fpm.addPass(InstCombinePass());
+  LoopPassManager lpm2;
+  lpm2.addPass(IndVarSimplifyPass());
+  lpm2.addPass(LoopIdiomRecognizePass());
+  lpm2.addPass(LoopDeletionPass());
+  fpm.addPass(createFunctionToLoopPassAdaptor(std::move(lpm2), true));
+  fpm.addPass(LoopUnrollPass(
+      LoopUnrollOptions(optLevel).setPeeling(true).setRuntime(false).setUpperBound(false).setPartial(false)));
+  fpm.addPass(SROAPass(SROAOptions::ModifyCFG));
+  ScalarizerPassOptions scalarizerOptions;
+  scalarizerOptions.ScalarizeMinBits = 32;
+  fpm.addPass(ScalarizerPass(scalarizerOptions));
+  fpm.addPass(LowerMulDx9Zero());
+  fpm.addPass(ScalarizeLoads());
+  fpm.addPass(InstSimplifyPass());
+  fpm.addPass(NewGVNPass());
+  fpm.addPass(BDCEPass());
+  fpm.addPass(InstCombinePass());
+  fpm.addPass(CorrelatedValuePropagationPass());
+  fpm.addPass(ADCEPass());
+  fpm.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
+  fpm.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                  .bonusInstThreshold(1)
+                                  .forwardSwitchCondToPhi(true)
+                                  .convertSwitchToLookupTable(true)
+                                  .needCanonicalLoops(true)
+                                  .hoistCommonInsts(true)
+                                  .sinkCommonInsts(true)));
+  fpm.addPass(LoopUnrollPass(LoopUnrollOptions(optLevel)));
+  fpm.addPass(SROAPass(SROAOptions::ModifyCFG));
+  // uses UniformityAnalysis
+  fpm.addPass(LowerReadFirstLane());
+  fpm.addPass(InferAlignmentPass());
+  fpm.addPass(InstCombinePass());
+  passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
+  passMgr.addPass(ConstantMergePass());
+  FunctionPassManager fpm2;
+  fpm2.addPass(DivRemPairsPass());
+  fpm2.addPass(SimplifyCFGPass());
+  passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm2)));
+}
+
+// =====================================================================================================================
+// Initializes the pass according to the specified module.
+//
+// NOTE: This function should be called at the beginning of "runOnModule()".
+//
+// @param module : LLVM module
+void Patch::init(Module *module) {
+  m_module = module;
+  m_context = &m_module->getContext();
+  m_shaderStage = std::nullopt;
+  m_entryPoint = nullptr;
+}
+
+// =====================================================================================================================
+// Get or create global variable for LDS.
+//
+// @param pipelineState : Pipeline state
+// @param [in/out] module : Module to get or create LDS in
+Constant *Patch::getLdsVariable(PipelineState *pipelineState, Function *func, bool rtStack) {
+  auto module = func->getParent();
+  auto context = &module->getContext();
+
+  auto stage = getShaderStage(func);
+  assert(stage && "unable to determine stage for LDS usage");
+
+  unsigned hwStageMask = pipelineState->getShaderHwStageMask(*stage);
+
+  ShaderStageEnum ldsStage;
+  const char *ldsName;
+  if (hwStageMask & Util::Abi::HwShaderGs) {
+    ldsName = LdsGsName;
+    ldsStage = ShaderStage::Geometry;
+  } else if (hwStageMask & Util::Abi::HwShaderHs) {
+    ldsName = LdsHsName;
+    ldsStage = ShaderStage::TessControl;
+  } else {
+    assert(false && "requesting LDS variable for unknown shader type");
+    return nullptr;
+  }
+
+  const unsigned staticLdsSize = pipelineState->getShaderStaticLdsUsage(ldsStage, /*rtStack=*/false);
+  const unsigned rtLdsSize = pipelineState->getShaderStaticLdsUsage(ldsStage, /*rtStack=*/true);
+  const unsigned ldsSize = staticLdsSize + rtLdsSize;
+
+  // See if module already has LDS variable.
+  GlobalVariable *lds = nullptr;
+  auto oldLds = func->getParent()->getNamedValue(ldsName);
+  const auto i32Ty = Type::getInt32Ty(*context);
+  if (oldLds) {
+    lds = cast<GlobalVariable>(oldLds);
+  } else {
+    // Else create LDS variable for this function.
+    // LDS type: [ldsSize * i32], address space 3
+    const auto ldsTy = ArrayType::get(i32Ty, ldsSize);
+    lds = new GlobalVariable(*module, ldsTy, false, GlobalValue::ExternalLinkage, nullptr, Twine(ldsName), nullptr,
+                             GlobalValue::NotThreadLocal, ADDR_SPACE_LOCAL);
+    lds->setAlignment(MaybeAlign(sizeof(unsigned)));
+  }
+
+  if (rtStack) {
+    auto *offset = Constant::getIntegerValue(i32Ty, APInt(32, staticLdsSize));
+    return ConstantExpr::getGetElementPtr(i32Ty, lds, offset);
+  }
+
+  return lds;
+}
+
+} // namespace lgc
diff --git a/lgc/test/NggInPassthroughMode.lgc b/lgc/test/NggInPassthroughMode.lgc
new file mode 100644
index 0000000000..0903941e79
--- /dev/null
+++ b/lgc/test/NggInPassthroughMode.lgc
@@ -0,0 +1,21 @@
+; RUN: lgc -march=amdgcn--amdpal -mcpu=gfx1100 -o - <%s | FileCheck %s
+
+; Check that NGG passthrough mode is used for that shader. If a s_sendmsg
+; instruction is generated on GFX11, passthrough mode is not used.
+
+define dllexport spir_func void @lgc.shader.VS.main() !spirv.ExecutionModel !3 !lgc.shaderstage !4 {
+; CHECK-NOT: s_sendmsg
+  %1 = insertelement <4 x float> <float poison, float poison, float poison, float 1.000000e+00>, float 0.0, i64 0
+  call void (...) @lgc.create.write.builtin.output(<4 x float> %1, i32 0, i32 0, i32 poison, i32 poison)
+  ret void
+}
+
+declare void @lgc.create.write.builtin.output(...)
+
+!lgc.options = !{!1}
+!lgc.input.assembly.state = !{!2}
+
+!1 = !{i32 -291355731, i32 1941901057, i32 1881874640, i32 2004622469, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 272, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 1}
+!2 = !{i32 3}
+!3 = !{i32 0}
+!4 = !{i32 1}
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf
new file mode 100644
index 0000000000..6814fa96dd
Binary files /dev/null and b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf differ
diff --git a/llpc/test/shaderdb/general/CantOptimizePointSizeWrite.pipe b/llpc/test/shaderdb/general/CantOptimizePointSizeWrite.pipe
new file mode 100644
index 0000000000..bf2520339d
--- /dev/null
+++ b/llpc/test/shaderdb/general/CantOptimizePointSizeWrite.pipe
@@ -0,0 +1,102 @@
+; This test is to verify the optimization of PointSize write cannot be performed. When the write value of PointSize is
+; not uniformly 1.0, we should disable the optimization. In such case, PointSize normally takes different values. The
+; missing writes of 1.0 is not semantically correct.
+
+; RUN: amdllpc -emit-llvm -print-after=lgc-collect-resource-usage %gfxip %s 2>&1 | FileCheck -check-prefix=SHADERTEST %s
+
+; SHADERTEST-LABEL: @lgc.shader.GS.main()
+; SHADERTEST: call void @lgc.output.export.builtin.PointSize.i32.i32.f32(i32 1, i32 1, float 1.000000e+00)
+; SHADERTEST: call void @lgc.output.export.builtin.PointSize.i32.i32.f32(i32 1, i32 1, float 2.000000e+00)
+
+[Version]
+version = 75
+
+[VsGlsl]
+#version 450
+
+void main(void)
+{
+}
+
+[VsInfo]
+entryPoint = main
+
+[GsGlsl]
+#version 450
+
+layout(points) in;
+layout(points, max_vertices = 16) out;
+layout(stream = 1) out;
+layout(location = 0) out vec4 color;
+
+layout(stream = 1) out gl_PerVertex
+{
+    vec4 gl_Position;
+    float gl_PointSize;
+};
+
+void main(void)
+{
+    // Color constants
+    vec4 g = vec4(0.0, 1.0, 0.0, 1.0);
+    vec4 m = vec4(1.0, 0.0, 1.0, 1.0);
+    // Coordinate constants: leftmost column
+    vec4 a = vec4(-1.0,-1.0, 0.0, 1.0);
+    vec4 b = vec4(-1.0, 0.0, 0.0, 1.0);
+    vec4 c = vec4(-1.0, 1.0, 0.0, 1.0);
+    // Coordinate constants: middle column
+    vec4 i = vec4( 0.0,-1.0, 0.0, 1.0);
+    vec4 j = vec4( 0.0, 0.0, 0.0, 1.0);
+    vec4 k = vec4( 0.0, 1.0, 0.0, 1.0);
+    // Coordinate constants: rightmost column
+    vec4 x = vec4( 1.0,-1.0, 0.0, 1.0);
+    vec4 y = vec4( 1.0, 0.0, 0.0, 1.0);
+    vec4 z = vec4( 1.0, 1.0, 0.0, 1.0);
+
+    if (gl_PrimitiveIDIn == 0)
+    {
+        color = g; gl_Position = (a + j) / 2.0f; gl_PointSize = 1.0f; EmitStreamVertex(0);
+        EndStreamPrimitive(0);
+        color = m; gl_Position = (b + k) / 2.0f; gl_PointSize = 1.0f; EmitStreamVertex(1);
+        EndStreamPrimitive(1);
+    }
+    else
+    {
+        color = g; gl_Position = (j + x) / 2.0f; gl_PointSize = 2.0f; EmitStreamVertex(0);
+        EndStreamPrimitive(0);
+        color = m; gl_Position = (k + y) / 2.0f; gl_PointSize = 2.0f; EmitStreamVertex(1);
+        EndStreamPrimitive(1);
+    }
+}
+
+[GsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 450
+
+layout(location = 0) in  vec4 i_color;
+layout(location = 0) out vec4 o_color;
+
+void main(void)
+{
+    o_color = i_color;
+}
+
+[FsInfo]
+entryPoint = main
+
+[ResourceMapping]
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST
+provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT
+depthClipEnable = 1
+rasterStream = 1
+numSamples = 1
+rasterStream = 1
+colorBuffer[0].format = VK_FORMAT_R8G8B8A8_UNORM
+colorBuffer[0].channelWriteMask = 15
+colorBuffer[0].blendEnable = 0
+colorBuffer[0].blendSrcAlphaToColor = 0
+options.optimizationLevel = 2
+options.optimizePointSizeWrite = 1
diff --git a/llpc/test/shaderdb/general/NggInCullingMode.pipe b/llpc/test/shaderdb/general/NggInCullingMode.pipe
new file mode 100644
index 0000000000..d2118773ac
--- /dev/null
+++ b/llpc/test/shaderdb/general/NggInCullingMode.pipe
@@ -0,0 +1,106 @@
+; This test is to verify NGG culling mode is enabled as expected.
+
+; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+
+; SHADERTEST-LABEL: .vgt_shader_stages_en
+; SHADERTEST: .primgen_en: 1
+; SHADERTEST: .primgen_passthru_en: 0
+	
+[Version]
+version = 75
+
+[VsGlsl]
+#version 430
+
+layout(location = 0) in vec4 in_position;
+layout(location = 1) in vec4 in_color;
+layout(location = 2) in int in_refVertexIndex;
+
+layout(location = 0) out vec4 out_color;
+
+void main() {
+  gl_Position = in_position;
+  if (gl_VertexIndex == in_refVertexIndex)
+    out_color = in_color;
+  else
+    out_color = vec4(1.0, 0.0, 0.0, 1.0);
+}
+
+[VsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 430
+
+layout(location = 0) in vec4 in_color;
+layout(location = 0) out vec4 out_color;
+
+void main() {
+  out_color = in_color;
+}
+
+[FsInfo]
+entryPoint = main
+
+[ResourceMapping]
+userDataNode[0].visibility = 2
+userDataNode[0].type = IndirectUserDataVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].indirectUserDataCount = 4
+userDataNode[1].visibility = 66
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 6
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBufferCompact
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 2
+userDataNode[1].next[0].set = 0x0000005D
+userDataNode[1].next[0].binding = 17
+userDataNode[1].next[0].strideInDwords = 0
+userDataNode[1].next[1].type = DescriptorConstBuffer
+userDataNode[1].next[1].offsetInDwords = 2
+userDataNode[1].next[1].sizeInDwords = 8
+userDataNode[1].next[1].set = 0x0000005D
+userDataNode[1].next[1].binding = 0
+userDataNode[1].next[1].strideInDwords = 0
+userDataNode[1].next[2].type = DescriptorBuffer
+userDataNode[1].next[2].offsetInDwords = 10
+userDataNode[1].next[2].sizeInDwords = 8
+userDataNode[1].next[2].set = 0x0000005D
+userDataNode[1].next[2].binding = 1
+userDataNode[1].next[2].strideInDwords = 0
+userDataNode[2].visibility = 4
+userDataNode[2].type = StreamOutTableVaPtr
+userDataNode[2].offsetInDwords = 2
+userDataNode[2].sizeInDwords = 1
+
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST
+colorBuffer[0].format = VK_FORMAT_R8G8B8A8_UNORM
+colorBuffer[0].channelWriteMask = 15
+colorBuffer[0].blendEnable = 0
+colorBuffer[0].blendSrcAlphaToColor = 0
+nggState.enableNgg = 1
+nggState.forceCullingMode = 1
+nggState.compactVertex = 1
+
+[VertexInputState]
+binding[0].binding = 0
+binding[0].stride = 36
+binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+attribute[0].location = 0
+attribute[0].binding = 0
+attribute[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
+attribute[0].offset = 0
+attribute[0].vbAddressLowBits = 0
+attribute[1].location = 1
+attribute[1].binding = 0
+attribute[1].format = VK_FORMAT_R32G32B32A32_SFLOAT
+attribute[1].offset = 16
+attribute[1].vbAddressLowBits = 0
+attribute[2].location = 2
+attribute[2].binding = 0
+attribute[2].format = VK_FORMAT_R32_SINT
+attribute[2].offset = 32
+attribute[2].vbAddressLowBits = 0
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestColorFormat_A8.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestColorFormat_A8.pipe
new file mode 100644
index 0000000000..cd2a1d69e7
--- /dev/null
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestColorFormat_A8.pipe
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --check-pal-metadata
+; Test color export format: VK_FORMAT_A8_UNORM_KHR which must contain alpha channel.
+
+; RUN: amdllpc -o - -filetype=asm %s | FileCheck -check-prefix=CHECK %s
+
+[Version]
+version = 65
+
+[VsGlsl]
+#version 450
+layout(push_constant, std430) uniform PushConstants
+{
+    float texWidth;
+    float texHeight;
+} pushc;
+
+layout(location = 0) in vec2 inPosition;
+layout(location = 1) in vec2 inTexCoord;
+layout(location = 0) out vec2 fragTexCoord;
+void main() {
+    gl_Position = vec4(inPosition, 0.0, 1.0);
+    fragTexCoord = vec2(inTexCoord.x, inTexCoord.y);
+}
+
+[VsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 450
+layout(set = 0, binding = 0) uniform sampler2D texSampler;
+layout(location = 0) in vec2 fragTexCoord;
+layout(location = 0) out vec4 outColor;
+void main() {
+    outColor = texture(texSampler, fragTexCoord);
+}
+
+[FsInfo]
+entryPoint = main
+
+[ResourceMapping]
+userDataNode[0].visibility = 64
+userDataNode[0].type = DescriptorTableVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].next[0].type = DescriptorCombinedTexture
+userDataNode[0].next[0].offsetInDwords = 0
+userDataNode[0].next[0].sizeInDwords = 12
+userDataNode[0].next[0].set = 0x00000000
+userDataNode[0].next[0].binding = 0
+userDataNode[0].next[0].strideInDwords = 0
+userDataNode[1].visibility = 66
+userDataNode[1].type = PushConst
+userDataNode[1].offsetInDwords = 1
+userDataNode[1].sizeInDwords = 2
+userDataNode[1].set = 0xFFFFFFFF
+userDataNode[1].binding = 0
+userDataNode[1].strideInDwords = 0
+userDataNode[2].visibility = 4
+userDataNode[2].type = StreamOutTableVaPtr
+userDataNode[2].offsetInDwords = 3
+userDataNode[2].sizeInDwords = 1
+userDataNode[3].visibility = 2
+userDataNode[3].type = IndirectUserDataVaPtr
+userDataNode[3].offsetInDwords = 4
+userDataNode[3].sizeInDwords = 1
+userDataNode[3].indirectUserDataCount = 4
+
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST
+colorBuffer[0].format = VK_FORMAT_A8_UNORM_KHR
+colorBuffer[0].channelWriteMask = 15
+colorBuffer[0].blendEnable = 0
+colorBuffer[0].blendSrcAlphaToColor = 0
+
+[VertexInputState]
+binding[0].binding = 0
+binding[0].stride = 16
+binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+attribute[0].location = 0
+attribute[0].binding = 0
+attribute[0].format = VK_FORMAT_R32G32_SFLOAT
+attribute[0].offset = 0
+attribute[1].location = 1
+attribute[1].binding = 0
+attribute[1].format = VK_FORMAT_R32G32_SFLOAT
+attribute[1].offset = 8
+
+; CHECK-LABEL: amdgpu_vs_main:
+; CHECK:         s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_mov_b32 s0, s1
+; CHECK-NEXT:    s_mov_b32 s1, s5
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 1.0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[4:7], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    exp pos0 v0, v1, v5, v4 done
+; CHECK-NEXT:    exp param0 v2, v3, off, off
+; CHECK-NEXT:    s_endpgm
+;
+; CHECK-LABEL: amdgpu_ps_main:
+; CHECK:         s_mov_b64 s[12:13], exec
+; CHECK-NEXT:    s_wqm_b64 exec, exec
+; CHECK-NEXT:    s_mov_b32 s8, s1
+; CHECK-NEXT:    s_getpc_b64 s[0:1]
+; CHECK-NEXT:    s_mov_b32 m0, s2
+; CHECK-NEXT:    s_mov_b32 s9, s1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x20
+; CHECK-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
+; CHECK-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
+; CHECK-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
+; CHECK-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
+; CHECK-NEXT:    s_and_b64 exec, exec, s[12:13]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s12, s3, 0xfffffff
+; CHECK-NEXT:    s_cmp_lt_i32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s3, s3, s12
+; CHECK-NEXT:    image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v1, v2, v3
+; CHECK-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; CHECK-NEXT:    s_endpgm
+;
+; CHECK-LABEL: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:       .aa_coverage_to_shader_select: InputCoverage
+; CHECK-NEXT:       .cb_shader_mask:
+; CHECK-NEXT:         .output0_enable: 0xf
+; CHECK-NEXT:         .output1_enable: 0
+; CHECK-NEXT:         .output2_enable: 0
+; CHECK-NEXT:         .output3_enable: 0
+; CHECK-NEXT:         .output4_enable: 0
+; CHECK-NEXT:         .output5_enable: 0
+; CHECK-NEXT:         .output6_enable: 0
+; CHECK-NEXT:         .output7_enable: 0
+; CHECK-NEXT:       .db_shader_control:
+; CHECK-NEXT:         .alpha_to_mask_disable: true
+; CHECK-NEXT:         .conservative_z_export: 0
+; CHECK-NEXT:         .depth_before_shader: 0
+; CHECK-NEXT:         .exec_on_hier_fail: false
+; CHECK-NEXT:         .exec_on_noop:   false
+; CHECK-NEXT:         .kill_enable:    false
+; CHECK-NEXT:         .mask_export_enable: false
+; CHECK-NEXT:         .pre_shader_depth_coverage_enable: 0
+; CHECK-NEXT:         .primitive_ordered_pixel_shader: false
+; CHECK-NEXT:         .stencil_test_val_export_enable: 0
+; CHECK-NEXT:         .z_export_enable: 0
+; CHECK-NEXT:         .z_order:        0x1
+; CHECK-NEXT:       .ia_multi_vgt_param:
+; CHECK-NEXT:         .primgroup_size: 0x7f
+; CHECK-NEXT:       .pa_cl_clip_cntl:
+; CHECK-NEXT:         .dx_linear_attr_clip_ena: true
+; CHECK-NEXT:         .rasterization_kill: false
+; CHECK-NEXT:         .vte_vport_provoke_disable: false
+; CHECK-NEXT:       .pa_cl_vte_cntl:
+; CHECK-NEXT:         .vtx_w0_fmt:     true
+; CHECK-NEXT:         .x_offset_ena:   true
+; CHECK-NEXT:         .x_scale_ena:    true
+; CHECK-NEXT:         .y_offset_ena:   true
+; CHECK-NEXT:         .y_scale_ena:    true
+; CHECK-NEXT:         .z_offset_ena:   true
+; CHECK-NEXT:         .z_scale_ena:    true
+; CHECK-NEXT:       .pa_sc_shader_control:
+; CHECK-NEXT:         .wave_break_region_size: 0
+; CHECK-NEXT:       .pa_su_vtx_cntl:
+; CHECK-NEXT:         .pix_center:     0x1
+; CHECK-NEXT:         .quant_mode:     0x5
+; CHECK-NEXT:         .round_mode:     0x2
+; CHECK-NEXT:       .ps_extra_lds_size: 0
+; CHECK-NEXT:       .ps_iter_sample: false
+; CHECK-NEXT:       .spi_baryc_cntl:
+; CHECK-NEXT:         .front_face_all_bits: true
+; CHECK-NEXT:         .pos_float_location: 0
+; CHECK-NEXT:       .spi_ps_in_control:
+; CHECK-NEXT:         .num_interps:    0x1
+; CHECK-NEXT:         .ps_w32_en:      false
+; CHECK-NEXT:       .spi_ps_input_addr:
+; CHECK-NEXT:         .ancillary_ena:  false
+; CHECK-NEXT:         .front_face_ena: false
+; CHECK-NEXT:         .line_stipple_tex_ena: false
+; CHECK-NEXT:         .linear_center_ena: false
+; CHECK-NEXT:         .linear_centroid_ena: false
+; CHECK-NEXT:         .linear_sample_ena: false
+; CHECK-NEXT:         .persp_center_ena: true
+; CHECK-NEXT:         .persp_centroid_ena: false
+; CHECK-NEXT:         .persp_pull_model_ena: false
+; CHECK-NEXT:         .persp_sample_ena: false
+; CHECK-NEXT:         .pos_fixed_pt_ena: false
+; CHECK-NEXT:         .pos_w_float_ena: false
+; CHECK-NEXT:         .pos_x_float_ena: false
+; CHECK-NEXT:         .pos_y_float_ena: false
+; CHECK-NEXT:         .pos_z_float_ena: false
+; CHECK-NEXT:         .sample_coverage_ena: false
+; CHECK-NEXT:       .spi_ps_input_cntl:
+; CHECK-NEXT:         - .attr0_valid:    0
+; CHECK-NEXT:           .attr1_valid:    0
+; CHECK-NEXT:           .flat_shade:     false
+; CHECK-NEXT:           .fp16_interp_mode: false
+; CHECK-NEXT:           .offset:         0
+; CHECK-NEXT:           .prim_attr:      false
+; CHECK-NEXT:           .pt_sprite_tex:  false
+; CHECK-NEXT:       .spi_ps_input_ena:
+; CHECK-NEXT:         .ancillary_ena:  false
+; CHECK-NEXT:         .front_face_ena: false
+; CHECK-NEXT:         .line_stipple_tex_ena: false
+; CHECK-NEXT:         .linear_center_ena: false
+; CHECK-NEXT:         .linear_centroid_ena: false
+; CHECK-NEXT:         .linear_sample_ena: false
+; CHECK-NEXT:         .persp_center_ena: true
+; CHECK-NEXT:         .persp_centroid_ena: false
+; CHECK-NEXT:         .persp_pull_model_ena: false
+; CHECK-NEXT:         .persp_sample_ena: false
+; CHECK-NEXT:         .pos_fixed_pt_ena: false
+; CHECK-NEXT:         .pos_w_float_ena: false
+; CHECK-NEXT:         .pos_x_float_ena: false
+; CHECK-NEXT:         .pos_y_float_ena: false
+; CHECK-NEXT:         .pos_z_float_ena: false
+; CHECK-NEXT:         .sample_coverage_ena: false
+; CHECK-NEXT:       .spi_shader_col_format:
+; CHECK-NEXT:         .col_0_export_format: 0x4
+; CHECK-NEXT:         .col_1_export_format: 0
+; CHECK-NEXT:         .col_2_export_format: 0
+; CHECK-NEXT:         .col_3_export_format: 0
+; CHECK-NEXT:         .col_4_export_format: 0
+; CHECK-NEXT:         .col_5_export_format: 0
+; CHECK-NEXT:         .col_6_export_format: 0
+; CHECK-NEXT:         .col_7_export_format: 0
+; CHECK-NEXT:       .spi_shader_pos_format:
+; CHECK-NEXT:         - 0x4
+; CHECK-NEXT:         - 0
+; CHECK-NEXT:         - 0
+; CHECK-NEXT:         - 0
+; CHECK-NEXT:         - 0
+; CHECK-NEXT:       .spi_vs_out_config:
+; CHECK-NEXT:         .vs_export_count: 0
+; CHECK-NEXT:       .vgt_reuse_off:  false
+; CHECK-NEXT:       .vgt_shader_stages_en:
+; CHECK-NEXT:         .max_primgroup_in_wave: 0x2
+; CHECK-NEXT:         .vs_stage_en:    0
+; CHECK-NEXT:         .vs_w32_en:      true
+; CHECK-NEXT:       .vgt_strmout_buffer_config:
+; CHECK-NEXT:         .stream_0_buffer_en: 0
+; CHECK-NEXT:         .stream_1_buffer_en: 0
+; CHECK-NEXT:         .stream_2_buffer_en: 0
+; CHECK-NEXT:         .stream_3_buffer_en: 0
+; CHECK-NEXT:       .vgt_strmout_config:
+; CHECK-NEXT:         .streamout_0_en: false
+; CHECK-NEXT:         .streamout_1_en: false
+; CHECK-NEXT:         .streamout_2_en: false
+; CHECK-NEXT:         .streamout_3_en: false
+; CHECK-NEXT:       .vs_so_base0_en: false
+; CHECK-NEXT:       .vs_so_base1_en: false
+; CHECK-NEXT:       .vs_so_base2_en: false
+; CHECK-NEXT:       .vs_so_base3_en: false
+; CHECK-NEXT:       .vs_streamout_en: false
+;
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_Test_unused_outputs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_Test_unused_outputs.pipe
new file mode 100644
index 0000000000..f8c5ef348a
--- /dev/null
+++ b/llpc/test/shaderdb/general/PipelineVsFs_Test_unused_outputs.pipe
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
+; RUN: amdllpc -v -gfxip 11.0 %s | FileCheck -check-prefix=CHECK %s
+
+[Version]
+version = 72
+
+[FsGlsl]
+#version 450
+
+layout(location = 0) out vec4 outColor[4];
+
+void main() {
+  outColor[0] = vec4(1.0, 0.0, 1.0, 1.0);
+  outColor[3] = vec4(1.0, 0.0, 1.0, 1.0);
+}
+
+[FsInfo]
+entryPoint = main
+
+[GraphicsPipelineState]
+colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
+colorBuffer[0].channelWriteMask = 15
+colorBuffer[0].blendEnable = 0
+colorBuffer[1].format = VK_FORMAT_R32G32B32A32_SFLOAT
+colorBuffer[1].channelWriteMask = 15
+colorBuffer[1].blendEnable = 0
+colorBuffer[2].format = VK_FORMAT_R32G32B32A32_SFLOAT
+colorBuffer[2].channelWriteMask = 15
+colorBuffer[2].blendEnable = 0
+colorBuffer[3].format = VK_FORMAT_R32G32B32A32_SFLOAT
+colorBuffer[3].channelWriteMask = 15
+colorBuffer[3].blendEnable = 0
+enableColorExportShader = 1
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 1, i32 0, <4 x float> poison) #[[ATTR1]]
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 2, i32 0, <4 x float> poison) #[[ATTR1]]
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 3, i32 0, <4 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>) #[[ATTR1]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @_amdgpu_ps_main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[COMPOSITEDATA:%.*]], 7
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], -4294967296
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[COLOREXPADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; CHECK-NEXT:    call amdgpu_gfx addrspace(4) void [[TMP6]](<4 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 inreg [[TMP2]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    unreachable
+;
+;
+; CHECK-LABEL: @color_export_shader(
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], i1 false, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 true, i1 true)
+; CHECK-NEXT:    call void @llvm.amdgcn.endpgm()
+; CHECK-NEXT:    unreachable
+;
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe
new file mode 100644
index 0000000000..09db5cfe81
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe
@@ -0,0 +1,151 @@
+; Check that the ray tracing continuations mode is working.
+; Generating the instruction 'image_bvh64_intersect_ray' indicates the trace ray library is linked correctly.
+
+; TODO: Change this to ISA / assembly output checks once the LLVM backend has settled
+
+; RUN: amdllpc -gfxip 11.0 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK %s
+
+; CHECK-LABEL: @_amdgpu_cs_main(
+; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
+
+; CHECK-LABEL: @_rgen_1(
+; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
+
+; CHECK-LABEL: @_rgen_1.resume.0(
+; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
+; CHECK:     unreachable
+; CHECK:     ret void
+
+; CHECK-LABEL: @_chit_2(
+; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
+
+; CHECK-LABEL: @_cs_(
+; CHECK:     call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.
+; CHECK-NOT: ret void
+; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
+; CHECK-NOT: ret void
+
+[Version]
+version = 69
+
+[rgenGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  vec3 color;
+};
+
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
+layout(binding = 1, set = 0, rgba32f) uniform image2D g_dst;
+
+layout(location = 14) rayPayloadEXT RayPayload g_ray;
+
+void main() {
+  vec3 origin;
+  origin.x = gl_LaunchIDEXT.x;
+  origin.y = gl_LaunchIDEXT.y;
+  origin.z = 0;
+
+  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff, 
+              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
+              origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0),
+              /* tmax */ 48.0, /* payload location */ 14);
+
+  imageStore(g_dst, ivec2(gl_LaunchIDEXT.xy), vec4(g_ray.color, 0));
+}
+
+[rgenInfo]
+entryPoint = main
+
+[chitGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  vec3 color;
+};
+
+layout(shaderRecordEXT, std430) buffer sbt {
+  float z;
+};
+
+hitAttributeEXT vec2 g_hit;
+rayPayloadInEXT RayPayload g_ray;
+
+void main() {
+  g_ray.color.xy = g_hit;
+  g_ray.color.z = z;
+}
+
+[chitInfo]
+entryPoint = main
+
+[ResourceMapping]
+userDataNode[0].visibility = 0xffffffff
+userDataNode[0].type = DescriptorTableVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].next[0].type = DescriptorConstBuffer
+userDataNode[0].next[0].offsetInDwords = 0
+userDataNode[0].next[0].sizeInDwords = 4
+userDataNode[0].next[0].set = 0x00000000
+userDataNode[0].next[0].binding = 0
+userDataNode[0].next[1].type = DescriptorImage
+userDataNode[0].next[1].offsetInDwords = 4
+userDataNode[0].next[1].sizeInDwords = 8
+userDataNode[0].next[1].set = 0x00000000
+userDataNode[0].next[1].binding = 1
+userDataNode[1].visibility = 0xffffffff
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 1
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBufferCompact
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 2
+userDataNode[1].next[0].set = 0x0000005D
+userDataNode[1].next[0].binding = 17
+userDataNode[1].next[1].type = DescriptorConstBuffer
+userDataNode[1].next[1].offsetInDwords = 2
+userDataNode[1].next[1].sizeInDwords = 4
+userDataNode[1].next[1].set = 0x0000005D
+userDataNode[1].next[1].binding = 0
+userDataNode[1].next[2].type = DescriptorBuffer
+userDataNode[1].next[2].offsetInDwords = 6
+userDataNode[1].next[2].sizeInDwords = 4
+userDataNode[1].next[2].set = 0x0000005D
+userDataNode[1].next[2].binding = 1
+
+[RayTracingPipelineState]
+groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
+groups[0].generalShader = 0
+groups[0].closestHitShader = -1
+groups[0].anyHitShader = -1
+groups[0].intersectionShader = -1
+groups[1].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR
+groups[1].closestHitShader = 1
+maxRecursionDepth = 1
+indirectStageMask = 0xffffffff
+mode = 3
+rtState.bvhResDescSize = 4
+rtState.bvhResDesc[0] = 0
+rtState.bvhResDesc[1] = 2197815296
+rtState.bvhResDesc[2] = 4294967295
+rtState.bvhResDesc[3] = 2164261887
+rtState.nodeStrideShift = 7
+rtState.threadGroupSizeX = 8
+rtState.threadGroupSizeY = 4
+rtState.threadGroupSizeZ = 1
+rtState.rayQueryCsSwizzle = 1
+rtState.ldsStackSize = 16
+rtState.dispatchRaysThreadGroupSize = 32
+rtState.ldsSizePerThreadGroup = 65536
+rtState.outerTileSize = 4
+rtState.dispatchDimSwizzleMode = 0
+rtState.enableDispatchRaysInnerSwizzle = 1
+rtState.enableDispatchRaysOuterSwizzle = 1
+rtState.enableOptimalLdsStackSizeForIndirect = 1
+rtState.enableOptimalLdsStackSizeForUnified = 1
+payloadSizeMaxInLib = 12
+attributeSizeMaxInLib = 8
+hasPipelineLibrary = 1
diff --git a/llpc/test/shaderdb/ray_tracing/TestHitAttribute.rint b/llpc/test/shaderdb/ray_tracing/TestHitAttribute.rint
new file mode 100644
index 0000000000..1fe4112fc3
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/TestHitAttribute.rint
@@ -0,0 +1,18 @@
+// BEGIN_SHADERTEST
+/*
+; Check that a hit attribute value that is not reported is never written out.
+; RUN: amdllpc -filetype=asm %gfxip -o - %s | FileCheck -check-prefix=CHECK %s
+; CHECK: 0x12345678
+; CHECK-NOT: 0x87654321
+*/
+// END_SHADERTEST
+
+#version 460
+#extension GL_EXT_ray_tracing : require
+hitAttributeEXT int attr;
+void main()
+{
+  attr = 0x12345678;
+  reportIntersectionEXT(1.0f, 0);
+  attr = 0x87654321;
+}
diff --git a/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen b/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen
new file mode 100644
index 0000000000..9cc5a5e406
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen
@@ -0,0 +1,16 @@
+// RUN: amdllpc %gfxip --print-after=lower-gpurt-library 2>&1 %s | FileCheck -check-prefix=CHECK %s
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+void main()
+{
+}
+// Check these _Amd intrinsics's bodies are deleted.
+// CHECK: declare dso_local spir_func i32 @_AmdGetShaderKind()
+// CHECK: declare dso_local spir_func i32 @_AmdGetResumePointAddr()
+// CHECK: declare dso_local spir_func {{.*}} @_AmdAwait{{.*}}(
+
+// Check these functions started with `_Amd` but are not intrinsics are preserved
+// CHECK-NOT: declare dso_local spir_func {{.*}} @_AmdSystemData.{{.*}}(
+// CHECK-NOT: declare dso_local spir_func {{.*}} @_AmdDispatchSystemData.{{.*}}(
+// CHECK-NOT: declare dso_local spir_func {{.*}} @_AmdPrimitiveSystemState.{{.*}}(
diff --git a/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp b/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp
new file mode 100644
index 0000000000..69a4a8f823
--- /dev/null
+++ b/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp
@@ -0,0 +1,226 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- DXILContPrepareGpurtLibrary.cpp - Change signature of functions -------===//
+//
+// A pass that prepares driver implemented functions for later use.
+//
+// This pass unmangles function names and changes sret arguments back to
+// return values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "compilerutils/ArgPromotion.h"
+#include "compilerutils/DxilUtils.h"
+#include "llvmraytracing/Continuations.h"
+#include "llvmraytracing/ContinuationsUtil.h"
+#include "lgc/LgcRtDialect.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dxil-cont-prepare-gpurt-library"
+
+DXILContPrepareGpurtLibraryPass::DXILContPrepareGpurtLibraryPass() {
+}
+
+/// - Unmangle the function names to be more readable and to prevent confusion
+/// with app defined functions later.
+/// - Convert sret arguments back to return values
+/// - Convert struct pointer arguments to pass structs by value
+static Function *transformFunction(Function &F) {
+  {
+    // Local scope for Name which is invalidated at the end.
+    auto Name = F.getName();
+    LLVM_DEBUG(dbgs() << "Transforming function " << Name << "\n");
+    // Copy name, otherwise it will be deleted before it is set
+    std::string NewName = CompilerUtils::dxil::tryDemangleFunctionName(Name.str()).str();
+
+    LLVM_DEBUG(dbgs() << "  Set new name " << NewName << "\n");
+    F.setName(NewName);
+
+    if (NewName == ContDriverFunc::TraversalName)
+      lgc::rt::setLgcRtShaderStage(&F, lgc::rt::RayTracingShaderStage::Traversal);
+    else if (NewName == ContDriverFunc::KernelEntryName)
+      lgc::rt::setLgcRtShaderStage(&F, lgc::rt::RayTracingShaderStage::KernelEntry);
+  }
+
+  // Unpack the inner type of @class.matrix types
+  Type *NewRetTy = F.getReturnType();
+  Function *NewFn = &F;
+  if (NewRetTy->isStructTy() && NewRetTy->getStructNumElements() == 1) {
+    if (F.getName().contains("ObjectToWorld4x3") || F.getName().contains("WorldToObject4x3")) {
+      NewFn = CompilerUtils::unpackStructReturnType(NewFn);
+    }
+  }
+
+  // Lower `StructRet` argument.
+  if (NewFn->hasStructRetAttr())
+    NewFn = CompilerUtils::lowerStructRetArgument(NewFn);
+
+  SmallBitVector PromotionMask(NewFn->arg_size());
+
+  StringRef NameStr = NewFn->getName();
+  for (unsigned ArgNo = 0; ArgNo < NewFn->arg_size(); ArgNo++) {
+    auto *Arg = NewFn->getArg(ArgNo);
+    TypedArgTy ArgTy = TypedArgTy::get(Arg);
+    if (!ArgTy.isPointerTy())
+      continue;
+
+    if ((NameStr.contains("Await") || NameStr.contains("Enqueue") || NameStr.contains("Traversal") ||
+         (NameStr == ContDriverFunc::SetTriangleHitAttributesName && ArgNo != 0)))
+      PromotionMask.set(ArgNo);
+  }
+  // Promote pointer arguments to their pointee value types.
+  NewFn = CompilerUtils::promotePointerArguments(NewFn, PromotionMask);
+
+  NewFn->addFnAttr(Attribute::AlwaysInline);
+  // Set external linkage, so the functions don't get removed, even if they are
+  // never referenced at this point
+  NewFn->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
+  return NewFn;
+}
+
+static bool isGpuRtFuncName(StringRef Name) {
+  for (const auto &Intr : LgcRtGpuRtMap) {
+    if (Name.contains(Intr.second.Name))
+      return true;
+  }
+
+  return false;
+}
+
+static bool isUtilFunction(StringRef Name) {
+  static const char *UtilNames[] = {
+      "AcceptHit",
+      "Await",
+      "Complete",
+      "ContinuationStackIsGlobal",
+      "ContStack",
+      "Enqueue", // To detect the mangled name of a declaration
+      "ExitRayGen",
+      "GetCandidateState",
+      "GetCommittedState",
+      "GetContinuationStackAddr",
+      "GetContinuationStackGlobalMemBase",
+      "GetCurrentFuncAddr",
+      "GetFuncAddr",
+      "GetI32",
+      "GetLocalRootIndex",
+      "GetResumePointAddr",
+      "GetRtip",
+      "GetSetting",
+      "GetShaderKind",
+      "GetTriangleHitAttributes",
+      "GetUninitialized",
+      "GpurtVersionFlags",
+      "I32Count",
+      "IsEndSearch",
+      "KernelEntry",
+      "ReportHit",
+      "RestoreSystemData",
+      "SetI32",
+      "SetTriangleHitAttributes",
+      "TraceRay",
+      "Traversal",
+      "ShaderStart",
+  };
+
+  for (const char *UtilName : UtilNames) {
+    if (Name.contains(UtilName))
+      return true;
+  }
+
+  return false;
+}
+
+static void handleIsLlpc(Function &Func) {
+  assert(Func.arg_empty()
+         // bool
+         && Func.getFunctionType()->getReturnType()->isIntegerTy(1));
+
+  auto *FalseConst = ConstantInt::getFalse(Func.getContext());
+  llvm::replaceCallsToFunction(Func, *FalseConst);
+}
+
+static void handleGetShaderRecordIndex(llvm_dialects::Builder &B, Function &Func) {
+  assert(Func.arg_empty()
+         // bool
+         && Func.getFunctionType()->getReturnType()->isIntegerTy(32));
+
+  llvm::forEachCall(Func, [&](CallInst &CInst) {
+    B.SetInsertPoint(&CInst);
+    auto *ShaderIndexCall = B.create<lgc::rt::ShaderIndexOp>();
+    CInst.replaceAllUsesWith(ShaderIndexCall);
+    CInst.eraseFromParent();
+  });
+}
+
+llvm::PreservedAnalyses DXILContPrepareGpurtLibraryPass::run(llvm::Module &M,
+                                                             llvm::ModuleAnalysisManager &AnalysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the dxil-cont-prepare-gpurt-library pass\n");
+
+  AnalysisManager.getResult<DialectContextAnalysis>(M);
+
+  SmallVector<Function *> Funcs(make_pointer_range(M.functions()));
+
+  llvm_dialects::Builder B{M.getContext()};
+
+  for (auto *F : Funcs) {
+    auto Name = F->getName();
+    bool ShouldTransform = false;
+
+    if (Name.contains("_cont_")) {
+      if (isGpuRtFuncName(Name))
+        ShouldTransform = true;
+      else if (isUtilFunction(Name))
+        ShouldTransform = true;
+    } else if (Name.contains("_Amd")) {
+      if (isUtilFunction(Name)) {
+        ShouldTransform = true;
+      } else if (Name.contains("IsLlpc")) {
+        ShouldTransform = false;
+        handleIsLlpc(*F);
+      } else if (Name.contains("GetShaderRecordIndex")) {
+        ShouldTransform = false;
+        handleGetShaderRecordIndex(B, *F);
+      }
+    }
+
+    if (ShouldTransform)
+      transformFunction(*F);
+  }
+
+  fixupDxilMetadata(M);
+
+  earlyGpurtTransform(M);
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll
new file mode 100644
index 0000000000..e8b6b2646e
--- /dev/null
+++ b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%struct.DispatchSystemData = type { i32 }
+%struct.TraversalData = type { %struct.SystemData, i32, i64 }
+%struct.SystemData = type { %struct.DispatchSystemData, float }
+
+; Function Attrs: nounwind memory(none)
+define i32 @_cont_GetContinuationStackAddr() #0 {
+  ret i32 1
+}
+
+; Function Attrs: nounwind
+define void @_cont_TraceRay(%struct.DispatchSystemData* noalias nocapture sret(%struct.DispatchSystemData) %agg.result, %struct.DispatchSystemData* nocapture readonly %data, i64 %accelStruct, i32 %rayFlags, i32 %instanceInclusioMask, i32 %rayContributionToHitGroupIndex, i32 %multiplierForGeometryContributionToShaderIndex, i32 %missShaderIndex, float %originX, float %originY, float %originZ, float %tMin, float %dirX, float %dirY, float %dirZ, float %tMax) #1 !pointeetys !2 {
+  %1 = alloca %struct.TraversalData, align 4
+  %2 = alloca %struct.DispatchSystemData, align 4
+  %3 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0
+  %4 = load i32, i32* %3, align 4
+  %5 = bitcast %struct.TraversalData* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %5) #3
+  %6 = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 0, i32 0, i32 0
+  store i32 %4, i32* %6, align 4
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %a = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 2
+  store i64 %addr, i64* %a, align 4
+  call void @"\01?_AmdWaitAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* nonnull sret(%struct.DispatchSystemData) %2, i64 3, i64 -1, %struct.TraversalData* nonnull %1) #3
+  %7 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %2, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4
+  %9 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %agg.result, i32 0, i32 0
+  store i32 %8, i32* %9, align 4
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %5) #3
+  ret void
+}
+
+declare !pointeetys !3 void @"\01?_AmdWaitAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* sret(%struct.DispatchSystemData), i64, i64, %struct.TraversalData*) #2
+
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !5 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !5 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
+
+attributes #0 = { nounwind memory(none) "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!0 = !{%struct.DispatchSystemData poison}
+!1 = !{i32 0, %struct.DispatchSystemData poison}
+!2 = !{null, %struct.DispatchSystemData poison, %struct.DispatchSystemData poison}
+!3 = !{null, %struct.DispatchSystemData poison, null, null, %struct.TraversalData poison}
+!4 = !{i32 0, %struct.TraversalData poison}
+!5 = !{i8 poison}
+!6 = !{i32 0, i8 poison}
+; CHECK-LABEL: define i32 @_cont_GetContinuationStackAddr(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 1
+;
+;
+; CHECK-LABEL: define %struct.DispatchSystemData @_cont_TraceRay(
+; CHECK-SAME: ptr nocapture readonly [[DATA:%.*]], i64 [[ACCELSTRUCT:%.*]], i32 [[RAYFLAGS:%.*]], i32 [[INSTANCEINCLUSIOMASK:%.*]], i32 [[RAYCONTRIBUTIONTOHITGROUPINDEX:%.*]], i32 [[MULTIPLIERFORGEOMETRYCONTRIBUTIONTOSHADERINDEX:%.*]], i32 [[MISSSHADERINDEX:%.*]], float [[ORIGINX:%.*]], float [[ORIGINY:%.*]], float [[ORIGINZ:%.*]], float [[TMIN:%.*]], float [[DIRX:%.*]], float [[DIRY:%.*]], float [[DIRZ:%.*]], float [[TMAX:%.*]]) #[[ATTR1:[0-9]+]] !pointeetys [[META0:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[DATA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast ptr [[TMP1]] to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[TMP6]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR5]]
+; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2
+; CHECK-NEXT:    store i64 [[ADDR]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]]) #[[ATTR5]], !waitmask [[META1:![0-9]+]]
+; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[TMP6]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]]
+;
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll
new file mode 100644
index 0000000000..29129961a8
--- /dev/null
+++ b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%struct.DispatchSystemData = type { i32 }
+%struct.TraversalData = type { %struct.SystemData, i32, i32 }
+%struct.SystemData = type { %struct.DispatchSystemData, float }
+
+; Function Attrs: nounwind memory(none)
+define i32 @_cont_GetContinuationStackAddr() #0 {
+  ret i32 1
+}
+
+; Function Attrs: nounwind
+define void @_cont_TraceRay(%struct.DispatchSystemData* noalias nocapture sret(%struct.DispatchSystemData) %agg.result, %struct.DispatchSystemData* nocapture readonly %data, i64 %accelStruct, i32 %rayFlags, i32 %instanceInclusioMask, i32 %rayContributionToHitGroupIndex, i32 %multiplierForGeometryContributionToShaderIndex, i32 %missShaderIndex, float %originX, float %originY, float %originZ, float %tMin, float %dirX, float %dirY, float %dirZ, float %tMax) #1 !pointeetys !2 {
+  %1 = alloca %struct.TraversalData, align 4
+  %2 = alloca %struct.DispatchSystemData, align 4
+  %3 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0
+  %4 = load i32, i32* %3, align 4
+  %5 = bitcast %struct.TraversalData* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %5) #3
+  %6 = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 0, i32 0, i32 0
+  store i32 %4, i32* %6, align 4
+  %addr = call i32 @_AmdGetResumePointAddr() #3
+  %a = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 2
+  store i32 %addr, i32* %a, align 4
+  call void @"\01?_AmdAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* nonnull sret(%struct.DispatchSystemData) %2, i64 3, %struct.TraversalData* nonnull %1) #3
+  %7 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %2, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4
+  %9 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %agg.result, i32 0, i32 0
+  store i32 %8, i32* %9, align 4
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %5) #3
+  ret void
+}
+
+declare !pointeetys !3 void @"\01?_AmdAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* sret(%struct.DispatchSystemData), i64, %struct.TraversalData*) #2
+
+; Function Attrs: nounwind
+declare i32 @_AmdGetResumePointAddr() #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !5 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !5 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
+
+attributes #0 = { nounwind memory(none) "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!0 = !{%struct.DispatchSystemData poison}
+!1 = !{i32 0, %struct.DispatchSystemData poison}
+!2 = !{null, %struct.DispatchSystemData poison, %struct.DispatchSystemData poison}
+!3 = !{null, %struct.DispatchSystemData poison, null, %struct.TraversalData poison}
+!4 = !{i32 0, %struct.TraversalData poison}
+!5 = !{i8 poison}
+!6 = !{i32 0, i8 poison}
+; CHECK-LABEL: define i32 @_cont_GetContinuationStackAddr(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 1
+;
+;
+; CHECK-LABEL: define %struct.DispatchSystemData @_cont_TraceRay(
+; CHECK-SAME: ptr nocapture readonly [[DATA:%.*]], i64 [[ACCELSTRUCT:%.*]], i32 [[RAYFLAGS:%.*]], i32 [[INSTANCEINCLUSIOMASK:%.*]], i32 [[RAYCONTRIBUTIONTOHITGROUPINDEX:%.*]], i32 [[MULTIPLIERFORGEOMETRYCONTRIBUTIONTOSHADERINDEX:%.*]], i32 [[MISSSHADERINDEX:%.*]], float [[ORIGINX:%.*]], float [[ORIGINY:%.*]], float [[ORIGINZ:%.*]], float [[TMIN:%.*]], float [[DIRX:%.*]], float [[DIRY:%.*]], float [[DIRZ:%.*]], float [[TMAX:%.*]]) #[[ATTR1:[0-9]+]] !pointeetys [[META0:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[DATA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast ptr [[TMP1]] to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[TMP6]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = call i32 @_AmdGetResumePointAddr() #[[ATTR5]]
+; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[ADDR]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]])
+; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[TMP6]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]]
+;
diff --git a/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll b/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll
new file mode 100644
index 0000000000..82dce60286
--- /dev/null
+++ b/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll
@@ -0,0 +1,26 @@
+; NOTE: Do not autogenerate
+; RUN: opt --verify-each -passes='continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
+
+; CHECK-NOT: Found a function with more than one call to setLocalRootIndex
+; CHECK-LABEL: define void @RayGen(
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%struct.DispatchSystemData = type { i32 }
+
+declare void @amd.dx.setLocalRootIndex(i32)
+declare void @lgc.cps.await__void(...)
+
+define void @RayGen(i32 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !0 !continuation.entry !1 !continuation !2 {
+  call void @amd.dx.setLocalRootIndex(i32 0)
+  call void (...) @lgc.cps.await__void(i32 2, i32 3)
+  call void @amd.dx.setLocalRootIndex(i32 5)
+  ret void
+}
+
+!continuation.stackAddrspace = !{!3}
+
+!0 = !{i32 0}
+!1 = !{}
+!2 = !{void ()* @RayGen}
+!3 = !{i32 21}
diff --git a/llvmraytracing/test/dx/remat-indirect-load.ll b/llvmraytracing/test/dx/remat-indirect-load.ll
new file mode 100644
index 0000000000..73ccacc1f6
--- /dev/null
+++ b/llvmraytracing/test/dx/remat-indirect-load.ll
@@ -0,0 +1,46 @@
+; NOTE: Do not autogenerate
+; RUN: opt -debug-only=dxil-coro-split -passes='dxil-coro-split' -S %s 2>&1 | FileCheck %s
+;
+; Test that an indirect handle load pattern does not produce a rematerialization
+; warning. We know that remat in this case is not profitable.
+;
+; REQUIRES: assertions
+
+; CHECK-NOT: Warning: isRematerializableDxilLoad unhandled pattern: {{.*}} = extractvalue %dx.types.ResRet.i32
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target triple = "dxil-ms-dx"
+
+%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+%dx.types.Handle = type { ptr }
+
+; Function Attrs: presplitcoroutine
+define { ptr, ptr } @"indirect_handle_load"() #0 {
+_cont_RayTCurrent.exit:
+  %0 = call token @llvm.coro.id.retcon(i32 0, i32 0, ptr null, ptr @"continuation.prototype.indirect_handle_load", ptr @continuation.malloc, ptr @continuation.free)
+  %1 = call ptr @llvm.coro.begin(token %0, ptr null)
+  %2 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 0, %dx.types.Handle zeroinitializer)
+  %3 = extractvalue %dx.types.ResRet.i32 %2, 0
+  %4 = call %dx.types.Handle @dx.op.createHandleFromHeap(i32 0, i32 %3)
+  %5 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 0, %dx.types.Handle %4)
+  ret { ptr, ptr } zeroinitializer
+}
+
+declare %dx.types.Handle @dx.op.createHandleFromHeap(i32, i32)
+
+declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle)
+
+declare ptr @continuation.malloc(i32)
+
+declare void @continuation.free(ptr)
+
+; Function Attrs: nounwind
+declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
+
+; Function Attrs: nounwind
+declare ptr @llvm.coro.begin(token, ptr writeonly) #1
+
+declare { ptr, ptr } @"continuation.prototype.indirect_handle_load"(ptr)
+
+attributes #0 = { presplitcoroutine }
+attributes #1 = { nounwind }
diff --git a/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll b/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll
new file mode 100644
index 0000000000..71cfff438d
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+declare void @lgc.cps.jump(...) noreturn
+
+define void @test(i32 %arg, ptr %table) !lgc.cps !0 !lgc.shaderstage !{i32 7} !continuation !{ptr @test} {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) !lgc.cps [[META1:![0-9]+]] !lgc.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TABLE_0:%.*]] = getelementptr i32, ptr [[TABLE]], i32 0
+; CHECK-NEXT:    [[CR_THEN:%.*]] = load i32, ptr [[TABLE_0]], align 4
+; CHECK-NEXT:    [[THEN_ARG:%.*]] = add i32 [[ARG]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_THEN]], i32 2, i32 [[TMP0]], i32 [[THEN_ARG]])
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %table.0 = getelementptr i32, ptr %table, i32 0
+  %cr.then = load i32, ptr %table.0
+  %then.arg = add i32 %arg, 1
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 %then.arg)
+  unreachable
+}
+!continuation.stackAddrspace = !{!1}
+
+!0 = !{i32 1} ; level 1
+!1 = !{i32 5}
+;
+;.
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 7}
+; CHECK: [[META3]] = !{ptr @test}
+; CHECK: [[META4]] = !{i32 0}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll
new file mode 100644
index 0000000000..929f97d738
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+declare void @lgc.cps.jump(...) #0
+
+declare void @lgc.cps.set.vsp(ptr addrspace(32)) #1
+
+declare ptr addrspace(32) @lgc.cps.get.vsp() #2
+
+define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc.shaderstage !3 !lgc.rt.shaderstage !3 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.CS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !lgc.shaderstage [[META3:![0-9]+]] !lgc.rt.shaderstage [[META3]] {
+; CHECK-NEXT:  [[_ENTRY:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DESC:%.*]] = call <4 x i32> @lgc.load.user.data__v4i32(i32 0)
+; CHECK-NEXT:    [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[DESC]])
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 0
+; CHECK-NEXT:    [[I_VSP:%.*]] = load i32, ptr addrspace(7) [[P0]], align 4
+; CHECK-NEXT:    store i32 [[I_VSP]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 1
+; CHECK-NEXT:    [[CR:%.*]] = load i32, ptr addrspace(7) [[P1]], align 4
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 2
+; CHECK-NEXT:    [[ARG:%.*]] = load i32, ptr addrspace(7) [[P2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 1, i32 [[TMP1]], i32 poison, i32 [[ARG]], i32 [[TMP0]])
+; CHECK-NEXT:    unreachable
+;
+.entry:
+  %desc = call <4 x i32> @lgc.load.user.data__v4i32(i32 0)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %p0 = getelementptr i32, ptr addrspace(7) %ptr, i32 0
+  %i_vsp = load i32, ptr addrspace(7) %p0, align 4
+  %vsp = inttoptr i32 %i_vsp to ptr addrspace(32)
+  call void @lgc.cps.set.vsp(ptr addrspace(32) %vsp)
+
+  %p1 = getelementptr i32, ptr addrspace(7) %ptr, i32 1
+  %cr = load i32, ptr addrspace(7) %p1, align 4
+
+  %p2 = getelementptr i32, ptr addrspace(7) %ptr, i32 2
+  %arg = load i32, ptr addrspace(7) %p2, align 4
+
+  %p32 = call ptr addrspace(32) @lgc.cps.get.vsp()
+
+  call void (...) @lgc.cps.jump(i32 %cr, i32 1, i32 poison, i32 poison, i32 %arg, ptr addrspace(32) %p32)
+  unreachable
+}
+
+declare <4 x i32> @lgc.load.user.data__v4i32(i32) #4
+
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #5
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind willreturn memory(inaccessiblemem: write) }
+attributes #2 = { nounwind willreturn memory(inaccessiblemem: read) }
+attributes #4 = { nounwind memory(none) }
+attributes #5 = { nounwind willreturn memory(none) }
+
+!lgc.user.data.nodes = !{!1}
+!llpc.compute.mode = !{!2}
+!continuation.stackAddrspace = !{!4}
+
+!1 = !{!"DescriptorBuffer", i32 6, i32 6, i32 0, i32 4, i64 0, i32 0, i32 4}
+!2 = !{i32 8, i32 4, i32 1, i32 0, i32 0, i32 1}
+!3 = !{i32 7}
+!4 = !{i32 5}
+;.
+; CHECK: [[META3]] = !{i32 7}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll
new file mode 100644
index 0000000000..51e44a2df8
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll
@@ -0,0 +1,252 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+%_rgen_1.Frame = type { ptr addrspace(7), ptr addrspace(7), i32 }
+
+define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shaderstage !16 !continuation !18 !lgc.cps !17 {
+; CHECK-LABEL: define spir_func void @_rgen_1(
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[RCR:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META16:![0-9]+]] !lgc.shaderstage [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
+; CHECK-NEXT:  [[_ENTRY:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 96
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 20)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr addrspace(4) [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 -1, i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 553734060, i64 3
+; CHECK-NEXT:    [[TMP21:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP22]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i32> [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP25]], i32 32
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP26]], align 16
+; CHECK-NEXT:    [[TMP28:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP29]], i32 0
+; CHECK-NEXT:    store ptr addrspace(7) [[TMP28]], ptr addrspace(5) [[TMP30]], align 32
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP31]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i32> [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP34]], i32 48
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16
+; CHECK-NEXT:    [[TMP37:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP39]], i32 0
+; CHECK-NEXT:    store ptr addrspace(7) [[TMP37]], ptr addrspace(5) [[TMP40]], align 32
+; CHECK-NEXT:    [[TMP41:%.*]] = load volatile i32, ptr addrspace(7) [[TMP37]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP43]], i32 0
+; CHECK-NEXT:    store i32 [[TMP41]], ptr addrspace(5) [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP41]], -37
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP21]], i32 52
+; CHECK-NEXT:    [[TMP47:%.*]] = load i64, ptr addrspace(7) [[TMP46]], align 8
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP21]], i32 60
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(7) [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[TMP45]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = inttoptr i64 [[TMP47]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP52:%.*]] = sext i32 [[TMP50]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP51]], i64 [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i64, ptr addrspace(1) [[TMP53]], align 8
+; CHECK-NEXT:    [[TMP55:%.*]] = inttoptr i64 [[TMP54]] to ptr
+; CHECK-NEXT:    [[TMP56:%.*]] = ptrtoint ptr [[TMP55]] to i32
+; CHECK-NEXT:    [[TMP57:%.*]] = or i32 [[TMP56]], 1
+; CHECK-NEXT:    [[TMP58:%.*]] = inttoptr i32 [[TMP57]] to ptr
+; CHECK-NEXT:    [[TMP59:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_rgen_1.resume.0)
+; CHECK-NEXT:    [[TMP60:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP57]], i32 2, i32 [[TMP60]], i32 [[TMP59]], [1 x i32] undef, i32 [[TMP45]])
+; CHECK-NEXT:    unreachable
+;
+.entry:
+  %0 = call ptr addrspace(32) @lgc.cps.alloc(i32 96)
+  %1 = call i64 @llvm.amdgcn.s.getpc()
+  %2 = bitcast i64 %1 to <2 x i32>
+  %3 = call i64 @llvm.amdgcn.s.getpc()
+  %4 = bitcast i64 %3 to <2 x i32>
+  %5 = call i64 @llvm.amdgcn.s.getpc()
+  %6 = bitcast i64 %5 to <2 x i32>
+  %7 = call i32 @lgc.load.user.data__i32(i32 20)
+  %8 = insertelement <2 x i32> %6, i32 %7, i64 0
+  %9 = bitcast <2 x i32> %8 to i64
+  %10 = inttoptr i64 %9 to ptr addrspace(4)
+  %11 = getelementptr i8, ptr addrspace(4) %10, i32 0
+  %12 = load <2 x i32>, ptr addrspace(4) %11, align 8
+  %13 = extractelement <2 x i32> %12, i64 0
+  %14 = extractelement <2 x i32> %12, i64 1
+  %15 = insertelement <4 x i32> poison, i32 %13, i64 0
+  %16 = and i32 %14, 65535
+  %17 = insertelement <4 x i32> %15, i32 %16, i64 1
+  %18 = insertelement <4 x i32> %17, i32 -1, i64 2
+  %19 = insertelement <4 x i32> %18, i32 553734060, i64 3
+  %20 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %19)
+  %21 = call i32 @lgc.load.user.data__i32(i32 0)
+  %22 = insertelement <2 x i32> %4, i32 %21, i64 0
+  %23 = bitcast <2 x i32> %22 to i64
+  %24 = inttoptr i64 %23 to ptr addrspace(4)
+  %25 = getelementptr i8, ptr addrspace(4) %24, i32 32
+  %26 = load <4 x i32>, ptr addrspace(4) %25, align 16
+  %27 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %26)
+  %28 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %0, i32 0, i32 0
+  store ptr addrspace(7) %27, ptr addrspace(32) %28, align 32
+  %29 = call i32 @lgc.load.user.data__i32(i32 0)
+  %30 = insertelement <2 x i32> %2, i32 %29, i64 0
+  %31 = bitcast <2 x i32> %30 to i64
+  %32 = inttoptr i64 %31 to ptr addrspace(4)
+  %33 = getelementptr i8, ptr addrspace(4) %32, i32 48
+  %34 = load <4 x i32>, ptr addrspace(4) %33, align 16
+  %35 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %34)
+  %36 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %0, i32 0, i32 1
+  store ptr addrspace(7) %35, ptr addrspace(32) %36, align 32
+  %37 = load volatile i32, ptr addrspace(7) %35, align 4
+  %38 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %0, i32 0, i32 2
+  store i32 %37, ptr addrspace(32) %38, align 4
+  %39 = add i32 %37, -37
+  %40 = getelementptr inbounds i8, ptr addrspace(7) %20, i32 52
+  %41 = load i64, ptr addrspace(7) %40, align 8
+  %42 = getelementptr inbounds i8, ptr addrspace(7) %20, i32 60
+  %43 = load i32, ptr addrspace(7) %42, align 4
+  %44 = mul i32 %39, %43
+  %45 = inttoptr i64 %41 to ptr addrspace(1)
+  %46 = sext i32 %44 to i64
+  %47 = getelementptr i8, ptr addrspace(1) %45, i64 %46
+  %48 = load i64, ptr addrspace(1) %47, align 8
+  %49 = inttoptr i64 %48 to ptr
+  %50 = ptrtoint ptr %49 to i32
+  %51 = or i32 %50, 1
+  %52 = inttoptr i32 %51 to ptr
+  %53 = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_rgen_1.resume.0)
+  call void (...) @lgc.cps.jump(i32 %51, i32 2, i32 poison, i32 %53, [1 x i32] undef, i32 %39)
+  unreachable
+}
+
+define void @_rgen_1.resume.0(i32 %1, [1 x i32] %2) !spirv.ExecutionModel !15 !lgc.shaderstage !16 !continuation !18 !lgc.cps !17 {
+; CHECK-LABEL: define void @_rgen_1.resume.0(
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], [1 x i32] [[TMP1:%.*]]) !spirv.ExecutionModel [[META16]] !lgc.shaderstage [[META17]] !continuation [[META21:![0-9]+]] !lgc.cps [[META19]] !continuation.state [[META20]] {
+; CHECK-NEXT:  [[ENTRYRESUME_0:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -96
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTRELOAD6:%.*]] = load i32, ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
+; CHECK-NEXT:    [[DOTRELOAD3:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP11]], align 32
+; CHECK-NEXT:    [[DUMMY_UDATA:%.*]] = call i32 @lgc.load.user.data__i32(i32 20)
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[DUMMY_UDATA]], 24
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP13]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP14]], i32 0
+; CHECK-NEXT:    [[DUMMY_RELOAD:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP15]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = load volatile i32, ptr addrspace(7) [[DOTRELOAD3]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[DOTRELOAD6]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr addrspace(7) [[DOTRELOAD]], align 4
+; CHECK-NEXT:    ret void
+;
+entryresume.0:
+  %3 = call ptr addrspace(32) @lgc.cps.peek(i32 96)
+  %4 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %3, i32 0, i32 2
+  %.reload6 = load i32, ptr addrspace(32) %4, align 4
+  %5 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %3, i32 0, i32 1
+  %.reload3 = load ptr addrspace(7), ptr addrspace(32) %5, align 32
+  %6 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %3, i32 0, i32 0
+  %.reload = load ptr addrspace(7), ptr addrspace(32) %6, align 32
+  %dummy.udata = call i32 @lgc.load.user.data__i32(i32 20)
+  %dummy.gep = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %3, i32 %dummy.udata, i32 0
+  %dummy.reload = load ptr addrspace(7), ptr addrspace(32) %dummy.gep, align 32
+  %7 = load volatile i32, ptr addrspace(7) %.reload3, align 4
+  %8 = icmp eq i32 %.reload6, %7
+  %9 = zext i1 %8 to i32
+  store i32 %9, ptr addrspace(7) %.reload, align 4
+  call void @lgc.cps.complete()
+  unreachable
+}
+
+declare i32 @lgc.load.user.data__i32(i32) #1
+
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+declare ptr addrspace(32) @lgc.cps.alloc(i32) #6
+
+declare i32 @lgc.cps.as.continuation.reference(...) #3
+
+declare void @lgc.cps.jump(...) #5
+
+declare ptr addrspace(32) @lgc.cps.peek(i32) #7
+
+declare void @lgc.cps.complete()
+
+attributes #0 = { alwaysinline nounwind "target-features"=",+wavefrontsize32" }
+attributes #1 = { nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind willreturn }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn }
+attributes #6 = { nounwind willreturn memory(inaccessiblemem: readwrite) }
+attributes #7 = { nounwind willreturn memory(inaccessiblemem: read) }
+
+!llpc.compute.mode = !{!0}
+!lgc.client = !{!1}
+!lgc.options = !{!2}
+!lgc.options.CS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13}
+!amdgpu.pal.metadata.msgpack = !{!14}
+!continuation.stackAddrspace = !{!19}
+
+!0 = !{i32 8, i32 4, i32 1}
+!1 = !{!"Vulkan"}
+!2 = !{i32 262875531, i32 502344192, i32 854861601, i32 -1595331954, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216, i32 0, i32 0, i32 2}
+!3 = !{i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 192, i32 0, i32 0, i32 32, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 0, i32 1, i32 4}
+!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i64 0, i32 0, i32 4}
+!6 = !{!"DescriptorBuffer", i32 6, i32 0, i32 4, i32 4, i64 0, i32 1, i32 4}
+!7 = !{!"DescriptorBuffer", i32 6, i32 0, i32 8, i32 4, i64 0, i32 2, i32 4}
+!8 = !{!"DescriptorBuffer", i32 6, i32 0, i32 12, i32 4, i64 0, i32 3, i32 4}
+!9 = !{!"StreamOutTableVaPtr", i32 11, i32 0, i32 1, i32 1, i32 0}
+!10 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 5, i32 1, i32 3}
+!11 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 0, i32 2, i64 93, i32 17, i32 2}
+!12 = !{!"DescriptorBuffer", i32 6, i32 0, i32 2, i32 4, i64 93, i32 0, i32 4}
+!13 = !{!"DescriptorBuffer", i32 6, i32 0, i32 6, i32 4, i64 93, i32 1, i32 4}
+!14 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\C4jyX\05\E6M\0F\CF\03b\DD\05\C5\B6\DB\B9\AD.llpc_version\A467.0\AEamdpal.version\92\03\00"}
+!15 = !{i32 5313}
+!16 = !{i32 7}
+!17 = !{i32 1}
+!18 = !{ptr @_rgen_1}
+!19 = !{i32 5}
+;.
+; CHECK: [[META16]] = !{i32 5313}
+; CHECK: [[META17]] = !{i32 7}
+; CHECK: [[META18]] = !{ptr @_rgen_1}
+; CHECK: [[META19]] = !{i32 1}
+; CHECK: [[META20]] = !{i32 0}
+; CHECK: [[META21]] = !{ptr @_rgen_1.resume.0}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll
new file mode 100644
index 0000000000..a14a7fe618
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+%_rgen_1.Frame = type { ptr addrspace(22), ptr addrspace(22), i32 }
+
+declare void @lgc.cps.jump(...) #0
+
+declare ptr addrspace(32) @lgc.cps.alloc(i32)
+
+declare void @lgc.cps.free(i32)
+
+declare i32 @lgc.cps.as.continuation.reference(ptr)
+
+declare ptr addrspace(32) @lgc.cps.peek(i32)
+
+declare ptr addrspace(32) @lgc.cps.get.vsp()
+
+declare i32 @lgc.cps.get.dummy.index(i32)
+
+declare void @lgc.cps.complete()
+
+declare i64 @_cont_GetContinuationStackGlobalMemBase()
+
+define { ptr, ptr } @test.0(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !3 {
+; CHECK-LABEL: define void @test.0(
+; CHECK-SAME: ) !lgc.cps [[META1:![0-9]+]] !lgc.rt.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP0]]
+; CHECK-NEXT:    store i32 333, ptr addrspace(22) [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 111, ptr addrspace(22) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 9
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT:    store i8 99, ptr addrspace(22) [[TMP8]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  store i32 333, ptr addrspace(32) %mem, align 4
+  %p1 = getelementptr i32, ptr addrspace(32) %mem, i32 1
+  store i32 111, ptr addrspace(32) %p1, align 4
+  %p2 = getelementptr i8, ptr addrspace(32) %mem, i32 9
+  store i8 99, ptr addrspace(32) %p2, align 1
+  %q1 = ptrtoint ptr addrspace(32) %p1 to i32
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
+  unreachable
+}
+
+define { ptr, ptr } @test.1(ptr addrspace(32) %p2, i32 %q1, ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !4 {
+; CHECK-LABEL: define void @test.1(
+; CHECK-SAME: i32 [[P2:%.*]], i32 [[Q1:%.*]]) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META5:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(22)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[Q1]]
+; CHECK-NEXT:    [[N111:%.*]] = load i32, ptr addrspace(22) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[P2]]
+; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(22) [[TMP3]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %p1 = inttoptr i32 %q1 to ptr addrspace(32)
+  %n111 = load i32, ptr addrspace(32) %p1, align 4
+  %n99 = load i8, ptr addrspace(32) %p2, align 1
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
+  unreachable
+}
+
+define { ptr, ptr } @test.2(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !5 {
+; CHECK-LABEL: define void @test.2(
+; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META6:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -12
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP1]]
+; CHECK-NEXT:    [[N333:%.*]] = load i32, ptr addrspace(22) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -12
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
+; CHECK-NEXT:    ret void
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.peek(i32 10)
+  %n333 = load i32, ptr addrspace(32) %mem, align 4
+  call void @lgc.cps.free(i32 10)
+  call void @lgc.cps.complete()
+  unreachable
+}
+
+define { ptr, ptr } @test.gep(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !6 {
+; CHECK-LABEL: define void @test.gep(
+; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META7:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(22)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(22) [[TMP11]], align 4
+; CHECK-NEXT:    [[STACK_EL1:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[STACK_EL1]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(22) [[TMP12]], align 4
+; CHECK-NEXT:    [[STACK_EL2:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 2)
+; CHECK-NEXT:    [[STACK_EL2_DIV:%.*]] = sdiv i32 [[STACK_EL2]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[STACK_EL2_DIV]], 24
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP15]]
+; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(22) [[TMP18]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1, align 4
+  %stack.el1 = call i32 @lgc.cps.get.dummy.index(i32 1)
+  %2 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el1
+  %vsp.2 = call ptr addrspace(32) @lgc.cps.peek(i32 4)
+  %vsp.2.i = ptrtoint ptr addrspace(32) %vsp.2 to i32
+  store i32 %vsp.2.i, ptr addrspace(32) %2, align 4
+  %stack.el2 = call i32 @lgc.cps.get.dummy.index(i32 2)
+  %stack.el2.div = sdiv i32 %stack.el2, 2
+  %3 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el2.div, i32 1
+  %vsp.3 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
+  %vsp.3.i = ptrtoint ptr addrspace(32) %vsp.3 to i32
+  store i32 %vsp.3.i, ptr addrspace(32) %3, align 4
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
+  unreachable
+}
+
+define { ptr, ptr } @test.nested.gep(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !7 {
+; CHECK-LABEL: define void @test.nested.gep(
+; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META8:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(22)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP7]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(22) [[TMP9]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %gep.base = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %gep.base, i32 0, i32 2
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1, align 4
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
+  unreachable
+}
+
+declare !continuation !3 { ptr, ptr } @continuation.prototype.test.0(ptr, i1)
+
+declare ptr @continuation.malloc(i32)
+
+declare void @continuation.free(ptr)
+
+declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
+
+declare ptr @llvm.coro.begin(token, ptr writeonly) #1
+
+declare !continuation !4 { ptr, ptr } @continuation.prototype.test.1(ptr, i1)
+
+declare !continuation !5 { ptr, ptr } @continuation.prototype.test.2(ptr, i1)
+
+declare !continuation !6 { ptr, ptr } @continuation.prototype.test.gep(ptr, i1)
+
+declare !continuation !7 { ptr, ptr } @continuation.prototype.test.nested.gep(ptr, i1)
+
+attributes #0 = { noreturn }
+attributes #1 = { nounwind }
+
+!continuation.stackAddrspace = !{!0}
+
+!0 = !{i32 22}
+!1 = !{i32 1}
+!2 = !{i32 7}
+!3 = !{ptr @test.0}
+!4 = !{ptr @test.1}
+!5 = !{ptr @test.2}
+!6 = !{ptr @test.gep}
+!7 = !{ptr @test.nested.gep}
+;.
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 7}
+; CHECK: [[META3]] = !{ptr @test.0}
+; CHECK: [[META4]] = !{i32 0}
+; CHECK: [[META5]] = !{ptr @test.1}
+; CHECK: [[META6]] = !{ptr @test.2}
+; CHECK: [[META7]] = !{ptr @test.gep}
+; CHECK: [[META8]] = !{ptr @test.nested.gep}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll
new file mode 100644
index 0000000000..f8fe1ecdd8
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+%_rgen_1.Frame = type { ptr addrspace(21), ptr addrspace(21), i32 }
+
+declare void @lgc.cps.jump(...) #0
+
+declare ptr addrspace(32) @lgc.cps.alloc(i32)
+
+declare void @lgc.cps.free(i32)
+
+declare i32 @lgc.cps.as.continuation.reference(ptr)
+
+declare ptr addrspace(32) @lgc.cps.peek(i32)
+
+declare ptr addrspace(32) @lgc.cps.get.vsp()
+
+declare i32 @lgc.cps.get.dummy.index(i32)
+
+declare void @lgc.cps.complete()
+
+define { ptr, ptr } @test.0(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !3 {
+; CHECK-LABEL: define void @test.0(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1:![0-9]+]] !lgc.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
+; CHECK-NEXT:    store i32 333, ptr addrspace(21) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
+; CHECK-NEXT:    store i32 111, ptr addrspace(21) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 9
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0
+; CHECK-NEXT:    store i8 99, ptr addrspace(21) [[TMP9]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  store i32 333, ptr addrspace(32) %mem, align 4
+  %p1 = getelementptr i32, ptr addrspace(32) %mem, i32 1
+  store i32 111, ptr addrspace(32) %p1, align 4
+  %p2 = getelementptr i8, ptr addrspace(32) %mem, i32 9
+  store i8 99, ptr addrspace(32) %p2, align 1
+  %q1 = ptrtoint ptr addrspace(32) %p1 to i32
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
+  unreachable
+}
+
+define { ptr, ptr } @test.1(ptr addrspace(32) %p2, i32 %q1, ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !4 {
+; CHECK-LABEL: define void @test.1(
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[P2:%.*]], i32 [[Q1:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META5:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[Q1]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP0]], i32 0
+; CHECK-NEXT:    [[N111:%.*]] = load i32, ptr addrspace(21) [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[P2]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
+; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(21) [[TMP3]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %p1 = inttoptr i32 %q1 to ptr addrspace(32)
+  %n111 = load i32, ptr addrspace(32) %p1, align 4
+  %n99 = load i8, ptr addrspace(32) %p2, align 1
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
+  unreachable
+}
+
+define { ptr, ptr } @test.2(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !5 {
+; CHECK-LABEL: define void @test.2(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META6:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -12
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
+; CHECK-NEXT:    [[N333:%.*]] = load i32, ptr addrspace(21) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -12
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
+; CHECK-NEXT:    ret void
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.peek(i32 10)
+  %n333 = load i32, ptr addrspace(32) %mem, align 4
+  call void @lgc.cps.free(i32 10)
+  call void @lgc.cps.complete()
+  unreachable
+}
+
+define { ptr, ptr } @test.gep(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !6 {
+; CHECK-LABEL: define void @test.gep(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META7:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(21) [[TMP6]], align 4
+; CHECK-NEXT:    [[STACK_EL1:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[STACK_EL1]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0
+; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(21) [[TMP12]], align 4
+; CHECK-NEXT:    [[STACK_EL2:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 2)
+; CHECK-NEXT:    [[STACK_EL2_DIV:%.*]] = sdiv i32 [[STACK_EL2]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[STACK_EL2_DIV]], 24
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0
+; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(21) [[TMP19]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1, align 4
+  %stack.el1 = call i32 @lgc.cps.get.dummy.index(i32 1)
+  %2 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el1
+  %vsp.2 = call ptr addrspace(32) @lgc.cps.peek(i32 4)
+  %vsp.2.i = ptrtoint ptr addrspace(32) %vsp.2 to i32
+  store i32 %vsp.2.i, ptr addrspace(32) %2, align 4
+  %stack.el2 = call i32 @lgc.cps.get.dummy.index(i32 2)
+  %stack.el2.div = sdiv i32 %stack.el2, 2
+  %3 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el2.div, i32 1
+  %vsp.3 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
+  %vsp.3.i = ptrtoint ptr addrspace(32) %vsp.3 to i32
+  store i32 %vsp.3.i, ptr addrspace(32) %3, align 4
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
+  unreachable
+}
+
+define { ptr, ptr } @test.nested.gep(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !7 {
+; CHECK-LABEL: define void @test.nested.gep(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META8:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(21) [[TMP7]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
+; CHECK-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %gep.base = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %gep.base, i32 0, i32 2
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1, align 4
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
+  unreachable
+}
+
+declare !continuation !3 { ptr, ptr } @continuation.prototype.test.0(ptr, i1)
+
+declare ptr @continuation.malloc(i32)
+
+declare void @continuation.free(ptr)
+
+declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
+
+declare ptr @llvm.coro.begin(token, ptr writeonly) #1
+
+declare !continuation !4 { ptr, ptr } @continuation.prototype.test.1(ptr, i1)
+
+declare !continuation !5 { ptr, ptr } @continuation.prototype.test.2(ptr, i1)
+
+declare !continuation !6 { ptr, ptr } @continuation.prototype.test.gep(ptr, i1)
+
+declare !continuation !7 { ptr, ptr } @continuation.prototype.test.nested.gep(ptr, i1)
+
+attributes #0 = { noreturn }
+attributes #1 = { nounwind }
+
+!continuation.stackAddrspace = !{!0}
+
+!0 = !{i32 21}
+!1 = !{i32 1}
+!2 = !{i32 7}
+!3 = !{ptr @test.0}
+!4 = !{ptr @test.1}
+!5 = !{ptr @test.2}
+!6 = !{ptr @test.gep}
+!7 = !{ptr @test.nested.gep}
+;.
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 7}
+; CHECK: [[META3]] = !{ptr @test.0}
+; CHECK: [[META4]] = !{i32 0}
+; CHECK: [[META5]] = !{ptr @test.1}
+; CHECK: [[META6]] = !{ptr @test.2}
+; CHECK: [[META7]] = !{ptr @test.gep}
+; CHECK: [[META8]] = !{ptr @test.nested.gep}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll
new file mode 100644
index 0000000000..ae352d4b23
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+declare void @lgc.cps.jump(...) noreturn
+declare ptr addrspace(32) @lgc.cps.alloc(i32)
+declare void @lgc.cps.free(i32)
+declare i32 @lgc.cps.as.continuation.reference(ptr)
+declare ptr addrspace(32) @lgc.cps.peek(i32)
+declare ptr addrspace(32) @lgc.cps.get.vsp()
+declare i32 @lgc.cps.get.dummy.index(i32)
+declare void @lgc.cps.complete()
+
+%_rgen_1.Frame = type { ptr addrspace(5), ptr addrspace(5), i32 }
+
+define void @test.0() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation !{ptr @test.0} {
+; CHECK-LABEL: define void @test.0(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1:![0-9]+]] !lgc.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP2]], i32 0
+; CHECK-NEXT:    store i32 333, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
+; CHECK-NEXT:    store i32 111, ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 9
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
+; CHECK-NEXT:    store i8 99, ptr addrspace(5) [[TMP9]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
+; CHECK-NEXT:    unreachable
+;
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
+
+  store i32 333, ptr addrspace(32) %mem
+
+  %p1 = getelementptr i32, ptr addrspace(32) %mem, i32 1
+  store i32 111, ptr addrspace(32) %p1
+
+  %p2 = getelementptr i8, ptr addrspace(32) %mem, i32 9
+  store i8 99, ptr addrspace(32) %p2
+
+  %q1 = ptrtoint ptr addrspace(32) %p1 to i32
+
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
+  unreachable
+}
+
+define void @test.1(ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation !{ptr @test.0} {
+; CHECK-LABEL: define void @test.1(
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[P2:%.*]], i32 [[Q1:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META5:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[Q1]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP0]], i32 0
+; CHECK-NEXT:    [[N111:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[P2]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP2]], i32 0
+; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(5) [[TMP3]], align 1
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
+; CHECK-NEXT:    unreachable
+;
+  %p1 = inttoptr i32 %q1 to ptr addrspace(32)
+  %n111 = load i32, ptr addrspace(32) %p1
+  %n99 = load i8, ptr addrspace(32) %p2
+
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
+  unreachable
+}
+
+define void @test.2() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation !{ptr @test.0} {
+; CHECK-LABEL: define void @test.2(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META6:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -12
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP2]], i32 0
+; CHECK-NEXT:    [[N333:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -12
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
+; CHECK-NEXT:    ret void
+;
+  %mem = call ptr addrspace(32) @lgc.cps.peek(i32 10)    ; round up to 12 during lowering
+
+  %n333 = load i32, ptr addrspace(32) %mem
+
+  call void @lgc.cps.free(i32 10)   ; round up to 12 during lowering
+
+  call void @lgc.cps.complete()
+  unreachable
+}
+
+; Dummy test to show behavior with lowering of non-constant GEP indices.
+define void @test.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation !{ptr @test.0} {
+; CHECK-LABEL: define void @test.gep(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META7:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[STACK_EL1:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[STACK_EL1]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP11]], i32 0
+; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(5) [[TMP12]], align 4
+; CHECK-NEXT:    [[STACK_EL2:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 2)
+; CHECK-NEXT:    [[STACK_EL2_DIV:%.*]] = sdiv i32 [[STACK_EL2]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[STACK_EL2_DIV]], 24
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP18]], i32 0
+; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(5) [[TMP19]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
+; CHECK-NEXT:    unreachable
+;
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
+
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1
+
+  %stack.el1 = call i32 @lgc.cps.get.dummy.index(i32 1)
+  %2 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el1
+  %vsp.2 = call ptr addrspace(32) @lgc.cps.peek(i32 4)
+  %vsp.2.i = ptrtoint ptr addrspace(32) %vsp.2 to i32
+  store i32 %vsp.2.i, ptr addrspace(32) %2
+
+  %stack.el2 = call i32 @lgc.cps.get.dummy.index(i32 2)
+  %stack.el2.div = sdiv i32 %stack.el2, 2
+  %3 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el2.div, i32 1
+  %vsp.3 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
+  %vsp.3.i = ptrtoint ptr addrspace(32) %vsp.3 to i32
+  store i32 %vsp.3.i, ptr addrspace(32) %3
+
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
+  unreachable
+}
+
+; Dummy test to show behavior with lowering of nested GEPs.
+define void @test.nested.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation !{ptr @test.0} {
+; CHECK-LABEL: define void @test.nested.gep(
+; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META8:![0-9]+]] !continuation.state [[META4]] {
+; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
+; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
+; CHECK-NEXT:    unreachable
+;
+  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
+
+  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
+  %gep.base = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
+  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %gep.base, i32 0, i32 2
+  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
+  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
+  store i32 %vsp.i, ptr addrspace(32) %1
+
+  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
+  unreachable
+}
+
+!continuation.stackAddrspace = !{!0}
+
+!0 = !{i32 5}
+;.
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 7}
+; CHECK: [[META3]] = !{ptr @test.0}
+; CHECK: [[META4]] = !{i32 0}
+; CHECK: [[META5]] = !{ptr @test.1}
+; CHECK: [[META6]] = !{ptr @test.2}
+; CHECK: [[META7]] = !{ptr @test.gep}
+; CHECK: [[META8]] = !{ptr @test.nested.gep}
+;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-unify-exits.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-unify-exits.ll
new file mode 100644
index 0000000000..7251051390
--- /dev/null
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-unify-exits.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
+
+declare void @lgc.cps.jump(...) noreturn
+
+define void @unify_jumps(i32 %arg, ptr %table) !lgc.cps !0 !lgc.shaderstage !{i32 7} {
+; CHECK-LABEL: define void @unify_jumps(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) !lgc.cps [[META1:![0-9]+]] !lgc.shaderstage [[META2:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[ARG]], 3
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TABLE_0:%.*]] = getelementptr i32, ptr [[TABLE]], i32 0
+; CHECK-NEXT:    [[CR_THEN:%.*]] = load i32, ptr [[TABLE_0]], align 4
+; CHECK-NEXT:    [[THEN_ARG:%.*]] = add i32 [[ARG]], 1
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_THEN]], i32 2, i32 poison, i32 poison, i32 [[THEN_ARG]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[TABLE_1:%.*]] = getelementptr i32, ptr [[TABLE]], i32 1
+; CHECK-NEXT:    [[CR_ELSE:%.*]] = load i32, ptr [[TABLE_1]], align 4
+; CHECK-NEXT:    [[ELSE_ARG:%.*]] = uitofp i32 [[ARG]] to float
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_ELSE]], i32 2, i32 poison, i32 poison, float [[ELSE_ARG]], i32 5)
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cond = icmp ult i32 %arg, 3
+  br i1 %cond, label %then, label %else
+
+then:
+  %table.0 = getelementptr i32, ptr %table, i32 0
+  %cr.then = load i32, ptr %table.0
+  %then.arg = add i32 %arg, 1
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 %then.arg)
+  unreachable
+
+else:
+  %table.1 = getelementptr i32, ptr %table, i32 1
+  %cr.else = load i32, ptr %table.1
+  %else.arg = uitofp i32 %arg to float
+  call void (...) @lgc.cps.jump(i32 %cr.else, i32 2, i32 poison, i32 poison, float %else.arg, i32 5)
+  unreachable
+}
+
+define void @unify_jump_ret(i32 %arg, ptr %table) !lgc.cps !0 !lgc.shaderstage !{i32 7} {
+; CHECK-LABEL: define void @unify_jump_ret(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[ARG]], 3
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TABLE_0:%.*]] = getelementptr i32, ptr [[TABLE]], i32 0
+; CHECK-NEXT:    [[CR_THEN:%.*]] = load i32, ptr [[TABLE_0]], align 4
+; CHECK-NEXT:    [[THEN_ARG:%.*]] = add i32 [[ARG]], 1
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_THEN]], i32 2, i32 poison, i32 poison, i32 [[THEN_ARG]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp ult i32 %arg, 3
+  br i1 %cond, label %then, label %else
+
+then:
+  %table.0 = getelementptr i32, ptr %table, i32 0
+  %cr.then = load i32, ptr %table.0
+  %then.arg = add i32 %arg, 1
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 %then.arg)
+  unreachable
+
+else:
+  ret void
+}
+
+!continuation.stackAddrspace = !{!1}
+
+!0 = !{i32 1} ; level 1
+!1 = !{i32 5}
+;.
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 7}
+;.