pytorch · haowhsu-quic · Jul 1, 2025
@@ -239,11 +239,28 @@ target_link_libraries(
           shared_buffer
           qnn_dlc_manager
 )
-target_link_libraries(
-  qnn_executorch_backend
-  PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
-          extension_tensor qnn_backend_options
-)
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+  link_directories(
+    $ENV{HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/$ENV{HEXAGON_ARCH}/G0/pic
+  )
+  target_link_libraries(
+    qnn_executorch_backend
+    PRIVATE qnn_executorch_header
+            qnn_schema
+            qnn_manager
+            executorch_core
+            extension_tensor
+            qnn_backend_options
+            c
+            c++
+  )
+else()
+  target_link_libraries(
+    qnn_executorch_backend
+    PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
+            extension_tensor qnn_backend_options
+  )
+endif()
 set_target_properties(
   qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
@@ -278,6 +295,13 @@ install(
   RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
 )
 
+if(DEFINED ENV{HEXAGON_SDK_ROOT})
+  add_subdirectory(
+    ${QNN_EXECUTORCH_ROOT_DIR}/fastrpc
+    ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/fastrpc
+  )
+endif()
+
 # QNN pybind
 if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
   add_subdirectory(

@@ -5,9 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
-from executorch.exir.delegate import executorch_call_delegate
 
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
 from executorch.exir.tensor import TensorSpec
 from torch.utils import _pytree as pytree
 
@@ -39,11 +38,17 @@ def call_getitem(self, value, key: int, meta):
         return super().call_getitem(value, key, meta)
 
     def call_delegate(self, lowered_module, args, kwargs, meta):
-        args_data, _ = pytree.tree_map_only(
-            ProxyValue, lambda x: x.data, (args, kwargs)
-        )
+        output_node = [
+            node
+            for node in lowered_module.original_module.graph.nodes
+            if node.target == "output"
+        ][0]
+        tensors = [
+            node.meta["val"].to(node.meta[QCOM_QUANTIZED_IO])
+            for node in output_node.args[0]
+        ]
         meta["spec"] = pytree.tree_map(
             self._make_spec,
-            executorch_call_delegate(lowered_module, *args_data),
+            tuple(tensors),
         )
         return super().call_delegate(lowered_module, args, kwargs, meta)
@@ -118,7 +118,17 @@ TensorWrapper::TensorWrapper(
 Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   if (data != nullptr) {
     QNN_TENSOR_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
+#ifdef __hexagon__
+    // alignment is required
+    auto align_size = [](size_t alignment, size_t sz) {
+      return (sz + (alignment - 1)) & ~(alignment - 1);
+    };
+    const size_t alignment = 64;
+    QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize =
+        align_size(alignment, bytes_);
+#else
     QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
+#endif
     if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes_);
       const char* src_data = static_cast<const char*>(data);

@@ -0,0 +1,78 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(_qnn_fastrpc__dir ${CMAKE_BINARY_DIR}/backends/qualcomm/fastrpc)
+set(_qnn_fastrpc__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_executorch.idl)
+set(_qnn_fastrpc__outputs
+    ${_qnn_fastrpc__dir}/qnn_executorch.h
+    ${_qnn_fastrpc__dir}/qnn_executorch_stub.c
+    ${_qnn_fastrpc__dir}/qnn_executorch_skel.c
+)
+
+if(DEFINED ENV{HEXAGON_SDK_ROOT})
+  add_custom_command(
+    OUTPUT ${_qnn_fastrpc__outputs}
+    COMMAND mkdir -p ${_qnn_fastrpc__dir}
+    COMMAND
+      $ENV{HEXAGON_SDK_ROOT}/ipc/fastrpc/qaic/bin/qaic -I
+      $ENV{HEXAGON_SDK_ROOT}/incs -I $ENV{HEXAGON_SDK_ROOT}/incs/stddef -o
+      ${_qnn_fastrpc__dir} ${_qnn_fastrpc__srcs}
+    WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+    DEPENDS qnn_executorch_backend
+    COMMENT "Codegen for fastrpc files"
+  )
+  add_custom_target(
+    fastrpc_codegen
+    DEPENDS ${_qnn_fastrpc__outputs}
+    COMMENT "Codegen for fastrpc files"
+  )
+
+endif()
+
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+  add_library(
+    qnn_executorch_skel SHARED
+    ${_qnn_fastrpc__dir}/qnn_executorch.h
+    ${_qnn_fastrpc__dir}/qnn_executorch_skel.c qnn_executorch_impl.cpp
+  )
+  target_include_directories(qnn_executorch_skel PRIVATE ${_qnn_fastrpc__dir})
+  target_link_libraries(
+    qnn_executorch_skel PRIVATE extension_data_loader qnn_executorch_backend
+                                c++ c
+  )
+  add_dependencies(qnn_executorch_skel fastrpc_codegen)
+endif()
+
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES aarch64)
+  include_directories(
+    $ENV{HEXAGON_SDK_ROOT}/incs $ENV{HEXAGON_SDK_ROOT}/incs/stddef
+    ${_qnn_fastrpc__dir}
+  )
+  link_directories(
+    $ENV{HEXAGON_SDK_ROOT}/ipc/fastrpc/remote/ship/android_aarch64
+  )
+  add_library(
+    qnn_executorch_stub SHARED ${_qnn_fastrpc__dir}/qnn_executorch.h
+                               ${_qnn_fastrpc__dir}/qnn_executorch_stub.c
+  )
+  # TODO: support cdsp if necessary
+  target_link_libraries(qnn_executorch_stub PRIVATE adsprpc)
+  add_dependencies(qnn_executorch_stub fastrpc_codegen)
+
+  # build minimum example app
+  add_executable(qnn_executor_runner qnn_executor_runner.cpp)
+  target_link_libraries(
+    qnn_executor_runner PRIVATE executorch_core gflags qnn_executorch_stub
+                                adsprpc
+  )
+  # TODO: support cdsp if necessary
+  target_link_libraries(qnn_executor_runner PRIVATE adsprpc)
+endif()
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <chrono>
+#include <fstream>
+#include <memory>
+#include <numeric>
+
+#include <executorch/runtime/platform/assert.h>
+#include <gflags/gflags.h>
+
+#include "qnn_executorch.h"
+
+DEFINE_string(
+    model_path,
+    "model.pte",
+    "Model serialized in flatbuffer format.");
+DEFINE_string(
+    output_folder_path,
+    ".",
+    "Executorch inference data output path.");
+DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "extra commandline args:";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += std::string(" ") + argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // fastrpc related
+  // adsp
+  const int adsp_domain_id = 0;
+  // signed PD
+  const int enable_unsigned_pd = 0;
+  // domain uri
+  std::string domain_uri(qnn_executorch_URI);
+  domain_uri += "&_dom=adsp";
+  // init session
+  struct remote_rpc_control_unsigned_module data;
+  data.domain = adsp_domain_id;
+  data.enable = enable_unsigned_pd;
+  int err = AEE_SUCCESS;
+  ET_CHECK_MSG(
+      AEE_SUCCESS ==
+          (err = remote_session_control(
+               DSPRPC_CONTROL_UNSIGNED_MODULE, (void*)&data, sizeof(data))),
+      "remote_session_control failed: 0x%x",
+      err);
+  // start session
+  remote_handle64 handle = -1;
+  ET_CHECK_MSG(
+      AEE_SUCCESS == (err = qnn_executorch_open(domain_uri.data(), &handle)),
+      "qnn_executorch_open failed: 0x%x",
+      err);
+  // load model
+  const char* model_path = FLAGS_model_path.c_str();
+  qnn_executorch_load(handle, model_path);
+
+  // prepare io
+  std::vector<std::vector<uint8_t>> input_data, output_data;
+  std::vector<tensor> input_tensor, output_tensor;
+  for (int i = 0;; ++i) {
+    int nbytes = 0;
+    qnn_executorch_get_input_size(handle, model_path, i, &nbytes);
+    if (nbytes == -1) {
+      break;
+    }
+    input_data.emplace_back(std::vector<uint8_t>(nbytes));
+    input_tensor.emplace_back(
+        tensor({input_data.back().data(), (int)input_data.back().size()}));
+  }
+  for (int i = 0;; ++i) {
+    int nbytes = 0;
+    qnn_executorch_get_output_size(handle, model_path, i, &nbytes);
+    if (nbytes == -1) {
+      break;
+    }
+    output_data.emplace_back(std::vector<uint8_t>(nbytes));
+    output_tensor.emplace_back(
+        tensor({output_data.back().data(), (int)output_data.back().size()}));
+  }
+
+  // prepare input data
+  std::ifstream input_list(FLAGS_input_list_path);
+  // TODO: should check IO info via fastrpc first
+  if (input_list.is_open()) {
+    auto split = [](std::string s, std::string delimiter) {
+      size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+      std::string token;
+      std::vector<std::string> res;
+
+      while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+      }
+      res.push_back(s.substr(pos_start));
+      return res;
+    };
+
+    std::string file_path;
+    int inference_index = 0;
+    while (std::getline(input_list, file_path)) {
+      auto input_files = split(file_path, " ");
+      if (input_files.size() == 0) {
+        break;
+      }
+      size_t num_inputs = input_files.size();
+      for (int i = 0; i < num_inputs; ++i) {
+        std::ifstream fin(input_files[i], std::ios::binary);
+        fin.seekg(0, fin.end);
+        size_t file_size = fin.tellg();
+        fin.seekg(0, fin.beg);
+        fin.read((char*)input_data[i].data(), file_size);
+        fin.close();
+      }
+      qnn_executorch_set_input(
+          handle, model_path, input_tensor.data(), input_tensor.size());
+      qnn_executorch_execute(handle, model_path);
+      qnn_executorch_get_output(
+          handle, model_path, output_tensor.data(), output_tensor.size());
+      for (size_t i = 0; i < output_tensor.size(); i++) {
+        auto output_file_name = FLAGS_output_folder_path + "/output_" +
+            std::to_string(inference_index) + "_" + std::to_string(i) + ".raw";
+        std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+        fout.write(
+            (const char*)output_tensor[i].data, output_tensor[i].dataLen);
+        fout.close();
+      }
+    }
+  }
+
+  // unload model
+  qnn_executorch_unload(handle, model_path);
+  // tear down
+  qnn_executorch_close(handle);
+  return 0;
+}
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "AEEStdDef.idl"
+#include "remote.idl"
+
+/// Enabling stub-skel mismatch check feature in the auto-gen files.
+/// Please refer to the IDL documentation for more details on the feature.
+/// It is fully supported only on Kailua and later targets.
+const string IDL_VERSION = "0.0.0";
+
+typedef sequence<uint8> tensor;
+
+interface qnn_executorch : remote_handle64 {
+   long load(in string pte_path);
+   long get_input_size(in string pte_path, in long index, rout long nbytes);
+   long set_input(in string pte_path, in sequence<tensor> tensors);
+   long execute(in string pte_path);
+   long get_output_size(in string pte_path, in long index, rout long nbytes);
+   long get_output(in string pte_path, rout sequence<tensor> tensors);
+   long unload(in string pte_path);
+};