From 4af507845a439c0d8395cb10476fdbe1317d6c3c Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 23 Dec 2025 17:07:52 -0800
Subject: [PATCH] [slimtensor] Add SlimTensor class with basic properties and
 CPU copy operation

**Key components:**

1. **`c10/core/Contiguity.h`** - Contiguity checking utility:
   - `_compute_contiguous<T>()` - computes whether a tensor with given sizes/strides is contiguous in memory (row-major order)

2. **`core/SlimTensor.h`** - Main SlimTensor class with:
   - **Constructors**: Default (undefined tensor) and full constructor with storage, sizes, strides, dtype, and storage_offset
   - **Property accessors**:
     - `sizes()`, `size(dim)` - get tensor dimensions with negative indexing support
     - `strides()`, `stride(dim)` - get tensor strides with negative indexing support
     - `dtype()`, `device()`, `device_type()`, `device_index()`
     - `numel()`, `dim()`, `nbytes()`, `itemsize()`
     - `data_ptr()` - returns pointer to tensor data (adjusted for storage_offset)
     - `storage_offset()`, `storage()`
   - **State queries**: `defined()`, `is_cpu()`, `is_contiguous()`, `is_empty()`
   - **Copy operation**: `copy_(other)` - copies data from another tensor
     - Fast path: uses memcpy for both-contiguous tensors
     - Slow path: element-wise copy respecting strides for non-contiguous tensors
   - **Setters**: `reset()`, `set_storage()`, `set_sizes_and_strides()`

**Curretnt constraints:**
- Only CPU device supported
- Only Float32 dtype tested
- copy_() only supports CPU-to-CPU copy

Those contraints will be further improved in the following diffs

Differential Revision: [D89750150](https://our.internmc.facebook.com/intern/diff/D89750150/)

[ghstack-poisoned]
---
 backends/aoti/slim/c10/core/Contiguity.h      |  54 +++
 backends/aoti/slim/c10/core/targets.bzl       |  13 +
 backends/aoti/slim/core/SlimTensor.h          | 365 ++++++++++++++++++
 backends/aoti/slim/core/targets.bzl           |  19 +
 backends/aoti/slim/core/test/targets.bzl      |  22 ++
 .../slim/core/test/test_slimtensor_basic.cpp  | 334 ++++++++++++++++
 .../slim/core/test/test_slimtensor_copy.cpp   | 259 +++++++++++++
 7 files changed, 1066 insertions(+)
 create mode 100644 backends/aoti/slim/c10/core/Contiguity.h
 create mode 100644 backends/aoti/slim/core/SlimTensor.h
 create mode 100644 backends/aoti/slim/core/test/test_slimtensor_basic.cpp
 create mode 100644 backends/aoti/slim/core/test/test_slimtensor_copy.cpp
diff --git a/backends/aoti/slim/c10/core/Contiguity.h b/backends/aoti/slim/c10/core/Contiguity.h
new file mode 100644
index 00000000000..e3bcfb24341
--- /dev/null
+++ b/backends/aoti/slim/c10/core/Contiguity.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/runtime/core/array_ref.h>
+
+namespace executorch::backends::aoti::slim::c10 {
+
+using ::executorch::runtime::ArrayRef;
+
+/**
+ * Compute whether a tensor with given sizes, strides, and numel is contiguous.
+ *
+ * A tensor is contiguous if its elements are laid out in memory in row-major
+ * order, i.e., the stride of the last dimension is 1, and each preceding
+ * dimension's stride equals the product of all following dimensions' sizes.
+ *
+ * @param sizes The sizes of each dimension
+ * @param strides The strides of each dimension
+ * @param numel The total number of elements
+ * @return true if the tensor is contiguous, false otherwise
+ */
+template <typename T>
+bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (numel == 0) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // Iterate from last dimension to first
+  for (int64_t d = static_cast<int64_t>(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (size_d == 1) {
+      // Size-1 dimensions don't affect contiguity
+      continue;
+    }
+
+    if (strides[d] != expected_stride) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/targets.bzl b/backends/aoti/slim/c10/core/targets.bzl
index c421081f095..500620aecd1 100644
--- a/backends/aoti/slim/c10/core/targets.bzl
+++ b/backends/aoti/slim/c10/core/targets.bzl
@@ -54,11 +54,24 @@ def define_common_targets():
         ],
     )
 
+    # Header-only library for Contiguity
+    runtime.cxx_library(
+        name = "contiguity",
+        headers = [
+            "Contiguity.h",
+        ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+        ],
+    )
+
     # Combined c10 core library
     runtime.cxx_library(
         name = "core",
         visibility = ["@EXECUTORCH_CLIENTS"],
         exported_deps = [
+            ":contiguity",
             ":device",
             ":device_type",
             ":scalar_type",
diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h
new file mode 100644
index 00000000000..f3ab9f3fec3
--- /dev/null
+++ b/backends/aoti/slim/core/SlimTensor.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+#include <executorch/backends/aoti/slim/c10/core/Contiguity.h>
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/aoti/slim/c10/core/SizesAndStrides.h>
+#include <executorch/backends/aoti/slim/core/Storage.h>
+#include <executorch/backends/aoti/slim/util/ArrayRefUtil.h>
+#include <executorch/backends/aoti/slim/util/SizeUtil.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch::backends::aoti::slim {
+
+/**
+ * SlimTensor - A lightweight tensor class for AOTI-driven ET backends runtime.
+ *
+ */
+class SlimTensor {
+ public:
+  /**
+   * Construct a SlimTensor with the given storage, sizes, strides, and dtype.
+   *
+   * @param storage The underlying storage
+   * @param sizes The sizes of each dimension
+   * @param strides The strides of each dimension
+   * @param dtype The scalar type of tensor elements
+   * @param storage_offset Offset into storage in number of elements
+   */
+  SlimTensor(
+      Storage&& storage,
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      c10::ScalarType dtype,
+      int64_t storage_offset = 0)
+      : storage_(std::move(storage)),
+        storage_offset_(storage_offset),
+        dtype_(dtype) {
+    set_sizes_and_strides(sizes, strides);
+  }
+
+  /**
+   * Default constructor - creates an undefined tensor.
+   */
+  SlimTensor()
+      : storage_(Storage()),
+        storage_offset_(0),
+        numel_(0),
+        dtype_(c10::ScalarType::Float),
+        is_contiguous_(true) {
+    sizes_and_strides_.set_sizes({0});
+    sizes_and_strides_.set_strides({1});
+  }
+
+  // Default copy/move operations
+  SlimTensor(const SlimTensor&) = default;
+  SlimTensor& operator=(const SlimTensor&) = default;
+  SlimTensor(SlimTensor&&) = default;
+  SlimTensor& operator=(SlimTensor&&) = default;
+  ~SlimTensor() = default;
+
+  /**
+   * Reset the tensor, releasing the storage reference.
+   */
+  void reset() {
+    storage_.reset();
+  }
+
+  // =========================================================================
+  // Property Accessors
+  // =========================================================================
+
+  /**
+   * Get the underlying storage.
+   */
+  Storage storage() const {
+    return storage_;
+  }
+
+  /**
+   * Get the total number of bytes for this tensor's data.
+   */
+  size_t nbytes() const {
+    return numel() * itemsize();
+  }
+
+  /**
+   * Get the size of a single element in bytes.
+   */
+  size_t itemsize() const {
+    return c10::elementSize(dtype_);
+  }
+
+  /**
+   * Get the sizes of all dimensions.
+   */
+  IntArrayRef sizes() const {
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  /**
+   * Get the size of a specific dimension.
+   */
+  int64_t size(int64_t dim) const {
+    int64_t ndim = static_cast<int64_t>(this->dim());
+    ET_CHECK_MSG(
+        dim >= -ndim && dim < ndim,
+        "Dimension out of range (expected to be in range of [%ld, %ld], but got %ld)",
+        -ndim,
+        ndim - 1,
+        dim);
+    if (dim < 0) {
+      dim += ndim;
+    }
+    return sizes_and_strides_.size_at(static_cast<size_t>(dim));
+  }
+
+  /**
+   * Get the strides of all dimensions.
+   */
+  IntArrayRef strides() const {
+    return sizes_and_strides_.strides_arrayref();
+  }
+
+  /**
+   * Get the stride of a specific dimension.
+   */
+  int64_t stride(int64_t dim) const {
+    int64_t ndim = static_cast<int64_t>(this->dim());
+    ET_CHECK_MSG(
+        dim >= -ndim && dim < ndim,
+        "Dimension out of range (expected to be in range of [%ld, %ld], but got %ld)",
+        -ndim,
+        ndim - 1,
+        dim);
+    if (dim < 0) {
+      dim += ndim;
+    }
+    return sizes_and_strides_.stride_at(static_cast<size_t>(dim));
+  }
+
+  /**
+   * Get the scalar type of tensor elements.
+   */
+  c10::ScalarType dtype() const {
+    return dtype_;
+  }
+
+  /**
+   * Get the device where the tensor data resides.
+   */
+  const c10::Device& device() const {
+    return storage_->device();
+  }
+
+  /**
+   * Get the device type.
+   */
+  c10::DeviceType device_type() const {
+    return storage_->device().type();
+  }
+
+  /**
+   * Get the device index.
+   */
+  c10::DeviceIndex device_index() const {
+    return storage_->device().index();
+  }
+
+  /**
+   * Get the storage offset in number of elements.
+   */
+  int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
+  /**
+   * Get the total number of elements.
+   */
+  size_t numel() const {
+    return numel_;
+  }
+
+  /**
+   * Get the number of dimensions.
+   */
+  size_t dim() const {
+    return sizes_and_strides_.size();
+  }
+
+  /**
+   * Get a pointer to the tensor data, adjusted for storage offset.
+   */
+  void* data_ptr() const {
+    return static_cast<char*>(storage_->data()) + storage_offset_ * itemsize();
+  }
+
+  /**
+   * Check if the tensor is contiguous in memory (row-major order).
+   */
+  bool is_contiguous() const {
+    return is_contiguous_;
+  }
+
+  /**
+   * Check if the tensor has zero elements.
+   */
+  bool is_empty() const {
+    return numel_ == 0;
+  }
+
+  /**
+   * Check if the tensor is on CPU.
+   */
+  bool is_cpu() const {
+    return device().is_cpu();
+  }
+
+  /**
+   * Check if the tensor is defined (has valid storage).
+   */
+  bool defined() const {
+    return storage_.get() != nullptr;
+  }
+
+  // =========================================================================
+  // Setters
+  // =========================================================================
+
+  /**
+   * Set the underlying storage.
+   */
+  void set_storage(Storage&& new_storage) {
+    storage_ = std::move(new_storage);
+  }
+
+  /**
+   * Set sizes and strides together.
+   */
+  void set_sizes_and_strides(IntArrayRef sizes, IntArrayRef strides) {
+    ET_CHECK_MSG(
+        sizes.size() == strides.size(),
+        "sizes (%zu) and strides (%zu) must have the same length",
+        sizes.size(),
+        strides.size());
+
+    sizes_and_strides_.set_sizes(sizes);
+    sizes_and_strides_.set_strides(strides);
+
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  // =========================================================================
+  // Copy Operation
+  // =========================================================================
+
+  /**
+   * Copy data from another tensor to this tensor.
+   *
+   * Both tensors must have the same numel and dtype.
+   * Currently only supports CPU-to-CPU copy (contiguous tensors only).
+   *
+   * @param other The source tensor to copy from
+   * @return Reference to this tensor
+   */
+  SlimTensor& copy_(const SlimTensor& other) {
+    ET_CHECK_MSG(
+        this->numel() == other.numel(),
+        "copy_: numel mismatch (dst=%zu, src=%zu)",
+        this->numel(),
+        other.numel());
+    ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype mismatch");
+
+    if (this->numel() == 0) {
+      return *this;
+    }
+
+    // Current we only support CPU-only tensors
+    // TODO(gasoonjia): support other device types.
+    ET_CHECK_MSG(
+        this->is_cpu() && other.is_cpu(), "copy_: only CPU tensors supported");
+
+    if (this->is_contiguous() && other.is_contiguous()) {
+      // Fast path: both tensors are contiguous, use memcpy
+      std::memcpy(this->data_ptr(), other.data_ptr(), other.nbytes());
+    } else {
+      // Slow path: element-wise copy for non-contiguous tensors
+      copy_strided_(other);
+    }
+
+    return *this;
+  }
+
+ private:
+  /**
+   * Element-wise copy for non-contiguous tensors.
+   */
+  void copy_strided_(const SlimTensor& other) {
+    const size_t elem_size = c10::elementSize(dtype_);
+    char* dst_data = static_cast<char*>(this->data_ptr());
+    const char* src_data = static_cast<const char*>(other.data_ptr());
+
+    std::vector<int64_t> counter(this->dim(), 0);
+    for (size_t i = 0; i < this->numel(); i++) {
+      // Compute source offset
+      int64_t src_offset = 0;
+      for (size_t d = 0; d < other.dim(); d++) {
+        src_offset += counter[d] * other.stride(static_cast<int64_t>(d));
+      }
+
+      // Compute destination offset
+      int64_t dst_offset = 0;
+      for (size_t d = 0; d < this->dim(); d++) {
+        dst_offset += counter[d] * this->stride(static_cast<int64_t>(d));
+      }
+
+      // Copy single element
+      std::memcpy(
+          dst_data + dst_offset * static_cast<int64_t>(elem_size),
+          src_data + src_offset * static_cast<int64_t>(elem_size),
+          elem_size);
+
+      // Increment multi-dimensional counter
+      for (int64_t d = static_cast<int64_t>(this->dim()) - 1; d >= 0; --d) {
+        counter[d]++;
+        if (counter[d] < this->size(d)) {
+          break;
+        }
+        counter[d] = 0;
+      }
+    }
+  }
+
+  void refresh_numel() {
+    numel_ = compute_numel(sizes_and_strides_.sizes_arrayref());
+  }
+
+  void refresh_contiguous() {
+    is_contiguous_ = c10::_compute_contiguous<int64_t>(
+        sizes_and_strides_.sizes_arrayref(),
+        sizes_and_strides_.strides_arrayref(),
+        static_cast<int64_t>(numel_));
+  }
+
+  Storage storage_;
+  int64_t storage_offset_{0};
+  c10::SizesAndStrides sizes_and_strides_;
+  size_t numel_{1};
+  c10::ScalarType dtype_;
+  bool is_contiguous_{true};
+};
+
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index 12de67bf8b1..8c352b74c28 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -17,3 +17,22 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
         ],
     )
+
+    # Header-only library for SlimTensor
+    runtime.cxx_library(
+        name = "slimtensor",
+        headers = [
+            "SlimTensor.h",
+        ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        exported_deps = [
+            ":storage",
+            "//executorch/backends/aoti/slim/c10/core:contiguity",
+            "//executorch/backends/aoti/slim/c10/core:device",
+            "//executorch/backends/aoti/slim/c10/core:scalar_type",
+            "//executorch/backends/aoti/slim/c10/core:sizes_and_strides",
+            "//executorch/backends/aoti/slim/util:array_ref_util",
+            "//executorch/backends/aoti/slim/util:size_util",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl
index 1bc6029bd2d..4d7ec4b0fbf 100644
--- a/backends/aoti/slim/core/test/targets.bzl
+++ b/backends/aoti/slim/core/test/targets.bzl
@@ -12,3 +12,25 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/core:storage",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_slimtensor_basic",
+        srcs = [
+            "test_slimtensor_basic.cpp",
+        ],
+        deps = [
+            "//executorch/backends/aoti/slim/core:slimtensor",
+            "//executorch/backends/aoti/slim/core:storage",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "test_slimtensor_copy",
+        srcs = [
+            "test_slimtensor_copy.cpp",
+        ],
+        deps = [
+            "//executorch/backends/aoti/slim/core:slimtensor",
+            "//executorch/backends/aoti/slim/core:storage",
+        ],
+    )
diff --git a/backends/aoti/slim/core/test/test_slimtensor_basic.cpp b/backends/aoti/slim/core/test/test_slimtensor_basic.cpp
new file mode 100644
index 00000000000..d19a8678725
--- /dev/null
+++ b/backends/aoti/slim/core/test/test_slimtensor_basic.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/aoti/slim/core/SlimTensor.h>
+#include <executorch/backends/aoti/slim/core/Storage.h>
+
+namespace executorch::backends::aoti::slim {
+
+// Helper function to create a CPU storage with given size
+Storage make_cpu_storage(size_t nbytes) {
+  return Storage(new MaybeOwningStorage(CPU_DEVICE, nbytes));
+}
+
+// Helper function to create a simple 2x3 float tensor
+SlimTensor make_2x3_tensor() {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+  size_t nbytes = 6 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+  return SlimTensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+}
+
+// =============================================================================
+// Constructor Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, DefaultConstructor) {
+  SlimTensor tensor;
+
+  EXPECT_FALSE(tensor.defined());
+  EXPECT_EQ(tensor.numel(), 0u);
+  EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float);
+  EXPECT_TRUE(tensor.is_contiguous());
+}
+
+TEST(SlimTensorBasicTest, ConstructWithStorage) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = {12, 4, 1};
+  size_t nbytes = 24 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_TRUE(tensor.defined());
+  EXPECT_EQ(tensor.dim(), 3u);
+  EXPECT_EQ(tensor.numel(), 24u);
+  EXPECT_TRUE(tensor.is_cpu());
+  EXPECT_TRUE(tensor.is_contiguous());
+}
+
+TEST(SlimTensorBasicTest, ConstructWithStorageOffset) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+  size_t nbytes = 100 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float,
+      10);
+
+  EXPECT_EQ(tensor.storage_offset(), 10);
+}
+
+// =============================================================================
+// Property Accessor Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, Sizes) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  auto sizes = tensor.sizes();
+  EXPECT_EQ(sizes.size(), 2u);
+  EXPECT_EQ(sizes[0], 2);
+  EXPECT_EQ(sizes[1], 3);
+}
+
+TEST(SlimTensorBasicTest, SizeAtDim) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  EXPECT_EQ(tensor.size(0), 2);
+  EXPECT_EQ(tensor.size(1), 3);
+  EXPECT_EQ(tensor.size(-1), 3);
+  EXPECT_EQ(tensor.size(-2), 2);
+}
+
+TEST(SlimTensorBasicTest, Strides) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  auto strides = tensor.strides();
+  EXPECT_EQ(strides.size(), 2u);
+  EXPECT_EQ(strides[0], 3);
+  EXPECT_EQ(strides[1], 1);
+}
+
+TEST(SlimTensorBasicTest, StrideAtDim) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  EXPECT_EQ(tensor.stride(0), 3);
+  EXPECT_EQ(tensor.stride(1), 1);
+  EXPECT_EQ(tensor.stride(-1), 1);
+  EXPECT_EQ(tensor.stride(-2), 3);
+}
+
+TEST(SlimTensorBasicTest, Dtype) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float);
+  EXPECT_EQ(tensor.itemsize(), sizeof(float));
+}
+
+TEST(SlimTensorBasicTest, Device) {
+  SlimTensor tensor = make_2x3_tensor();
+
+  EXPECT_TRUE(tensor.is_cpu());
+  EXPECT_EQ(tensor.device_type(), c10::DeviceType::CPU);
+  EXPECT_EQ(tensor.device_index(), 0);
+}
+
+TEST(SlimTensorBasicTest, Numel) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_EQ(tensor.numel(), 6u);
+}
+
+TEST(SlimTensorBasicTest, Dim) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_EQ(tensor.dim(), 2u);
+}
+
+TEST(SlimTensorBasicTest, Nbytes) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_EQ(tensor.nbytes(), 6 * sizeof(float));
+}
+
+TEST(SlimTensorBasicTest, Itemsize) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_EQ(tensor.itemsize(), sizeof(float));
+}
+
+TEST(SlimTensorBasicTest, DataPtr) {
+  SlimTensor tensor = make_2x3_tensor();
+  void* data = tensor.data_ptr();
+  EXPECT_NE(data, nullptr);
+}
+
+TEST(SlimTensorBasicTest, DataPtrWithOffset) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+  size_t nbytes = 100 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+  void* base = storage->data();
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float,
+      5);
+
+  void* data = tensor.data_ptr();
+  EXPECT_EQ(data, static_cast<char*>(base) + 5 * sizeof(float));
+}
+
+TEST(SlimTensorBasicTest, StorageOffset) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+  size_t nbytes = 100 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float,
+      42);
+
+  EXPECT_EQ(tensor.storage_offset(), 42);
+}
+
+// =============================================================================
+// Contiguity Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, IsContiguousTrue) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_TRUE(tensor.is_contiguous());
+}
+
+TEST(SlimTensorBasicTest, IsContiguousFalseTransposed) {
+  std::vector<int64_t> sizes = {3, 2};
+  std::vector<int64_t> strides = {1, 3};
+  size_t nbytes = 6 * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_FALSE(tensor.is_contiguous());
+}
+
+TEST(SlimTensorBasicTest, IsContiguousEmptyTensor) {
+  std::vector<int64_t> sizes = {0, 3};
+  std::vector<int64_t> strides = {3, 1};
+  size_t nbytes = 0;
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_TRUE(tensor.is_contiguous());
+  EXPECT_TRUE(tensor.is_empty());
+}
+
+// =============================================================================
+// State Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, Defined) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_TRUE(tensor.defined());
+}
+
+TEST(SlimTensorBasicTest, NotDefined) {
+  SlimTensor tensor;
+  EXPECT_FALSE(tensor.defined());
+}
+
+TEST(SlimTensorBasicTest, IsEmpty) {
+  std::vector<int64_t> sizes = {0};
+  std::vector<int64_t> strides = {1};
+  Storage storage = make_cpu_storage(0);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_TRUE(tensor.is_empty());
+  EXPECT_EQ(tensor.numel(), 0u);
+}
+
+TEST(SlimTensorBasicTest, Reset) {
+  SlimTensor tensor = make_2x3_tensor();
+  EXPECT_TRUE(tensor.defined());
+
+  tensor.reset();
+  EXPECT_FALSE(tensor.defined());
+}
+
+// =============================================================================
+// Copy/Move Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, CopyConstructor) {
+  SlimTensor original = make_2x3_tensor();
+  SlimTensor copy = original;
+
+  EXPECT_TRUE(copy.defined());
+  EXPECT_EQ(copy.dim(), 2u);
+  EXPECT_EQ(copy.numel(), 6u);
+  EXPECT_EQ(copy.dtype(), c10::ScalarType::Float);
+}
+
+TEST(SlimTensorBasicTest, MoveConstructor) {
+  SlimTensor original = make_2x3_tensor();
+  SlimTensor moved = std::move(original);
+
+  EXPECT_TRUE(moved.defined());
+  EXPECT_EQ(moved.dim(), 2u);
+  EXPECT_EQ(moved.numel(), 6u);
+}
+
+// =============================================================================
+// Multi-dimensional Tests
+// =============================================================================
+
+TEST(SlimTensorBasicTest, OneDimensional) {
+  std::vector<int64_t> sizes = {10};
+  std::vector<int64_t> strides = {1};
+  Storage storage = make_cpu_storage(10 * sizeof(float));
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_EQ(tensor.dim(), 1u);
+  EXPECT_EQ(tensor.size(0), 10);
+  EXPECT_EQ(tensor.stride(0), 1);
+  EXPECT_TRUE(tensor.is_contiguous());
+}
+
+TEST(SlimTensorBasicTest, FourDimensional) {
+  std::vector<int64_t> sizes = {2, 3, 4, 5};
+  std::vector<int64_t> strides = {60, 20, 5, 1};
+  Storage storage = make_cpu_storage(120 * sizeof(float));
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  EXPECT_EQ(tensor.dim(), 4u);
+  EXPECT_EQ(tensor.numel(), 120u);
+  EXPECT_TRUE(tensor.is_contiguous());
+}
+
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/core/test/test_slimtensor_copy.cpp b/backends/aoti/slim/core/test/test_slimtensor_copy.cpp
new file mode 100644
index 00000000000..a0adb083808
--- /dev/null
+++ b/backends/aoti/slim/core/test/test_slimtensor_copy.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/aoti/slim/core/SlimTensor.h>
+#include <executorch/backends/aoti/slim/core/Storage.h>
+
+namespace executorch::backends::aoti::slim {
+
+// Helper function to create a CPU storage with given size
+Storage make_cpu_storage(size_t nbytes) {
+  return Storage(new MaybeOwningStorage(CPU_DEVICE, nbytes));
+}
+
+// Helper function to create a contiguous float tensor and fill with values
+SlimTensor make_filled_tensor(
+    std::vector<int64_t> sizes,
+    std::vector<int64_t> strides,
+    const std::vector<float>& values) {
+  size_t numel = 1;
+  for (auto s : sizes) {
+    numel *= static_cast<size_t>(s);
+  }
+  size_t nbytes = numel * sizeof(float);
+  Storage storage = make_cpu_storage(nbytes);
+
+  SlimTensor tensor(
+      std::move(storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  float* data = static_cast<float*>(tensor.data_ptr());
+  for (size_t i = 0; i < values.size() && i < numel; ++i) {
+    data[i] = values[i];
+  }
+
+  return tensor;
+}
+
+// =============================================================================
+// Basic Copy Tests
+// =============================================================================
+
+TEST(SlimTensorCopyTest, CopyContiguousTensors) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+  std::vector<float> src_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  SlimTensor src = make_filled_tensor(sizes, strides, src_values);
+  SlimTensor dst =
+      make_filled_tensor(sizes, strides, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+  dst.copy_(src);
+
+  float* dst_data = static_cast<float*>(dst.data_ptr());
+  EXPECT_FLOAT_EQ(dst_data[0], 1.0f);
+  EXPECT_FLOAT_EQ(dst_data[1], 2.0f);
+  EXPECT_FLOAT_EQ(dst_data[2], 3.0f);
+  EXPECT_FLOAT_EQ(dst_data[3], 4.0f);
+  EXPECT_FLOAT_EQ(dst_data[4], 5.0f);
+  EXPECT_FLOAT_EQ(dst_data[5], 6.0f);
+}
+
+TEST(SlimTensorCopyTest, CopyOneDimensional) {
+  std::vector<int64_t> sizes = {5};
+  std::vector<int64_t> strides = {1};
+  std::vector<float> src_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f};
+
+  SlimTensor src = make_filled_tensor(sizes, strides, src_values);
+  SlimTensor dst =
+      make_filled_tensor(sizes, strides, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+  dst.copy_(src);
+
+  float* dst_data = static_cast<float*>(dst.data_ptr());
+  for (size_t i = 0; i < 5; ++i) {
+    EXPECT_FLOAT_EQ(dst_data[i], src_values[i]);
+  }
+}
+
+TEST(SlimTensorCopyTest, CopyThreeDimensional) {
+  std::vector<int64_t> sizes = {2, 2, 2};
+  std::vector<int64_t> strides = {4, 2, 1};
+  std::vector<float> src_values = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+
+  SlimTensor src = make_filled_tensor(sizes, strides, src_values);
+  SlimTensor dst = make_filled_tensor(
+      sizes, strides, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+  dst.copy_(src);
+
+  float* dst_data = static_cast<float*>(dst.data_ptr());
+  for (size_t i = 0; i < 8; ++i) {
+    EXPECT_FLOAT_EQ(dst_data[i], src_values[i]);
+  }
+}
+
+TEST(SlimTensorCopyTest, CopyEmptyTensor) {
+  std::vector<int64_t> sizes = {0, 3};
+  std::vector<int64_t> strides = {3, 1};
+  Storage storage1 = make_cpu_storage(0);
+  Storage storage2 = make_cpu_storage(0);
+
+  SlimTensor src(
+      std::move(storage1),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  SlimTensor dst(
+      std::move(storage2),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float);
+
+  // Should not crash
+  dst.copy_(src);
+
+  EXPECT_EQ(dst.numel(), 0u);
+}
+
+TEST(SlimTensorCopyTest, CopyReturnsSelf) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<int64_t> strides = {2, 1};
+  std::vector<float> values = {1.0f, 2.0f, 3.0f, 4.0f};
+
+  SlimTensor src = make_filled_tensor(sizes, strides, values);
+  SlimTensor dst = make_filled_tensor(sizes, strides, {0.0f, 0.0f, 0.0f, 0.0f});
+
+  SlimTensor& result = dst.copy_(src);
+
+  EXPECT_EQ(&result, &dst);
+}
+
+// =============================================================================
+// Non-Contiguous Copy Tests
+// =============================================================================
+
+TEST(SlimTensorCopyTest, CopyNonContiguousSrc) {
+  // Source is transposed (non-contiguous)
+  std::vector<int64_t> src_sizes = {2, 3};
+  std::vector<int64_t> src_strides = {1, 2};
+
+  // Allocate storage for 6 elements in transposed layout
+  Storage src_storage = make_cpu_storage(6 * sizeof(float));
+  float* src_data = static_cast<float*>(src_storage->data());
+  // Physical layout: [0,3] [1,4] [2,5] for logical [0,1,2; 3,4,5]
+  src_data[0] = 0.0f;
+  src_data[1] = 3.0f;
+  src_data[2] = 1.0f;
+  src_data[3] = 4.0f;
+  src_data[4] = 2.0f;
+  src_data[5] = 5.0f;
+
+  SlimTensor src(
+      std::move(src_storage),
+      makeArrayRef(src_sizes),
+      makeArrayRef(src_strides),
+      c10::ScalarType::Float);
+
+  // Destination is contiguous
+  std::vector<int64_t> dst_sizes = {2, 3};
+  std::vector<int64_t> dst_strides = {3, 1};
+  SlimTensor dst = make_filled_tensor(
+      dst_sizes, dst_strides, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+  dst.copy_(src);
+
+  float* dst_data = static_cast<float*>(dst.data_ptr());
+  EXPECT_FLOAT_EQ(dst_data[0], 0.0f);
+  EXPECT_FLOAT_EQ(dst_data[1], 1.0f);
+  EXPECT_FLOAT_EQ(dst_data[2], 2.0f);
+  EXPECT_FLOAT_EQ(dst_data[3], 3.0f);
+  EXPECT_FLOAT_EQ(dst_data[4], 4.0f);
+  EXPECT_FLOAT_EQ(dst_data[5], 5.0f);
+}
+
+TEST(SlimTensorCopyTest, CopyNonContiguousDst) {
+  // Source is contiguous
+  std::vector<int64_t> src_sizes = {2, 3};
+  std::vector<int64_t> src_strides = {3, 1};
+  std::vector<float> values = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+  SlimTensor src = make_filled_tensor(src_sizes, src_strides, values);
+
+  // Destination is transposed (non-contiguous)
+  std::vector<int64_t> dst_sizes = {2, 3};
+  std::vector<int64_t> dst_strides = {1, 2};
+  Storage dst_storage = make_cpu_storage(6 * sizeof(float));
+
+  SlimTensor dst(
+      std::move(dst_storage),
+      makeArrayRef(dst_sizes),
+      makeArrayRef(dst_strides),
+      c10::ScalarType::Float);
+
+  dst.copy_(src);
+
+  float* dst_data = static_cast<float*>(dst.storage()->data());
+  // After copy, physical layout should be: [0,3] [1,4] [2,5]
+  EXPECT_FLOAT_EQ(dst_data[0], 0.0f);
+  EXPECT_FLOAT_EQ(dst_data[1], 3.0f);
+  EXPECT_FLOAT_EQ(dst_data[2], 1.0f);
+  EXPECT_FLOAT_EQ(dst_data[3], 4.0f);
+  EXPECT_FLOAT_EQ(dst_data[4], 2.0f);
+  EXPECT_FLOAT_EQ(dst_data[5], 5.0f);
+}
+
+// =============================================================================
+// Storage Offset Tests
+// =============================================================================
+
+TEST(SlimTensorCopyTest, CopyWithStorageOffset) {
+  // Create a larger storage and use offset
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<int64_t> strides = {2, 1};
+  size_t total_nbytes = 100 * sizeof(float);
+
+  // Source with offset
+  Storage src_storage = make_cpu_storage(total_nbytes);
+  float* src_base = static_cast<float*>(src_storage->data());
+  src_base[10] = 1.0f;
+  src_base[11] = 2.0f;
+  src_base[12] = 3.0f;
+  src_base[13] = 4.0f;
+
+  SlimTensor src(
+      std::move(src_storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float,
+      10);
+
+  // Destination with different offset
+  Storage dst_storage = make_cpu_storage(total_nbytes);
+  SlimTensor dst(
+      std::move(dst_storage),
+      makeArrayRef(sizes),
+      makeArrayRef(strides),
+      c10::ScalarType::Float,
+      20);
+
+  dst.copy_(src);
+
+  float* dst_base = static_cast<float*>(dst.storage()->data());
+  EXPECT_FLOAT_EQ(dst_base[20], 1.0f);
+  EXPECT_FLOAT_EQ(dst_base[21], 2.0f);
+  EXPECT_FLOAT_EQ(dst_base[22], 3.0f);
+  EXPECT_FLOAT_EQ(dst_base[23], 4.0f);
+}
+
+} // namespace executorch::backends::aoti::slim