diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index f0a7649a2b..c67f47cf08 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -16,6 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 on:
+  push:
+    branches:
+      - "main"
+      - "pull-request/[0-9]+"
   schedule:
     # 2:17 AM UTC daily, after the midnight main CI build finishes.
     # Avoid minute 0 because GitHub documents high scheduled-workflow load
@@ -192,6 +196,82 @@ jobs:
       test-mode: nightly-numba-cuda
       matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda"))'
 
+  # ── numba-cuda-mlir tests ──
+
+  test-numba-cuda-mlir-linux-64:
+    name: "Nightly numba-cuda-mlir (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-numba-cuda-mlir
+      matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda-mlir"))'
+
+  test-numba-cuda-mlir-windows:
+    name: "Nightly numba-cuda-mlir (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-numba-cuda-mlir
+      matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda-mlir"))'
+
+  # ── Released cuda-core against main pathfinder/bindings ──
+
+  test-cuda-core-linux-64:
+    name: "Nightly cuda-core (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-cuda-core
+      matrix_filter: 'map(select(.ENV.MODE == "nightly-cuda-core"))'
+
+  test-cuda-core-windows:
+    name: "Nightly cuda-core (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-cuda-core
+      matrix_filter: 'map(select(.ENV.MODE == "nightly-cuda-core"))'
+
   # ── Standard tests on nightly-only runners ──
 
   test-standard-linux-aarch64:
@@ -226,6 +306,10 @@ jobs:
       - test-numba-cuda-linux-64
       - test-numba-cuda-linux-aarch64
       - test-numba-cuda-windows
+      - test-numba-cuda-mlir-linux-64
+      - test-numba-cuda-mlir-windows
+      - test-cuda-core-linux-64
+      - test-cuda-core-windows
       - test-standard-linux-aarch64
     steps:
       - name: Exit
@@ -250,6 +334,14 @@ jobs:
                  needs.test-numba-cuda-linux-aarch64.result == 'failure' ||
                  needs.test-numba-cuda-windows.result == 'cancelled' ||
                  needs.test-numba-cuda-windows.result == 'failure' ||
+                 needs.test-numba-cuda-mlir-linux-64.result == 'cancelled' ||
+                 needs.test-numba-cuda-mlir-linux-64.result == 'failure' ||
+                 needs.test-numba-cuda-mlir-windows.result == 'cancelled' ||
+                 needs.test-numba-cuda-mlir-windows.result == 'failure' ||
+                 needs.test-cuda-core-linux-64.result == 'cancelled' ||
+                 needs.test-cuda-core-linux-64.result == 'failure' ||
+                 needs.test-cuda-core-windows.result == 'cancelled' ||
+                 needs.test-cuda-core-windows.result == 'failure' ||
                  needs.test-standard-linux-aarch64.result == 'cancelled' ||
                  needs.test-standard-linux-aarch64.result == 'failure' }}; then
             exit 1
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 982d4d1c49..16af4b15d3 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -37,8 +37,9 @@ on:
         default: ''
       test-mode:
         description: >
-          Test mode: 'standard' (default), 'nightly-pytorch', or
-          'nightly-numba-cuda'.
+          Test mode: 'standard' (default), 'nightly-pytorch',
+          'nightly-numba-cuda', 'nightly-numba-cuda-mlir', or
+          'nightly-cuda-core'.
         type: string
         default: 'standard'
       sha:
@@ -409,6 +410,20 @@ jobs:
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests nightly-numba-cuda
 
+      - name: Install cuda-python wheels + numba-cuda-mlir
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: run-tests nightly-numba-cuda-mlir
+
+      - name: Install main pathfinder/bindings + released cuda-core
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: run-tests nightly-cuda-core
+
       # ── Nightly: run tests ──
       - name: Run PyTorch interop tests
         if: ${{ inputs.test-mode == 'nightly-pytorch' }}
@@ -420,3 +435,98 @@ jobs:
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         run: python -m numba.runtests numba.cuda.tests
+
+      - name: Checkout numba-cuda-mlir tests at matching tag
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }}
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+        with:
+          repository: NVIDIA/numba-cuda-mlir
+          ref: v${{ env.NUMBA_CUDA_MLIR_VER }}
+          path: numba-cuda-mlir-released
+
+      - name: Run numba-cuda-mlir tests
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }}
+        run: |
+          pushd numba-cuda-mlir-released
+          # Install this tag's test deps (pytest + plugins + ml-dtypes + ...).
+          pip install --upgrade "pip>=25.1"
+          pip install --group test
+          # Skip tests/benchmarks/ and tests/doc_examples/ — they import the
+          # numba package at collection time, which cuSIMT intentionally does
+          # not depend on. See NVIDIA/numba-cuda-mlir#136.
+          #
+          # Version-gated deselects: when a newer numba-cuda-mlir release
+          # ships with the referenced fix, the guard evaluates false and the
+          # tests get run automatically. If they still fail on the newer
+          # version we hear about it loudly (rather than silently masking).
+          DESELECTS=()
+          if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then
+            # NVIDIA/numba-cuda-mlir#135: serial-pytest contamination of
+            # numba_cuda_mlir.cuda.cudadrv from an xfailed test in
+            # test_nrt_comprehensive.py contaminates any later test that
+            # touches cuda.cudadrv.driver. Upstream CI hides it via
+            # `-n auto --dist loadscope`. Which specific tests fail depends
+            # on collection order (we saw different subsets on linux-64 vs
+            # win-64 across runs), so we deselect the union of all tests
+            # #135 lists as vulnerable + test_fortran_contiguous (observed
+            # to hit the same contamination in our runs).
+            #
+            # test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn:
+            # subprocess-invokes `cuobjdump`, not on PATH in the base
+            # ubuntu:24.04 container. (Linux-only; Windows runners ship
+            # cuobjdump with the local CTK. No upstream fix yet — pending
+            # a skip-guard bug to be filed against NVIDIA/numba-cuda-mlir.)
+            DESELECTS+=(
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream'
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream'
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous'
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn'
+            )
+          fi
+          pytest -rxXs -v --durations=0 \
+            --ignore=tests/benchmarks \
+            --ignore=tests/doc_examples \
+            "${DESELECTS[@]}" \
+            tests/
+          popd
+
+      - name: Checkout released cuda-core tests at matching tag
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }}
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+        with:
+          ref: cuda-core-v${{ env.CUDA_CORE_RELEASED_VER }}
+          path: cuda-core-released
+
+      - name: Run released cuda-core tests
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }}
+        run: |
+          pushd cuda-core-released/cuda_core
+          # Install the released tag's test group so we exercise the exact deps
+          # that cuda-core version shipped with.
+          pip install --upgrade "pip>=25.1"
+          pip install --group "${CUDA_CORE_TEST_GROUP}"
+          # Cap pytest below 9.1: released cuda-core <=1.0.1 has parametrize
+          # patterns that pytest 9.1 rejects; the main-side fix (#2212) has
+          # not yet shipped in a cuda-core release.
+          pip install "pytest<9.1"
+          # Version-gated deselect: drops automatically when a newer
+          # cuda-core release with the wrapper-mapping update ships.
+          DESELECTS=()
+          if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${CUDA_CORE_RELEASED_VER}') <= Version('1.0.1') else 1)"; then
+            # NvlinkVersion: v1.0.1's wrapper mapping predates
+            # NvlinkVersion.VERSION_6_0 which main cuda-bindings adds.
+            # Expected drift on this mode until released cuda-core catches up.
+            DESELECTS+=(
+              --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]'
+            )
+          fi
+          pytest -rxXs -v --durations=0 --randomly-dont-reorganize \
+            "${DESELECTS[@]}" \
+            tests/
+          popd
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 2ad263297e..0d9dc78d5d 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -37,8 +37,9 @@ on:
         default: ''
       test-mode:
         description: >
-          Test mode: 'standard' (default), 'nightly-pytorch', or
-          'nightly-numba-cuda'.
+          Test mode: 'standard' (default), 'nightly-pytorch',
+          'nightly-numba-cuda', 'nightly-numba-cuda-mlir', or
+          'nightly-cuda-core'.
         type: string
         default: 'standard'
       sha:
@@ -388,6 +389,22 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests nightly-numba-cuda
 
+      - name: Install cuda-python wheels + numba-cuda-mlir
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: run-tests nightly-numba-cuda-mlir
+
+      - name: Install main pathfinder/bindings + released cuda-core
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: run-tests nightly-cuda-core
+
       # ── Nightly: run tests ──
       - name: Run PyTorch interop tests
         if: ${{ inputs.test-mode == 'nightly-pytorch' }}
@@ -401,3 +418,85 @@ jobs:
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: python -m numba.runtests numba.cuda.tests
+
+      - name: Checkout numba-cuda-mlir tests at matching tag
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }}
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+        with:
+          repository: NVIDIA/numba-cuda-mlir
+          ref: v${{ env.NUMBA_CUDA_MLIR_VER }}
+          path: numba-cuda-mlir-released
+
+      - name: Run numba-cuda-mlir tests
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pushd numba-cuda-mlir-released
+          pip install --upgrade "pip>=25.1"
+          pip install --group test
+          # Version-gated deselects — dropped automatically when newer
+          # cuSIMT release ships. See linux step for full rationale.
+          # NVIDIA/numba-cuda-mlir#135 poisons a subset of tests that
+          # varies across runs based on collection order, so we deselect
+          # the full union rather than trying to enumerate what happened
+          # to fail on the most recent nightly.
+          DESELECTS=()
+          if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then
+            DESELECTS+=(
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream'
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream'
+              --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams'
+              --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous'
+            )
+          fi
+          pytest -rxXs -v --durations=0 \
+            --ignore=tests/benchmarks \
+            --ignore=tests/doc_examples \
+            "${DESELECTS[@]}" \
+            tests/
+          popd
+
+      - name: Checkout released cuda-core tests at matching tag
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }}
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+        with:
+          ref: cuda-core-v${{ env.CUDA_CORE_RELEASED_VER }}
+          path: cuda-core-released
+
+      - name: Run released cuda-core tests
+        if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pushd cuda-core-released/cuda_core
+          pip install --upgrade "pip>=25.1"
+          pip install --group "${CUDA_CORE_TEST_GROUP}"
+          # Cap pytest below 9.1 — released cuda-core <=1.0.1 has parametrize
+          # patterns that pytest 9.1 rejects (see #2212).
+          pip install "pytest<9.1"
+          # Version-gated deselects — dropped automatically when a newer
+          # cuda-core release ships. See linux step for full rationale on
+          # NvlinkVersion. The Windows-only tests are:
+          # - test_rlcompleter_patch: env-dependent expectation that
+          #   passes on Linux, fails on Windows MCDM.
+          # - test_non_managed_resources_report_not_managed[pinned]: same
+          #   MCDM mempool OOM v1.0.1 already xfails in
+          #   test_pinned_memory_resource_initialization (TODO(#9999));
+          #   main fixed the parametrized case via #2139 but v1.0.1 lacks
+          #   the fix.
+          DESELECTS=()
+          if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${CUDA_CORE_RELEASED_VER}') <= Version('1.0.1') else 1)"; then
+            DESELECTS+=(
+              --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]'
+              --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive'
+              --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]'
+            )
+          fi
+          pytest -rxXs -v --durations=0 --randomly-dont-reorganize \
+            "${DESELECTS[@]}" \
+            tests/
+          popd
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index c9eafd4f52..cd0944f200 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -29,7 +29,7 @@
 #         subsequent steps (including the cuda.bindings and cuda.core test
 #         steps). Nightly rows also use ENV.MODE as a matrix-filter tag (see
 #         ci-nightly.yml). Examples:
-#           ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' }
+#           ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' }
 #           ENV: { MODE: 'nightly-pytorch', TORCH_VER: '2.12.1', TORCH_CUDA: 'cu126' }
 
 linux:
@@ -41,7 +41,7 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.11',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.11',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.11',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' } }
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' } }
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100',       GPU_COUNT: '1', DRIVER: 'latest' }
@@ -96,6 +96,11 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: '580.65.06', ENV: { MODE: 'nightly-numba-cuda' } }
     - { ARCH: 'arm64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda' } }
     - { ARCH: 'arm64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda' } }
+    # nightly-numba-cuda-mlir (MLIR backend, linux-64 only)
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda-mlir' } }
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda-mlir' } }
+    # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings)
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-cuda-core' } }
     # nightly-standard (arm64 nightly-only runners — per runner team request)
     # TODO: gh200 row disabled — currently hangs on stream-ordered memory
     #       allocator (cudaMallocAsync); runner pool needs fixing first.
@@ -117,7 +122,7 @@ windows:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' } }
+    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' } }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
@@ -136,3 +141,8 @@ windows:
     # nightly-numba-cuda
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest',  DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } }
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: '596.36',  DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } }
+    # nightly-numba-cuda-mlir (MLIR backend, win-64)
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest',  DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-numba-cuda-mlir' } }
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest',  DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-numba-cuda-mlir' } }
+    # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings)
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest',  DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-cuda-core' } }
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index 1ca54ba820..c5c0cc1233 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -93,10 +93,9 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
     PATHFINDER_WHL=($(realpath ./cuda_pathfinder/*.whl))
   fi
 
-  # pushd so --group reads test dependency groups from cuda_core/pyproject.toml.
-  pushd ./cuda_core
-
   if [[ "${test_module}" == "core" ]]; then
+    # pushd so --group reads test dependency groups from cuda_core/pyproject.toml.
+    pushd ./cuda_core
     echo "Installing bindings (source: ${BINDINGS_SOURCE})"
     pip install "${BINDINGS_ARGS[@]}"
     echo "Installing core wheel"
@@ -112,10 +111,28 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
     if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then
       ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
     fi
+    popd
+  elif [[ "${test_module}" == "nightly-cuda-core" ]]; then
+    # Test the *released* cuda-core (from PyPI) against *main*-built pathfinder
+    # and cuda-bindings. The workflow follows up with an actions/checkout of the
+    # matching cuda-core-v<X.Y.Z> tag so the released version's own test suite
+    # (which is not shipped in the wheel) can be exercised.
+    echo "Installing pathfinder + bindings from main + released cuda-core from PyPI"
+    pip install "${PATHFINDER_WHL[@]}" "${BINDINGS_ARGS[@]}" "cuda-core[cu${TEST_CUDA_MAJOR}]"
+
+    released_ver=$(pip show cuda-core | awk '/^Version:/{print $2}')
+    if [[ -n "${GITHUB_ENV:-}" ]]; then
+      echo "CUDA_CORE_RELEASED_VER=${released_ver}" >> "${GITHUB_ENV}"
+      echo "CUDA_CORE_TEST_GROUP=test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" >> "${GITHUB_ENV}"
+    fi
+    echo "Installed packages before released cuda-core tests:"
+    pip list
   else
-    # Nightly optional-dependency testing.
-    # Install ALL wheels (pathfinder + bindings + core) and the optional dep
-    # in a single pip call so pip resolves version constraints in one shot.
+    # Nightly optional-dependency testing: nightly-pytorch, nightly-numba-cuda,
+    # nightly-numba-cuda-mlir. Install ALL cuda-python wheels (pathfinder +
+    # bindings + core) and the optional dep in a single pip call so pip resolves
+    # version constraints in one shot.
+    pushd ./cuda_core
     PIP_ARGS=(
       "${PATHFINDER_WHL[@]}"
       "${BINDINGS_ARGS[@]}"
@@ -144,12 +161,26 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
         "cupy-cuda${TEST_CUDA_MAJOR}x"
         psutil cffi pytest-xdist pytest-benchmark filecheck ml_dtypes statistics
       )
+    elif [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then
+      echo "Installing pathfinder + bindings + core + numba-cuda-mlir"
+      # numpy<2.5: numba-cuda-mlir 0.4.0 registers np.row_stack, which was
+      # removed in NumPy 2.5. See NVIDIA/numba-cuda-mlir#154.
+      PIP_ARGS+=("numba-cuda-mlir[cu${TEST_CUDA_MAJOR}]" "numpy<2.5")
     fi
 
     pip install "${PIP_ARGS[@]}"
     echo "Nightly install complete — installed packages:"
     pip list
+    popd
+
+    if [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then
+      # Expose the installed numba-cuda-mlir version so the workflow can
+      # actions/checkout the matching v<X.Y.Z> tag from NVIDIA/numba-cuda-mlir
+      # (the wheel does not ship test_*.py files).
+      installed_ver=$(pip show numba-cuda-mlir | awk '/^Version:/{print $2}')
+      if [[ -n "${GITHUB_ENV:-}" ]]; then
+        echo "NUMBA_CUDA_MLIR_VER=${installed_ver}" >> "${GITHUB_ENV}"
+      fi
+    fi
   fi
-
-  popd
 fi