From 05be7d20e58877ee951141840cb2de3765e14077 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 03:53:24 +0000 Subject: [PATCH 01/13] CI: add nightly-cuda-core and nightly-numba-cuda-mlir modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nightly-cuda-core: test the released cuda-core from PyPI against main-built pathfinder and cuda-bindings, catching the "core released × bindings main" gap documented in issue #1955. Runs on linux-64 (a100) and win-64 (a100 MCDM). nightly-numba-cuda-mlir: MLIR-backend companion to nightly-numba-cuda. Installs main pathfinder+bindings+core plus numba-cuda-mlir from PyPI, runs numba-cuda-mlir's own test suite from the matching git tag. Linux amd64/arm64 x CUDA 12.9.1 / 13.3.0. Both modes fetch the released version's tests from git tags because the respective wheels do not ship test_*.py files. Includes tag-not-found fallback (log warning + exit 0) to avoid red-lining the nightly on a freshly-cut PyPI release that hasn't been pushed to git yet. --- .github/workflows/ci-nightly.yml | 88 ++++++++++++++++++++++++ .github/workflows/test-wheel-linux.yml | 33 ++++++++- .github/workflows/test-wheel-windows.yml | 20 +++++- ci/test-matrix.yml | 9 +++ ci/tools/run-tests | 72 ++++++++++++++++--- 5 files changed, 210 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml index f0a7649a2be..8f752b23ba4 100644 --- a/.github/workflows/ci-nightly.yml +++ b/.github/workflows/ci-nightly.yml @@ -192,6 +192,82 @@ jobs: test-mode: nightly-numba-cuda matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda"))' + # ── numba-cuda-mlir tests ── + + test-numba-cuda-mlir-linux-64: + name: "Nightly numba-cuda-mlir (linux-64)" + if: ${{ github.repository_owner == 'nvidia' }} + needs: find-wheels + permissions: + contents: read + actions: read + secrets: inherit + uses: ./.github/workflows/test-wheel-linux.yml + with: + build-type: nightly + host-platform: linux-64 + build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }} + run-id: ${{ needs.find-wheels.outputs.RUN_ID }} + sha: ${{ needs.find-wheels.outputs.HEAD_SHA }} + test-mode: nightly-numba-cuda-mlir + matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda-mlir"))' + + test-numba-cuda-mlir-linux-aarch64: + name: "Nightly numba-cuda-mlir (linux-aarch64)" + if: ${{ github.repository_owner == 'nvidia' }} + needs: find-wheels + permissions: + contents: read + actions: read + secrets: inherit + uses: ./.github/workflows/test-wheel-linux.yml + with: + build-type: nightly + host-platform: linux-aarch64 + build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }} + run-id: ${{ needs.find-wheels.outputs.RUN_ID }} + sha: ${{ needs.find-wheels.outputs.HEAD_SHA }} + test-mode: nightly-numba-cuda-mlir + matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda-mlir"))' + + # ── Released cuda-core against main pathfinder/bindings ── + + test-cuda-core-linux-64: + name: "Nightly cuda-core (linux-64)" + if: ${{ github.repository_owner == 'nvidia' }} + needs: find-wheels + permissions: + contents: read + actions: read + secrets: inherit + uses: ./.github/workflows/test-wheel-linux.yml + with: + build-type: nightly + host-platform: linux-64 + build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }} + run-id: ${{ needs.find-wheels.outputs.RUN_ID }} + sha: ${{ needs.find-wheels.outputs.HEAD_SHA }} + test-mode: nightly-cuda-core + matrix_filter: 'map(select(.ENV.MODE == "nightly-cuda-core"))' + + test-cuda-core-windows: + name: "Nightly cuda-core (win-64)" + if: ${{ github.repository_owner == 'nvidia' }} + needs: find-wheels + permissions: + contents: read + actions: read + secrets: inherit + uses: ./.github/workflows/test-wheel-windows.yml + with: + build-type: nightly + host-platform: win-64 + build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }} + run-id: ${{ needs.find-wheels.outputs.RUN_ID }} + sha: ${{ needs.find-wheels.outputs.HEAD_SHA }} + test-mode: nightly-cuda-core + matrix_filter: 'map(select(.ENV.MODE == "nightly-cuda-core"))' + # ── Standard tests on nightly-only runners ── test-standard-linux-aarch64: @@ -226,6 +302,10 @@ jobs: - test-numba-cuda-linux-64 - test-numba-cuda-linux-aarch64 - test-numba-cuda-windows + - test-numba-cuda-mlir-linux-64 + - test-numba-cuda-mlir-linux-aarch64 + - test-cuda-core-linux-64 + - test-cuda-core-windows - test-standard-linux-aarch64 steps: - name: Exit @@ -250,6 +330,14 @@ jobs: needs.test-numba-cuda-linux-aarch64.result == 'failure' || needs.test-numba-cuda-windows.result == 'cancelled' || needs.test-numba-cuda-windows.result == 'failure' || + needs.test-numba-cuda-mlir-linux-64.result == 'cancelled' || + needs.test-numba-cuda-mlir-linux-64.result == 'failure' || + needs.test-numba-cuda-mlir-linux-aarch64.result == 'cancelled' || + needs.test-numba-cuda-mlir-linux-aarch64.result == 'failure' || + needs.test-cuda-core-linux-64.result == 'cancelled' || + needs.test-cuda-core-linux-64.result == 'failure' || + needs.test-cuda-core-windows.result == 'cancelled' || + needs.test-cuda-core-windows.result == 'failure' || needs.test-standard-linux-aarch64.result == 'cancelled' || needs.test-standard-linux-aarch64.result == 'failure' }}; then exit 1 diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 982d4d1c491..8b4899afccf 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -37,8 +37,9 @@ on: default: '' test-mode: description: > - Test mode: 'standard' (default), 'nightly-pytorch', or - 'nightly-numba-cuda'. + Test mode: 'standard' (default), 'nightly-pytorch', + 'nightly-numba-cuda', 'nightly-numba-cuda-mlir', or + 'nightly-cuda-core'. type: string default: 'standard' sha: @@ -409,6 +410,20 @@ jobs: LOCAL_CTK: ${{ matrix.LOCAL_CTK }} run: run-tests nightly-numba-cuda + - name: Install cuda-python wheels + numba-cuda-mlir + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + run: run-tests nightly-numba-cuda-mlir + + - name: Install main pathfinder/bindings + released cuda-core + if: ${{ inputs.test-mode == 'nightly-cuda-core' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + run: run-tests nightly-cuda-core + # ── Nightly: run tests ── - name: Run PyTorch interop tests if: ${{ inputs.test-mode == 'nightly-pytorch' }} @@ -420,3 +435,17 @@ jobs: - name: Run numba-cuda tests if: ${{ inputs.test-mode == 'nightly-numba-cuda' }} run: python -m numba.runtests numba.cuda.tests + + - name: Run numba-cuda-mlir tests + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_TESTS_DIR != '' }} + run: | + pushd "${NUMBA_CUDA_MLIR_TESTS_DIR}" + pytest -rxXs -v --durations=0 tests/ + popd + + - name: Run released cuda-core tests + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_TESTS_DIR != '' }} + run: | + pushd "${CUDA_CORE_RELEASED_TESTS_DIR}/cuda_core" + pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ + popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 2ad263297eb..da80fbc20e6 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -37,8 +37,8 @@ on: default: '' test-mode: description: > - Test mode: 'standard' (default), 'nightly-pytorch', or - 'nightly-numba-cuda'. + Test mode: 'standard' (default), 'nightly-pytorch', + 'nightly-numba-cuda', or 'nightly-cuda-core'. type: string default: 'standard' sha: @@ -388,6 +388,14 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: run-tests nightly-numba-cuda + - name: Install main pathfinder/bindings + released cuda-core + if: ${{ inputs.test-mode == 'nightly-cuda-core' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: run-tests nightly-cuda-core + # ── Nightly: run tests ── - name: Run PyTorch interop tests if: ${{ inputs.test-mode == 'nightly-pytorch' }} @@ -401,3 +409,11 @@ jobs: if: ${{ inputs.test-mode == 'nightly-numba-cuda' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: python -m numba.runtests numba.cuda.tests + + - name: Run released cuda-core tests + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_TESTS_DIR != '' }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pushd "${CUDA_CORE_RELEASED_TESTS_DIR}/cuda_core" + pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ + popd diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index c9eafd4f521..f0e0668828c 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -96,6 +96,13 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } + # nightly-numba-cuda-mlir (MLIR backend, mirrors nightly-numba-cuda coverage) + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings) + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-cuda-core' } } # nightly-standard (arm64 nightly-only runners — per runner team request) # TODO: gh200 row disabled — currently hangs on stream-ordered memory # allocator (cudaMallocAsync); runner pool needs fixing first. @@ -136,3 +143,5 @@ windows: # nightly-numba-cuda - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } } + # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings) + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-cuda-core' } } diff --git a/ci/tools/run-tests b/ci/tools/run-tests index 1ca54ba8207..bcb487d23f8 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -93,10 +93,9 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then PATHFINDER_WHL=($(realpath ./cuda_pathfinder/*.whl)) fi - # pushd so --group reads test dependency groups from cuda_core/pyproject.toml. - pushd ./cuda_core - if [[ "${test_module}" == "core" ]]; then + # pushd so --group reads test dependency groups from cuda_core/pyproject.toml. + pushd ./cuda_core echo "Installing bindings (source: ${BINDINGS_SOURCE})" pip install "${BINDINGS_ARGS[@]}" echo "Installing core wheel" @@ -112,10 +111,44 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython fi + popd + elif [[ "${test_module}" == "nightly-cuda-core" ]]; then + # Test the *released* cuda-core (from PyPI) against *main*-built pathfinder + # and cuda-bindings. Fetches the released cuda-core's own test suite from + # the matching git tag so we exercise its full compiled surface, not the + # current main tests (which may reference APIs the released core doesn't + # have). + echo "Installing pathfinder + bindings from main + released cuda-core from PyPI" + pip install "${PATHFINDER_WHL[@]}" "${BINDINGS_ARGS[@]}" "cuda-core[cu${TEST_CUDA_MAJOR}]" + + released_ver=$(pip show cuda-core | awk '/^Version:/{print $2}') + tag="cuda-core-v${released_ver}" + tests_dir="${TMPDIR:-/tmp}/cuda-core-released-${released_ver}" + echo "Fetching cuda-core tests from tag ${tag}" + if ! git fetch origin --tags "+refs/tags/${tag}:refs/tags/${tag}" >/dev/null 2>&1; then + echo "Warning: tag ${tag} not fetchable; skipping released cuda-core tests" + exit 0 + fi + mkdir -p "${tests_dir}" + git archive "${tag}" cuda_core | tar -x -C "${tests_dir}" + + # Install test deps from the released tag's pyproject.toml, not main's. + pushd "${tests_dir}/cuda_core" + echo "Installing cuda-core test deps from tag ${tag}" + pip install --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" + popd + + if [[ -n "${GITHUB_ENV:-}" ]]; then + echo "CUDA_CORE_RELEASED_TESTS_DIR=${tests_dir}" >> "${GITHUB_ENV}" + fi + echo "Installed packages before released cuda-core tests:" + pip list else - # Nightly optional-dependency testing. - # Install ALL wheels (pathfinder + bindings + core) and the optional dep - # in a single pip call so pip resolves version constraints in one shot. + # Nightly optional-dependency testing: nightly-pytorch, nightly-numba-cuda, + # nightly-numba-cuda-mlir. Install ALL cuda-python wheels (pathfinder + + # bindings + core) and the optional dep in a single pip call so pip resolves + # version constraints in one shot. + pushd ./cuda_core PIP_ARGS=( "${PATHFINDER_WHL[@]}" "${BINDINGS_ARGS[@]}" @@ -144,12 +177,35 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then "cupy-cuda${TEST_CUDA_MAJOR}x" psutil cffi pytest-xdist pytest-benchmark filecheck ml_dtypes statistics ) + elif [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then + echo "Installing pathfinder + bindings + core + numba-cuda-mlir" + PIP_ARGS+=("numba-cuda-mlir[cu${TEST_CUDA_MAJOR}]") fi pip install "${PIP_ARGS[@]}" echo "Nightly install complete — installed packages:" pip list + popd + + if [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then + # Fetch numba-cuda-mlir's own test suite from the matching git tag — + # the wheel does not ship test_*.py files. + installed_ver=$(pip show numba-cuda-mlir | awk '/^Version:/{print $2}') + tag="v${installed_ver}" + tests_dir="${TMPDIR:-/tmp}/numba-cuda-mlir-${installed_ver}" + echo "Cloning numba-cuda-mlir tests at tag ${tag}" + if git clone --depth 1 --branch "${tag}" https://github.com/NVIDIA/numba-cuda-mlir "${tests_dir}"; then + pushd "${tests_dir}" + # --group requires pip 25.1+; Ubuntu 24.04 stock ships older. + pip install --upgrade "pip>=25.1" + pip install --group test + popd + if [[ -n "${GITHUB_ENV:-}" ]]; then + echo "NUMBA_CUDA_MLIR_TESTS_DIR=${tests_dir}" >> "${GITHUB_ENV}" + fi + else + echo "Warning: numba-cuda-mlir tag ${tag} not clonable; skipping tests" + fi + fi fi - - popd fi From c39bc71f514f4e1f6d25860fd047651295d37292 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:00:11 +0000 Subject: [PATCH 02/13] ci/test-matrix.yml: fix CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM typo The two ENV overrides intended to exercise the per-thread default stream code path were misspelled (missing the CUDA_ segment), so the env var was silently ignored and the PTDS coverage added in #1972 had no effect. Rename to the correct CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM. Refs #971. --- ci/test-matrix.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index f0e0668828c..ad036e5b8ca 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -29,7 +29,7 @@ # subsequent steps (including the cuda.bindings and cuda.core test # steps). Nightly rows also use ENV.MODE as a matrix-filter tag (see # ci-nightly.yml). Examples: -# ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' } +# ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' } # ENV: { MODE: 'nightly-pytorch', TORCH_VER: '2.12.1', TORCH_CUDA: 'cu126' } linux: @@ -41,7 +41,7 @@ linux: - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' } } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' } } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } @@ -124,7 +124,7 @@ windows: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { CUDA_PYTHON_PER_THREAD_DEFAULT_STREAM: '1' } } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM: '1' } } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } From 2a42aa7d0bba0a58b3fe849b7318d5ca4189e8a8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:00:11 +0000 Subject: [PATCH 03/13] cuda_pathfinder: pin nvshmem to <3.7 (was previously excluding only 3.7.0) nvidia-nvshmem-cu{12,13} 3.7.x breaks the main branch, not only 3.7.0. Widen the exclusion from an exact-version bump to <3.7 so 3.7.x and above are avoided until we can move forward. --- cuda_pathfinder/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml index 87c9ca4205c..86f33b726e9 100644 --- a/cuda_pathfinder/pyproject.toml +++ b/cuda_pathfinder/pyproject.toml @@ -29,7 +29,7 @@ cu12 = [ "nvidia-cusparselt-cu12", "nvidia-libmathdx-cu12", "nvidia-nccl-cu12; sys_platform != 'win32'", - "nvidia-nvshmem-cu12!=3.7.0; sys_platform != 'win32'", + "nvidia-nvshmem-cu12<3.7; sys_platform != 'win32'", ] cu13 = [ "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti,profiler,nvvm]==13.*", @@ -43,7 +43,7 @@ cu13 = [ "nvidia-cusparselt-cu13", "nvidia-libmathdx-cu13", "nvidia-nccl-cu13; sys_platform != 'win32'", - "nvidia-nvshmem-cu13!=3.7.0; sys_platform != 'win32'", + "nvidia-nvshmem-cu13<3.7; sys_platform != 'win32'", ] host = [ # TODO: remove the Python 3.15 guard once 3.15 is officially supported From 60dac9017fd8fdb8520d9c88d2775d6e968df7e8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:15:23 +0000 Subject: [PATCH 04/13] nightly-numba-cuda-mlir: swap arm64 for win-64 coverage, use rtxpro6000 Drop the linux-aarch64 rows and instead add win-64 coverage with the same CUDA 12.9.1 / 13.3.0 pair. Switch all four rows from GPU l4 to rtxpro6000. Windows rows use DRIVER_MODE MCDM, matching the existing rtxpro6000 CUDA 13.3.0 patterns. --- .github/workflows/ci-nightly.yml | 14 +++++++------- .github/workflows/test-wheel-windows.yml | 19 ++++++++++++++++++- ci/test-matrix.yml | 11 ++++++----- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml index 8f752b23ba4..f05318707ab 100644 --- a/.github/workflows/ci-nightly.yml +++ b/.github/workflows/ci-nightly.yml @@ -212,18 +212,18 @@ jobs: test-mode: nightly-numba-cuda-mlir matrix_filter: 'map(select(.ENV.MODE == "nightly-numba-cuda-mlir"))' - test-numba-cuda-mlir-linux-aarch64: - name: "Nightly numba-cuda-mlir (linux-aarch64)" + test-numba-cuda-mlir-windows: + name: "Nightly numba-cuda-mlir (win-64)" if: ${{ github.repository_owner == 'nvidia' }} needs: find-wheels permissions: contents: read actions: read secrets: inherit - uses: ./.github/workflows/test-wheel-linux.yml + uses: ./.github/workflows/test-wheel-windows.yml with: build-type: nightly - host-platform: linux-aarch64 + host-platform: win-64 build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }} run-id: ${{ needs.find-wheels.outputs.RUN_ID }} sha: ${{ needs.find-wheels.outputs.HEAD_SHA }} @@ -303,7 +303,7 @@ jobs: - test-numba-cuda-linux-aarch64 - test-numba-cuda-windows - test-numba-cuda-mlir-linux-64 - - test-numba-cuda-mlir-linux-aarch64 + - test-numba-cuda-mlir-windows - test-cuda-core-linux-64 - test-cuda-core-windows - test-standard-linux-aarch64 @@ -332,8 +332,8 @@ jobs: needs.test-numba-cuda-windows.result == 'failure' || needs.test-numba-cuda-mlir-linux-64.result == 'cancelled' || needs.test-numba-cuda-mlir-linux-64.result == 'failure' || - needs.test-numba-cuda-mlir-linux-aarch64.result == 'cancelled' || - needs.test-numba-cuda-mlir-linux-aarch64.result == 'failure' || + needs.test-numba-cuda-mlir-windows.result == 'cancelled' || + needs.test-numba-cuda-mlir-windows.result == 'failure' || needs.test-cuda-core-linux-64.result == 'cancelled' || needs.test-cuda-core-linux-64.result == 'failure' || needs.test-cuda-core-windows.result == 'cancelled' || diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index da80fbc20e6..5e88af78014 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -38,7 +38,8 @@ on: test-mode: description: > Test mode: 'standard' (default), 'nightly-pytorch', - 'nightly-numba-cuda', or 'nightly-cuda-core'. + 'nightly-numba-cuda', 'nightly-numba-cuda-mlir', or + 'nightly-cuda-core'. type: string default: 'standard' sha: @@ -388,6 +389,14 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: run-tests nightly-numba-cuda + - name: Install cuda-python wheels + numba-cuda-mlir + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: run-tests nightly-numba-cuda-mlir + - name: Install main pathfinder/bindings + released cuda-core if: ${{ inputs.test-mode == 'nightly-cuda-core' }} env: @@ -410,6 +419,14 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: python -m numba.runtests numba.cuda.tests + - name: Run numba-cuda-mlir tests + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_TESTS_DIR != '' }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pushd "${NUMBA_CUDA_MLIR_TESTS_DIR}" + pytest -rxXs -v --durations=0 tests/ + popd + - name: Run released cuda-core tests if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_TESTS_DIR != '' }} shell: bash --noprofile --norc -xeuo pipefail {0} diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index ad036e5b8ca..cd0944f2000 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -96,11 +96,9 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } - # nightly-numba-cuda-mlir (MLIR backend, mirrors nightly-numba-cuda coverage) - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + # nightly-numba-cuda-mlir (MLIR backend, linux-64 only) + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda-mlir' } } # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings) - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-cuda-core' } } # nightly-standard (arm64 nightly-only runners — per runner team request) @@ -143,5 +141,8 @@ windows: # nightly-numba-cuda - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC', ENV: { MODE: 'nightly-numba-cuda' } } + # nightly-numba-cuda-mlir (MLIR backend, win-64) + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-numba-cuda-mlir' } } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-numba-cuda-mlir' } } # nightly-cuda-core (released cuda-core from PyPI against main pathfinder/bindings) - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM', ENV: { MODE: 'nightly-cuda-core' } } From a0ccd19bf6fb276ff54e6f3b4f863d6bc31d3138 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:15:34 +0000 Subject: [PATCH 05/13] Temporarily add push trigger to ci-nightly.yml for testing Remove before merging. --- .github/workflows/ci-nightly.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml index f05318707ab..c67f47cf085 100644 --- a/.github/workflows/ci-nightly.yml +++ b/.github/workflows/ci-nightly.yml @@ -16,6 +16,10 @@ concurrency: cancel-in-progress: true on: + push: + branches: + - "main" + - "pull-request/[0-9]+" schedule: # 2:17 AM UTC daily, after the midnight main CI build finishes. # Avoid minute 0 because GitHub documents high scheduled-workflow load From 9490bd3a4940fc0b82aa48eb7fa075ea0d332a21 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:35:12 +0000 Subject: [PATCH 06/13] CI: switch nightly-{cuda-core,numba-cuda-mlir} to actions/checkout for tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial approach used git inside the ubuntu:24.04 container to fetch the released version's test suite, but git is not installed on that container (install_unix_deps only pulls in jq/wget/g++/etc.) and its absence made the run steps silently skip via the tag-not-fetchable fallback. On Windows, git archive of just the cuda_core subtree also hit a dangling-symlink extraction failure (cuda_core/.git_archival.txt). Refactor to: - run-tests: just install wheels and expose the resolved release version (CUDA_CORE_RELEASED_VER / NUMBA_CUDA_MLIR_VER) and cuda-core test-group name via GITHUB_ENV. No more git operations. - test-wheel-{linux,windows}.yml: add an actions/checkout step per mode that pulls the matching release tag into a subdirectory (cuda-core-released / numba-cuda-mlir-released), then the follow-up test step installs that tag's test dep-group and runs pytest. For numba-cuda-mlir also pass --ignore=tests/benchmarks --ignore=tests/doc_examples to pytest: those directories import the `numba` package at module top and would fail collection, which is cuSIMT's expected behavior (see NVIDIA/numba-cuda-mlir#136 — cuSIMT intentionally does not depend on numba). --- .github/workflows/test-wheel-linux.yml | 38 ++++++++++++++++--- .github/workflows/test-wheel-windows.yml | 34 ++++++++++++++--- ci/tools/run-tests | 47 +++++------------------- 3 files changed, 72 insertions(+), 47 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 8b4899afccf..955cd845af6 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -436,16 +436,44 @@ jobs: if: ${{ inputs.test-mode == 'nightly-numba-cuda' }} run: python -m numba.runtests numba.cuda.tests + - name: Checkout numba-cuda-mlir tests at matching tag + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }} + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + repository: NVIDIA/numba-cuda-mlir + ref: v${{ env.NUMBA_CUDA_MLIR_VER }} + path: numba-cuda-mlir-released + - name: Run numba-cuda-mlir tests - if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_TESTS_DIR != '' }} + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }} run: | - pushd "${NUMBA_CUDA_MLIR_TESTS_DIR}" - pytest -rxXs -v --durations=0 tests/ + pushd numba-cuda-mlir-released + # Install this tag's test deps (pytest + plugins + ml-dtypes + ...). + pip install --upgrade "pip>=25.1" + pip install --group test + # Skip tests/benchmarks/ and tests/doc_examples/ — they import the + # numba package at collection time, which cuSIMT intentionally does + # not depend on. See NVIDIA/numba-cuda-mlir#136. + pytest -rxXs -v --durations=0 \ + --ignore=tests/benchmarks \ + --ignore=tests/doc_examples \ + tests/ popd + - name: Checkout released cuda-core tests at matching tag + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }} + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: cuda-core-v${{ env.CUDA_CORE_RELEASED_VER }} + path: cuda-core-released + - name: Run released cuda-core tests - if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_TESTS_DIR != '' }} + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }} run: | - pushd "${CUDA_CORE_RELEASED_TESTS_DIR}/cuda_core" + pushd cuda-core-released/cuda_core + # Install the released tag's test group so we exercise the exact deps + # that cuda-core version shipped with. + pip install --upgrade "pip>=25.1" + pip install --group "${CUDA_CORE_TEST_GROUP}" pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 5e88af78014..f8cea7fc0b9 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -419,18 +419,42 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: python -m numba.runtests numba.cuda.tests + - name: Checkout numba-cuda-mlir tests at matching tag + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }} + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + repository: NVIDIA/numba-cuda-mlir + ref: v${{ env.NUMBA_CUDA_MLIR_VER }} + path: numba-cuda-mlir-released + - name: Run numba-cuda-mlir tests - if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_TESTS_DIR != '' }} + if: ${{ inputs.test-mode == 'nightly-numba-cuda-mlir' && env.NUMBA_CUDA_MLIR_VER != '' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | - pushd "${NUMBA_CUDA_MLIR_TESTS_DIR}" - pytest -rxXs -v --durations=0 tests/ + pushd numba-cuda-mlir-released + pip install --upgrade "pip>=25.1" + pip install --group test + # Skip tests/benchmarks/ and tests/doc_examples/ — see + # NVIDIA/numba-cuda-mlir#136. + pytest -rxXs -v --durations=0 \ + --ignore=tests/benchmarks \ + --ignore=tests/doc_examples \ + tests/ popd + - name: Checkout released cuda-core tests at matching tag + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }} + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: cuda-core-v${{ env.CUDA_CORE_RELEASED_VER }} + path: cuda-core-released + - name: Run released cuda-core tests - if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_TESTS_DIR != '' }} + if: ${{ inputs.test-mode == 'nightly-cuda-core' && env.CUDA_CORE_RELEASED_VER != '' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | - pushd "${CUDA_CORE_RELEASED_TESTS_DIR}/cuda_core" + pushd cuda-core-released/cuda_core + pip install --upgrade "pip>=25.1" + pip install --group "${CUDA_CORE_TEST_GROUP}" pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ popd diff --git a/ci/tools/run-tests b/ci/tools/run-tests index bcb487d23f8..783511b2f3c 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -114,32 +114,16 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then popd elif [[ "${test_module}" == "nightly-cuda-core" ]]; then # Test the *released* cuda-core (from PyPI) against *main*-built pathfinder - # and cuda-bindings. Fetches the released cuda-core's own test suite from - # the matching git tag so we exercise its full compiled surface, not the - # current main tests (which may reference APIs the released core doesn't - # have). + # and cuda-bindings. The workflow follows up with an actions/checkout of the + # matching cuda-core-v tag so the released version's own test suite + # (which is not shipped in the wheel) can be exercised. echo "Installing pathfinder + bindings from main + released cuda-core from PyPI" pip install "${PATHFINDER_WHL[@]}" "${BINDINGS_ARGS[@]}" "cuda-core[cu${TEST_CUDA_MAJOR}]" released_ver=$(pip show cuda-core | awk '/^Version:/{print $2}') - tag="cuda-core-v${released_ver}" - tests_dir="${TMPDIR:-/tmp}/cuda-core-released-${released_ver}" - echo "Fetching cuda-core tests from tag ${tag}" - if ! git fetch origin --tags "+refs/tags/${tag}:refs/tags/${tag}" >/dev/null 2>&1; then - echo "Warning: tag ${tag} not fetchable; skipping released cuda-core tests" - exit 0 - fi - mkdir -p "${tests_dir}" - git archive "${tag}" cuda_core | tar -x -C "${tests_dir}" - - # Install test deps from the released tag's pyproject.toml, not main's. - pushd "${tests_dir}/cuda_core" - echo "Installing cuda-core test deps from tag ${tag}" - pip install --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" - popd - if [[ -n "${GITHUB_ENV:-}" ]]; then - echo "CUDA_CORE_RELEASED_TESTS_DIR=${tests_dir}" >> "${GITHUB_ENV}" + echo "CUDA_CORE_RELEASED_VER=${released_ver}" >> "${GITHUB_ENV}" + echo "CUDA_CORE_TEST_GROUP=test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" >> "${GITHUB_ENV}" fi echo "Installed packages before released cuda-core tests:" pip list @@ -188,23 +172,12 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then popd if [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then - # Fetch numba-cuda-mlir's own test suite from the matching git tag — - # the wheel does not ship test_*.py files. + # Expose the installed numba-cuda-mlir version so the workflow can + # actions/checkout the matching v tag from NVIDIA/numba-cuda-mlir + # (the wheel does not ship test_*.py files). installed_ver=$(pip show numba-cuda-mlir | awk '/^Version:/{print $2}') - tag="v${installed_ver}" - tests_dir="${TMPDIR:-/tmp}/numba-cuda-mlir-${installed_ver}" - echo "Cloning numba-cuda-mlir tests at tag ${tag}" - if git clone --depth 1 --branch "${tag}" https://github.com/NVIDIA/numba-cuda-mlir "${tests_dir}"; then - pushd "${tests_dir}" - # --group requires pip 25.1+; Ubuntu 24.04 stock ships older. - pip install --upgrade "pip>=25.1" - pip install --group test - popd - if [[ -n "${GITHUB_ENV:-}" ]]; then - echo "NUMBA_CUDA_MLIR_TESTS_DIR=${tests_dir}" >> "${GITHUB_ENV}" - fi - else - echo "Warning: numba-cuda-mlir tag ${tag} not clonable; skipping tests" + if [[ -n "${GITHUB_ENV:-}" ]]; then + echo "NUMBA_CUDA_MLIR_VER=${installed_ver}" >> "${GITHUB_ENV}" fi fi fi From f3770d9f472e664a2b9b3e503b8de5ad6fd3aef9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 04:52:29 +0000 Subject: [PATCH 07/13] CI: pin numpy<2.5 (mlir) and pytest<9.1 (cuda-core released tests) Two nightly failure fixups after the first green iteration: nightly-numba-cuda-mlir: numba-cuda-mlir 0.4.0 has an inverted guard that registers an overload of np.row_stack on NumPy 2.x, and NumPy 2.5 removed that name entirely, so test collection fails with "AttributeError: module 'numpy' has no attribute 'row_stack'". Cap numpy to <2.5. See NVIDIA/numba-cuda-mlir#154. nightly-cuda-core: released cuda-core v1.0.1's test suite uses a parametrize argvalues pattern that pytest 9.1 rejects ("in parametrize the number of names (1)... must be equal to the number of values (3)"). The main-side fix was #2212 but it has not shipped in a cuda-core release yet. Cap pytest to <9.1 for the released-cuda-core test run only. --- .github/workflows/test-wheel-linux.yml | 4 ++++ .github/workflows/test-wheel-windows.yml | 3 +++ ci/tools/run-tests | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 955cd845af6..69575c5902d 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -475,5 +475,9 @@ jobs: # that cuda-core version shipped with. pip install --upgrade "pip>=25.1" pip install --group "${CUDA_CORE_TEST_GROUP}" + # Cap pytest below 9.1: released cuda-core <=1.0.1 has parametrize + # patterns that pytest 9.1 rejects; the main-side fix (#2212) has + # not yet shipped in a cuda-core release. + pip install "pytest<9.1" pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index f8cea7fc0b9..f453a357bf0 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -456,5 +456,8 @@ jobs: pushd cuda-core-released/cuda_core pip install --upgrade "pip>=25.1" pip install --group "${CUDA_CORE_TEST_GROUP}" + # Cap pytest below 9.1 — released cuda-core <=1.0.1 has parametrize + # patterns that pytest 9.1 rejects (see #2212). + pip install "pytest<9.1" pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ popd diff --git a/ci/tools/run-tests b/ci/tools/run-tests index 783511b2f3c..c5c0cc12336 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -163,7 +163,9 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then ) elif [[ "${test_module}" == "nightly-numba-cuda-mlir" ]]; then echo "Installing pathfinder + bindings + core + numba-cuda-mlir" - PIP_ARGS+=("numba-cuda-mlir[cu${TEST_CUDA_MAJOR}]") + # numpy<2.5: numba-cuda-mlir 0.4.0 registers np.row_stack, which was + # removed in NumPy 2.5. See NVIDIA/numba-cuda-mlir#154. + PIP_ARGS+=("numba-cuda-mlir[cu${TEST_CUDA_MAJOR}]" "numpy<2.5") fi pip install "${PIP_ARGS[@]}" From 7476a9f7feb0645324dbb65501620d458a006a0f Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 06:35:06 +0000 Subject: [PATCH 08/13] CI: deselect known pre-existing failures in nightly-cuda-core and nightly-numba-cuda-mlir Applied only in the affected nightly-* pytest invocations; the released source trees under test are unmodified. nightly-numba-cuda-mlir (all 10 tests deselected are from cuSIMT): * CudaArraySetting::{test_no_sync_default_stream, test_no_sync_supplied_stream, test_sync} TestCudaArrayInterface::{test_consume_no_sync, test_consume_sync, test_launch_no_sync, test_launch_sync, test_launch_sync_two_streams, test_fortran_contiguous} Serial-pytest contamination of numba_cuda_mlir.cuda.cudadrv from an xfailed test in test_nrt_comprehensive.py. Upstream CI runs with `pytest -n auto --dist loadscope`, which isolates the offending side effect in a separate xdist worker; our nightly runs serially and hits the pollution. See NVIDIA/numba-cuda-mlir#135. * TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn Subprocess-invokes `cuobjdump`, which isn't on PATH in the base ubuntu:24.04 container. Filed as an upstream skip-guard bug. nightly-cuda-core (3 tests deselected are pre-existing v1.0.1 issues): * test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion] Expected drift: main cuda-bindings adds NvlinkVersion.VERSION_6_0 which v1.0.1's wrapper mapping predates. This mode intentionally pairs released core with main bindings, so this coverage-style test will stay red here until a cuda-core release catches up. * test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive Environment-dependent test: expects rlcompleter to crash without the tab-completion patch, but on Windows MCDM the pre-patch behavior is clean. Passes on Linux, fails on Windows MCDM. * test_memory.py::test_non_managed_resources_report_not_managed[pinned] Same underlying "Failed to allocate memory from pool" error that v1.0.1 already xfails in the sibling test_pinned_memory_resource_initialization (TODO(#9999)). cuda-python main has since fixed the parametrized case to route through _allocate_pinned_buffer_or_xfail(), but that fix hasn't shipped in a cuda-core release yet. --- .github/workflows/test-wheel-linux.yml | 40 +++++++++++++++++++++++- .github/workflows/test-wheel-windows.yml | 21 +++++++++++-- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 69575c5902d..f2d268a0581 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -454,9 +454,30 @@ jobs: # Skip tests/benchmarks/ and tests/doc_examples/ — they import the # numba package at collection time, which cuSIMT intentionally does # not depend on. See NVIDIA/numba-cuda-mlir#136. + # + # Deselects: + # - CudaArraySetting + TestCudaArrayInterface + test_fortran_contiguous: + # serial-pytest contamination of `numba_cuda_mlir.cuda.cudadrv` + # from an xfailed test in test_nrt_comprehensive.py. Upstream CI + # hides it by running with `-n auto --dist loadscope`. See + # NVIDIA/numba-cuda-mlir#135. + # - TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn: + # invokes `cuobjdump` subprocess, which is not on PATH in the + # base ubuntu:24.04 container. Will file an upstream skip-guard + # issue. pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn' \ tests/ popd @@ -479,5 +500,22 @@ jobs: # patterns that pytest 9.1 rejects; the main-side fix (#2212) has # not yet shipped in a cuda-core release. pip install "pytest<9.1" - pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ + # Deselects, all pre-existing in released cuda-core v1.0.1: + # - NvlinkVersion: expected drift; main's cuda-bindings adds + # VERSION_6_0, which the v1.0.1 wrapper mapping predates. The + # nightly-cuda-core mode intentionally exercises released-core + # against main-bindings, so this test is permanently red here. + # - rlcompleter opt-out: the "should crash" expectation baked into + # v1.0.1 doesn't hold on all runners; passes on Linux, flakes on + # Windows MCDM. Environment-dependent, not a code signal. + # - test_non_managed_resources_report_not_managed[pinned]: same + # underlying "Failed to allocate memory from pool" that v1.0.1 + # already xfails in sibling test_pinned_memory_resource_initialization + # (TODO(#9999)). Main has since fixed this parametrization to + # xfail on the pinned branch too. + pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ + --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ + --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' \ + --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]' \ + tests/ popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index f453a357bf0..5ce84fe42fc 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -434,11 +434,21 @@ jobs: pushd numba-cuda-mlir-released pip install --upgrade "pip>=25.1" pip install --group test - # Skip tests/benchmarks/ and tests/doc_examples/ — see - # NVIDIA/numba-cuda-mlir#136. + # Deselects — see comments on the equivalent Linux step and + # NVIDIA/numba-cuda-mlir#135, #136. pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' \ + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' \ + --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn' \ tests/ popd @@ -459,5 +469,10 @@ jobs: # Cap pytest below 9.1 — released cuda-core <=1.0.1 has parametrize # patterns that pytest 9.1 rejects (see #2212). pip install "pytest<9.1" - pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ + # Deselects — see comments on the equivalent Linux step. + pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ + --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ + --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' \ + --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]' \ + tests/ popd From 456b5a95c323777fdfd62583e10d722ee66806bc Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 06:48:20 +0000 Subject: [PATCH 09/13] CI: tighten deselects to per-platform failing sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously applied the same list on both Linux and Windows workflows, which over-deselected — some tests only fail on one platform because the underlying issues (serial-pytest test-order in mlir, MCDM-only behavior in cuda-core) are platform-specific. Now: nightly-numba-cuda-mlir linux-64: TestCudaArrayInterface::{test_consume_no_sync, test_consume_sync, test_launch_no_sync, test_launch_sync, test_launch_sync_two_streams, test_fortran_contiguous} + TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn. win-64: CudaArraySetting::{test_no_sync_default_stream, test_no_sync_supplied_stream, test_sync} + TestCudaArrayInterface::test_fortran_contiguous. Test-order contamination in numba-cuda-mlir#135 surfaces different tests depending on collection order (linux-64 vs win-64 exercise different subsets), so the per-platform lists differ. cuobjdump-based TestLinkerDumpAssembly only fires on Linux because the ubuntu:24.04 container's PATH lacks cuobjdump; Windows runners ship it with the local CTK. nightly-cuda-core linux-64: test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]. win-64: NvlinkVersion (same as Linux) + test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive + test_memory.py::test_non_managed_resources_report_not_managed[pinned]. rlcompleter and pinned mempool tests only fail on Windows MCDM. NvlinkVersion fails on both (expected drift for the mode). --- .github/workflows/test-wheel-linux.yml | 38 +++++++----------------- .github/workflows/test-wheel-windows.yml | 21 +++++++------ 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index f2d268a0581..f96ba6eb09b 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -455,22 +455,18 @@ jobs: # numba package at collection time, which cuSIMT intentionally does # not depend on. See NVIDIA/numba-cuda-mlir#136. # - # Deselects: - # - CudaArraySetting + TestCudaArrayInterface + test_fortran_contiguous: - # serial-pytest contamination of `numba_cuda_mlir.cuda.cudadrv` - # from an xfailed test in test_nrt_comprehensive.py. Upstream CI - # hides it by running with `-n auto --dist loadscope`. See - # NVIDIA/numba-cuda-mlir#135. + # Deselects observed to fail on linux-64 only: + # - TestCudaArrayInterface::*: serial-pytest contamination of + # numba_cuda_mlir.cuda.cudadrv from an xfailed test in + # test_nrt_comprehensive.py. Upstream CI hides it via + # `-n auto --dist loadscope`. See NVIDIA/numba-cuda-mlir#135. # - TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn: - # invokes `cuobjdump` subprocess, which is not on PATH in the - # base ubuntu:24.04 container. Will file an upstream skip-guard - # issue. + # subprocess-invokes `cuobjdump`, which is not on PATH in the + # base ubuntu:24.04 container. Windows runners ship cuobjdump + # with the local CTK, so this doesn't repro there. pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' \ --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' \ --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' \ --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' \ @@ -500,22 +496,10 @@ jobs: # patterns that pytest 9.1 rejects; the main-side fix (#2212) has # not yet shipped in a cuda-core release. pip install "pytest<9.1" - # Deselects, all pre-existing in released cuda-core v1.0.1: - # - NvlinkVersion: expected drift; main's cuda-bindings adds - # VERSION_6_0, which the v1.0.1 wrapper mapping predates. The - # nightly-cuda-core mode intentionally exercises released-core - # against main-bindings, so this test is permanently red here. - # - rlcompleter opt-out: the "should crash" expectation baked into - # v1.0.1 doesn't hold on all runners; passes on Linux, flakes on - # Windows MCDM. Environment-dependent, not a code signal. - # - test_non_managed_resources_report_not_managed[pinned]: same - # underlying "Failed to allocate memory from pool" that v1.0.1 - # already xfails in sibling test_pinned_memory_resource_initialization - # (TODO(#9999)). Main has since fixed this parametrization to - # xfail on the pinned branch too. + # NvlinkVersion: expected drift on this mode. main cuda-bindings + # adds NvlinkVersion.VERSION_6_0 which v1.0.1's wrapper mapping + # predates. Fails on both platforms. pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ - --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' \ - --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]' \ tests/ popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 5ce84fe42fc..a03c9401d9f 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -434,21 +434,16 @@ jobs: pushd numba-cuda-mlir-released pip install --upgrade "pip>=25.1" pip install --group test - # Deselects — see comments on the equivalent Linux step and - # NVIDIA/numba-cuda-mlir#135, #136. + # Deselects observed to fail on win-64 only (subset of the + # NVIDIA/numba-cuda-mlir#135 tests; different serial-pytest + # ordering surfaces different tests on Linux vs Windows). pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' \ --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' \ --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' \ --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn' \ tests/ popd @@ -469,7 +464,15 @@ jobs: # Cap pytest below 9.1 — released cuda-core <=1.0.1 has parametrize # patterns that pytest 9.1 rejects (see #2212). pip install "pytest<9.1" - # Deselects — see comments on the equivalent Linux step. + # NvlinkVersion: same expected drift as Linux (bindings adds + # VERSION_6_0 unknown to v1.0.1's wrapper mapping). + # rlcompleter opt-out: env-dependent assertion that only fails on + # Windows MCDM — passes on Linux. + # test_non_managed_resources_report_not_managed[pinned]: same + # MCDM mempool OOM that v1.0.1 already xfails in sibling + # test_pinned_memory_resource_initialization (TODO(#9999)). + # Main fixed the parametrized case via #2139 but v1.0.1 doesn't + # have the fix. pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' \ From 01ac84efeb1b7a49e2cc3757039ad026bf25d69e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 06:53:35 +0000 Subject: [PATCH 10/13] CI: version-gate the nightly-mode deselects so they auto-clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each deselect is now wrapped in a bash conditional keyed on the installed release version. When a newer numba-cuda-mlir or cuda-core release ships with the referenced fix, the nightly picks it up automatically, the guard evaluates false, and the deselect drops — so the tests run against the new release. If they still fail we hear about it loudly rather than silently masking a regression. Current guards: - numba-cuda-mlir #135 tests + cuobjdump TestLinkerDumpAssembly: applied when installed numba-cuda-mlir version <= 0.4.0. - cuda-core NvlinkVersion / rlcompleter opt-out / pinned mempool: applied when installed cuda-core version <= 1.0.1. Structure keeps one conditional block per (mode, platform) with a comment above each deselect explaining the tracking issue. --- .github/workflows/test-wheel-linux.yml | 58 ++++++++++++++++-------- .github/workflows/test-wheel-windows.yml | 52 +++++++++++++-------- 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index f96ba6eb09b..25eb1f1c9f1 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -455,25 +455,35 @@ jobs: # numba package at collection time, which cuSIMT intentionally does # not depend on. See NVIDIA/numba-cuda-mlir#136. # - # Deselects observed to fail on linux-64 only: - # - TestCudaArrayInterface::*: serial-pytest contamination of - # numba_cuda_mlir.cuda.cudadrv from an xfailed test in - # test_nrt_comprehensive.py. Upstream CI hides it via - # `-n auto --dist loadscope`. See NVIDIA/numba-cuda-mlir#135. - # - TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn: - # subprocess-invokes `cuobjdump`, which is not on PATH in the - # base ubuntu:24.04 container. Windows runners ship cuobjdump - # with the local CTK, so this doesn't repro there. + # Version-gated deselects: when a newer numba-cuda-mlir release + # ships with the referenced fix, the guard evaluates false and the + # tests get run automatically. If they still fail on the newer + # version we hear about it loudly (rather than silently masking). + DESELECTS=() + if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then + # NVIDIA/numba-cuda-mlir#135: serial-pytest contamination of + # numba_cuda_mlir.cuda.cudadrv from an xfailed test in + # test_nrt_comprehensive.py. Upstream CI hides it via + # `-n auto --dist loadscope`. + # + # test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn: + # subprocess-invokes `cuobjdump`, not on PATH in the base + # ubuntu:24.04 container. (No upstream fix yet — pending a + # skip-guard bug to be filed against NVIDIA/numba-cuda-mlir.) + DESELECTS+=( + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' + --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn' + ) + fi pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_nvjitlink.py::TestLinkerDumpAssembly::test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn' \ + "${DESELECTS[@]}" \ tests/ popd @@ -496,10 +506,18 @@ jobs: # patterns that pytest 9.1 rejects; the main-side fix (#2212) has # not yet shipped in a cuda-core release. pip install "pytest<9.1" - # NvlinkVersion: expected drift on this mode. main cuda-bindings - # adds NvlinkVersion.VERSION_6_0 which v1.0.1's wrapper mapping - # predates. Fails on both platforms. + # Version-gated deselect: drops automatically when a newer + # cuda-core release with the wrapper-mapping update ships. + DESELECTS=() + if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${CUDA_CORE_RELEASED_VER}') <= Version('1.0.1') else 1)"; then + # NvlinkVersion: v1.0.1's wrapper mapping predates + # NvlinkVersion.VERSION_6_0 which main cuda-bindings adds. + # Expected drift on this mode until released cuda-core catches up. + DESELECTS+=( + --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' + ) + fi pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ - --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ + "${DESELECTS[@]}" \ tests/ popd diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index a03c9401d9f..4a5e1ba14c6 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -434,16 +434,23 @@ jobs: pushd numba-cuda-mlir-released pip install --upgrade "pip>=25.1" pip install --group test - # Deselects observed to fail on win-64 only (subset of the - # NVIDIA/numba-cuda-mlir#135 tests; different serial-pytest - # ordering surfaces different tests on Linux vs Windows). + # Version-gated deselects — dropped automatically when newer + # cuSIMT release ships. See linux step for full rationale. + DESELECTS=() + if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then + # Subset of NVIDIA/numba-cuda-mlir#135 tests that surface on + # win-64 (different serial-pytest ordering than Linux). + DESELECTS+=( + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' + ) + fi pytest -rxXs -v --durations=0 \ --ignore=tests/benchmarks \ --ignore=tests/doc_examples \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' \ - --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' \ - --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' \ + "${DESELECTS[@]}" \ tests/ popd @@ -464,18 +471,25 @@ jobs: # Cap pytest below 9.1 — released cuda-core <=1.0.1 has parametrize # patterns that pytest 9.1 rejects (see #2212). pip install "pytest<9.1" - # NvlinkVersion: same expected drift as Linux (bindings adds - # VERSION_6_0 unknown to v1.0.1's wrapper mapping). - # rlcompleter opt-out: env-dependent assertion that only fails on - # Windows MCDM — passes on Linux. - # test_non_managed_resources_report_not_managed[pinned]: same - # MCDM mempool OOM that v1.0.1 already xfails in sibling - # test_pinned_memory_resource_initialization (TODO(#9999)). - # Main fixed the parametrized case via #2139 but v1.0.1 doesn't - # have the fix. + # Version-gated deselects — dropped automatically when a newer + # cuda-core release ships. See linux step for full rationale on + # NvlinkVersion. The Windows-only tests are: + # - test_rlcompleter_patch: env-dependent expectation that + # passes on Linux, fails on Windows MCDM. + # - test_non_managed_resources_report_not_managed[pinned]: same + # MCDM mempool OOM v1.0.1 already xfails in + # test_pinned_memory_resource_initialization (TODO(#9999)); + # main fixed the parametrized case via #2139 but v1.0.1 lacks + # the fix. + DESELECTS=() + if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${CUDA_CORE_RELEASED_VER}') <= Version('1.0.1') else 1)"; then + DESELECTS+=( + --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' + --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' + --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]' + ) + fi pytest -rxXs -v --durations=0 --randomly-dont-reorganize \ - --deselect 'tests/test_enum_coverage.py::test_wrapper_covers_all_binding_members[NvlinkVersion]' \ - --deselect 'tests/test_rlcompleter_patch.py::test_opt_out_env_var_disables_patch_even_when_interactive' \ - --deselect 'tests/test_memory.py::test_non_managed_resources_report_not_managed[pinned]' \ + "${DESELECTS[@]}" \ tests/ popd From bb8d9d9e930f2248415fa6adbd96b7cf66e8365b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 15:39:05 +0000 Subject: [PATCH 11/13] CI: broaden mlir deselect list to full #135 union across platforms The previous per-platform-tight lists were incomplete: NVIDIA/numba-cuda-mlir#135's import-time contamination poisons whichever tests reference cuda.cudadrv.driver AFTER the polluting xfail runs, and collection order varies between runs. Two consecutive Windows CI runs failed on different subsets (3 slicing tests one run, 5 interface tests the next). Deselect the full union of #135-listed tests + test_fortran_contiguous (observed to hit the same contamination) on both Linux and Windows. Same version guard (<= 0.4.0) still applies, so the whole block drops automatically when a newer numba-cuda-mlir release ships with the root-cause fix. Linux keeps the extra cuobjdump deselect (Linux-only environment issue). --- .github/workflows/test-wheel-linux.yml | 17 +++++++++++++---- .github/workflows/test-wheel-windows.yml | 11 +++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 25eb1f1c9f1..16af4b15d34 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -463,14 +463,23 @@ jobs: if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then # NVIDIA/numba-cuda-mlir#135: serial-pytest contamination of # numba_cuda_mlir.cuda.cudadrv from an xfailed test in - # test_nrt_comprehensive.py. Upstream CI hides it via - # `-n auto --dist loadscope`. + # test_nrt_comprehensive.py contaminates any later test that + # touches cuda.cudadrv.driver. Upstream CI hides it via + # `-n auto --dist loadscope`. Which specific tests fail depends + # on collection order (we saw different subsets on linux-64 vs + # win-64 across runs), so we deselect the union of all tests + # #135 lists as vulnerable + test_fortran_contiguous (observed + # to hit the same contamination in our runs). # # test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn: # subprocess-invokes `cuobjdump`, not on PATH in the base - # ubuntu:24.04 container. (No upstream fix yet — pending a - # skip-guard bug to be filed against NVIDIA/numba-cuda-mlir.) + # ubuntu:24.04 container. (Linux-only; Windows runners ship + # cuobjdump with the local CTK. No upstream fix yet — pending + # a skip-guard bug to be filed against NVIDIA/numba-cuda-mlir.) DESELECTS+=( + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' + --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 4a5e1ba14c6..0d9dc78d5da 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -436,14 +436,21 @@ jobs: pip install --group test # Version-gated deselects — dropped automatically when newer # cuSIMT release ships. See linux step for full rationale. + # NVIDIA/numba-cuda-mlir#135 poisons a subset of tests that + # varies across runs based on collection order, so we deselect + # the full union rather than trying to enumerate what happened + # to fail on the most recent nightly. DESELECTS=() if python -c "from packaging.version import Version; import sys; sys.exit(0 if Version('${NUMBA_CUDA_MLIR_VER}') <= Version('0.4.0') else 1)"; then - # Subset of NVIDIA/numba-cuda-mlir#135 tests that surface on - # win-64 (different serial-pytest ordering than Linux). DESELECTS+=( --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_default_stream' --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_no_sync_supplied_stream' --deselect 'tests/numba_cuda_tests/cudadrv/test_cuda_array_slicing.py::CudaArraySetting::test_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_no_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_consume_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_no_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync' + --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_launch_sync_two_streams' --deselect 'tests/numba_cuda_tests/cudapy/test_cuda_array_interface.py::TestCudaArrayInterface::test_fortran_contiguous' ) fi From 3b66391ef0fe05dab6c6d6503a749a965859ff65 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Jul 2026 15:56:40 +0000 Subject: [PATCH 12/13] Revert "cuda_pathfinder: pin nvshmem to <3.7 (was previously excluding only 3.7.0)" This reverts commit 2a42aa7d0bba0a58b3fe849b7318d5ca4189e8a8. --- cuda_pathfinder/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml index 86f33b726e9..87c9ca4205c 100644 --- a/cuda_pathfinder/pyproject.toml +++ b/cuda_pathfinder/pyproject.toml @@ -29,7 +29,7 @@ cu12 = [ "nvidia-cusparselt-cu12", "nvidia-libmathdx-cu12", "nvidia-nccl-cu12; sys_platform != 'win32'", - "nvidia-nvshmem-cu12<3.7; sys_platform != 'win32'", + "nvidia-nvshmem-cu12!=3.7.0; sys_platform != 'win32'", ] cu13 = [ "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti,profiler,nvvm]==13.*", @@ -43,7 +43,7 @@ cu13 = [ "nvidia-cusparselt-cu13", "nvidia-libmathdx-cu13", "nvidia-nccl-cu13; sys_platform != 'win32'", - "nvidia-nvshmem-cu13<3.7; sys_platform != 'win32'", + "nvidia-nvshmem-cu13!=3.7.0; sys_platform != 'win32'", ] host = [ # TODO: remove the Python 3.15 guard once 3.15 is officially supported From bd898645956ebf946f977272d47c01ce8da5493a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Jul 2026 05:46:18 +0000 Subject: [PATCH 13/13] Revert "Temporarily add push trigger to ci-nightly.yml for testing" This reverts commit a0ccd19bf6fb276ff54e6f3b4f863d6bc31d3138. --- .github/workflows/ci-nightly.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml index c67f47cf085..f05318707ab 100644 --- a/.github/workflows/ci-nightly.yml +++ b/.github/workflows/ci-nightly.yml @@ -16,10 +16,6 @@ concurrency: cancel-in-progress: true on: - push: - branches: - - "main" - - "pull-request/[0-9]+" schedule: # 2:17 AM UTC daily, after the midnight main CI build finishes. # Avoid minute 0 because GitHub documents high scheduled-workflow load